Browse code

cluster: Renew the context after communicating with the registry

When pinning by digest, the registry might be slow or unresponsive. This
could cause the context to already be expired by the time UpdateService
or CreateService is called. We want digest pinning to be a best-effort
operation, so it's problematic if a slow or misbehaving registry
prevents the service operation from completing. Replace the context
after communicating with the registry, so we have a fresh timeout for
the gRPC call.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
(cherry picked from commit f8273a216ed35e22ac157dee8055393f07d4be39)

Aaron Lehmann authored on 2017/03/07 09:05:56
Showing 2 changed files
... ...
@@ -32,6 +32,7 @@ be found.
32 32
  * Retry failed network allocations less aggressively [docker/swarmkit#2021](https://github.com/docker/swarmkit/pull/2021)
33 33
  * Avoid network allocation for tasks that are no longer running [docker/swarmkit#2017](https://github.com/docker/swarmkit/pull/2017)
34 34
  * Bookkeeping fixes inside network allocator allocator [docker/swarmkit#2019](https://github.com/docker/swarmkit/pull/2019) [docker/swarmkit#2020](https://github.com/docker/swarmkit/pull/2020)
35
+* Avoid timing out service create or update when a registry is slow to respond [#31861](https://github.com/docker/docker/pull/31861)
35 36
 
36 37
 ### Windows
37 38
 
... ...
@@ -1119,6 +1119,16 @@ func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (*apity
1119 1119
 		} else {
1120 1120
 			logrus.Debugf("creating service using supplied digest reference %s", ctnr.Image)
1121 1121
 		}
1122
+
1123
+		// Replace the context with a fresh one.
1124
+		// If we timed out while communicating with the
1125
+		// registry, then "ctx" will already be expired, which
1126
+		// would cause UpdateService below to fail. Reusing
1127
+		// "ctx" could make it impossible to create a service
1128
+		// if the registry is slow or unresponsive.
1129
+		var newCancel func()
1130
+		ctx, newCancel = c.getRequestContext()
1131
+		defer newCancel()
1122 1132
 	}
1123 1133
 
1124 1134
 	r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec})
... ...
@@ -1230,6 +1240,16 @@ func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec typ
1230 1230
 		} else {
1231 1231
 			logrus.Debugf("updating service using supplied digest reference %s", newCtnr.Image)
1232 1232
 		}
1233
+
1234
+		// Replace the context with a fresh one.
1235
+		// If we timed out while communicating with the
1236
+		// registry, then "ctx" will already be expired, which
1237
+		// would cause UpdateService below to fail. Reusing
1238
+		// "ctx" could make it impossible to create a service
1239
+		// if the registry is slow or unresponsive.
1240
+		var newCancel func()
1241
+		ctx, newCancel = c.getRequestContext()
1242
+		defer newCancel()
1233 1243
 	}
1234 1244
 
1235 1245
 	_, err = c.client.UpdateService(