Browse code

ContainerWait on remove: don't stuck on rm fail

Currently, if a container removal has failed for some reason,
any client waiting for removal (e.g. `docker run --rm`) is
stuck, waiting for removal to succeed while it has failed already.
For more details and the reproducer, please check
https://github.com/moby/moby/issues/34945

This commit addresses that by allowing `ContainerWait()` with
`container.WaitCondition == "removed"` argument to return an
error in case of removal failure. The `ContainerWaitOKBody`
stucture returned to a client is amended with a pointer to `struct Error`,
containing an error message string, and the `Client.ContainerWait()`
is modified to return the error, if any, to the client.

Note that this feature is only available for API version >= 1.34.
In order for the old clients to be unstuck, we just close the connection
without writing anything -- this causes client's error.

Now, docker-cli would need a separate commit to bump the API to 1.34
and to show an error returned, if any.

[v2: recreate the waitRemove channel after closing]
[v3: document; keep legacy behavior for older clients]
[v4: convert Error from string to pointer to a struct]
[v5: don't emulate old behavior, send empty response in error case]
[v6: rename legacy* vars to include version suffix]

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>

Kir Kolyshkin authored on 2017/09/28 03:49:22
Showing 6 changed files
... ...
@@ -280,11 +280,12 @@ func (s *containerRouter) postContainersWait(ctx context.Context, w http.Respons
280 280
 	// Behavior changed in version 1.30 to handle wait condition and to
281 281
 	// return headers immediately.
282 282
 	version := httputils.VersionFromContext(ctx)
283
-	legacyBehavior := versions.LessThan(version, "1.30")
283
+	legacyBehaviorPre130 := versions.LessThan(version, "1.30")
284
+	legacyRemovalWaitPre134 := false
284 285
 
285 286
 	// The wait condition defaults to "not-running".
286 287
 	waitCondition := containerpkg.WaitConditionNotRunning
287
-	if !legacyBehavior {
288
+	if !legacyBehaviorPre130 {
288 289
 		if err := httputils.ParseForm(r); err != nil {
289 290
 			return err
290 291
 		}
... ...
@@ -293,6 +294,7 @@ func (s *containerRouter) postContainersWait(ctx context.Context, w http.Respons
293 293
 			waitCondition = containerpkg.WaitConditionNextExit
294 294
 		case container.WaitConditionRemoved:
295 295
 			waitCondition = containerpkg.WaitConditionRemoved
296
+			legacyRemovalWaitPre134 = versions.LessThan(version, "1.34")
296 297
 		}
297 298
 	}
298 299
 
... ...
@@ -306,7 +308,7 @@ func (s *containerRouter) postContainersWait(ctx context.Context, w http.Respons
306 306
 
307 307
 	w.Header().Set("Content-Type", "application/json")
308 308
 
309
-	if !legacyBehavior {
309
+	if !legacyBehaviorPre130 {
310 310
 		// Write response header immediately.
311 311
 		w.WriteHeader(http.StatusOK)
312 312
 		if flusher, ok := w.(http.Flusher); ok {
... ...
@@ -317,8 +319,22 @@ func (s *containerRouter) postContainersWait(ctx context.Context, w http.Respons
317 317
 	// Block on the result of the wait operation.
318 318
 	status := <-waitC
319 319
 
320
+	// With API < 1.34, wait on WaitConditionRemoved did not return
321
+	// in case container removal failed. The only way to report an
322
+	// error back to the client is to not write anything (i.e. send
323
+	// an empty response which will be treated as an error).
324
+	if legacyRemovalWaitPre134 && status.Err() != nil {
325
+		return nil
326
+	}
327
+
328
+	var waitError *container.ContainerWaitOKBodyError
329
+	if status.Err() != nil {
330
+		waitError = &container.ContainerWaitOKBodyError{Message: status.Err().Error()}
331
+	}
332
+
320 333
 	return json.NewEncoder(w).Encode(&container.ContainerWaitOKBody{
321 334
 		StatusCode: int64(status.ExitCode()),
335
+		Error:      waitError,
322 336
 	})
323 337
 }
324 338
 
... ...
@@ -5723,6 +5723,13 @@ paths:
5723 5723
                 description: "Exit code of the container"
5724 5724
                 type: "integer"
5725 5725
                 x-nullable: false
5726
+              Error:
5727
+                description: "container waiting error, if any"
5728
+                type: "object"
5729
+                properties:
5730
+                  Message:
5731
+                    description: "Details of an error"
5732
+                    type: "string"
5726 5733
         404:
5727 5734
           description: "no such container"
5728 5735
           schema:
... ...
@@ -7,10 +7,22 @@ package container
7 7
 // See hack/generate-swagger-api.sh
8 8
 // ----------------------------------------------------------------------------
9 9
 
10
+// ContainerWaitOKBodyError container waiting error, if any
11
+// swagger:model ContainerWaitOKBodyError
12
+type ContainerWaitOKBodyError struct {
13
+
14
+	// Details of an error
15
+	Message string `json:"Message,omitempty"`
16
+}
17
+
10 18
 // ContainerWaitOKBody container wait o k body
11 19
 // swagger:model ContainerWaitOKBody
12 20
 type ContainerWaitOKBody struct {
13 21
 
22
+	// error
23
+	// Required: true
24
+	Error *ContainerWaitOKBodyError `json:"Error"`
25
+
14 26
 	// Exit code of the container
15 27
 	// Required: true
16 28
 	StatusCode int64 `json:"StatusCode"`
... ...
@@ -29,7 +29,7 @@ type State struct {
29 29
 	Dead              bool
30 30
 	Pid               int
31 31
 	ExitCodeValue     int    `json:"ExitCode"`
32
-	ErrorMsg          string `json:"Error"` // contains last known error when starting the container
32
+	ErrorMsg          string `json:"Error"` // contains last known error during container start or remove
33 33
 	StartedAt         time.Time
34 34
 	FinishedAt        time.Time
35 35
 	Health            *Health
... ...
@@ -319,7 +319,10 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) {
319 319
 // know the error that occurred when container transits to another state
320 320
 // when inspecting it
321 321
 func (s *State) SetError(err error) {
322
-	s.ErrorMsg = err.Error()
322
+	s.ErrorMsg = ""
323
+	if err != nil {
324
+		s.ErrorMsg = err.Error()
325
+	}
323 326
 }
324 327
 
325 328
 // IsPaused returns whether the container is paused or not.
... ...
@@ -385,8 +388,18 @@ func (s *State) IsDead() bool {
385 385
 // closes the internal waitRemove channel to unblock callers waiting for a
386 386
 // container to be removed.
387 387
 func (s *State) SetRemoved() {
388
+	s.SetRemovalError(nil)
389
+}
390
+
391
+// SetRemovalError is to be called in case a container remove failed.
392
+// It sets an error and closes the internal waitRemove channel to unblock
393
+// callers waiting for the container to be removed.
394
+func (s *State) SetRemovalError(err error) {
395
+	s.SetError(err)
388 396
 	s.Lock()
389 397
 	close(s.waitRemove) // Unblock those waiting on remove.
398
+	// Recreate the channel so next ContainerWait will work
399
+	s.waitRemove = make(chan struct{})
390 400
 	s.Unlock()
391 401
 }
392 402
 
... ...
@@ -120,12 +120,16 @@ func (daemon *Daemon) cleanupContainer(container *container.Container, forceRemo
120 120
 		metadata, err := daemon.stores[container.OS].layerStore.ReleaseRWLayer(container.RWLayer)
121 121
 		layer.LogReleaseMetadata(metadata)
122 122
 		if err != nil && err != layer.ErrMountDoesNotExist && !os.IsNotExist(errors.Cause(err)) {
123
-			return errors.Wrapf(err, "driver %q failed to remove root filesystem for %s", daemon.GraphDriverName(container.OS), container.ID)
123
+			e := errors.Wrapf(err, "driver %q failed to remove root filesystem for %s", daemon.GraphDriverName(container.OS), container.ID)
124
+			container.SetRemovalError(e)
125
+			return e
124 126
 		}
125 127
 	}
126 128
 
127 129
 	if err := system.EnsureRemoveAll(container.Root); err != nil {
128
-		return errors.Wrapf(err, "unable to remove filesystem for %s", container.ID)
130
+		e := errors.Wrapf(err, "unable to remove filesystem for %s", container.ID)
131
+		container.SetRemovalError(e)
132
+		return e
129 133
 	}
130 134
 
131 135
 	linkNames := daemon.linkIndex.delete(container)
... ...
@@ -17,12 +17,19 @@ keywords: "API, Docker, rcli, REST, documentation"
17 17
 
18 18
 [Docker Engine API v1.34](https://docs.docker.com/engine/api/v1.34/) documentation
19 19
 
20
+* `POST /containers/(name)/wait?condition=removed` now also also returns
21
+  in case of container removal failure. A pointer to a structure named
22
+  `Error` added to the response JSON in order to indicate a failure.
23
+  If `Error` is `null`, container removal has succeeded, otherwise
24
+  the test of an error message indicating why container removal has failed
25
+  is available from `Error.Message` field.
26
+
20 27
 ## v1.33 API changes
21 28
 
22 29
 [Docker Engine API v1.33](https://docs.docker.com/engine/api/v1.33/) documentation
23 30
 
24 31
 * `GET /events` now supports filtering 4 more kinds of events: `config`, `node`,
25
-`secret` and `service`. 
32
+`secret` and `service`.
26 33
 
27 34
 ## v1.32 API changes
28 35