Browse code

Handle missing c8d task on stop

In this case, we are sending a signal to the container (typically this
would be SIGKILL or SIGTERM, but could be any signal), but container
reports that the process does not exist.

At the point this code is happening, dockerd thinks that the container
is running, but containerd reports that it is not.

Since containerd reports that it is not running, try to collect the exit
status of the container from containerd, and mark the container as
stopped in dockerd.

Repro this problem like so:

```
id=$(docker run -d busybox top)
pkill containerd && pkill top
docker stop $id
```

Without this change, `docker stop $id` will first try to send SIGTERM,
wait for exit, then try SIGKILL.
Because the process doesn't exist to begin with, no signal is sent, and
so nothing happens.
Since we won't receive any event here to process, the container can
never be marked as stopped until the daemon is restarted.

With the change `docker stop` succeeds immediately (since the process is
already stopped) and we mark the container as stopped. We handle the
case as if we missed a exit event.

There are definitely some other places in the stack that could use some
improvement here, but this helps people get out of a sticky situation.

With io.containerd.runc.v2, no event is ever recieved by docker because
the shim quits trying to send the event.

With io.containerd.runtime.v1.linux the TastExit event is sent before
dockerd can reconnect to the event stream and we miss the event.

No matter what, we shouldn't be reliant on the shim doing the right
thing here, nor can we rely on a steady event stream.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>

Brian Goff authored on 2020/07/18 03:47:40
Showing 2 changed files
... ...
@@ -99,6 +99,7 @@ func (daemon *Daemon) killWithSignal(container *containerpkg.Container, sig int)
99 99
 		if errdefs.IsNotFound(err) {
100 100
 			unpause = false
101 101
 			logrus.WithError(err).WithField("container", container.ID).WithField("action", "kill").Debug("container kill failed because of 'container not found' or 'no such process'")
102
+			go daemon.handleContainerExit(container, nil)
102 103
 		} else {
103 104
 			return errors.Wrapf(err, "Cannot kill container %s", container.ID)
104 105
 		}
... ...
@@ -24,6 +24,82 @@ func (daemon *Daemon) setStateCounter(c *container.Container) {
24 24
 	}
25 25
 }
26 26
 
27
+func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontainerdtypes.EventInfo) error {
28
+	c.Lock()
29
+
30
+	ec, et, err := daemon.containerd.DeleteTask(context.Background(), c.ID)
31
+	if err != nil {
32
+		logrus.WithError(err).Warnf("failed to delete container %s from containerd", c.ID)
33
+	}
34
+
35
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
36
+	c.StreamConfig.Wait(ctx)
37
+	cancel()
38
+	c.Reset(false)
39
+
40
+	exitStatus := container.ExitStatus{
41
+		ExitCode: int(ec),
42
+		ExitedAt: et,
43
+	}
44
+	if e != nil {
45
+		exitStatus.ExitCode = int(e.ExitCode)
46
+		exitStatus.ExitedAt = e.ExitedAt
47
+		exitStatus.OOMKilled = e.OOMKilled
48
+		if e.Error != nil {
49
+			c.SetError(e.Error)
50
+		}
51
+	}
52
+
53
+	restart, wait, err := c.RestartManager().ShouldRestart(ec, daemon.IsShuttingDown() || c.HasBeenManuallyStopped, time.Since(c.StartedAt))
54
+	if err == nil && restart {
55
+		c.RestartCount++
56
+		c.SetRestarting(&exitStatus)
57
+	} else {
58
+		c.SetStopped(&exitStatus)
59
+		defer daemon.autoRemove(c)
60
+	}
61
+	defer c.Unlock() // needs to be called before autoRemove
62
+
63
+	// cancel healthcheck here, they will be automatically
64
+	// restarted if/when the container is started again
65
+	daemon.stopHealthchecks(c)
66
+	attributes := map[string]string{
67
+		"exitCode": strconv.Itoa(int(ec)),
68
+	}
69
+	daemon.LogContainerEventWithAttributes(c, "die", attributes)
70
+	daemon.Cleanup(c)
71
+	daemon.setStateCounter(c)
72
+	cpErr := c.CheckpointTo(daemon.containersReplica)
73
+
74
+	if err == nil && restart {
75
+		go func() {
76
+			err := <-wait
77
+			if err == nil {
78
+				// daemon.netController is initialized when daemon is restoring containers.
79
+				// But containerStart will use daemon.netController segment.
80
+				// So to avoid panic at startup process, here must wait util daemon restore done.
81
+				daemon.waitForStartupDone()
82
+				if err = daemon.containerStart(c, "", "", false); err != nil {
83
+					logrus.Debugf("failed to restart container: %+v", err)
84
+				}
85
+			}
86
+			if err != nil {
87
+				c.Lock()
88
+				c.SetStopped(&exitStatus)
89
+				daemon.setStateCounter(c)
90
+				c.CheckpointTo(daemon.containersReplica)
91
+				c.Unlock()
92
+				defer daemon.autoRemove(c)
93
+				if err != restartmanager.ErrRestartCanceled {
94
+					logrus.Errorf("restartmanger wait error: %+v", err)
95
+				}
96
+			}
97
+		}()
98
+	}
99
+
100
+	return cpErr
101
+}
102
+
27 103
 // ProcessEvent is called by libcontainerd whenever an event occurs
28 104
 func (daemon *Daemon) ProcessEvent(id string, e libcontainerdtypes.EventType, ei libcontainerdtypes.EventInfo) error {
29 105
 	c, err := daemon.GetContainer(id)
... ...
@@ -48,72 +124,7 @@ func (daemon *Daemon) ProcessEvent(id string, e libcontainerdtypes.EventType, ei
48 48
 		daemon.LogContainerEvent(c, "oom")
49 49
 	case libcontainerdtypes.EventExit:
50 50
 		if int(ei.Pid) == c.Pid {
51
-			c.Lock()
52
-			_, _, err := daemon.containerd.DeleteTask(context.Background(), c.ID)
53
-			if err != nil {
54
-				logrus.WithError(err).Warnf("failed to delete container %s from containerd", c.ID)
55
-			}
56
-			ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
57
-			c.StreamConfig.Wait(ctx)
58
-			cancel()
59
-			c.Reset(false)
60
-
61
-			exitStatus := container.ExitStatus{
62
-				ExitCode:  int(ei.ExitCode),
63
-				ExitedAt:  ei.ExitedAt,
64
-				OOMKilled: ei.OOMKilled,
65
-			}
66
-			restart, wait, err := c.RestartManager().ShouldRestart(ei.ExitCode, daemon.IsShuttingDown() || c.HasBeenManuallyStopped, time.Since(c.StartedAt))
67
-			if err == nil && restart {
68
-				c.RestartCount++
69
-				c.SetRestarting(&exitStatus)
70
-			} else {
71
-				if ei.Error != nil {
72
-					c.SetError(ei.Error)
73
-				}
74
-				c.SetStopped(&exitStatus)
75
-				defer daemon.autoRemove(c)
76
-			}
77
-			defer c.Unlock() // needs to be called before autoRemove
78
-
79
-			// cancel healthcheck here, they will be automatically
80
-			// restarted if/when the container is started again
81
-			daemon.stopHealthchecks(c)
82
-			attributes := map[string]string{
83
-				"exitCode": strconv.Itoa(int(ei.ExitCode)),
84
-			}
85
-			daemon.LogContainerEventWithAttributes(c, "die", attributes)
86
-			daemon.Cleanup(c)
87
-			daemon.setStateCounter(c)
88
-			cpErr := c.CheckpointTo(daemon.containersReplica)
89
-
90
-			if err == nil && restart {
91
-				go func() {
92
-					err := <-wait
93
-					if err == nil {
94
-						// daemon.netController is initialized when daemon is restoring containers.
95
-						// But containerStart will use daemon.netController segment.
96
-						// So to avoid panic at startup process, here must wait util daemon restore done.
97
-						daemon.waitForStartupDone()
98
-						if err = daemon.containerStart(c, "", "", false); err != nil {
99
-							logrus.Debugf("failed to restart container: %+v", err)
100
-						}
101
-					}
102
-					if err != nil {
103
-						c.Lock()
104
-						c.SetStopped(&exitStatus)
105
-						daemon.setStateCounter(c)
106
-						c.CheckpointTo(daemon.containersReplica)
107
-						c.Unlock()
108
-						defer daemon.autoRemove(c)
109
-						if err != restartmanager.ErrRestartCanceled {
110
-							logrus.Errorf("restartmanger wait error: %+v", err)
111
-						}
112
-					}
113
-				}()
114
-			}
115
-
116
-			return cpErr
51
+			return daemon.handleContainerExit(c, &ei)
117 52
 		}
118 53
 
119 54
 		exitCode := 127