Browse code

Fix critical bug: can't restart a restarting container

When user try to restart a restarting container, docker client report
error: "container is already active", and container will be stopped
instead be restarted which is seriously wrong.

What's more critical is that when user try to start this container
again, it will always fail.

This error can also be reproduced with a `docker stop`+`docker start`.

And this commit will fix the bug.

Signed-off-by: Zhang Wei <zhangwei555@huawei.com>
(cherry picked from commit a705e166cf3bcca62543150c2b3f9bfeae45ecfa)

Zhang Wei authored on 2016/04/07 23:05:29
Showing 5 changed files
... ...
@@ -3,6 +3,7 @@ package daemon
3 3
 import (
4 4
 	"fmt"
5 5
 	"runtime"
6
+	"strings"
6 7
 	"syscall"
7 8
 	"time"
8 9
 
... ...
@@ -81,7 +82,14 @@ func (daemon *Daemon) killWithSignal(container *container.Container, sig int) er
81 81
 	}
82 82
 
83 83
 	if err := daemon.kill(container, sig); err != nil {
84
-		return fmt.Errorf("Cannot kill container %s: %s", container.ID, err)
84
+		err = fmt.Errorf("Cannot kill container %s: %s", container.ID, err)
85
+		// if container or process not exists, ignore the error
86
+		if strings.Contains(err.Error(), "container not found") ||
87
+			strings.Contains(err.Error(), "no such process") {
88
+			logrus.Warnf("%s", err.Error())
89
+		} else {
90
+			return err
91
+		}
85 92
 	}
86 93
 
87 94
 	attributes := map[string]string{
... ...
@@ -218,3 +218,32 @@ func (s *DockerSuite) TestRestartPolicyAfterRestart(c *check.C) {
218 218
 	err = waitInspect(id, "{{.State.Status}}", "running", 30*time.Second)
219 219
 	c.Assert(err, check.IsNil)
220 220
 }
221
+
222
+func (s *DockerSuite) TestRestartContainerwithRestartPolicy(c *check.C) {
223
+	out1, _ := dockerCmd(c, "run", "-d", "--restart=on-failure:3", "busybox", "false")
224
+	out2, _ := dockerCmd(c, "run", "-d", "--restart=always", "busybox", "false")
225
+
226
+	id1 := strings.TrimSpace(string(out1))
227
+	id2 := strings.TrimSpace(string(out2))
228
+	err := waitInspect(id1, "{{ .State.Restarting }} {{ .State.Running }}", "false false", 30*time.Second)
229
+	c.Assert(err, checker.IsNil)
230
+
231
+	// TODO: fix racey problem during restart:
232
+	// https://jenkins.dockerproject.org/job/Docker-PRs-Win2Lin/24665/console
233
+	// Error response from daemon: Cannot restart container 6655f620d90b390527db23c0a15b3e46d86a58ecec20a5697ab228d860174251: remove /var/run/docker/libcontainerd/6655f620d90b390527db23c0a15b3e46d86a58ecec20a5697ab228d860174251/rootfs: device or resource busy
234
+	if _, _, err := dockerCmdWithError("restart", id1); err != nil {
235
+		// if restart met racey problem, try again
236
+		time.Sleep(500 * time.Millisecond)
237
+		dockerCmd(c, "restart", id1)
238
+	}
239
+	if _, _, err := dockerCmdWithError("restart", id2); err != nil {
240
+		// if restart met racey problem, try again
241
+		time.Sleep(500 * time.Millisecond)
242
+		dockerCmd(c, "restart", id2)
243
+	}
244
+
245
+	dockerCmd(c, "stop", id1)
246
+	dockerCmd(c, "stop", id2)
247
+	dockerCmd(c, "start", id1)
248
+	dockerCmd(c, "start", id2)
249
+}
... ...
@@ -134,7 +134,7 @@ func (clnt *client) Create(containerID string, spec Spec, options ...CreateOptio
134 134
 	defer clnt.unlock(containerID)
135 135
 
136 136
 	if ctr, err := clnt.getContainer(containerID); err == nil {
137
-		if ctr.restarting { // docker doesn't actually call start if restart is on atm, but probably should in the future
137
+		if ctr.restarting {
138 138
 			ctr.restartManager.Cancel()
139 139
 			ctr.clean()
140 140
 		} else {
... ...
@@ -121,6 +121,7 @@ func (ctr *container) handleEvent(e *containerd.Event) error {
121 121
 			} else if restart {
122 122
 				st.State = StateRestart
123 123
 				ctr.restarting = true
124
+				ctr.client.deleteContainer(e.Id)
124 125
 				go func() {
125 126
 					err := <-wait
126 127
 					ctr.restarting = false
... ...
@@ -51,7 +51,7 @@ func (rm *restartManager) ShouldRestart(exitCode uint32) (bool, chan error, erro
51 51
 	}()
52 52
 
53 53
 	if rm.canceled {
54
-		return false, nil, nil
54
+		return false, nil, fmt.Errorf("restartmanager canceled")
55 55
 	}
56 56
 
57 57
 	if rm.active {