Browse code

Update mount state of live containers after a daemon crash.

Fix unmount issues in the daemon crash and restart lifecycle, w.r.t
graph drivers. This change sets a live container RWLayer's activity
count to 1, so that the RWLayer is aware of the mount. Note that
containerd has experimental support for restore live containers.

Added/updated corresponding tests.

Signed-off-by: Anusha Ragunathan <anusha@docker.com>

Anusha Ragunathan authored on 2016/03/05 07:41:53
Showing 8 changed files
... ...
@@ -293,6 +293,11 @@ func (daemon *Daemon) restore() error {
293 293
 		go func(c *container.Container) {
294 294
 			defer wg.Done()
295 295
 			if c.IsRunning() || c.IsPaused() {
296
+				// Fix activityCount such that graph mounts can be unmounted later
297
+				if err := daemon.layerStore.ReinitRWLayer(c.RWLayer); err != nil {
298
+					logrus.Errorf("Failed to ReinitRWLayer for %s due to %s", c.ID, err)
299
+					return
300
+				}
296 301
 				if err := daemon.containerd.Restore(c.ID, libcontainerd.WithRestartManager(c.RestartManager(true))); err != nil {
297 302
 					logrus.Errorf("Failed to restore with containerd: %q", err)
298 303
 					return
... ...
@@ -121,6 +121,10 @@ func (ls *mockLayerStore) GetMountID(string) (string, error) {
121 121
 	return "", errors.New("not implemented")
122 122
 }
123 123
 
124
+func (ls *mockLayerStore) ReinitRWLayer(layer.RWLayer) error {
125
+	return errors.New("not implemented")
126
+}
127
+
124 128
 func (ls *mockLayerStore) Cleanup() error {
125 129
 	return nil
126 130
 }
... ...
@@ -3,6 +3,8 @@
3 3
 package main
4 4
 
5 5
 import (
6
+	"io/ioutil"
7
+	"os"
6 8
 	"os/exec"
7 9
 	"strings"
8 10
 	"time"
... ...
@@ -56,6 +58,49 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithKilledRunningContainer(t *check
56 56
 
57 57
 }
58 58
 
59
+// os.Kill should kill daemon ungracefully, leaving behind live containers.
60
+// The live containers should be known to the restarted daemon. Stopping
61
+// them now, should remove the mounts.
62
+func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonCrash(c *check.C) {
63
+	testRequires(c, DaemonIsLinux)
64
+	c.Assert(s.d.StartWithBusybox(), check.IsNil)
65
+
66
+	out, err := s.d.Cmd("run", "-d", "busybox", "top")
67
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
68
+	id := strings.TrimSpace(out)
69
+
70
+	c.Assert(s.d.cmd.Process.Signal(os.Kill), check.IsNil)
71
+	mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
72
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
73
+
74
+	// container mounts should exist even after daemon has crashed.
75
+	comment := check.Commentf("%s should stay mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
76
+	c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment)
77
+
78
+	// restart daemon.
79
+	if err := s.d.Restart(); err != nil {
80
+		c.Fatal(err)
81
+	}
82
+
83
+	// container should be running.
84
+	out, err = s.d.Cmd("inspect", "--format='{{.State.Running}}'", id)
85
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
86
+	out = strings.TrimSpace(out)
87
+	if out != "true" {
88
+		c.Fatalf("Container %s expected to stay alive after daemon restart", id)
89
+	}
90
+
91
+	// 'docker stop' should work.
92
+	out, err = s.d.Cmd("stop", id)
93
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
94
+
95
+	// Now, container mounts should be gone.
96
+	mountOut, err = ioutil.ReadFile("/proc/self/mountinfo")
97
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
98
+	comment = check.Commentf("%s is still mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
99
+	c.Assert(strings.Contains(string(mountOut), id), check.Equals, false, comment)
100
+}
101
+
59 102
 // TestDaemonRestartWithPausedRunningContainer requires live restore of running containers
60 103
 func (s *DockerDaemonSuite) TestDaemonRestartWithPausedRunningContainer(t *check.C) {
61 104
 	if err := s.d.StartWithBusybox(); err != nil {
62 105
new file mode 100644
... ...
@@ -0,0 +1,39 @@
0
+// +build daemon,!windows,!experimental
1
+
2
+package main
3
+
4
+import (
5
+	"io/ioutil"
6
+	"os"
7
+	"strings"
8
+
9
+	"github.com/go-check/check"
10
+)
11
+
12
+// os.Kill should kill daemon ungracefully, leaving behind container mounts.
13
+// A subsequent daemon restart shoud clean up said mounts.
14
+func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonKill(c *check.C) {
15
+	c.Assert(s.d.StartWithBusybox(), check.IsNil)
16
+
17
+	out, err := s.d.Cmd("run", "-d", "busybox", "top")
18
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
19
+	id := strings.TrimSpace(out)
20
+	c.Assert(s.d.cmd.Process.Signal(os.Kill), check.IsNil)
21
+	mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
22
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
23
+
24
+	// container mounts should exist even after daemon has crashed.
25
+	comment := check.Commentf("%s should stay mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
26
+	c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment)
27
+
28
+	// restart daemon.
29
+	if err := s.d.Restart(); err != nil {
30
+		c.Fatal(err)
31
+	}
32
+
33
+	// Now, container mounts should be gone.
34
+	mountOut, err = ioutil.ReadFile("/proc/self/mountinfo")
35
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
36
+	comment = check.Commentf("%s is still mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
37
+	c.Assert(strings.Contains(string(mountOut), id), check.Equals, false, comment)
38
+}
... ...
@@ -1501,25 +1501,54 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithSocketAsVolume(c *check.C) {
1501 1501
 	c.Assert(s.d.Restart(), check.IsNil)
1502 1502
 }
1503 1503
 
1504
-func (s *DockerDaemonSuite) TestCleanupMountsAfterCrash(c *check.C) {
1504
+// os.Kill should kill daemon ungracefully, leaving behind container mounts.
1505
+// A subsequent daemon restart shoud clean up said mounts.
1506
+func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonAndContainerKill(c *check.C) {
1505 1507
 	c.Assert(s.d.StartWithBusybox(), check.IsNil)
1506 1508
 
1507 1509
 	out, err := s.d.Cmd("run", "-d", "busybox", "top")
1508 1510
 	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
1509 1511
 	id := strings.TrimSpace(out)
1510
-	c.Assert(s.d.Kill(), check.IsNil)
1512
+	c.Assert(s.d.cmd.Process.Signal(os.Kill), check.IsNil)
1513
+	mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
1514
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
1515
+
1516
+	// container mounts should exist even after daemon has crashed.
1517
+	comment := check.Commentf("%s should stay mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
1518
+	c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment)
1511 1519
 
1512 1520
 	// kill the container
1513 1521
 	runCmd := exec.Command(ctrBinary, "--address", "/var/run/docker/libcontainerd/docker-containerd.sock", "containers", "kill", id)
1514 1522
 	if out, ec, err := runCommandWithOutput(runCmd); err != nil {
1515
-		c.Fatalf("Failed to run ctr, ExitCode: %d, err: '%v' output: '%s' cid: '%s'\n", ec, err, out, id)
1523
+		c.Fatalf("Failed to run ctr, ExitCode: %d, err: %v output: %s id: %s\n", ec, err, out, id)
1524
+	}
1525
+
1526
+	// restart daemon.
1527
+	if err := s.d.Restart(); err != nil {
1528
+		c.Fatal(err)
1516 1529
 	}
1517 1530
 
1518
-	// Give time to containerd to process the command if we don't
1519
-	// the exit event might be received after we do the inspect
1531
+	// Now, container mounts should be gone.
1532
+	mountOut, err = ioutil.ReadFile("/proc/self/mountinfo")
1533
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
1534
+	comment = check.Commentf("%s is still mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
1535
+	c.Assert(strings.Contains(string(mountOut), id), check.Equals, false, comment)
1536
+}
1537
+
1538
+// os.Interrupt should perform a graceful daemon shutdown and hence cleanup mounts.
1539
+func (s *DockerDaemonSuite) TestCleanupMountsAfterGracefulShutdown(c *check.C) {
1540
+	c.Assert(s.d.StartWithBusybox(), check.IsNil)
1541
+
1542
+	out, err := s.d.Cmd("run", "-d", "busybox", "top")
1543
+	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
1544
+	id := strings.TrimSpace(out)
1545
+
1546
+	// Send SIGINT and daemon should clean up
1547
+	c.Assert(s.d.cmd.Process.Signal(os.Interrupt), check.IsNil)
1548
+
1549
+	// Wait a bit for the daemon to handle cleanups.
1520 1550
 	time.Sleep(3 * time.Second)
1521 1551
 
1522
-	c.Assert(s.d.Start(), check.IsNil)
1523 1552
 	mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
1524 1553
 	c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
1525 1554
 
... ...
@@ -174,6 +174,7 @@ type Store interface {
174 174
 	CreateRWLayer(id string, parent ChainID, mountLabel string, initFunc MountInit) (RWLayer, error)
175 175
 	GetRWLayer(id string) (RWLayer, error)
176 176
 	GetMountID(id string) (string, error)
177
+	ReinitRWLayer(l RWLayer) error
177 178
 	ReleaseRWLayer(RWLayer) ([]Metadata, error)
178 179
 
179 180
 	Cleanup() error
... ...
@@ -487,11 +487,30 @@ func (ls *layerStore) GetMountID(id string) (string, error) {
487 487
 	if !ok {
488 488
 		return "", ErrMountDoesNotExist
489 489
 	}
490
-	logrus.Debugf("GetRWLayer id: %s -> mountID: %s", id, mount.mountID)
490
+	logrus.Debugf("GetMountID id: %s -> mountID: %s", id, mount.mountID)
491 491
 
492 492
 	return mount.mountID, nil
493 493
 }
494 494
 
495
+// ReinitRWLayer reinitializes a given mount to the layerstore, specifically
496
+// initializing the usage count. It should strictly only be used in the
497
+// daemon's restore path to restore state of live containers.
498
+func (ls *layerStore) ReinitRWLayer(l RWLayer) error {
499
+	ls.mountL.Lock()
500
+	defer ls.mountL.Unlock()
501
+
502
+	m, ok := ls.mounts[l.Name()]
503
+	if !ok {
504
+		return ErrMountDoesNotExist
505
+	}
506
+
507
+	if err := m.incActivityCount(l); err != nil {
508
+		return err
509
+	}
510
+
511
+	return nil
512
+}
513
+
495 514
 func (ls *layerStore) ReleaseRWLayer(l RWLayer) ([]Metadata, error) {
496 515
 	ls.mountL.Lock()
497 516
 	defer ls.mountL.Unlock()
... ...
@@ -83,6 +83,18 @@ func (ml *mountedLayer) hasReferences() bool {
83 83
 	return len(ml.references) > 0
84 84
 }
85 85
 
86
+func (ml *mountedLayer) incActivityCount(ref RWLayer) error {
87
+	rl, ok := ml.references[ref]
88
+	if !ok {
89
+		return ErrLayerNotRetained
90
+	}
91
+
92
+	if err := rl.acquire(); err != nil {
93
+		return err
94
+	}
95
+	return nil
96
+}
97
+
86 98
 func (ml *mountedLayer) deleteReference(ref RWLayer) error {
87 99
 	rl, ok := ml.references[ref]
88 100
 	if !ok {
... ...
@@ -111,6 +123,15 @@ type referencedRWLayer struct {
111 111
 	activityCount int
112 112
 }
113 113
 
114
+func (rl *referencedRWLayer) acquire() error {
115
+	rl.activityL.Lock()
116
+	defer rl.activityL.Unlock()
117
+
118
+	rl.activityCount++
119
+
120
+	return nil
121
+}
122
+
114 123
 func (rl *referencedRWLayer) release() error {
115 124
 	rl.activityL.Lock()
116 125
 	defer rl.activityL.Unlock()