Fix unmount issues in the daemon crash and restart lifecycle, w.r.t
graph drivers. This change sets a live container RWLayer's activity
count to 1, so that the RWLayer is aware of the mount. Note that
containerd has experimental support for restore live containers.
Added/updated corresponding tests.
Signed-off-by: Anusha Ragunathan <anusha@docker.com>
| ... | ... |
@@ -293,6 +293,11 @@ func (daemon *Daemon) restore() error {
|
| 293 | 293 |
go func(c *container.Container) {
|
| 294 | 294 |
defer wg.Done() |
| 295 | 295 |
if c.IsRunning() || c.IsPaused() {
|
| 296 |
+ // Fix activityCount such that graph mounts can be unmounted later |
|
| 297 |
+ if err := daemon.layerStore.ReinitRWLayer(c.RWLayer); err != nil {
|
|
| 298 |
+ logrus.Errorf("Failed to ReinitRWLayer for %s due to %s", c.ID, err)
|
|
| 299 |
+ return |
|
| 300 |
+ } |
|
| 296 | 301 |
if err := daemon.containerd.Restore(c.ID, libcontainerd.WithRestartManager(c.RestartManager(true))); err != nil {
|
| 297 | 302 |
logrus.Errorf("Failed to restore with containerd: %q", err)
|
| 298 | 303 |
return |
| ... | ... |
@@ -121,6 +121,10 @@ func (ls *mockLayerStore) GetMountID(string) (string, error) {
|
| 121 | 121 |
return "", errors.New("not implemented")
|
| 122 | 122 |
} |
| 123 | 123 |
|
| 124 |
+func (ls *mockLayerStore) ReinitRWLayer(layer.RWLayer) error {
|
|
| 125 |
+ return errors.New("not implemented")
|
|
| 126 |
+} |
|
| 127 |
+ |
|
| 124 | 128 |
func (ls *mockLayerStore) Cleanup() error {
|
| 125 | 129 |
return nil |
| 126 | 130 |
} |
| ... | ... |
@@ -3,6 +3,8 @@ |
| 3 | 3 |
package main |
| 4 | 4 |
|
| 5 | 5 |
import ( |
| 6 |
+ "io/ioutil" |
|
| 7 |
+ "os" |
|
| 6 | 8 |
"os/exec" |
| 7 | 9 |
"strings" |
| 8 | 10 |
"time" |
| ... | ... |
@@ -56,6 +58,49 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithKilledRunningContainer(t *check |
| 56 | 56 |
|
| 57 | 57 |
} |
| 58 | 58 |
|
| 59 |
+// os.Kill should kill daemon ungracefully, leaving behind live containers. |
|
| 60 |
+// The live containers should be known to the restarted daemon. Stopping |
|
| 61 |
+// them now, should remove the mounts. |
|
| 62 |
+func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonCrash(c *check.C) {
|
|
| 63 |
+ testRequires(c, DaemonIsLinux) |
|
| 64 |
+ c.Assert(s.d.StartWithBusybox(), check.IsNil) |
|
| 65 |
+ |
|
| 66 |
+ out, err := s.d.Cmd("run", "-d", "busybox", "top")
|
|
| 67 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
|
|
| 68 |
+ id := strings.TrimSpace(out) |
|
| 69 |
+ |
|
| 70 |
+ c.Assert(s.d.cmd.Process.Signal(os.Kill), check.IsNil) |
|
| 71 |
+ mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
|
|
| 72 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
|
| 73 |
+ |
|
| 74 |
+ // container mounts should exist even after daemon has crashed. |
|
| 75 |
+ comment := check.Commentf("%s should stay mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
|
|
| 76 |
+ c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment) |
|
| 77 |
+ |
|
| 78 |
+ // restart daemon. |
|
| 79 |
+ if err := s.d.Restart(); err != nil {
|
|
| 80 |
+ c.Fatal(err) |
|
| 81 |
+ } |
|
| 82 |
+ |
|
| 83 |
+ // container should be running. |
|
| 84 |
+ out, err = s.d.Cmd("inspect", "--format='{{.State.Running}}'", id)
|
|
| 85 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
|
|
| 86 |
+ out = strings.TrimSpace(out) |
|
| 87 |
+ if out != "true" {
|
|
| 88 |
+ c.Fatalf("Container %s expected to stay alive after daemon restart", id)
|
|
| 89 |
+ } |
|
| 90 |
+ |
|
| 91 |
+ // 'docker stop' should work. |
|
| 92 |
+ out, err = s.d.Cmd("stop", id)
|
|
| 93 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
|
|
| 94 |
+ |
|
| 95 |
+ // Now, container mounts should be gone. |
|
| 96 |
+ mountOut, err = ioutil.ReadFile("/proc/self/mountinfo")
|
|
| 97 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
|
| 98 |
+ comment = check.Commentf("%s is still mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
|
|
| 99 |
+ c.Assert(strings.Contains(string(mountOut), id), check.Equals, false, comment) |
|
| 100 |
+} |
|
| 101 |
+ |
|
| 59 | 102 |
// TestDaemonRestartWithPausedRunningContainer requires live restore of running containers |
| 60 | 103 |
func (s *DockerDaemonSuite) TestDaemonRestartWithPausedRunningContainer(t *check.C) {
|
| 61 | 104 |
if err := s.d.StartWithBusybox(); err != nil {
|
| 62 | 105 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,39 @@ |
| 0 |
+// +build daemon,!windows,!experimental |
|
| 1 |
+ |
|
| 2 |
+package main |
|
| 3 |
+ |
|
| 4 |
+import ( |
|
| 5 |
+ "io/ioutil" |
|
| 6 |
+ "os" |
|
| 7 |
+ "strings" |
|
| 8 |
+ |
|
| 9 |
+ "github.com/go-check/check" |
|
| 10 |
+) |
|
| 11 |
+ |
|
| 12 |
+// os.Kill should kill daemon ungracefully, leaving behind container mounts. |
|
| 13 |
+// A subsequent daemon restart shoud clean up said mounts. |
|
| 14 |
+func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonKill(c *check.C) {
|
|
| 15 |
+ c.Assert(s.d.StartWithBusybox(), check.IsNil) |
|
| 16 |
+ |
|
| 17 |
+ out, err := s.d.Cmd("run", "-d", "busybox", "top")
|
|
| 18 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
|
|
| 19 |
+ id := strings.TrimSpace(out) |
|
| 20 |
+ c.Assert(s.d.cmd.Process.Signal(os.Kill), check.IsNil) |
|
| 21 |
+ mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
|
|
| 22 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
|
| 23 |
+ |
|
| 24 |
+ // container mounts should exist even after daemon has crashed. |
|
| 25 |
+ comment := check.Commentf("%s should stay mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
|
|
| 26 |
+ c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment) |
|
| 27 |
+ |
|
| 28 |
+ // restart daemon. |
|
| 29 |
+ if err := s.d.Restart(); err != nil {
|
|
| 30 |
+ c.Fatal(err) |
|
| 31 |
+ } |
|
| 32 |
+ |
|
| 33 |
+ // Now, container mounts should be gone. |
|
| 34 |
+ mountOut, err = ioutil.ReadFile("/proc/self/mountinfo")
|
|
| 35 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
|
| 36 |
+ comment = check.Commentf("%s is still mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
|
|
| 37 |
+ c.Assert(strings.Contains(string(mountOut), id), check.Equals, false, comment) |
|
| 38 |
+} |
| ... | ... |
@@ -1501,25 +1501,54 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithSocketAsVolume(c *check.C) {
|
| 1501 | 1501 |
c.Assert(s.d.Restart(), check.IsNil) |
| 1502 | 1502 |
} |
| 1503 | 1503 |
|
| 1504 |
-func (s *DockerDaemonSuite) TestCleanupMountsAfterCrash(c *check.C) {
|
|
| 1504 |
+// os.Kill should kill daemon ungracefully, leaving behind container mounts. |
|
| 1505 |
+// A subsequent daemon restart shoud clean up said mounts. |
|
| 1506 |
+func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonAndContainerKill(c *check.C) {
|
|
| 1505 | 1507 |
c.Assert(s.d.StartWithBusybox(), check.IsNil) |
| 1506 | 1508 |
|
| 1507 | 1509 |
out, err := s.d.Cmd("run", "-d", "busybox", "top")
|
| 1508 | 1510 |
c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
|
| 1509 | 1511 |
id := strings.TrimSpace(out) |
| 1510 |
- c.Assert(s.d.Kill(), check.IsNil) |
|
| 1512 |
+ c.Assert(s.d.cmd.Process.Signal(os.Kill), check.IsNil) |
|
| 1513 |
+ mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
|
|
| 1514 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
|
| 1515 |
+ |
|
| 1516 |
+ // container mounts should exist even after daemon has crashed. |
|
| 1517 |
+ comment := check.Commentf("%s should stay mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
|
|
| 1518 |
+ c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment) |
|
| 1511 | 1519 |
|
| 1512 | 1520 |
// kill the container |
| 1513 | 1521 |
runCmd := exec.Command(ctrBinary, "--address", "/var/run/docker/libcontainerd/docker-containerd.sock", "containers", "kill", id) |
| 1514 | 1522 |
if out, ec, err := runCommandWithOutput(runCmd); err != nil {
|
| 1515 |
- c.Fatalf("Failed to run ctr, ExitCode: %d, err: '%v' output: '%s' cid: '%s'\n", ec, err, out, id)
|
|
| 1523 |
+ c.Fatalf("Failed to run ctr, ExitCode: %d, err: %v output: %s id: %s\n", ec, err, out, id)
|
|
| 1524 |
+ } |
|
| 1525 |
+ |
|
| 1526 |
+ // restart daemon. |
|
| 1527 |
+ if err := s.d.Restart(); err != nil {
|
|
| 1528 |
+ c.Fatal(err) |
|
| 1516 | 1529 |
} |
| 1517 | 1530 |
|
| 1518 |
- // Give time to containerd to process the command if we don't |
|
| 1519 |
- // the exit event might be received after we do the inspect |
|
| 1531 |
+ // Now, container mounts should be gone. |
|
| 1532 |
+ mountOut, err = ioutil.ReadFile("/proc/self/mountinfo")
|
|
| 1533 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
|
| 1534 |
+ comment = check.Commentf("%s is still mounted from older daemon start:\nDaemon root repository %s\n%s", id, s.d.folder, mountOut)
|
|
| 1535 |
+ c.Assert(strings.Contains(string(mountOut), id), check.Equals, false, comment) |
|
| 1536 |
+} |
|
| 1537 |
+ |
|
| 1538 |
+// os.Interrupt should perform a graceful daemon shutdown and hence cleanup mounts. |
|
| 1539 |
+func (s *DockerDaemonSuite) TestCleanupMountsAfterGracefulShutdown(c *check.C) {
|
|
| 1540 |
+ c.Assert(s.d.StartWithBusybox(), check.IsNil) |
|
| 1541 |
+ |
|
| 1542 |
+ out, err := s.d.Cmd("run", "-d", "busybox", "top")
|
|
| 1543 |
+ c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
|
|
| 1544 |
+ id := strings.TrimSpace(out) |
|
| 1545 |
+ |
|
| 1546 |
+ // Send SIGINT and daemon should clean up |
|
| 1547 |
+ c.Assert(s.d.cmd.Process.Signal(os.Interrupt), check.IsNil) |
|
| 1548 |
+ |
|
| 1549 |
+ // Wait a bit for the daemon to handle cleanups. |
|
| 1520 | 1550 |
time.Sleep(3 * time.Second) |
| 1521 | 1551 |
|
| 1522 |
- c.Assert(s.d.Start(), check.IsNil) |
|
| 1523 | 1552 |
mountOut, err := ioutil.ReadFile("/proc/self/mountinfo")
|
| 1524 | 1553 |
c.Assert(err, check.IsNil, check.Commentf("Output: %s", mountOut))
|
| 1525 | 1554 |
|
| ... | ... |
@@ -174,6 +174,7 @@ type Store interface {
|
| 174 | 174 |
CreateRWLayer(id string, parent ChainID, mountLabel string, initFunc MountInit) (RWLayer, error) |
| 175 | 175 |
GetRWLayer(id string) (RWLayer, error) |
| 176 | 176 |
GetMountID(id string) (string, error) |
| 177 |
+ ReinitRWLayer(l RWLayer) error |
|
| 177 | 178 |
ReleaseRWLayer(RWLayer) ([]Metadata, error) |
| 178 | 179 |
|
| 179 | 180 |
Cleanup() error |
| ... | ... |
@@ -487,11 +487,30 @@ func (ls *layerStore) GetMountID(id string) (string, error) {
|
| 487 | 487 |
if !ok {
|
| 488 | 488 |
return "", ErrMountDoesNotExist |
| 489 | 489 |
} |
| 490 |
- logrus.Debugf("GetRWLayer id: %s -> mountID: %s", id, mount.mountID)
|
|
| 490 |
+ logrus.Debugf("GetMountID id: %s -> mountID: %s", id, mount.mountID)
|
|
| 491 | 491 |
|
| 492 | 492 |
return mount.mountID, nil |
| 493 | 493 |
} |
| 494 | 494 |
|
| 495 |
+// ReinitRWLayer reinitializes a given mount to the layerstore, specifically |
|
| 496 |
+// initializing the usage count. It should strictly only be used in the |
|
| 497 |
+// daemon's restore path to restore state of live containers. |
|
| 498 |
+func (ls *layerStore) ReinitRWLayer(l RWLayer) error {
|
|
| 499 |
+ ls.mountL.Lock() |
|
| 500 |
+ defer ls.mountL.Unlock() |
|
| 501 |
+ |
|
| 502 |
+ m, ok := ls.mounts[l.Name()] |
|
| 503 |
+ if !ok {
|
|
| 504 |
+ return ErrMountDoesNotExist |
|
| 505 |
+ } |
|
| 506 |
+ |
|
| 507 |
+ if err := m.incActivityCount(l); err != nil {
|
|
| 508 |
+ return err |
|
| 509 |
+ } |
|
| 510 |
+ |
|
| 511 |
+ return nil |
|
| 512 |
+} |
|
| 513 |
+ |
|
| 495 | 514 |
func (ls *layerStore) ReleaseRWLayer(l RWLayer) ([]Metadata, error) {
|
| 496 | 515 |
ls.mountL.Lock() |
| 497 | 516 |
defer ls.mountL.Unlock() |
| ... | ... |
@@ -83,6 +83,18 @@ func (ml *mountedLayer) hasReferences() bool {
|
| 83 | 83 |
return len(ml.references) > 0 |
| 84 | 84 |
} |
| 85 | 85 |
|
| 86 |
+func (ml *mountedLayer) incActivityCount(ref RWLayer) error {
|
|
| 87 |
+ rl, ok := ml.references[ref] |
|
| 88 |
+ if !ok {
|
|
| 89 |
+ return ErrLayerNotRetained |
|
| 90 |
+ } |
|
| 91 |
+ |
|
| 92 |
+ if err := rl.acquire(); err != nil {
|
|
| 93 |
+ return err |
|
| 94 |
+ } |
|
| 95 |
+ return nil |
|
| 96 |
+} |
|
| 97 |
+ |
|
| 86 | 98 |
func (ml *mountedLayer) deleteReference(ref RWLayer) error {
|
| 87 | 99 |
rl, ok := ml.references[ref] |
| 88 | 100 |
if !ok {
|
| ... | ... |
@@ -111,6 +123,15 @@ type referencedRWLayer struct {
|
| 111 | 111 |
activityCount int |
| 112 | 112 |
} |
| 113 | 113 |
|
| 114 |
+func (rl *referencedRWLayer) acquire() error {
|
|
| 115 |
+ rl.activityL.Lock() |
|
| 116 |
+ defer rl.activityL.Unlock() |
|
| 117 |
+ |
|
| 118 |
+ rl.activityCount++ |
|
| 119 |
+ |
|
| 120 |
+ return nil |
|
| 121 |
+} |
|
| 122 |
+ |
|
| 114 | 123 |
func (rl *referencedRWLayer) release() error {
|
| 115 | 124 |
rl.activityL.Lock() |
| 116 | 125 |
defer rl.activityL.Unlock() |