Add --live-restore flag
Michael Crosby authored on 2016/06/14 12:57:19... | ... |
@@ -71,6 +71,9 @@ func (cli *DaemonCli) getPlatformRemoteOptions() []libcontainerd.RemoteOption { |
71 | 71 |
args := []string{"--systemd-cgroup=true"} |
72 | 72 |
opts = append(opts, libcontainerd.WithRuntimeArgs(args)) |
73 | 73 |
} |
74 |
+ if cli.Config.LiveRestore { |
|
75 |
+ opts = append(opts, libcontainerd.WithLiveRestore(true)) |
|
76 |
+ } |
|
74 | 77 |
return opts |
75 | 78 |
} |
76 | 79 |
|
... | ... |
@@ -90,6 +90,7 @@ type CommonConfig struct { |
90 | 90 |
TrustKeyPath string `json:"-"` |
91 | 91 |
CorsHeaders string `json:"api-cors-header,omitempty"` |
92 | 92 |
EnableCors bool `json:"api-enable-cors,omitempty"` |
93 |
+ LiveRestore bool `json:"live-restore,omitempty"` |
|
93 | 94 |
|
94 | 95 |
// ClusterStore is the storage backend used for the cluster information. It is used by both |
95 | 96 |
// multihost networking (to store networks and endpoints information) and by the node discovery |
... | ... |
@@ -82,6 +82,7 @@ func (config *Config) InstallFlags(cmd *flag.FlagSet, usageFn func(string) strin |
82 | 82 |
cmd.StringVar(&config.CgroupParent, []string{"-cgroup-parent"}, "", usageFn("Set parent cgroup for all containers")) |
83 | 83 |
cmd.StringVar(&config.RemappedRoot, []string{"-userns-remap"}, "", usageFn("User/Group setting for user namespaces")) |
84 | 84 |
cmd.StringVar(&config.ContainerdAddr, []string{"-containerd"}, "", usageFn("Path to containerd socket")) |
85 |
+ cmd.BoolVar(&config.LiveRestore, []string{"-live-restore"}, false, usageFn("Enable live restore of docker when containers are still running")) |
|
85 | 86 |
|
86 | 87 |
config.attachExperimentalFlags(cmd, usageFn) |
87 | 88 |
} |
... | ... |
@@ -92,6 +92,7 @@ type Daemon struct { |
92 | 92 |
nameIndex *registrar.Registrar |
93 | 93 |
linkIndex *linkIndex |
94 | 94 |
containerd libcontainerd.Client |
95 |
+ containerdRemote libcontainerd.Remote |
|
95 | 96 |
defaultIsolation containertypes.Isolation // Default isolation mode on Windows |
96 | 97 |
} |
97 | 98 |
|
... | ... |
@@ -552,6 +553,7 @@ func NewDaemon(config *Config, registryService registry.Service, containerdRemot |
552 | 552 |
|
553 | 553 |
d.nameIndex = registrar.NewRegistrar() |
554 | 554 |
d.linkIndex = newLinkIndex() |
555 |
+ d.containerdRemote = containerdRemote |
|
555 | 556 |
|
556 | 557 |
go d.execCommandGC() |
557 | 558 |
|
... | ... |
@@ -609,6 +611,11 @@ func (daemon *Daemon) shutdownContainer(c *container.Container) error { |
609 | 609 |
// Shutdown stops the daemon. |
610 | 610 |
func (daemon *Daemon) Shutdown() error { |
611 | 611 |
daemon.shutdown = true |
612 |
+ // Keep mounts and networking running on daemon shutdown if |
|
613 |
+ // we are to keep containers running and restore them. |
|
614 |
+ if daemon.configStore.LiveRestore { |
|
615 |
+ return nil |
|
616 |
+ } |
|
612 | 617 |
if daemon.containers != nil { |
613 | 618 |
logrus.Debug("starting clean shutdown of all containers...") |
614 | 619 |
daemon.containers.ApplyAll(func(c *container.Container) { |
... | ... |
@@ -794,6 +801,7 @@ func (daemon *Daemon) initDiscovery(config *Config) error { |
794 | 794 |
// - Daemon max concurrent downloads |
795 | 795 |
// - Daemon max concurrent uploads |
796 | 796 |
// - Cluster discovery (reconfigure and restart). |
797 |
+// - Daemon live restore |
|
797 | 798 |
func (daemon *Daemon) Reload(config *Config) error { |
798 | 799 |
daemon.configStore.reloadLock.Lock() |
799 | 800 |
defer daemon.configStore.reloadLock.Unlock() |
... | ... |
@@ -808,6 +816,13 @@ func (daemon *Daemon) Reload(config *Config) error { |
808 | 808 |
if config.IsValueSet("debug") { |
809 | 809 |
daemon.configStore.Debug = config.Debug |
810 | 810 |
} |
811 |
+ if config.IsValueSet("live-restore") { |
|
812 |
+ daemon.configStore.LiveRestore = config.LiveRestore |
|
813 |
+ if err := daemon.containerdRemote.UpdateOptions(libcontainerd.WithLiveRestore(config.LiveRestore)); err != nil { |
|
814 |
+ return err |
|
815 |
+ } |
|
816 |
+ |
|
817 |
+ } |
|
811 | 818 |
|
812 | 819 |
// If no value is set for max-concurrent-downloads we assume it is the default value |
813 | 820 |
// We always "reset" as the cost is lightweight and easy to maintain. |
... | ... |
@@ -278,3 +278,16 @@ be viewed using `journalctl -u docker` |
278 | 278 |
May 06 00:22:06 localhost.localdomain docker[2495]: time="2015-05-06T00:22:06Z" level="info" msg="-job acceptconnections() = OK (0)" |
279 | 279 |
|
280 | 280 |
_Note: Using and configuring journal is an advanced topic and is beyond the scope of this article._ |
281 |
+ |
|
282 |
+ |
|
283 |
+### Daemonless Containers |
|
284 |
+ |
|
285 |
+Starting with Docker 1.12 containers can run without Docker or containerd running. This allows the |
|
286 |
+Docker daemon to exit, be upgraded, or recover from a crash without affecting running containers |
|
287 |
+on the system. To enable this functionality you need to add the `--live-restore` flag when |
|
288 |
+launching `dockerd`. This will ensure that Docker does not kill containers on graceful shutdown or |
|
289 |
+on restart leaving the containers running. |
|
290 |
+ |
|
291 |
+While the Docker daemon is down logging will still be captured, however, it will be capped at the kernel's pipe buffer size before the buffer fills up, blocking the process. |
|
292 |
+Docker will need to be restarted to flush these buffers. |
|
293 |
+You can modify the kernel's buffer size by changing `/proc/sys/fs/pipe-max-size`. |
... | ... |
@@ -63,7 +63,7 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithKilledRunningContainer(t *check |
63 | 63 |
// them now, should remove the mounts. |
64 | 64 |
func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonCrash(c *check.C) { |
65 | 65 |
testRequires(c, DaemonIsLinux) |
66 |
- c.Assert(s.d.StartWithBusybox(), check.IsNil) |
|
66 |
+ c.Assert(s.d.StartWithBusybox("--live-restore"), check.IsNil) |
|
67 | 67 |
|
68 | 68 |
out, err := s.d.Cmd("run", "-d", "busybox", "top") |
69 | 69 |
c.Assert(err, check.IsNil, check.Commentf("Output: %s", out)) |
... | ... |
@@ -78,7 +78,7 @@ func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonCrash(c *check.C) { |
78 | 78 |
c.Assert(strings.Contains(string(mountOut), id), check.Equals, true, comment) |
79 | 79 |
|
80 | 80 |
// restart daemon. |
81 |
- if err := s.d.Restart(); err != nil { |
|
81 |
+ if err := s.d.Restart("--live-restore"); err != nil { |
|
82 | 82 |
c.Fatal(err) |
83 | 83 |
} |
84 | 84 |
|
... | ... |
@@ -103,7 +103,7 @@ func (s *DockerDaemonSuite) TestCleanupMountsAfterDaemonCrash(c *check.C) { |
103 | 103 |
|
104 | 104 |
// TestDaemonRestartWithPausedRunningContainer requires live restore of running containers |
105 | 105 |
func (s *DockerDaemonSuite) TestDaemonRestartWithPausedRunningContainer(t *check.C) { |
106 |
- if err := s.d.StartWithBusybox(); err != nil { |
|
106 |
+ if err := s.d.StartWithBusybox("--live-restore"); err != nil { |
|
107 | 107 |
t.Fatal(err) |
108 | 108 |
} |
109 | 109 |
|
... | ... |
@@ -130,7 +130,7 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithPausedRunningContainer(t *check |
130 | 130 |
time.Sleep(3 * time.Second) |
131 | 131 |
|
132 | 132 |
// restart the daemon |
133 |
- if err := s.d.Start(); err != nil { |
|
133 |
+ if err := s.d.Start("--live-restore"); err != nil { |
|
134 | 134 |
t.Fatal(err) |
135 | 135 |
} |
136 | 136 |
|
... | ... |
@@ -148,7 +148,7 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithPausedRunningContainer(t *check |
148 | 148 |
func (s *DockerDaemonSuite) TestDaemonRestartWithUnpausedRunningContainer(t *check.C) { |
149 | 149 |
// TODO(mlaventure): Not sure what would the exit code be on windows |
150 | 150 |
testRequires(t, DaemonIsLinux) |
151 |
- if err := s.d.StartWithBusybox(); err != nil { |
|
151 |
+ if err := s.d.StartWithBusybox("--live-restore"); err != nil { |
|
152 | 152 |
t.Fatal(err) |
153 | 153 |
} |
154 | 154 |
|
... | ... |
@@ -180,7 +180,7 @@ func (s *DockerDaemonSuite) TestDaemonRestartWithUnpausedRunningContainer(t *che |
180 | 180 |
time.Sleep(3 * time.Second) |
181 | 181 |
|
182 | 182 |
// restart the daemon |
183 |
- if err := s.d.Start(); err != nil { |
|
183 |
+ if err := s.d.Start("--live-restore"); err != nil { |
|
184 | 184 |
t.Fatal(err) |
185 | 185 |
} |
186 | 186 |
|
... | ... |
@@ -8,6 +8,7 @@ import ( |
8 | 8 |
"strings" |
9 | 9 |
"sync" |
10 | 10 |
"syscall" |
11 |
+ "time" |
|
11 | 12 |
|
12 | 13 |
"github.com/Sirupsen/logrus" |
13 | 14 |
containerd "github.com/docker/containerd/api/grpc/types" |
... | ... |
@@ -24,6 +25,7 @@ type client struct { |
24 | 24 |
remote *remote |
25 | 25 |
q queue |
26 | 26 |
exitNotifiers map[string]*exitNotifier |
27 |
+ liveRestore bool |
|
27 | 28 |
} |
28 | 29 |
|
29 | 30 |
func (clnt *client) AddProcess(containerID, processFriendlyName string, specp Process) error { |
... | ... |
@@ -456,13 +458,48 @@ func (clnt *client) restore(cont *containerd.Container, options ...CreateOption) |
456 | 456 |
} |
457 | 457 |
|
458 | 458 |
func (clnt *client) Restore(containerID string, options ...CreateOption) error { |
459 |
+ if clnt.liveRestore { |
|
460 |
+ cont, err := clnt.getContainerdContainer(containerID) |
|
461 |
+ if err == nil && cont.Status != "stopped" { |
|
462 |
+ if err := clnt.restore(cont, options...); err != nil { |
|
463 |
+ logrus.Errorf("error restoring %s: %v", containerID, err) |
|
464 |
+ } |
|
465 |
+ return nil |
|
466 |
+ } |
|
467 |
+ return clnt.setExited(containerID) |
|
468 |
+ } |
|
469 |
+ |
|
459 | 470 |
cont, err := clnt.getContainerdContainer(containerID) |
460 | 471 |
if err == nil && cont.Status != "stopped" { |
461 |
- if err := clnt.restore(cont, options...); err != nil { |
|
462 |
- logrus.Errorf("error restoring %s: %v", containerID, err) |
|
472 |
+ w := clnt.getOrCreateExitNotifier(containerID) |
|
473 |
+ clnt.lock(cont.Id) |
|
474 |
+ container := clnt.newContainer(cont.BundlePath) |
|
475 |
+ container.systemPid = systemPid(cont) |
|
476 |
+ clnt.appendContainer(container) |
|
477 |
+ clnt.unlock(cont.Id) |
|
478 |
+ |
|
479 |
+ container.discardFifos() |
|
480 |
+ |
|
481 |
+ if err := clnt.Signal(containerID, int(syscall.SIGTERM)); err != nil { |
|
482 |
+ logrus.Errorf("error sending sigterm to %v: %v", containerID, err) |
|
483 |
+ } |
|
484 |
+ select { |
|
485 |
+ case <-time.After(10 * time.Second): |
|
486 |
+ if err := clnt.Signal(containerID, int(syscall.SIGKILL)); err != nil { |
|
487 |
+ logrus.Errorf("error sending sigkill to %v: %v", containerID, err) |
|
488 |
+ } |
|
489 |
+ select { |
|
490 |
+ case <-time.After(2 * time.Second): |
|
491 |
+ case <-w.wait(): |
|
492 |
+ return nil |
|
493 |
+ } |
|
494 |
+ case <-w.wait(): |
|
495 |
+ return nil |
|
463 | 496 |
} |
464 |
- return nil |
|
465 | 497 |
} |
498 |
+ |
|
499 |
+ clnt.deleteContainer(containerID) |
|
500 |
+ |
|
466 | 501 |
return clnt.setExited(containerID) |
467 | 502 |
} |
468 | 503 |
|
... | ... |
@@ -2,6 +2,7 @@ package libcontainerd |
2 | 2 |
|
3 | 3 |
import ( |
4 | 4 |
"encoding/json" |
5 |
+ "io" |
|
5 | 6 |
"io/ioutil" |
6 | 7 |
"os" |
7 | 8 |
"path/filepath" |
... | ... |
@@ -194,3 +195,18 @@ func (ctr *container) handleEvent(e *containerd.Event) error { |
194 | 194 |
} |
195 | 195 |
return nil |
196 | 196 |
} |
197 |
+ |
|
198 |
+// discardFifos attempts to fully read the container fifos to unblock processes |
|
199 |
+// that may be blocked on the writer side. |
|
200 |
+func (ctr *container) discardFifos() { |
|
201 |
+ for _, i := range []int{syscall.Stdout, syscall.Stderr} { |
|
202 |
+ f := ctr.fifo(i) |
|
203 |
+ c := make(chan struct{}) |
|
204 |
+ go func() { |
|
205 |
+ close(c) // this channel is used to not close the writer too early, before readonly open has been called. |
|
206 |
+ io.Copy(ioutil.Discard, openReaderFromFifo(f)) |
|
207 |
+ }() |
|
208 |
+ <-c |
|
209 |
+ closeReaderFifo(f) // avoid blocking permanently on open if there is no writer side |
|
210 |
+ } |
|
211 |
+} |
... | ... |
@@ -9,6 +9,8 @@ type Remote interface { |
9 | 9 |
// Cleanup stops containerd if it was started by libcontainerd. |
10 | 10 |
// Note this is not used on Windows as there is no remote containerd. |
11 | 11 |
Cleanup() |
12 |
+ // UpdateOptions allows various remote options to be updated at runtime. |
|
13 |
+ UpdateOptions(...RemoteOption) error |
|
12 | 14 |
} |
13 | 15 |
|
14 | 16 |
// RemoteOption allows to configure parameters of remotes. |
... | ... |
@@ -52,6 +52,7 @@ type remote struct { |
52 | 52 |
pastEvents map[string]*containerd.Event |
53 | 53 |
runtimeArgs []string |
54 | 54 |
daemonWaitCh chan struct{} |
55 |
+ liveRestore bool |
|
55 | 56 |
} |
56 | 57 |
|
57 | 58 |
// New creates a fresh instance of libcontainerd remote. |
... | ... |
@@ -111,6 +112,15 @@ func New(stateDir string, options ...RemoteOption) (_ Remote, err error) { |
111 | 111 |
return r, nil |
112 | 112 |
} |
113 | 113 |
|
114 |
+func (r *remote) UpdateOptions(options ...RemoteOption) error { |
|
115 |
+ for _, option := range options { |
|
116 |
+ if err := option.Apply(r); err != nil { |
|
117 |
+ return err |
|
118 |
+ } |
|
119 |
+ } |
|
120 |
+ return nil |
|
121 |
+} |
|
122 |
+ |
|
114 | 123 |
func (r *remote) handleConnectionChange() { |
115 | 124 |
var transientFailureCount = 0 |
116 | 125 |
state := grpc.Idle |
... | ... |
@@ -184,6 +194,7 @@ func (r *remote) Client(b Backend) (Client, error) { |
184 | 184 |
}, |
185 | 185 |
remote: r, |
186 | 186 |
exitNotifiers: make(map[string]*exitNotifier), |
187 |
+ liveRestore: r.liveRestore, |
|
187 | 188 |
} |
188 | 189 |
|
189 | 190 |
r.Lock() |
... | ... |
@@ -461,3 +472,21 @@ func (d debugLog) Apply(r Remote) error { |
461 | 461 |
} |
462 | 462 |
return fmt.Errorf("WithDebugLog option not supported for this remote") |
463 | 463 |
} |
464 |
+ |
|
465 |
+// WithLiveRestore defines if containers are stopped on shutdown or restored. |
|
466 |
+func WithLiveRestore(v bool) RemoteOption { |
|
467 |
+ return liveRestore(v) |
|
468 |
+} |
|
469 |
+ |
|
470 |
+type liveRestore bool |
|
471 |
+ |
|
472 |
+func (l liveRestore) Apply(r Remote) error { |
|
473 |
+ if remote, ok := r.(*remote); ok { |
|
474 |
+ remote.liveRestore = bool(l) |
|
475 |
+ for _, c := range remote.clients { |
|
476 |
+ c.liveRestore = bool(l) |
|
477 |
+ } |
|
478 |
+ return nil |
|
479 |
+ } |
|
480 |
+ return fmt.Errorf("WithLiveRestore option not supported for this remote") |
|
481 |
+} |
... | ... |
@@ -19,7 +19,16 @@ func (r *remote) Client(b Backend) (Client, error) { |
19 | 19 |
func (r *remote) Cleanup() { |
20 | 20 |
} |
21 | 21 |
|
22 |
+func (r *remote) UpdateOptions(opts ...RemoteOption) error { |
|
23 |
+ return nil |
|
24 |
+} |
|
25 |
+ |
|
22 | 26 |
// New creates a fresh instance of libcontainerd remote. |
23 | 27 |
func New(_ string, _ ...RemoteOption) (Remote, error) { |
24 | 28 |
return &remote{}, nil |
25 | 29 |
} |
30 |
+ |
|
31 |
+// WithLiveRestore is a noop on solaris. |
|
32 |
+func WithLiveRestore(v bool) RemoteOption { |
|
33 |
+ return nil |
|
34 |
+} |
... | ... |
@@ -20,8 +20,17 @@ func (r *remote) Client(b Backend) (Client, error) { |
20 | 20 |
func (r *remote) Cleanup() { |
21 | 21 |
} |
22 | 22 |
|
23 |
+func (r *remote) UpdateOptions(opts ...RemoteOption) error { |
|
24 |
+ return nil |
|
25 |
+} |
|
26 |
+ |
|
23 | 27 |
// New creates a fresh instance of libcontainerd remote. On Windows, |
24 | 28 |
// this is not used as there is no remote containerd process. |
25 | 29 |
func New(_ string, _ ...RemoteOption) (Remote, error) { |
26 | 30 |
return &remote{}, nil |
27 | 31 |
} |
32 |
+ |
|
33 |
+// WithLiveRestore is a noop on windows. |
|
34 |
+func WithLiveRestore(v bool) RemoteOption { |
|
35 |
+ return nil |
|
36 |
+} |
... | ... |
@@ -42,6 +42,7 @@ dockerd - Enable daemon mode |
42 | 42 |
[**--isolation**[=*default*]] |
43 | 43 |
[**-l**|**--log-level**[=*info*]] |
44 | 44 |
[**--label**[=*[]*]] |
45 |
+[**--live-restore**[=*false*]] |
|
45 | 46 |
[**--log-driver**[=*json-file*]] |
46 | 47 |
[**--log-opt**[=*map[]*]] |
47 | 48 |
[**--mtu**[=*0*]] |
... | ... |
@@ -195,6 +196,9 @@ is `hyperv`. Linux only supports `default`. |
195 | 195 |
**--label**="[]" |
196 | 196 |
Set key=value labels to the daemon (displayed in `docker info`) |
197 | 197 |
|
198 |
+**--live-restore**=*false* |
|
199 |
+ Enable live restore of running containers when the daemon starts so that they are not restarted. |
|
200 |
+ |
|
198 | 201 |
**--log-driver**="*json-file*|*syslog*|*journald*|*gelf*|*fluentd*|*awslogs*|*splunk*|*etwlogs*|*gcplogs*|*none*" |
199 | 202 |
Default driver for container logs. Default is `json-file`. |
200 | 203 |
**Warning**: `docker logs` command works only for `json-file` logging driver. |