Browse code

Initial implementation of containerd Checkpoint API.

Signed-off-by: boucher <rboucher@gmail.com>

boucher authored on 2016/05/12 23:52:00
Showing 43 changed files
... ...
@@ -57,12 +57,17 @@ RUN apt-get update && apt-get install -y \
57 57
 	libapparmor-dev \
58 58
 	libcap-dev \
59 59
 	libltdl-dev \
60
+	libnl-3-dev \
61
+	libprotobuf-c0-dev \
62
+	libprotobuf-dev	\
60 63
 	libsqlite3-dev \
61 64
 	libsystemd-journal-dev \
62 65
 	libtool \
63 66
 	mercurial \
64 67
 	net-tools \
65 68
 	pkg-config \
69
+	protobuf-compiler \
70
+	protobuf-c-compiler \
66 71
 	python-dev \
67 72
 	python-mock \
68 73
 	python-pip \
... ...
@@ -145,6 +150,14 @@ RUN git clone https://github.com/golang/lint.git /go/src/github.com/golang/lint
145 145
 	&& (cd /go/src/github.com/golang/lint && git checkout -q $GO_LINT_COMMIT) \
146 146
 	&& go install -v github.com/golang/lint/golint
147 147
 
148
+# Install CRIU for checkpoint/restore support
149
+ENV CRIU_VERSION 2.2
150
+RUN mkdir -p /usr/src/criu \
151
+	&& curl -sSL https://github.com/xemul/criu/archive/v${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \
152
+	&& cd /usr/src/criu \
153
+	&& make \
154
+	&& make install-criu
155
+
148 156
 # Install two versions of the registry. The first is an older version that
149 157
 # only supports schema1 manifests. The second is a newer version that supports
150 158
 # both. This allows integration-cli tests to cover push/pull with both schema1
151 159
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import "github.com/docker/docker/api/types"
5
+
6
+// Backend for Checkpoint
7
+type Backend interface {
8
+	CheckpointCreate(container string, config types.CheckpointCreateOptions) error
9
+	CheckpointDelete(container string, checkpointID string) error
10
+	CheckpointList(container string) ([]types.Checkpoint, error)
11
+}
0 12
new file mode 100644
... ...
@@ -0,0 +1,28 @@
0
+package checkpoint
1
+
2
+import (
3
+	"github.com/docker/docker/api/server/httputils"
4
+	"github.com/docker/docker/api/server/router"
5
+)
6
+
7
+// checkpointRouter is a router to talk with the checkpoint controller
8
+type checkpointRouter struct {
9
+	backend Backend
10
+	decoder httputils.ContainerDecoder
11
+	routes  []router.Route
12
+}
13
+
14
+// NewRouter initializes a new checkpoint router
15
+func NewRouter(b Backend, decoder httputils.ContainerDecoder) router.Router {
16
+	r := &checkpointRouter{
17
+		backend: b,
18
+		decoder: decoder,
19
+	}
20
+	r.initRoutes()
21
+	return r
22
+}
23
+
24
+// Routes returns the available routers to the checkpoint controller
25
+func (r *checkpointRouter) Routes() []router.Route {
26
+	return r.routes
27
+}
0 28
new file mode 100644
... ...
@@ -0,0 +1,15 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"github.com/docker/docker/api/server/router"
6
+)
7
+
8
+func (r *checkpointRouter) initRoutes() {
9
+	r.routes = []router.Route{
10
+		router.NewGetRoute("/containers/{name:.*}/checkpoints", r.getContainerCheckpoints),
11
+		router.NewPostRoute("/containers/{name:.*}/checkpoints", r.postContainerCheckpoint),
12
+		router.NewDeleteRoute("/containers/{name:.*}/checkpoints/{checkpoint:.*}", r.deleteContainerCheckpoint),
13
+	}
14
+}
0 15
new file mode 100644
... ...
@@ -0,0 +1,8 @@
0
+// +build !experimental
1
+
2
+package checkpoint
3
+
4
+func (r *checkpointRouter) initRoutes() {}
5
+
6
+// Backend is empty so that the package can compile in non-experimental
7
+type Backend interface{}
0 8
new file mode 100644
... ...
@@ -0,0 +1,60 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"encoding/json"
6
+	"net/http"
7
+
8
+	"github.com/docker/docker/api/server/httputils"
9
+	"github.com/docker/docker/api/types"
10
+	"golang.org/x/net/context"
11
+)
12
+
13
+func (s *checkpointRouter) postContainerCheckpoint(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
14
+	if err := httputils.ParseForm(r); err != nil {
15
+		return err
16
+	}
17
+
18
+	var options types.CheckpointCreateOptions
19
+
20
+	decoder := json.NewDecoder(r.Body)
21
+	if err := decoder.Decode(&options); err != nil {
22
+		return err
23
+	}
24
+
25
+	err := s.backend.CheckpointCreate(vars["name"], options)
26
+	if err != nil {
27
+		return err
28
+	}
29
+
30
+	w.WriteHeader(http.StatusNoContent)
31
+	return nil
32
+}
33
+
34
+func (s *checkpointRouter) getContainerCheckpoints(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
35
+	if err := httputils.ParseForm(r); err != nil {
36
+		return err
37
+	}
38
+
39
+	checkpoints, err := s.backend.CheckpointList(vars["name"])
40
+	if err != nil {
41
+		return err
42
+	}
43
+
44
+	return httputils.WriteJSON(w, http.StatusOK, checkpoints)
45
+}
46
+
47
+func (s *checkpointRouter) deleteContainerCheckpoint(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
48
+	if err := httputils.ParseForm(r); err != nil {
49
+		return err
50
+	}
51
+
52
+	err := s.backend.CheckpointDelete(vars["name"], vars["checkpoint"])
53
+	if err != nil {
54
+		return err
55
+	}
56
+
57
+	w.WriteHeader(http.StatusNoContent)
58
+	return nil
59
+}
... ...
@@ -39,7 +39,7 @@ type stateBackend interface {
39 39
 	ContainerResize(name string, height, width int) error
40 40
 	ContainerRestart(name string, seconds int) error
41 41
 	ContainerRm(name string, config *types.ContainerRmConfig) error
42
-	ContainerStart(name string, hostConfig *container.HostConfig, validateHostname bool) error
42
+	ContainerStart(name string, hostConfig *container.HostConfig, validateHostname bool, checkpoint string) error
43 43
 	ContainerStop(name string, seconds int) error
44 44
 	ContainerUnpause(name string) error
45 45
 	ContainerUpdate(name string, hostConfig *container.HostConfig, validateHostname bool) (types.ContainerUpdateResponse, error)
... ...
@@ -151,10 +151,16 @@ func (s *containerRouter) postContainersStart(ctx context.Context, w http.Respon
151 151
 		hostConfig = c
152 152
 	}
153 153
 
154
+	if err := httputils.ParseForm(r); err != nil {
155
+		return err
156
+	}
157
+
158
+	checkpoint := r.Form.Get("checkpoint")
154 159
 	validateHostname := versions.GreaterThanOrEqualTo(version, "1.24")
155
-	if err := s.backend.ContainerStart(vars["name"], hostConfig, validateHostname); err != nil {
160
+	if err := s.backend.ContainerStart(vars["name"], hostConfig, validateHostname, checkpoint); err != nil {
156 161
 		return err
157 162
 	}
163
+
158 164
 	w.WriteHeader(http.StatusNoContent)
159 165
 	return nil
160 166
 }
... ...
@@ -124,12 +124,19 @@ type Backend interface {
124 124
 	// ContainerKill stops the container execution abruptly.
125 125
 	ContainerKill(containerID string, sig uint64) error
126 126
 	// ContainerStart starts a new container
127
-	ContainerStart(containerID string, hostConfig *container.HostConfig, validateHostname bool) error
127
+	ContainerStart(containerID string, hostConfig *container.HostConfig, validateHostname bool, checkpoint string) error
128 128
 	// ContainerWait stops processing until the given container is stopped.
129 129
 	ContainerWait(containerID string, timeout time.Duration) (int, error)
130 130
 	// ContainerUpdateCmdOnBuild updates container.Path and container.Args
131 131
 	ContainerUpdateCmdOnBuild(containerID string, cmd []string) error
132 132
 
133
+	// CheckpointCreate checkpoints a running container
134
+	CheckpointCreate(container string, config types.CheckpointCreateOptions) error
135
+	// CheckpointDelete deletes a container's checkpoint
136
+	CheckpointDelete(container string, checkpoint string) error
137
+	// CheckpointList lists the available checkpoints for a container
138
+	CheckpointList(container string) ([]types.Checkpoint, error)
139
+
133 140
 	// ContainerCopy copies/extracts a source FileInfo to a destination path inside a container
134 141
 	// specified by a container object.
135 142
 	// TODO: make an Extract method instead of passing `decompress`
... ...
@@ -555,7 +555,7 @@ func (b *Builder) run(cID string) (err error) {
555 555
 		}
556 556
 	}()
557 557
 
558
-	if err := b.docker.ContainerStart(cID, nil, true); err != nil {
558
+	if err := b.docker.ContainerStart(cID, nil, true, ""); err != nil {
559 559
 		return err
560 560
 	}
561 561
 
562 562
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build !experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"github.com/docker/docker/cli/command"
6
+	"github.com/spf13/cobra"
7
+)
8
+
9
+// NewCheckpointCommand returns a cobra command for `checkpoint` subcommands
10
+func NewCheckpointCommand(rootCmd *cobra.Command, dockerCli *command.DockerCli) {
11
+}
0 12
new file mode 100644
... ...
@@ -0,0 +1,31 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"fmt"
6
+
7
+	"github.com/spf13/cobra"
8
+
9
+	"github.com/docker/docker/cli"
10
+	"github.com/docker/docker/cli/command"
11
+)
12
+
13
+// NewCheckpointCommand returns a cobra command for `checkpoint` subcommands
14
+func NewCheckpointCommand(rootCmd *cobra.Command, dockerCli *command.DockerCli) {
15
+	cmd := &cobra.Command{
16
+		Use:   "checkpoint",
17
+		Short: "Manage Container Checkpoints",
18
+		Args:  cli.NoArgs,
19
+		Run: func(cmd *cobra.Command, args []string) {
20
+			fmt.Fprintf(dockerCli.Err(), "\n"+cmd.UsageString())
21
+		},
22
+	}
23
+	cmd.AddCommand(
24
+		newCreateCommand(dockerCli),
25
+		newListCommand(dockerCli),
26
+		newRemoveCommand(dockerCli),
27
+	)
28
+
29
+	rootCmd.AddCommand(cmd)
30
+}
0 31
new file mode 100644
... ...
@@ -0,0 +1,54 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"golang.org/x/net/context"
6
+
7
+	"github.com/docker/docker/api/types"
8
+	"github.com/docker/docker/cli"
9
+	"github.com/docker/docker/cli/command"
10
+	"github.com/spf13/cobra"
11
+)
12
+
13
+type createOptions struct {
14
+	container    string
15
+	checkpoint   string
16
+	leaveRunning bool
17
+}
18
+
19
+func newCreateCommand(dockerCli *command.DockerCli) *cobra.Command {
20
+	var opts createOptions
21
+
22
+	cmd := &cobra.Command{
23
+		Use:   "create CONTAINER CHECKPOINT",
24
+		Short: "Create a checkpoint from a running container",
25
+		Args:  cli.ExactArgs(2),
26
+		RunE: func(cmd *cobra.Command, args []string) error {
27
+			opts.container = args[0]
28
+			opts.checkpoint = args[1]
29
+			return runCreate(dockerCli, opts)
30
+		},
31
+	}
32
+
33
+	flags := cmd.Flags()
34
+	flags.BoolVar(&opts.leaveRunning, "leave-running", false, "leave the container running after checkpoing")
35
+
36
+	return cmd
37
+}
38
+
39
+func runCreate(dockerCli *command.DockerCli, opts createOptions) error {
40
+	client := dockerCli.Client()
41
+
42
+	checkpointOpts := types.CheckpointCreateOptions{
43
+		CheckpointID: opts.checkpoint,
44
+		Exit:         !opts.leaveRunning,
45
+	}
46
+
47
+	err := client.CheckpointCreate(context.Background(), opts.container, checkpointOpts)
48
+	if err != nil {
49
+		return err
50
+	}
51
+
52
+	return nil
53
+}
0 54
new file mode 100644
... ...
@@ -0,0 +1,47 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"fmt"
6
+	"text/tabwriter"
7
+
8
+	"golang.org/x/net/context"
9
+
10
+	"github.com/docker/docker/cli"
11
+	"github.com/docker/docker/cli/command"
12
+	"github.com/spf13/cobra"
13
+)
14
+
15
+func newListCommand(dockerCli *command.DockerCli) *cobra.Command {
16
+	return &cobra.Command{
17
+		Use:     "ls CONTAINER",
18
+		Aliases: []string{"list"},
19
+		Short:   "List checkpoints for a container",
20
+		Args:    cli.ExactArgs(1),
21
+		RunE: func(cmd *cobra.Command, args []string) error {
22
+			return runList(dockerCli, args[0])
23
+		},
24
+	}
25
+}
26
+
27
+func runList(dockerCli *command.DockerCli, container string) error {
28
+	client := dockerCli.Client()
29
+
30
+	checkpoints, err := client.CheckpointList(context.Background(), container)
31
+	if err != nil {
32
+		return err
33
+	}
34
+
35
+	w := tabwriter.NewWriter(dockerCli.Out(), 20, 1, 3, ' ', 0)
36
+	fmt.Fprintf(w, "CHECKPOINT NAME")
37
+	fmt.Fprintf(w, "\n")
38
+
39
+	for _, checkpoint := range checkpoints {
40
+		fmt.Fprintf(w, "%s\t", checkpoint.Name)
41
+		fmt.Fprint(w, "\n")
42
+	}
43
+
44
+	w.Flush()
45
+	return nil
46
+}
0 47
new file mode 100644
... ...
@@ -0,0 +1,28 @@
0
+// +build experimental
1
+
2
+package checkpoint
3
+
4
+import (
5
+	"golang.org/x/net/context"
6
+
7
+	"github.com/docker/docker/cli"
8
+	"github.com/docker/docker/cli/command"
9
+	"github.com/spf13/cobra"
10
+)
11
+
12
+func newRemoveCommand(dockerCli *command.DockerCli) *cobra.Command {
13
+	return &cobra.Command{
14
+		Use:     "rm CONTAINER CHECKPOINT",
15
+		Aliases: []string{"remove"},
16
+		Short:   "Remove a checkpoint",
17
+		Args:    cli.ExactArgs(2),
18
+		RunE: func(cmd *cobra.Command, args []string) error {
19
+			return runRemove(dockerCli, args[0], args[1])
20
+		},
21
+	}
22
+}
23
+
24
+func runRemove(dockerCli *command.DockerCli, container string, checkpoint string) error {
25
+	client := dockerCli.Client()
26
+	return client.CheckpointDelete(context.Background(), container, checkpoint)
27
+}
... ...
@@ -2,6 +2,7 @@ package commands
2 2
 
3 3
 import (
4 4
 	"github.com/docker/docker/cli/command"
5
+	"github.com/docker/docker/cli/command/checkpoint"
5 6
 	"github.com/docker/docker/cli/command/container"
6 7
 	"github.com/docker/docker/cli/command/image"
7 8
 	"github.com/docker/docker/cli/command/network"
... ...
@@ -67,5 +68,6 @@ func AddCommands(cmd *cobra.Command, dockerCli *command.DockerCli) {
67 67
 		volume.NewVolumeCommand(dockerCli),
68 68
 		system.NewInfoCommand(dockerCli),
69 69
 	)
70
+	checkpoint.NewCheckpointCommand(cmd, dockerCli)
70 71
 	plugin.NewPluginCommand(cmd, dockerCli)
71 72
 }
... ...
@@ -20,6 +20,7 @@ type startOptions struct {
20 20
 	attach     bool
21 21
 	openStdin  bool
22 22
 	detachKeys string
23
+	checkpoint string
23 24
 
24 25
 	containers []string
25 26
 }
... ...
@@ -42,6 +43,9 @@ func NewStartCommand(dockerCli *command.DockerCli) *cobra.Command {
42 42
 	flags.BoolVarP(&opts.attach, "attach", "a", false, "Attach STDOUT/STDERR and forward signals")
43 43
 	flags.BoolVarP(&opts.openStdin, "interactive", "i", false, "Attach container's STDIN")
44 44
 	flags.StringVar(&opts.detachKeys, "detach-keys", "", "Override the key sequence for detaching a container")
45
+
46
+	addExperimentalStartFlags(flags, &opts)
47
+
45 48
 	return cmd
46 49
 }
47 50
 
... ...
@@ -105,9 +109,12 @@ func runStart(dockerCli *command.DockerCli, opts *startOptions) error {
105 105
 		// 3. We should open a channel for receiving status code of the container
106 106
 		// no matter it's detached, removed on daemon side(--rm) or exit normally.
107 107
 		statusChan, statusErr := waitExitOrRemoved(dockerCli, context.Background(), c.ID, c.HostConfig.AutoRemove)
108
+		startOptions := types.ContainerStartOptions{
109
+			CheckpointID: opts.checkpoint,
110
+		}
108 111
 
109 112
 		// 4. Start the container.
110
-		if err := dockerCli.Client().ContainerStart(ctx, c.ID, types.ContainerStartOptions{}); err != nil {
113
+		if err := dockerCli.Client().ContainerStart(ctx, c.ID, startOptions); err != nil {
111 114
 			cancelFun()
112 115
 			<-cErr
113 116
 			if c.HostConfig.AutoRemove && statusErr == nil {
... ...
@@ -134,6 +141,16 @@ func runStart(dockerCli *command.DockerCli, opts *startOptions) error {
134 134
 		if status := <-statusChan; status != 0 {
135 135
 			return cli.StatusError{StatusCode: status}
136 136
 		}
137
+	} else if opts.checkpoint != "" {
138
+		if len(opts.containers) > 1 {
139
+			return fmt.Errorf("You cannot restore multiple containers at once.")
140
+		}
141
+		container := opts.containers[0]
142
+		startOptions := types.ContainerStartOptions{
143
+			CheckpointID: opts.checkpoint,
144
+		}
145
+		return dockerCli.Client().ContainerStart(ctx, container, startOptions)
146
+
137 147
 	} else {
138 148
 		// We're not going to attach to anything.
139 149
 		// Start as many containers as we want.
140 150
new file mode 100644
... ...
@@ -0,0 +1,8 @@
0
+// +build !experimental
1
+
2
+package container
3
+
4
+import "github.com/spf13/pflag"
5
+
6
+func addExperimentalStartFlags(flags *pflag.FlagSet, opts *startOptions) {
7
+}
0 8
new file mode 100644
... ...
@@ -0,0 +1,9 @@
0
+// +build experimental
1
+
2
+package container
3
+
4
+import "github.com/spf13/pflag"
5
+
6
+func addExperimentalStartFlags(flags *pflag.FlagSet, opts *startOptions) {
7
+	flags.StringVar(&opts.checkpoint, "checkpoint", "", "Restore from this checkpoint")
8
+}
... ...
@@ -409,7 +409,7 @@ func initRouter(s *apiserver.Server, d *daemon.Daemon, c *cluster.Cluster) {
409 409
 	if d.NetworkControllerEnabled() {
410 410
 		routers = append(routers, network.NewRouter(d, c))
411 411
 	}
412
-	routers = addExperimentalRouters(routers)
412
+	routers = addExperimentalRouters(routers, d, decoder)
413 413
 
414 414
 	s.InitRouter(utils.IsDebugEnabled(), routers...)
415 415
 }
... ...
@@ -2,8 +2,12 @@
2 2
 
3 3
 package main
4 4
 
5
-import "github.com/docker/docker/api/server/router"
5
+import (
6
+	"github.com/docker/docker/api/server/httputils"
7
+	"github.com/docker/docker/api/server/router"
8
+	"github.com/docker/docker/daemon"
9
+)
6 10
 
7
-func addExperimentalRouters(routers []router.Router) []router.Router {
11
+func addExperimentalRouters(routers []router.Router, d *daemon.Daemon, decoder httputils.ContainerDecoder) []router.Router {
8 12
 	return routers
9 13
 }
... ...
@@ -3,11 +3,14 @@
3 3
 package main
4 4
 
5 5
 import (
6
+	"github.com/docker/docker/api/server/httputils"
6 7
 	"github.com/docker/docker/api/server/router"
8
+	checkpointrouter "github.com/docker/docker/api/server/router/checkpoint"
7 9
 	pluginrouter "github.com/docker/docker/api/server/router/plugin"
10
+	"github.com/docker/docker/daemon"
8 11
 	"github.com/docker/docker/plugin"
9 12
 )
10 13
 
11
-func addExperimentalRouters(routers []router.Router) []router.Router {
12
-	return append(routers, pluginrouter.NewRouter(plugin.GetManager()))
14
+func addExperimentalRouters(routers []router.Router, d *daemon.Daemon, decoder httputils.ContainerDecoder) []router.Router {
15
+	return append(routers, checkpointrouter.NewRouter(d, decoder), pluginrouter.NewRouter(plugin.GetManager()))
13 16
 }
... ...
@@ -306,6 +306,11 @@ func (container *Container) ConfigPath() (string, error) {
306 306
 	return container.GetRootResourcePath(configFileName)
307 307
 }
308 308
 
309
+// CheckpointDir returns the directory checkpoints are stored in
310
+func (container *Container) CheckpointDir() string {
311
+	return filepath.Join(container.Root, "checkpoints")
312
+}
313
+
309 314
 // StartLogger starts a new logger driver for the container.
310 315
 func (container *Container) StartLogger(cfg containertypes.LogConfig) (logger.Logger, error) {
311 316
 	c, err := logger.GetLogDriver(cfg.Type)
312 317
new file mode 100644
... ...
@@ -0,0 +1,82 @@
0
+package daemon
1
+
2
+import (
3
+	"encoding/json"
4
+	"fmt"
5
+	"io/ioutil"
6
+	"os"
7
+	"path/filepath"
8
+
9
+	"github.com/docker/docker/api/types"
10
+)
11
+
12
+// CheckpointCreate checkpoints the process running in a container with CRIU
13
+func (daemon *Daemon) CheckpointCreate(name string, config types.CheckpointCreateOptions) error {
14
+	container, err := daemon.GetContainer(name)
15
+	if err != nil {
16
+		return err
17
+	}
18
+
19
+	if !container.IsRunning() {
20
+		return fmt.Errorf("Container %s not running", name)
21
+	}
22
+
23
+	err = daemon.containerd.CreateCheckpoint(container.ID, config.CheckpointID, container.CheckpointDir(), config.Exit)
24
+	if err != nil {
25
+		return fmt.Errorf("Cannot checkpoint container %s: %s", name, err)
26
+	}
27
+
28
+	daemon.LogContainerEvent(container, "checkpoint")
29
+
30
+	return nil
31
+}
32
+
33
+// CheckpointDelete deletes the specified checkpoint
34
+func (daemon *Daemon) CheckpointDelete(name string, checkpoint string) error {
35
+	container, err := daemon.GetContainer(name)
36
+	if err != nil {
37
+		return err
38
+	}
39
+
40
+	checkpointDir := container.CheckpointDir()
41
+	return os.RemoveAll(filepath.Join(checkpointDir, checkpoint))
42
+}
43
+
44
+// CheckpointList deletes the specified checkpoint
45
+func (daemon *Daemon) CheckpointList(name string) ([]types.Checkpoint, error) {
46
+	response := []types.Checkpoint{}
47
+
48
+	container, err := daemon.GetContainer(name)
49
+	if err != nil {
50
+		return response, err
51
+	}
52
+
53
+	checkpointDir := container.CheckpointDir()
54
+	if err := os.MkdirAll(checkpointDir, 0755); err != nil {
55
+		return nil, err
56
+	}
57
+
58
+	dirs, err := ioutil.ReadDir(checkpointDir)
59
+	if err != nil {
60
+		return nil, err
61
+	}
62
+
63
+	var out []types.Checkpoint
64
+	for _, d := range dirs {
65
+		if !d.IsDir() {
66
+			continue
67
+		}
68
+		path := filepath.Join(checkpointDir, d.Name(), "config.json")
69
+		data, err := ioutil.ReadFile(path)
70
+		if err != nil {
71
+			return nil, err
72
+		}
73
+		var cpt types.Checkpoint
74
+		if err := json.Unmarshal(data, &cpt); err != nil {
75
+			return nil, err
76
+		}
77
+		out = append(out, cpt)
78
+	}
79
+
80
+	return out, nil
81
+}
... ...
@@ -24,7 +24,7 @@ type Backend interface {
24 24
 	SetupIngress(req clustertypes.NetworkCreateRequest, nodeIP string) error
25 25
 	PullImage(ctx context.Context, image, tag string, metaHeaders map[string][]string, authConfig *types.AuthConfig, outStream io.Writer) error
26 26
 	CreateManagedContainer(config types.ContainerCreateConfig, validateHostname bool) (types.ContainerCreateResponse, error)
27
-	ContainerStart(name string, hostConfig *container.HostConfig, validateHostname bool) error
27
+	ContainerStart(name string, hostConfig *container.HostConfig, validateHostname bool, checkpoint string) error
28 28
 	ContainerStop(name string, seconds int) error
29 29
 	ConnectContainerToNetwork(containerName, networkName string, endpointConfig *network.EndpointSettings) error
30 30
 	UpdateContainerServiceConfig(containerName string, serviceConfig *clustertypes.ServiceConfig) error
... ...
@@ -220,7 +220,7 @@ func (c *containerAdapter) create(ctx context.Context) error {
220 220
 func (c *containerAdapter) start(ctx context.Context) error {
221 221
 	version := httputils.VersionFromContext(ctx)
222 222
 	validateHostname := versions.GreaterThanOrEqualTo(version, "1.24")
223
-	return c.backend.ContainerStart(c.container.name(), nil, validateHostname)
223
+	return c.backend.ContainerStart(c.container.name(), nil, validateHostname, "")
224 224
 }
225 225
 
226 226
 func (c *containerAdapter) inspect(ctx context.Context) (types.ContainerJSON, error) {
... ...
@@ -115,6 +115,9 @@ func (daemon *Daemon) create(params types.ContainerCreateConfig, managed bool) (
115 115
 	if err := idtools.MkdirAs(container.Root, 0700, rootUID, rootGID); err != nil {
116 116
 		return nil, err
117 117
 	}
118
+	if err := idtools.MkdirAs(container.CheckpointDir(), 0700, rootUID, rootGID); err != nil {
119
+		return nil, err
120
+	}
118 121
 
119 122
 	if err := daemon.setHostConfig(container, params.HostConfig); err != nil {
120 123
 		return nil, err
... ...
@@ -287,7 +287,7 @@ func (daemon *Daemon) restore() error {
287 287
 
288 288
 			// Make sure networks are available before starting
289 289
 			daemon.waitForNetworks(c)
290
-			if err := daemon.containerStart(c); err != nil {
290
+			if err := daemon.containerStart(c, ""); err != nil {
291 291
 				logrus.Errorf("Failed to start container %s: %s", c.ID, err)
292 292
 			}
293 293
 			close(chNotify)
... ...
@@ -28,7 +28,7 @@ func (daemon *Daemon) postRunProcessing(container *container.Container, e libcon
28 28
 
29 29
 		// Create a new servicing container, which will start, complete the update, and merge back the
30 30
 		// results if it succeeded, all as part of the below function call.
31
-		if err := daemon.containerd.Create((container.ID + "_servicing"), *spec, servicingOption); err != nil {
31
+		if err := daemon.containerd.Create((container.ID + "_servicing"), "", "", *spec, servicingOption); err != nil {
32 32
 			container.SetExitCode(-1)
33 33
 			return fmt.Errorf("Post-run update servicing failed: %s", err)
34 34
 		}
... ...
@@ -56,7 +56,7 @@ func (daemon *Daemon) containerRestart(container *container.Container, seconds i
56 56
 		}
57 57
 	}
58 58
 
59
-	if err := daemon.containerStart(container); err != nil {
59
+	if err := daemon.containerStart(container, ""); err != nil {
60 60
 		return err
61 61
 	}
62 62
 
... ...
@@ -19,7 +19,7 @@ import (
19 19
 )
20 20
 
21 21
 // ContainerStart starts a container.
22
-func (daemon *Daemon) ContainerStart(name string, hostConfig *containertypes.HostConfig, validateHostname bool) error {
22
+func (daemon *Daemon) ContainerStart(name string, hostConfig *containertypes.HostConfig, validateHostname bool, checkpoint string) error {
23 23
 	container, err := daemon.GetContainer(name)
24 24
 	if err != nil {
25 25
 		return err
... ...
@@ -78,19 +78,19 @@ func (daemon *Daemon) ContainerStart(name string, hostConfig *containertypes.Hos
78 78
 		return err
79 79
 	}
80 80
 
81
-	return daemon.containerStart(container)
81
+	return daemon.containerStart(container, checkpoint)
82 82
 }
83 83
 
84 84
 // Start starts a container
85 85
 func (daemon *Daemon) Start(container *container.Container) error {
86
-	return daemon.containerStart(container)
86
+	return daemon.containerStart(container, "")
87 87
 }
88 88
 
89 89
 // containerStart prepares the container to run by setting up everything the
90 90
 // container needs, such as storage and networking, as well as links
91 91
 // between containers. The container is left waiting for a signal to
92 92
 // begin running.
93
-func (daemon *Daemon) containerStart(container *container.Container) (err error) {
93
+func (daemon *Daemon) containerStart(container *container.Container, checkpoint string) (err error) {
94 94
 	container.Lock()
95 95
 	defer container.Unlock()
96 96
 
... ...
@@ -150,7 +150,7 @@ func (daemon *Daemon) containerStart(container *container.Container) (err error)
150 150
 		createOptions = append(createOptions, *copts...)
151 151
 	}
152 152
 
153
-	if err := daemon.containerd.Create(container.ID, *spec, createOptions...); err != nil {
153
+	if err := daemon.containerd.Create(container.ID, checkpoint, container.CheckpointDir(), *spec, createOptions...); err != nil {
154 154
 		errDesc := grpc.ErrorDesc(err)
155 155
 		logrus.Errorf("Create container failed with error: %s", errDesc)
156 156
 		// if we receive an internal error from the initial start of a container then lets
... ...
@@ -2,7 +2,7 @@
2 2
 
3 3
 This page contains a list of features in the Docker engine which are
4 4
 experimental. Experimental features are **not** ready for production. They are
5
-provided for test and evaluation in your sandbox environments.  
5
+provided for test and evaluation in your sandbox environments.
6 6
 
7 7
 The information below describes each feature and the GitHub pull requests and
8 8
 issues associated with it. If necessary, links are provided to additional
... ...
@@ -74,9 +74,10 @@ to build a Docker binary with the experimental features enabled:
74 74
  * [External graphdriver plugins](plugins_graphdriver.md)
75 75
  * [Macvlan and Ipvlan Network Drivers](vlan-networks.md)
76 76
  * [Docker Stacks and Distributed Application Bundles](docker-stacks-and-bundles.md)
77
+ * [Checkpoint & Restore](checkpoint-restore.md)
77 78
 
78 79
 ## How to comment on an experimental feature
79 80
 
80
-Each feature's documentation includes a list of proposal pull requests or PRs associated with the feature. If you want to comment on or suggest a change to a feature, please add it to the existing feature PR.  
81
+Each feature's documentation includes a list of proposal pull requests or PRs associated with the feature. If you want to comment on or suggest a change to a feature, please add it to the existing feature PR.
81 82
 
82
-Issues or problems with a feature? Inquire for help on the `#docker` IRC channel or in on the [Docker Google group](https://groups.google.com/forum/#!forum/docker-user).  
83
+Issues or problems with a feature? Inquire for help on the `#docker` IRC channel or in on the [Docker Google group](https://groups.google.com/forum/#!forum/docker-user).
83 84
new file mode 100644
... ...
@@ -0,0 +1,75 @@
0
+# Docker Checkpoint & Restore
1
+
2
+Checkpoint & Restore is a new feature that allows you to freeze a running
3
+container by checkpointing it, which turns its state into a collection of files
4
+on disk. Later, the container can be restored from the point it was frozen.
5
+
6
+This is accomplished using a tool called [CRIU](http://criu.org), which is an
7
+external dependency of this feature. A good overview of the history of
8
+checkpoint and restore in Docker is available in this
9
+[Kubernetes blog post](http://blog.kubernetes.io/2015/07/how-did-quake-demo-from-dockercon-work.html).
10
+
11
+## Installing CRIU
12
+
13
+If you use a Debian system, you can add the CRIU PPA and install with apt-get
14
+[from the criu launchpad](https://launchpad.net/~criu/+archive/ubuntu/ppa).
15
+
16
+Alternatively, you can [build CRIU from source](http://criu.org/Installation).
17
+
18
+You need at least version 2.0 of CRIU to run checkpoint/restore in Docker.
19
+
20
+## Use cases for checkpoint & restore
21
+
22
+This feature is currently focused on single-host use cases for checkpoint and
23
+restore. Here are a few:
24
+
25
+- Restarting the host machine without stopping/starting containers
26
+- Speeding up the start time of slow start applications
27
+- "Rewinding" processes to an earlier point in time
28
+- "Forensic debugging" of running processes
29
+
30
+Another primary use case of checkpoint & restore outside of Docker is the live
31
+migration of a server from one machine to another. This is possible with the
32
+current implementation, but not currently a priority (and so the workflow is
33
+not optimized for the task).
34
+
35
+## Using Checkpoint & Restore
36
+
37
+A new top level commands `docker checkpoint` is introduced, with three subcommands:
38
+- `create` (creates a new checkpoint)
39
+- `ls` (lists existing checkpoints)
40
+- `rm` (deletes an existing checkpoint)
41
+
42
+Additionally, a `--checkpoint` flag is added to the container start command.
43
+
44
+The options for checkpoint create:
45
+
46
+    Usage:  docker checkpoint [OPTIONS] CONTAINER CHECKPOINT_ID
47
+
48
+    Checkpoint the specified container
49
+
50
+      --leave-running=false    leave the container running after checkpoint
51
+
52
+And to restore a container:
53
+
54
+    Usage:  docker start --checkpoint CHECKPOINT_ID [OTHER OPTIONS] CONTAINER
55
+
56
+
57
+A simple example of using checkpoint & restore on a container:
58
+
59
+    $ docker run --security-opt=seccomp:unconfined --name cr -d busybox /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done'
60
+    > abc0123
61
+
62
+    $ docker checkpoint create cr checkpoint1
63
+
64
+    # <later>
65
+    $ docker start --checkpoint checkpoint1 cr
66
+    > abc0123
67
+
68
+This process just logs an incrementing counter to stdout. If you `docker logs`
69
+in between running/checkpoint/restoring you should see that the counter
70
+increases while the process is running, stops while it's checkpointed, and
71
+resumes from the point it left off once you restore.
72
+
73
+Note that seccomp is only supported by CRIU in very up to date kernels.
74
+
... ...
@@ -10,6 +10,7 @@ import (
10 10
 	"github.com/docker/docker/pkg/homedir"
11 11
 	"github.com/docker/docker/pkg/integration/checker"
12 12
 	icmd "github.com/docker/docker/pkg/integration/cmd"
13
+	"github.com/docker/docker/utils"
13 14
 	"github.com/go-check/check"
14 15
 )
15 16
 
... ...
@@ -122,6 +123,12 @@ func (s *DockerSuite) TestHelpTextVerify(c *check.C) {
122 122
 		cmdsToTest = append(cmdsToTest, "network ls")
123 123
 		cmdsToTest = append(cmdsToTest, "network rm")
124 124
 
125
+		if utils.ExperimentalBuild() {
126
+			cmdsToTest = append(cmdsToTest, "checkpoint create")
127
+			cmdsToTest = append(cmdsToTest, "checkpoint ls")
128
+			cmdsToTest = append(cmdsToTest, "checkpoint rm")
129
+		}
130
+
125 131
 		// Divide the list of commands into go routines and  run the func testcommand on the commands in parallel
126 132
 		// to save runtime of test
127 133
 
... ...
@@ -133,7 +133,7 @@ func (clnt *client) prepareBundleDir(uid, gid int) (string, error) {
133 133
 	return p, nil
134 134
 }
135 135
 
136
-func (clnt *client) Create(containerID string, spec Spec, options ...CreateOption) (err error) {
136
+func (clnt *client) Create(containerID string, checkpoint string, checkpointDir string, spec Spec, options ...CreateOption) (err error) {
137 137
 	clnt.lock(containerID)
138 138
 	defer clnt.unlock(containerID)
139 139
 
... ...
@@ -180,7 +180,7 @@ func (clnt *client) Create(containerID string, spec Spec, options ...CreateOptio
180 180
 		return err
181 181
 	}
182 182
 
183
-	return container.start()
183
+	return container.start(checkpoint, checkpointDir)
184 184
 }
185 185
 
186 186
 func (clnt *client) Signal(containerID string, sig int) error {
... ...
@@ -625,3 +625,57 @@ func (en *exitNotifier) close() {
625 625
 func (en *exitNotifier) wait() <-chan struct{} {
626 626
 	return en.c
627 627
 }
628
+
629
+func (clnt *client) CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error {
630
+	clnt.lock(containerID)
631
+	defer clnt.unlock(containerID)
632
+	if _, err := clnt.getContainer(containerID); err != nil {
633
+		return err
634
+	}
635
+
636
+	_, err := clnt.remote.apiClient.CreateCheckpoint(context.Background(), &containerd.CreateCheckpointRequest{
637
+		Id: containerID,
638
+		Checkpoint: &containerd.Checkpoint{
639
+			Name:        checkpointID,
640
+			Exit:        exit,
641
+			Tcp:         true,
642
+			UnixSockets: true,
643
+			Shell:       false,
644
+			EmptyNS:     []string{"network"},
645
+		},
646
+		CheckpointDir: checkpointDir,
647
+	})
648
+	return err
649
+}
650
+
651
+func (clnt *client) DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error {
652
+	clnt.lock(containerID)
653
+	defer clnt.unlock(containerID)
654
+	if _, err := clnt.getContainer(containerID); err != nil {
655
+		return err
656
+	}
657
+
658
+	_, err := clnt.remote.apiClient.DeleteCheckpoint(context.Background(), &containerd.DeleteCheckpointRequest{
659
+		Id:            containerID,
660
+		Name:          checkpointID,
661
+		CheckpointDir: checkpointDir,
662
+	})
663
+	return err
664
+}
665
+
666
+func (clnt *client) ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error) {
667
+	clnt.lock(containerID)
668
+	defer clnt.unlock(containerID)
669
+	if _, err := clnt.getContainer(containerID); err != nil {
670
+		return nil, err
671
+	}
672
+
673
+	resp, err := clnt.remote.apiClient.ListCheckpoint(context.Background(), &containerd.ListCheckpointRequest{
674
+		Id:            containerID,
675
+		CheckpointDir: checkpointDir,
676
+	})
677
+	if err != nil {
678
+		return nil, err
679
+	}
680
+	return (*Checkpoints)(resp), nil
681
+}
... ...
@@ -12,7 +12,7 @@ func (clnt *client) AddProcess(ctx context.Context, containerID, processFriendly
12 12
 	return nil
13 13
 }
14 14
 
15
-func (clnt *client) Create(containerID string, spec Spec, options ...CreateOption) (err error) {
15
+func (clnt *client) Create(containerID string, checkpoint string, checkpointDir string, spec Spec, options ...CreateOption) (err error) {
16 16
 	return nil
17 17
 }
18 18
 
... ...
@@ -37,7 +37,7 @@ const defaultOwner = "docker"
37 37
 
38 38
 // Create is the entrypoint to create a container from a spec, and if successfully
39 39
 // created, start it too.
40
-func (clnt *client) Create(containerID string, spec Spec, options ...CreateOption) error {
40
+func (clnt *client) Create(containerID string, checkpoint string, checkpointDir string, spec Spec, options ...CreateOption) error {
41 41
 	logrus.Debugln("libcontainerd: client.Create() with spec", spec)
42 42
 
43 43
 	configuration := &hcsshim.ContainerConfig{
... ...
@@ -435,3 +435,15 @@ func (clnt *client) UpdateResources(containerID string, resources Resources) err
435 435
 	// but we should return nil for enabling updating container
436 436
 	return nil
437 437
 }
438
+
439
+func (clnt *client) CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error {
440
+	return errors.New("Windows: Containers do not support checkpoints")
441
+}
442
+
443
+func (clnt *client) DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error {
444
+	return errors.New("Windows: Containers do not support checkpoints")
445
+}
446
+
447
+func (clnt *client) ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error) {
448
+	return nil, errors.New("Windows: Containers do not support checkpoints")
449
+}
... ...
@@ -86,7 +86,7 @@ func (ctr *container) spec() (*specs.Spec, error) {
86 86
 	return &spec, nil
87 87
 }
88 88
 
89
-func (ctr *container) start() error {
89
+func (ctr *container) start(checkpoint string, checkpointDir string) error {
90 90
 	spec, err := ctr.spec()
91 91
 	if err != nil {
92 92
 		return nil
... ...
@@ -97,11 +97,13 @@ func (ctr *container) start() error {
97 97
 	}
98 98
 
99 99
 	r := &containerd.CreateContainerRequest{
100
-		Id:         ctr.containerID,
101
-		BundlePath: ctr.dir,
102
-		Stdin:      ctr.fifo(syscall.Stdin),
103
-		Stdout:     ctr.fifo(syscall.Stdout),
104
-		Stderr:     ctr.fifo(syscall.Stderr),
100
+		Id:            ctr.containerID,
101
+		BundlePath:    ctr.dir,
102
+		Stdin:         ctr.fifo(syscall.Stdin),
103
+		Stdout:        ctr.fifo(syscall.Stdout),
104
+		Stderr:        ctr.fifo(syscall.Stderr),
105
+		Checkpoint:    checkpoint,
106
+		CheckpointDir: checkpointDir,
105 107
 		// check to see if we are running in ramdisk to disable pivot root
106 108
 		NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "",
107 109
 		Runtime:     ctr.runtime,
... ...
@@ -191,7 +193,7 @@ func (ctr *container) handleEvent(e *containerd.Event) error {
191 191
 					defer ctr.client.unlock(ctr.containerID)
192 192
 					ctr.restarting = false
193 193
 					if err == nil {
194
-						if err = ctr.start(); err != nil {
194
+						if err = ctr.start("", ""); err != nil {
195 195
 							logrus.Errorf("libcontainerd: error restarting %v", err)
196 196
 						}
197 197
 					}
... ...
@@ -261,7 +261,7 @@ func (ctr *container) waitExit(process *process, isFirstProcessToStart bool) err
261 261
 			ctr.restarting = false
262 262
 			ctr.client.deleteContainer(ctr.friendlyName)
263 263
 			if err == nil {
264
-				if err = ctr.client.Create(ctr.containerID, ctr.ociSpec, ctr.options...); err != nil {
264
+				if err = ctr.client.Create(ctr.containerID, "", "", ctr.ociSpec, ctr.options...); err != nil {
265 265
 					logrus.Errorf("libcontainerd: error restarting %v", err)
266 266
 				}
267 267
 			}
... ...
@@ -36,7 +36,7 @@ type Backend interface {
36 36
 
37 37
 // Client provides access to containerd features.
38 38
 type Client interface {
39
-	Create(containerID string, spec Spec, options ...CreateOption) error
39
+	Create(containerID string, checkpoint string, checkpointDir string, spec Spec, options ...CreateOption) error
40 40
 	Signal(containerID string, sig int) error
41 41
 	SignalProcess(containerID string, processFriendlyName string, sig int) error
42 42
 	AddProcess(ctx context.Context, containerID, processFriendlyName string, process Process) error
... ...
@@ -48,6 +48,9 @@ type Client interface {
48 48
 	GetPidsForContainer(containerID string) ([]int, error)
49 49
 	Summary(containerID string) ([]Summary, error)
50 50
 	UpdateResources(containerID string, resources Resources) error
51
+	CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error
52
+	DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error
53
+	ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error)
51 54
 }
52 55
 
53 56
 // CreateOption allows to configure parameters of container creation.
... ...
@@ -53,3 +53,6 @@ type User specs.User
53 53
 
54 54
 // Resources defines updatable container resource values.
55 55
 type Resources containerd.UpdateResource
56
+
57
+// Checkpoints contains the details of a checkpoint
58
+type Checkpoints containerd.ListCheckpointResponse
... ...
@@ -37,3 +37,13 @@ type Resources struct{}
37 37
 type ServicingOption struct {
38 38
 	IsServicing bool
39 39
 }
40
+
41
+// Checkpoint holds the details of a checkpoint (not supported in windows)
42
+type Checkpoint struct {
43
+	Name string
44
+}
45
+
46
+// Checkpoints contains the details of a checkpoint
47
+type Checkpoints struct {
48
+	Checkpoints []*Checkpoint
49
+}
... ...
@@ -27,7 +27,7 @@ func (pm *Manager) enable(p *v2.Plugin, force bool) error {
27 27
 	}
28 28
 
29 29
 	p.RestartManager = restartmanager.New(container.RestartPolicy{Name: "always"}, 0)
30
-	if err := pm.containerdClient.Create(p.GetID(), libcontainerd.Spec(*spec), libcontainerd.WithRestartManager(p.RestartManager)); err != nil {
30
+	if err := pm.containerdClient.Create(p.GetID(), "", "", libcontainerd.Spec(*spec), libcontainerd.WithRestartManager(p.RestartManager)); err != nil {
31 31
 		if err := p.RestartManager.Cancel(); err != nil {
32 32
 			logrus.Errorf("enable: restartManager.Cancel failed due to %v", err)
33 33
 		}