Browse code

Merge pull request #40174 from AkihiroSuda/cgroup2

support cgroup2

Sebastiaan van Stijn authored on 2020/01/10 04:09:11
Showing 15 changed files
... ...
@@ -9,6 +9,7 @@ import (
9 9
 	"github.com/docker/docker/opts"
10 10
 	"github.com/docker/docker/rootless"
11 11
 	units "github.com/docker/go-units"
12
+	"github.com/opencontainers/runc/libcontainer/cgroups"
12 13
 	"github.com/pkg/errors"
13 14
 	"github.com/spf13/pflag"
14 15
 )
... ...
@@ -64,6 +65,10 @@ func installConfigFlags(conf *config.Config, flags *pflag.FlagSet) error {
64 64
 	// rootless needs to be explicitly specified for running "rootful" dockerd in rootless dockerd (#38702)
65 65
 	// Note that defaultUserlandProxyPath and honorXDG are configured according to the value of rootless.RunningWithRootlessKit, not the value of --rootless.
66 66
 	flags.BoolVar(&conf.Rootless, "rootless", rootless.RunningWithRootlessKit(), "Enable rootless mode; typically used with RootlessKit (experimental)")
67
-	flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", config.DefaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`)
67
+	defaultCgroupNamespaceMode := "host"
68
+	if cgroups.IsCgroup2UnifiedMode() {
69
+		defaultCgroupNamespaceMode = "private"
70
+	}
71
+	flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", defaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`)
68 72
 	return nil
69 73
 }
... ...
@@ -11,8 +11,6 @@ import (
11 11
 )
12 12
 
13 13
 const (
14
-	// DefaultCgroupNamespaceMode is the default for a container's CgroupnsMode, if not set otherwise
15
-	DefaultCgroupNamespaceMode = "host" // TODO: change to private
16 14
 	// DefaultIpcMode is default for container's IpcMode, if not set otherwise
17 15
 	DefaultIpcMode = "private"
18 16
 )
... ...
@@ -794,6 +794,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
794 794
 		PluginStore: pluginStore,
795 795
 		startupDone: make(chan struct{}),
796 796
 	}
797
+
797 798
 	// Ensure the daemon is properly shutdown if there is a failure during
798 799
 	// initialization
799 800
 	defer func() {
... ...
@@ -914,7 +915,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
914 914
 			}
915 915
 		}
916 916
 
917
-		return pluginexec.New(ctx, getPluginExecRoot(config.Root), pluginCli, config.ContainerdPluginNamespace, m)
917
+		return pluginexec.New(ctx, getPluginExecRoot(config.Root), pluginCli, config.ContainerdPluginNamespace, m, d.useShimV2())
918 918
 	}
919 919
 
920 920
 	// Plugin system initialization should happen before restore. Do not change order.
... ...
@@ -1063,7 +1064,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
1063 1063
 
1064 1064
 	go d.execCommandGC()
1065 1065
 
1066
-	d.containerd, err = libcontainerd.NewClient(ctx, d.containerdCli, filepath.Join(config.ExecRoot, "containerd"), config.ContainerdNamespace, d)
1066
+	d.containerd, err = libcontainerd.NewClient(ctx, d.containerdCli, filepath.Join(config.ExecRoot, "containerd"), config.ContainerdNamespace, d, d.useShimV2())
1067 1067
 	if err != nil {
1068 1068
 		return nil, err
1069 1069
 	}
... ...
@@ -364,10 +364,15 @@ func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConf
364 364
 
365 365
 	// Set default cgroup namespace mode, if unset for container
366 366
 	if hostConfig.CgroupnsMode.IsEmpty() {
367
-		if hostConfig.Privileged {
367
+		// for cgroup v2: unshare cgroupns even for privileged containers
368
+		// https://github.com/containers/libpod/pull/4374#issuecomment-549776387
369
+		if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() {
368 370
 			hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host")
369 371
 		} else {
370
-			m := config.DefaultCgroupNamespaceMode
372
+			m := "host"
373
+			if cgroups.IsCgroup2UnifiedMode() {
374
+				m = "private"
375
+			}
371 376
 			if daemon.configStore != nil {
372 377
 				m = daemon.configStore.CgroupNamespaceMode
373 378
 			}
... ...
@@ -708,8 +713,8 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.
708 708
 			warnings = append(warnings, "Your kernel does not support cgroup namespaces.  Cgroup namespace setting discarded.")
709 709
 		}
710 710
 
711
-		if hostConfig.Privileged {
712
-			return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces.  You must run the container in the host cgroup namespace when running privileged mode")
711
+		if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() {
712
+			return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces on cgroup v1 host.  You must run the container in the host cgroup namespace when running privileged mode")
713 713
 		}
714 714
 	}
715 715
 
... ...
@@ -1594,6 +1599,10 @@ func (daemon *Daemon) initCgroupsPath(path string) error {
1594 1594
 		return nil
1595 1595
 	}
1596 1596
 
1597
+	if cgroups.IsCgroup2UnifiedMode() {
1598
+		return fmt.Errorf("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2")
1599
+	}
1600
+
1597 1601
 	// Recursively create cgroup to ensure that the system and all parent cgroups have values set
1598 1602
 	// for the period and runtime as this limits what the children can be set to.
1599 1603
 	daemon.initCgroupsPath(filepath.Dir(path))
... ...
@@ -1639,3 +1648,7 @@ func (daemon *Daemon) setupSeccompProfile() error {
1639 1639
 	}
1640 1640
 	return nil
1641 1641
 }
1642
+
1643
+func (daemon *Daemon) useShimV2() bool {
1644
+	return cgroups.IsCgroup2UnifiedMode()
1645
+}
... ...
@@ -653,3 +653,7 @@ func (daemon *Daemon) initRuntimes(_ map[string]types.Runtime) error {
653 653
 
654 654
 func setupResolvConf(config *config.Config) {
655 655
 }
656
+
657
+func (daemon *Daemon) useShimV2() bool {
658
+	return true
659
+}
... ...
@@ -316,7 +316,9 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
316 316
 				return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
317 317
 			}
318 318
 
319
-			if cgroupNsMode.IsPrivate() && !c.HostConfig.Privileged {
319
+			// for cgroup v2: unshare cgroupns even for privileged containers
320
+			// https://github.com/containers/libpod/pull/4374#issuecomment-549776387
321
+			if cgroupNsMode.IsPrivate() && (cgroups.IsCgroup2UnifiedMode() || !c.HostConfig.Privileged) {
320 322
 				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
321 323
 				setNamespace(s, nsCgroup)
322 324
 			}
... ...
@@ -8,6 +8,7 @@ import (
8 8
 	"path/filepath"
9 9
 
10 10
 	"github.com/containerd/containerd/runtime/linux/runctypes"
11
+	v2runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
11 12
 	"github.com/docker/docker/container"
12 13
 	"github.com/docker/docker/errdefs"
13 14
 	"github.com/pkg/errors"
... ...
@@ -43,6 +44,20 @@ func (daemon *Daemon) getLibcontainerdCreateOptions(container *container.Contain
43 43
 	if err != nil {
44 44
 		return nil, err
45 45
 	}
46
+	if daemon.useShimV2() {
47
+		opts := &v2runcoptions.Options{
48
+			BinaryName: path,
49
+			Root: filepath.Join(daemon.configStore.ExecRoot,
50
+				fmt.Sprintf("runtime-%s", container.HostConfig.Runtime)),
51
+		}
52
+
53
+		if UsingSystemd(daemon.configStore) {
54
+			opts.SystemdCgroup = true
55
+		}
56
+
57
+		return opts, nil
58
+
59
+	}
46 60
 	opts := &runctypes.RuncOptions{
47 61
 		Runtime: path,
48 62
 		RuntimeRoot: filepath.Join(daemon.configStore.ExecRoot,
... ...
@@ -115,7 +115,7 @@ func TestCgroupNamespacesRunPrivilegedAndPrivate(t *testing.T) {
115 115
 	skip.If(t, !requirement.CgroupNamespacesEnabled())
116 116
 
117 117
 	// Running with both privileged and cgroupns=private is not allowed
118
-	errStr := "privileged mode is incompatible with private cgroup namespaces.  You must run the container in the host cgroup namespace when running privileged mode"
118
+	errStr := "privileged mode is incompatible with private cgroup namespaces on cgroup v1 host.  You must run the container in the host cgroup namespace when running privileged mode"
119 119
 	testCreateFailureWithCgroupNs(t, "private", errStr, container.WithPrivileged(true), container.WithCgroupnsMode("private"))
120 120
 }
121 121
 
... ...
@@ -9,6 +9,6 @@ import (
9 9
 )
10 10
 
11 11
 // NewClient creates a new libcontainerd client from a containerd client
12
-func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) {
13
-	return remote.NewClient(ctx, cli, stateDir, ns, b)
12
+func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) {
13
+	return remote.NewClient(ctx, cli, stateDir, ns, b, useShimV2)
14 14
 }
... ...
@@ -11,9 +11,10 @@ import (
11 11
 )
12 12
 
13 13
 // NewClient creates a new libcontainerd client from a containerd client
14
-func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) {
14
+func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) {
15 15
 	if !system.ContainerdRuntimeSupported() {
16
+		// useShimV2 is ignored for windows
16 17
 		return local.NewClient(ctx, cli, stateDir, ns, b)
17 18
 	}
18
-	return remote.NewClient(ctx, cli, stateDir, ns, b)
19
+	return remote.NewClient(ctx, cli, stateDir, ns, b, useShimV2)
19 20
 }
... ...
@@ -23,6 +23,7 @@ import (
23 23
 	"github.com/containerd/containerd/events"
24 24
 	"github.com/containerd/containerd/images"
25 25
 	"github.com/containerd/containerd/runtime/linux/runctypes"
26
+	v2runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
26 27
 	"github.com/containerd/typeurl"
27 28
 	"github.com/docker/docker/errdefs"
28 29
 	"github.com/docker/docker/libcontainerd/queue"
... ...
@@ -45,21 +46,27 @@ type client struct {
45 45
 	logger   *logrus.Entry
46 46
 	ns       string
47 47
 
48
-	backend libcontainerdtypes.Backend
49
-	eventQ  queue.Queue
50
-	oomMu   sync.Mutex
51
-	oom     map[string]bool
48
+	backend         libcontainerdtypes.Backend
49
+	eventQ          queue.Queue
50
+	oomMu           sync.Mutex
51
+	oom             map[string]bool
52
+	useShimV2       bool
53
+	v2runcoptionsMu sync.Mutex
54
+	// v2runcoptions is used for copying options specified on Create() to Start()
55
+	v2runcoptions map[string]v2runcoptions.Options
52 56
 }
53 57
 
54 58
 // NewClient creates a new libcontainerd client from a containerd client
55
-func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) {
59
+func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) {
56 60
 	c := &client{
57
-		client:   cli,
58
-		stateDir: stateDir,
59
-		logger:   logrus.WithField("module", "libcontainerd").WithField("namespace", ns),
60
-		ns:       ns,
61
-		backend:  b,
62
-		oom:      make(map[string]bool),
61
+		client:        cli,
62
+		stateDir:      stateDir,
63
+		logger:        logrus.WithField("module", "libcontainerd").WithField("namespace", ns),
64
+		ns:            ns,
65
+		backend:       b,
66
+		oom:           make(map[string]bool),
67
+		useShimV2:     useShimV2,
68
+		v2runcoptions: make(map[string]v2runcoptions.Options),
63 69
 	}
64 70
 
65 71
 	go c.processEventStream(ctx, ns)
... ...
@@ -126,9 +133,13 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run
126 126
 	bdir := c.bundleDir(id)
127 127
 	c.logger.WithField("bundle", bdir).WithField("root", ociSpec.Root.Path).Debug("bundle dir created")
128 128
 
129
+	rt := runtimeName
130
+	if c.useShimV2 {
131
+		rt = shimV2RuntimeName
132
+	}
129 133
 	newOpts := []containerd.NewContainerOpts{
130 134
 		containerd.WithSpec(ociSpec),
131
-		containerd.WithRuntime(runtimeName, runtimeOptions),
135
+		containerd.WithRuntime(rt, runtimeOptions),
132 136
 		WithBundle(bdir, ociSpec),
133 137
 	}
134 138
 	opts = append(opts, newOpts...)
... ...
@@ -140,6 +151,13 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run
140 140
 		}
141 141
 		return wrapError(err)
142 142
 	}
143
+	if c.useShimV2 {
144
+		if x, ok := runtimeOptions.(*v2runcoptions.Options); ok {
145
+			c.v2runcoptionsMu.Lock()
146
+			c.v2runcoptions[id] = *x
147
+			c.v2runcoptionsMu.Unlock()
148
+		}
149
+	}
143 150
 	return nil
144 151
 }
145 152
 
... ...
@@ -200,11 +218,26 @@ func (c *client) Start(ctx context.Context, id, checkpointDir string, withStdin
200 200
 
201 201
 	if runtime.GOOS != "windows" {
202 202
 		taskOpts = append(taskOpts, func(_ context.Context, _ *containerd.Client, info *containerd.TaskInfo) error {
203
-			info.Options = &runctypes.CreateOptions{
204
-				IoUid:       uint32(uid),
205
-				IoGid:       uint32(gid),
206
-				NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "",
203
+			if c.useShimV2 {
204
+				// For v2, we need to inherit options specified on Create
205
+				c.v2runcoptionsMu.Lock()
206
+				opts, ok := c.v2runcoptions[id]
207
+				c.v2runcoptionsMu.Unlock()
208
+				if !ok {
209
+					opts = v2runcoptions.Options{}
210
+				}
211
+				opts.IoUid = uint32(uid)
212
+				opts.IoGid = uint32(gid)
213
+				opts.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
214
+				info.Options = &opts
215
+			} else {
216
+				info.Options = &runctypes.CreateOptions{
217
+					IoUid:       uint32(uid),
218
+					IoGid:       uint32(gid),
219
+					NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "",
220
+				}
207 221
 			}
222
+
208 223
 			return nil
209 224
 		})
210 225
 	} else {
... ...
@@ -466,6 +499,9 @@ func (c *client) Delete(ctx context.Context, containerID string) error {
466 466
 	c.oomMu.Lock()
467 467
 	delete(c.oom, containerID)
468 468
 	c.oomMu.Unlock()
469
+	c.v2runcoptionsMu.Lock()
470
+	delete(c.v2runcoptions, containerID)
471
+	c.v2runcoptionsMu.Unlock()
469 472
 	if os.Getenv("LIBCONTAINERD_NOCLEAN") != "1" {
470 473
 		if err := os.RemoveAll(bundle); err != nil {
471 474
 			c.logger.WithError(err).WithFields(logrus.Fields{
... ...
@@ -16,7 +16,10 @@ import (
16 16
 	"github.com/sirupsen/logrus"
17 17
 )
18 18
 
19
-const runtimeName = "io.containerd.runtime.v1.linux"
19
+const (
20
+	runtimeName       = "io.containerd.runtime.v1.linux"
21
+	shimV2RuntimeName = "io.containerd.runc.v2"
22
+)
20 23
 
21 24
 func summaryFromInterface(i interface{}) (*libcontainerdtypes.Summary, error) {
22 25
 	return &libcontainerdtypes.Summary{}, nil
... ...
@@ -16,7 +16,10 @@ import (
16 16
 	"github.com/sirupsen/logrus"
17 17
 )
18 18
 
19
-const runtimeName = "io.containerd.runhcs.v1"
19
+const (
20
+	runtimeName       = "io.containerd.runhcs.v1"
21
+	shimV2RuntimeName = runtimeName
22
+)
20 23
 
21 24
 func summaryFromInterface(i interface{}) (*libcontainerdtypes.Summary, error) {
22 25
 	switch pd := i.(type) {
... ...
@@ -60,6 +60,9 @@ func New(quiet bool) *SysInfo {
60 60
 		w := o(sysInfo, cgMounts)
61 61
 		warnings = append(warnings, w...)
62 62
 	}
63
+	if cgroups.IsCgroup2UnifiedMode() {
64
+		warnings = append(warnings, "Your system is running cgroup v2 (unsupported)")
65
+	}
63 66
 	if !quiet {
64 67
 		for _, w := range warnings {
65 68
 			logrus.Warn(w)
... ...
@@ -70,6 +73,15 @@ func New(quiet bool) *SysInfo {
70 70
 
71 71
 // applyMemoryCgroupInfo reads the memory information from the memory cgroup mount point.
72 72
 func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
73
+	if cgroups.IsCgroup2UnifiedMode() {
74
+		// TODO: check cgroup2 info correctly
75
+		info.MemoryLimit = true
76
+		info.SwapLimit = true
77
+		info.MemoryReservation = true
78
+		info.OomKillDisable = true
79
+		info.MemorySwappiness = true
80
+		return nil
81
+	}
73 82
 	var warnings []string
74 83
 	mountPoint, ok := cgMounts["memory"]
75 84
 	if !ok {
... ...
@@ -108,6 +120,15 @@ func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
108 108
 
109 109
 // applyCPUCgroupInfo reads the cpu information from the cpu cgroup mount point.
110 110
 func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
111
+	if cgroups.IsCgroup2UnifiedMode() {
112
+		// TODO: check cgroup2 info correctly
113
+		info.CPUShares = true
114
+		info.CPUCfsPeriod = true
115
+		info.CPUCfsQuota = true
116
+		info.CPURealtimePeriod = true
117
+		info.CPURealtimeRuntime = true
118
+		return nil
119
+	}
111 120
 	var warnings []string
112 121
 	mountPoint, ok := cgMounts["cpu"]
113 122
 	if !ok {
... ...
@@ -145,6 +166,15 @@ func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
145 145
 
146 146
 // applyBlkioCgroupInfo reads the blkio information from the blkio cgroup mount point.
147 147
 func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
148
+	if cgroups.IsCgroup2UnifiedMode() {
149
+		// TODO: check cgroup2 info correctly
150
+		info.BlkioWeight = true
151
+		info.BlkioReadBpsDevice = true
152
+		info.BlkioWriteBpsDevice = true
153
+		info.BlkioReadIOpsDevice = true
154
+		info.BlkioWriteIOpsDevice = true
155
+		return nil
156
+	}
148 157
 	var warnings []string
149 158
 	mountPoint, ok := cgMounts["blkio"]
150 159
 	if !ok {
... ...
@@ -186,6 +216,11 @@ func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
186 186
 
187 187
 // applyCPUSetCgroupInfo reads the cpuset information from the cpuset cgroup mount point.
188 188
 func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
189
+	if cgroups.IsCgroup2UnifiedMode() {
190
+		// TODO: check cgroup2 info correctly
191
+		info.Cpuset = true
192
+		return nil
193
+	}
189 194
 	var warnings []string
190 195
 	mountPoint, ok := cgMounts["cpuset"]
191 196
 	if !ok {
... ...
@@ -213,6 +248,11 @@ func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
213 213
 
214 214
 // applyPIDSCgroupInfo reads the pids information from the pids cgroup mount point.
215 215
 func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string {
216
+	if cgroups.IsCgroup2UnifiedMode() {
217
+		// TODO: check cgroup2 info correctly
218
+		info.PidsLimit = true
219
+		return nil
220
+	}
216 221
 	var warnings []string
217 222
 	_, err := cgroups.FindCgroupMountpoint("", "pids")
218 223
 	if err != nil {
... ...
@@ -225,6 +265,11 @@ func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string {
225 225
 
226 226
 // applyDevicesCgroupInfo reads the pids information from the devices cgroup mount point.
227 227
 func applyDevicesCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
228
+	if cgroups.IsCgroup2UnifiedMode() {
229
+		// TODO: check cgroup2 info correctly
230
+		info.CgroupDevicesEnabled = true
231
+		return nil
232
+	}
228 233
 	var warnings []string
229 234
 	_, ok := cgMounts["devices"]
230 235
 	info.CgroupDevicesEnabled = ok
... ...
@@ -26,13 +26,13 @@ type ExitHandler interface {
26 26
 }
27 27
 
28 28
 // New creates a new containerd plugin executor
29
-func New(ctx context.Context, rootDir string, cli *containerd.Client, ns string, exitHandler ExitHandler) (*Executor, error) {
29
+func New(ctx context.Context, rootDir string, cli *containerd.Client, ns string, exitHandler ExitHandler, useShimV2 bool) (*Executor, error) {
30 30
 	e := &Executor{
31 31
 		rootDir:     rootDir,
32 32
 		exitHandler: exitHandler,
33 33
 	}
34 34
 
35
-	client, err := libcontainerd.NewClient(ctx, cli, rootDir, ns, e)
35
+	client, err := libcontainerd.NewClient(ctx, cli, rootDir, ns, e, useShimV2)
36 36
 	if err != nil {
37 37
 		return nil, errors.Wrap(err, "error creating containerd exec client")
38 38
 	}