Browse code

Start containers in their own cgroup namespaces

This is enabled for all containers that are not run with --privileged,
if the kernel supports it.

Fixes #38332

Signed-off-by: Rob Gulewich <rgulewich@netflix.com>

Rob Gulewich authored on 2018/12/15 08:07:19
Showing 10 changed files
... ...
@@ -81,26 +81,27 @@ var (
81 81
 
82 82
 // Daemon holds information about the Docker daemon.
83 83
 type Daemon struct {
84
-	ID                string
85
-	repository        string
86
-	containers        container.Store
87
-	containersReplica container.ViewDB
88
-	execCommands      *exec.Store
89
-	imageService      *images.ImageService
90
-	idIndex           *truncindex.TruncIndex
91
-	configStore       *config.Config
92
-	statsCollector    *stats.Collector
93
-	defaultLogConfig  containertypes.LogConfig
94
-	RegistryService   registry.Service
95
-	EventsService     *events.Events
96
-	netController     libnetwork.NetworkController
97
-	volumes           *volumesservice.VolumesService
98
-	discoveryWatcher  discovery.Reloader
99
-	root              string
100
-	seccompEnabled    bool
101
-	apparmorEnabled   bool
102
-	shutdown          bool
103
-	idMapping         *idtools.IdentityMapping
84
+	ID                      string
85
+	repository              string
86
+	containers              container.Store
87
+	containersReplica       container.ViewDB
88
+	execCommands            *exec.Store
89
+	imageService            *images.ImageService
90
+	idIndex                 *truncindex.TruncIndex
91
+	configStore             *config.Config
92
+	statsCollector          *stats.Collector
93
+	defaultLogConfig        containertypes.LogConfig
94
+	RegistryService         registry.Service
95
+	EventsService           *events.Events
96
+	netController           libnetwork.NetworkController
97
+	volumes                 *volumesservice.VolumesService
98
+	discoveryWatcher        discovery.Reloader
99
+	root                    string
100
+	seccompEnabled          bool
101
+	apparmorEnabled         bool
102
+	cgroupNamespacesEnabled bool
103
+	shutdown                bool
104
+	idMapping               *idtools.IdentityMapping
104 105
 	// TODO: move graphDrivers field to an InfoService
105 106
 	graphDrivers map[string]string // By operating system
106 107
 
... ...
@@ -1020,6 +1021,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
1020 1020
 	d.idMapping = idMapping
1021 1021
 	d.seccompEnabled = sysInfo.Seccomp
1022 1022
 	d.apparmorEnabled = sysInfo.AppArmor
1023
+	d.cgroupNamespacesEnabled = sysInfo.CgroupNamespaces
1023 1024
 
1024 1025
 	d.linkIndex = newLinkIndex()
1025 1026
 
... ...
@@ -307,8 +307,13 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
307 307
 			s.Hostname = ""
308 308
 		}
309 309
 
310
-		return nil
310
+	// cgroup
311
+	if daemon.cgroupNamespacesEnabled && !c.HostConfig.Privileged {
312
+		nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
313
+		setNamespace(s, nsCgroup)
311 314
 	}
315
+
316
+	return nil
312 317
 }
313 318
 
314 319
 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
... ...
@@ -3984,18 +3984,40 @@ func (s *DockerSuite) TestBuildContainerWithCgroupParent(c *check.C) {
3984 3984
 	if !found {
3985 3985
 		c.Fatalf("unable to find self memory cgroup path. CgroupsPath: %v", selfCgroupPaths)
3986 3986
 	}
3987
-	result := buildImage("buildcgroupparent",
3988
-		cli.WithFlags("--cgroup-parent", cgroupParent),
3989
-		build.WithDockerfile(`
3987
+
3988
+	doneCh := make(chan string)
3989
+
3990
+	// If cgroup namespaces are enabled, then processes running inside the container won't
3991
+	// be able to see the parent namespace. Check that they have the correct parents from
3992
+	// the host, which has the non-namespaced view of the hierarchy.
3993
+
3994
+	go func() {
3995
+		result := buildImage("buildcgroupparent",
3996
+			cli.WithFlags("--cgroup-parent", cgroupParent),
3997
+			build.WithDockerfile(`
3990 3998
 FROM busybox
3991
-RUN cat /proc/self/cgroup
3992
-`))
3993
-	result.Assert(c, icmd.Success)
3994
-	m, err := regexp.MatchString(fmt.Sprintf("memory:.*/%s/.*", cgroupParent), result.Combined())
3995
-	assert.NilError(c, err)
3999
+RUN sleep 10
4000
+			`))
4001
+		result.Assert(c, icmd.Success)
4002
+		doneCh <- "done"
4003
+	}()
4004
+
4005
+	// Wait until the build is well into the sleep
4006
+	time.Sleep(3 * time.Second)
4007
+	out, _, err := dockerCmdWithError("ps", "-q", "-l")
4008
+	c.Assert(err, check.IsNil)
4009
+	cID := strings.TrimSpace(out)
4010
+
4011
+	pid := inspectField(c, cID, "State.Pid")
4012
+	paths := ReadCgroupPathsForPid(c, pid)
4013
+	m, err := regexp.MatchString(fmt.Sprintf("memory:.*/%s/.*", cgroupParent), paths)
4014
+	c.Assert(err, check.IsNil)
3996 4015
 	if !m {
3997
-		c.Fatalf("There is no expected memory cgroup with parent /%s/: %s", cgroupParent, result.Combined())
4016
+		c.Fatalf("There is no expected memory cgroup with parent /%s/: %s", cgroupParent, paths)
3998 4017
 	}
4018
+
4019
+	// Wait for the build to complete, otherwise it will exit with an error
4020
+	<-doneCh
3999 4021
 }
4000 4022
 
4001 4023
 // FIXME(vdemeester) could be a unit test
... ...
@@ -1787,7 +1787,8 @@ func (s *DockerDaemonSuite) TestDaemonRestartContainerLinksRestart(c *check.C) {
1787 1787
 }
1788 1788
 
1789 1789
 func (s *DockerDaemonSuite) TestDaemonCgroupParent(c *check.C) {
1790
-	testRequires(c, DaemonIsLinux)
1790
+	// Test requires local filesystem access on a Linux host
1791
+	testRequires(c, DaemonIsLinux, testEnv.IsLocalDaemon)
1791 1792
 
1792 1793
 	cgroupParent := "test"
1793 1794
 	name := "cgroup-test"
... ...
@@ -1795,10 +1796,20 @@ func (s *DockerDaemonSuite) TestDaemonCgroupParent(c *check.C) {
1795 1795
 	s.d.StartWithBusybox(c, "--cgroup-parent", cgroupParent)
1796 1796
 	defer s.d.Restart(c)
1797 1797
 
1798
-	out, err := s.d.Cmd("run", "--name", name, "busybox", "cat", "/proc/self/cgroup")
1799
-	assert.NilError(c, err)
1800
-	cgroupPaths := ParseCgroupPaths(string(out))
1801
-	c.Assert(len(cgroupPaths), checker.Not(checker.Equals), 0, check.Commentf("unexpected output - %q", string(out)))
1798
+	out, err := s.d.Cmd("run", "--name", name, "-d", "busybox", "top")
1799
+	c.Assert(err, checker.IsNil)
1800
+
1801
+	// If cgroup namespaces are enabled, then processes running inside the container won't
1802
+	// be able to see the parent namespace. Check that they have the correct parents from
1803
+	// the host, which has the non-namespaced view of the hierarchy.
1804
+
1805
+	pid, err := s.d.Cmd("inspect", "-f", "{{.State.Pid}}", name)
1806
+	c.Assert(err, checker.IsNil)
1807
+	pid = strings.TrimSpace(string(pid))
1808
+	paths := ReadCgroupPathsForPid(c, pid)
1809
+	cgroupPaths := ParseCgroupPaths(paths)
1810
+	c.Assert(len(cgroupPaths), checker.Not(checker.Equals), 0, check.Commentf("unexpected output - %q", paths))
1811
+
1802 1812
 	out, err = s.d.Cmd("inspect", "-f", "{{.Id}}", name)
1803 1813
 	assert.NilError(c, err)
1804 1814
 	id := strings.TrimSpace(string(out))
... ...
@@ -3241,8 +3241,8 @@ func (s *DockerSuite) TestRunWithUlimits(c *check.C) {
3241 3241
 }
3242 3242
 
3243 3243
 func (s *DockerSuite) TestRunContainerWithCgroupParent(c *check.C) {
3244
-	// Not applicable on Windows as uses Unix specific functionality
3245
-	testRequires(c, DaemonIsLinux)
3244
+	// Test requires local filesystem access on a Linux host
3245
+	testRequires(c, DaemonIsLinux, testEnv.IsLocalDaemon)
3246 3246
 
3247 3247
 	// cgroup-parent relative path
3248 3248
 	testRunContainerWithCgroupParent(c, "test", "cgroup-test")
... ...
@@ -3252,14 +3252,23 @@ func (s *DockerSuite) TestRunContainerWithCgroupParent(c *check.C) {
3252 3252
 }
3253 3253
 
3254 3254
 func testRunContainerWithCgroupParent(c *check.C, cgroupParent, name string) {
3255
-	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "busybox", "cat", "/proc/self/cgroup")
3255
+	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "-d", "busybox", "top")
3256 3256
 	if err != nil {
3257 3257
 		c.Fatalf("unexpected failure when running container with --cgroup-parent option - %s\n%v", string(out), err)
3258 3258
 	}
3259
-	cgroupPaths := ParseCgroupPaths(string(out))
3259
+	cID := strings.TrimSpace(out)
3260
+
3261
+	// If cgroup namespaces are enabled, then processes running inside the container won't
3262
+	// be able to see the parent namespace. Check that they have the correct parents from
3263
+	// the host, which has the non-namespaced view of the hierarchy.
3264
+
3265
+	pid := inspectField(c, cID, "State.Pid")
3266
+	paths := ReadCgroupPathsForPid(c, pid)
3267
+	cgroupPaths := ParseCgroupPaths(paths)
3260 3268
 	if len(cgroupPaths) == 0 {
3261
-		c.Fatalf("unexpected output - %q", string(out))
3269
+		c.Fatalf("unexpected output - %q", string(paths))
3262 3270
 	}
3271
+
3263 3272
 	id := getIDByName(c, name)
3264 3273
 	expectedCgroup := path.Join(cgroupParent, id)
3265 3274
 	found := false
... ...
@@ -3285,21 +3294,29 @@ func (s *DockerSuite) TestRunInvalidCgroupParent(c *check.C) {
3285 3285
 }
3286 3286
 
3287 3287
 func testRunInvalidCgroupParent(c *check.C, cgroupParent, cleanCgroupParent, name string) {
3288
-	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "busybox", "cat", "/proc/self/cgroup")
3288
+	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "-d", "busybox", "top")
3289 3289
 	if err != nil {
3290 3290
 		// XXX: This may include a daemon crash.
3291 3291
 		c.Fatalf("unexpected failure when running container with --cgroup-parent option - %s\n%v", string(out), err)
3292 3292
 	}
3293
+	cID := strings.TrimSpace(out)
3293 3294
 
3294 3295
 	// We expect "/SHOULD_NOT_EXIST" to not exist. If not, we have a security issue.
3295 3296
 	if _, err := os.Stat("/SHOULD_NOT_EXIST"); err == nil || !os.IsNotExist(err) {
3296 3297
 		c.Fatalf("SECURITY: --cgroup-parent with ../../ relative paths cause files to be created in the host (this is bad) !!")
3297 3298
 	}
3298 3299
 
3299
-	cgroupPaths := ParseCgroupPaths(string(out))
3300
+	// If cgroup namespaces are enabled, then processes running inside the container won't
3301
+	// be able to see the parent namespace. Check that they have the correct parents from
3302
+	// the host, which has the non-namespaced view of the hierarchy.
3303
+
3304
+	pid := inspectField(c, cID, "State.Pid")
3305
+	paths := ReadCgroupPathsForPid(c, pid)
3306
+	cgroupPaths := ParseCgroupPaths(paths)
3300 3307
 	if len(cgroupPaths) == 0 {
3301
-		c.Fatalf("unexpected output - %q", string(out))
3308
+		c.Fatalf("unexpected output - %q", string(paths))
3302 3309
 	}
3310
+
3303 3311
 	id := getIDByName(c, name)
3304 3312
 	expectedCgroup := path.Join(cleanCgroupParent, id)
3305 3313
 	found := false
... ...
@@ -2,6 +2,7 @@ package main
2 2
 
3 3
 import (
4 4
 	"fmt"
5
+	"io/ioutil"
5 6
 	"os"
6 7
 	"os/exec"
7 8
 	"path/filepath"
... ...
@@ -38,6 +39,17 @@ func transformCmd(execCmd *exec.Cmd) icmd.Cmd {
38 38
 	}
39 39
 }
40 40
 
41
+// ReadCgroupPathsForPid reads the cgroup path file for a pid in '/proc/<pid>/cgroup'
42
+func ReadCgroupPathsForPid(c *check.C, pid string) string {
43
+	cgroupFile := fmt.Sprintf("/proc/%s/cgroup", pid)
44
+	out, err := ioutil.ReadFile(cgroupFile)
45
+	if err != nil {
46
+		c.Fatalf("unexpected failure when reading cgroup file %s\n%v", cgroupFile, err)
47
+	}
48
+
49
+	return string(out)
50
+}
51
+
41 52
 // ParseCgroupPaths parses 'procCgroupData', which is output of '/proc/<pid>/cgroup', and returns
42 53
 // a map which cgroup name as key and path as value.
43 54
 func ParseCgroupPaths(procCgroupData string) map[string]string {
... ...
@@ -2,6 +2,10 @@ package container // import "github.com/docker/docker/integration/container"
2 2
 
3 3
 import (
4 4
 	"context"
5
+	"fmt"
6
+	"io/ioutil"
7
+	"os"
8
+	"path/filepath"
5 9
 	"strconv"
6 10
 	"strings"
7 11
 	"testing"
... ...
@@ -93,3 +97,32 @@ func TestNISDomainname(t *testing.T) {
93 93
 	assert.Equal(t, 0, res.ExitCode)
94 94
 	assert.Check(t, is.Equal(domainname, strings.TrimSpace(res.Stdout())))
95 95
 }
96
+
97
+func TestCgroupNamespaces(t *testing.T) {
98
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
99
+	skip.If(t, testEnv.IsRemoteDaemon())
100
+
101
+	if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
102
+		t.Skip("cgroup namespaces are unsupported")
103
+	}
104
+
105
+	defer setupTest(t)()
106
+	client := testEnv.APIClient()
107
+	ctx := context.Background()
108
+
109
+	cID := container.Run(t, ctx, client)
110
+	poll.WaitOn(t, container.IsInState(ctx, client, cID, "running"), poll.WithDelay(100*time.Millisecond))
111
+
112
+	path := filepath.Join(os.Getenv("DEST"), "docker.pid")
113
+	b, err := ioutil.ReadFile(path)
114
+	assert.NilError(t, err)
115
+	link, err := os.Readlink(fmt.Sprintf("/proc/%s/ns/cgroup", string(b)))
116
+	assert.NilError(t, err)
117
+
118
+	// Check that the container's cgroup doesn't match the docker daemon's
119
+	res, err := container.Exec(ctx, client, cID, []string{"readlink", "/proc/1/ns/cgroup"})
120
+	assert.NilError(t, err)
121
+	assert.Assert(t, is.Len(res.Stderr(), 0))
122
+	assert.Equal(t, 0, res.ExitCode)
123
+	assert.Assert(t, link != strings.TrimSpace(res.Stdout()))
124
+}
... ...
@@ -16,6 +16,9 @@ type SysInfo struct {
16 16
 	cgroupCpusetInfo
17 17
 	cgroupPids
18 18
 
19
+	// Whether the kernel supports cgroup namespaces or not
20
+	CgroupNamespaces bool
21
+
19 22
 	// Whether IPv4 forwarding is supported or not, if this was disabled, networking will not work
20 23
 	IPv4ForwardingDisabled bool
21 24
 
... ...
@@ -53,6 +53,7 @@ func New(quiet bool) *SysInfo {
53 53
 		applyNetworkingInfo,
54 54
 		applyAppArmorInfo,
55 55
 		applySeccompInfo,
56
+		applyCgroupNsInfo,
56 57
 	}...)
57 58
 
58 59
 	for _, o := range ops {
... ...
@@ -250,6 +251,15 @@ func applyAppArmorInfo(info *SysInfo, _ map[string]string) []string {
250 250
 	return warnings
251 251
 }
252 252
 
253
+// applyCgroupNsInfo adds cgroup namespace information to the info.
254
+func applyCgroupNsInfo(info *SysInfo, _ map[string]string) []string {
255
+	var warnings []string
256
+	if _, err := os.Stat("/proc/self/ns/cgroup"); !os.IsNotExist(err) {
257
+		info.CgroupNamespaces = true
258
+	}
259
+	return warnings
260
+}
261
+
253 262
 // applySeccompInfo checks if Seccomp is supported, via CONFIG_SECCOMP.
254 263
 func applySeccompInfo(info *SysInfo, _ map[string]string) []string {
255 264
 	var warnings []string
... ...
@@ -96,6 +96,26 @@ func TestNewAppArmorDisabled(t *testing.T) {
96 96
 	assert.Assert(t, !sysInfo.AppArmor)
97 97
 }
98 98
 
99
+func TestNewCgroupNamespacesEnabled(t *testing.T) {
100
+	// If cgroup namespaces are supported in the kernel, then sysInfo.CgroupNamespaces should be TRUE
101
+	if _, err := os.Stat("/proc/self/ns/cgroup"); err != nil {
102
+		t.Skip("cgroup namespaces must be enabled")
103
+	}
104
+
105
+	sysInfo := New(true)
106
+	assert.Assert(t, sysInfo.CgroupNamespaces)
107
+}
108
+
109
+func TestNewCgroupNamespacesDisabled(t *testing.T) {
110
+	// If cgroup namespaces are *not* supported in the kernel, then sysInfo.CgroupNamespaces should be FALSE
111
+	if _, err := os.Stat("/proc/self/ns/cgroup"); !os.IsNotExist(err) {
112
+		t.Skip("cgroup namespaces must be disabled")
113
+	}
114
+
115
+	sysInfo := New(true)
116
+	assert.Assert(t, !sysInfo.CgroupNamespaces)
117
+}
118
+
99 119
 func TestNumCPU(t *testing.T) {
100 120
 	cpuNumbers := NumCPU()
101 121
 	if cpuNumbers <= 0 {