Browse code

Merge pull request #5411 from crosbymichael/lockdown

Update default restrictions for exec drivers

unclejack authored on 2014/04/26 09:27:56
Showing 23 changed files
... ...
@@ -59,9 +59,10 @@ func init() {
59 59
 }
60 60
 
61 61
 type driver struct {
62
-	root       string // root path for the driver to use
63
-	apparmor   bool
64
-	sharedRoot bool
62
+	root            string // root path for the driver to use
63
+	apparmor        bool
64
+	sharedRoot      bool
65
+	restrictionPath string
65 66
 }
66 67
 
67 68
 func NewDriver(root string, apparmor bool) (*driver, error) {
... ...
@@ -69,10 +70,15 @@ func NewDriver(root string, apparmor bool) (*driver, error) {
69 69
 	if err := linkLxcStart(root); err != nil {
70 70
 		return nil, err
71 71
 	}
72
+	restrictionPath := filepath.Join(root, "empty")
73
+	if err := os.MkdirAll(restrictionPath, 0700); err != nil {
74
+		return nil, err
75
+	}
72 76
 	return &driver{
73
-		apparmor:   apparmor,
74
-		root:       root,
75
-		sharedRoot: rootIsShared(),
77
+		apparmor:        apparmor,
78
+		root:            root,
79
+		sharedRoot:      rootIsShared(),
80
+		restrictionPath: restrictionPath,
76 81
 	}, nil
77 82
 }
78 83
 
... ...
@@ -403,14 +409,16 @@ func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) {
403 403
 
404 404
 	if err := LxcTemplateCompiled.Execute(fo, struct {
405 405
 		*execdriver.Command
406
-		AppArmor     bool
407
-		ProcessLabel string
408
-		MountLabel   string
406
+		AppArmor          bool
407
+		ProcessLabel      string
408
+		MountLabel        string
409
+		RestrictionSource string
409 410
 	}{
410
-		Command:      c,
411
-		AppArmor:     d.apparmor,
412
-		ProcessLabel: process,
413
-		MountLabel:   mount,
411
+		Command:           c,
412
+		AppArmor:          d.apparmor,
413
+		ProcessLabel:      process,
414
+		MountLabel:        mount,
415
+		RestrictionSource: d.restrictionPath,
414 416
 	}); err != nil {
415 417
 		return "", err
416 418
 	}
... ...
@@ -88,7 +88,9 @@ lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noex
88 88
 
89 89
 # WARNING: sysfs is a known attack vector and should probably be disabled
90 90
 # if your userspace allows it. eg. see http://bit.ly/T9CkqJ
91
+{{if .Privileged}}
91 92
 lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0
93
+{{end}}
92 94
 
93 95
 {{if .Tty}}
94 96
 lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0
... ...
@@ -109,8 +111,15 @@ lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabS
109 109
 {{if .AppArmor}}
110 110
 lxc.aa_profile = unconfined
111 111
 {{else}}
112
-#lxc.aa_profile = unconfined
112
+# not unconfined
113 113
 {{end}}
114
+{{else}}
115
+# restrict access to proc
116
+lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/sys none bind,ro 0 0
117
+lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/irq none bind,ro 0 0
118
+lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/acpi none bind,ro 0 0
119
+lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/sysrq-trigger none bind,ro 0 0
120
+lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0
114 121
 {{end}}
115 122
 
116 123
 # limits
... ...
@@ -25,6 +25,7 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container
25 25
 	container.Cgroups.Name = c.ID
26 26
 	// check to see if we are running in ramdisk to disable pivot root
27 27
 	container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
28
+	container.Context["restriction_path"] = d.restrictionPath
28 29
 
29 30
 	if err := d.createNetwork(container, c); err != nil {
30 31
 		return nil, err
... ...
@@ -33,6 +34,8 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container
33 33
 		if err := d.setPrivileged(container); err != nil {
34 34
 			return nil, err
35 35
 		}
36
+	} else {
37
+		container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "devtmpfs"})
36 38
 	}
37 39
 	if err := d.setupCgroups(container, c); err != nil {
38 40
 		return nil, err
... ...
@@ -81,6 +84,11 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error {
81 81
 		c.Enabled = true
82 82
 	}
83 83
 	container.Cgroups.DeviceAccess = true
84
+
85
+	// add sysfs as a mount for privileged containers
86
+	container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "sysfs"})
87
+	delete(container.Context, "restriction_path")
88
+
84 89
 	if apparmor.IsEnabled() {
85 90
 		container.Context["apparmor_profile"] = "unconfined"
86 91
 	}
... ...
@@ -99,7 +107,13 @@ func (d *driver) setupCgroups(container *libcontainer.Container, c *execdriver.C
99 99
 
100 100
 func (d *driver) setupMounts(container *libcontainer.Container, c *execdriver.Command) error {
101 101
 	for _, m := range c.Mounts {
102
-		container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private})
102
+		container.Mounts = append(container.Mounts, libcontainer.Mount{
103
+			Type:        "bind",
104
+			Source:      m.Source,
105
+			Destination: m.Destination,
106
+			Writable:    m.Writable,
107
+			Private:     m.Private,
108
+		})
103 109
 	}
104 110
 	return nil
105 111
 }
... ...
@@ -23,7 +23,7 @@ import (
23 23
 
24 24
 const (
25 25
 	DriverName                = "native"
26
-	Version                   = "0.1"
26
+	Version                   = "0.2"
27 27
 	BackupApparmorProfilePath = "apparmor/docker.back" // relative to docker root
28 28
 )
29 29
 
... ...
@@ -62,6 +62,7 @@ type driver struct {
62 62
 	root             string
63 63
 	initPath         string
64 64
 	activeContainers map[string]*exec.Cmd
65
+	restrictionPath  string
65 66
 }
66 67
 
67 68
 func NewDriver(root, initPath string) (*driver, error) {
... ...
@@ -72,8 +73,14 @@ func NewDriver(root, initPath string) (*driver, error) {
72 72
 	if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil {
73 73
 		return nil, err
74 74
 	}
75
+	restrictionPath := filepath.Join(root, "empty")
76
+	if err := os.MkdirAll(restrictionPath, 0700); err != nil {
77
+		return nil, err
78
+	}
79
+
75 80
 	return &driver{
76 81
 		root:             root,
82
+		restrictionPath:  restrictionPath,
77 83
 		initPath:         initPath,
78 84
 		activeContainers: make(map[string]*exec.Cmd),
79 85
 	}, nil
... ...
@@ -665,3 +665,25 @@ func TestUnPrivilegedCannotMount(t *testing.T) {
665 665
 
666 666
 	logDone("run - test un-privileged cannot mount")
667 667
 }
668
+
669
+func TestSysNotAvaliableInNonPrivilegedContainers(t *testing.T) {
670
+	cmd := exec.Command(dockerBinary, "run", "busybox", "ls", "/sys/kernel")
671
+	if code, err := runCommand(cmd); err == nil || code == 0 {
672
+		t.Fatal("sys should not be available in a non privileged container")
673
+	}
674
+
675
+	deleteAllContainers()
676
+
677
+	logDone("run - sys not avaliable in non privileged container")
678
+}
679
+
680
+func TestSysAvaliableInPrivilegedContainers(t *testing.T) {
681
+	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "ls", "/sys/kernel")
682
+	if code, err := runCommand(cmd); err != nil || code != 0 {
683
+		t.Fatalf("sys should be available in privileged container")
684
+	}
685
+
686
+	deleteAllContainers()
687
+
688
+	logDone("run - sys avaliable in privileged container")
689
+}
... ...
@@ -16,76 +16,149 @@ process are specified in this file.  The configuration is used for each process
16 16
 Sample `container.json` file:
17 17
 ```json
18 18
 {
19
-   "hostname" : "koye",
20
-   "networks" : [
19
+   "mounts" : [
21 20
       {
22
-         "gateway" : "172.17.42.1",
23
-         "context" : {
24
-            "bridge" : "docker0",
25
-            "prefix" : "veth"
26
-         },
27
-         "address" : "172.17.0.2/16",
28
-         "type" : "veth",
29
-         "mtu" : 1500
21
+         "type" : "devtmpfs"
30 22
       }
31 23
    ],
32
-   "cgroups" : {
33
-      "parent" : "docker",
34
-      "name" : "11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620"
35
-   },
36 24
    "tty" : true,
37 25
    "environment" : [
38 26
       "HOME=/",
39
-      "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
40
-      "HOSTNAME=11bb30683fb0",
41
-      "TERM=xterm"
42
-   ],
43
-   "capabilities_mask" : [
44
-      "SETPCAP",
45
-      "SYS_MODULE",
46
-      "SYS_RAWIO",
47
-      "SYS_PACCT",
48
-      "SYS_ADMIN",
49
-      "SYS_NICE",
50
-      "SYS_RESOURCE",
51
-      "SYS_TIME",
52
-      "SYS_TTY_CONFIG",
53
-      "MKNOD",
54
-      "AUDIT_WRITE",
55
-      "AUDIT_CONTROL",
56
-      "MAC_OVERRIDE",
57
-      "MAC_ADMIN",
58
-      "NET_ADMIN"
27
+      "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
28
+      "container=docker",
29
+      "TERM=xterm-256color"
59 30
    ],
60
-   "context" : {
61
-      "apparmor_profile" : "docker-default"
31
+   "hostname" : "koye",
32
+   "cgroups" : {
33
+      "parent" : "docker",
34
+      "name" : "docker-koye"
62 35
    },
63
-   "mounts" : [
36
+   "capabilities_mask" : [
37
+      {
38
+         "value" : 8,
39
+         "key" : "SETPCAP",
40
+         "enabled" : false
41
+      },
42
+      {
43
+         "enabled" : false,
44
+         "value" : 16,
45
+         "key" : "SYS_MODULE"
46
+      },
47
+      {
48
+         "value" : 17,
49
+         "key" : "SYS_RAWIO",
50
+         "enabled" : false
51
+      },
52
+      {
53
+         "key" : "SYS_PACCT",
54
+         "value" : 20,
55
+         "enabled" : false
56
+      },
57
+      {
58
+         "value" : 21,
59
+         "key" : "SYS_ADMIN",
60
+         "enabled" : false
61
+      },
62
+      {
63
+         "value" : 23,
64
+         "key" : "SYS_NICE",
65
+         "enabled" : false
66
+      },
67
+      {
68
+         "value" : 24,
69
+         "key" : "SYS_RESOURCE",
70
+         "enabled" : false
71
+      },
64 72
       {
65
-         "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/resolv.conf",
66
-         "writable" : false,
67
-         "destination" : "/etc/resolv.conf",
68
-         "private" : true
73
+         "key" : "SYS_TIME",
74
+         "value" : 25,
75
+         "enabled" : false
69 76
       },
70 77
       {
71
-         "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/hostname",
72
-         "writable" : false,
73
-         "destination" : "/etc/hostname",
74
-         "private" : true
78
+         "enabled" : false,
79
+         "value" : 26,
80
+         "key" : "SYS_TTY_CONFIG"
75 81
       },
76 82
       {
77
-         "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/hosts",
78
-         "writable" : false,
79
-         "destination" : "/etc/hosts",
80
-         "private" : true
83
+         "key" : "AUDIT_WRITE",
84
+         "value" : 29,
85
+         "enabled" : false
86
+      },
87
+      {
88
+         "value" : 30,
89
+         "key" : "AUDIT_CONTROL",
90
+         "enabled" : false
91
+      },
92
+      {
93
+         "enabled" : false,
94
+         "key" : "MAC_OVERRIDE",
95
+         "value" : 32
96
+      },
97
+      {
98
+         "enabled" : false,
99
+         "key" : "MAC_ADMIN",
100
+         "value" : 33
101
+      },
102
+      {
103
+         "key" : "NET_ADMIN",
104
+         "value" : 12,
105
+         "enabled" : false
106
+      },
107
+      {
108
+         "value" : 27,
109
+         "key" : "MKNOD",
110
+         "enabled" : true
111
+      }
112
+   ],
113
+   "networks" : [
114
+      {
115
+         "mtu" : 1500,
116
+         "address" : "127.0.0.1/0",
117
+         "type" : "loopback",
118
+         "gateway" : "localhost"
119
+      },
120
+      {
121
+         "mtu" : 1500,
122
+         "address" : "172.17.42.2/16",
123
+         "type" : "veth",
124
+         "context" : {
125
+            "bridge" : "docker0",
126
+            "prefix" : "veth"
127
+         },
128
+         "gateway" : "172.17.42.1"
81 129
       }
82 130
    ],
83 131
    "namespaces" : [
84
-      "NEWNS",
85
-      "NEWUTS",
86
-      "NEWIPC",
87
-      "NEWPID",
88
-      "NEWNET"
132
+      {
133
+         "key" : "NEWNS",
134
+         "value" : 131072,
135
+         "enabled" : true,
136
+         "file" : "mnt"
137
+      },
138
+      {
139
+         "key" : "NEWUTS",
140
+         "value" : 67108864,
141
+         "enabled" : true,
142
+         "file" : "uts"
143
+      },
144
+      {
145
+         "enabled" : true,
146
+         "file" : "ipc",
147
+         "key" : "NEWIPC",
148
+         "value" : 134217728
149
+      },
150
+      {
151
+         "file" : "pid",
152
+         "enabled" : true,
153
+         "value" : 536870912,
154
+         "key" : "NEWPID"
155
+      },
156
+      {
157
+         "enabled" : true,
158
+         "file" : "net",
159
+         "key" : "NEWNET",
160
+         "value" : 1073741824
161
+      }
89 162
    ]
90 163
 }
91 164
 ```
92 165
deleted file mode 100644
... ...
@@ -1,35 +0,0 @@
1
-package capabilities
2
-
3
-import (
4
-	"github.com/dotcloud/docker/pkg/libcontainer"
5
-	"github.com/syndtr/gocapability/capability"
6
-	"os"
7
-)
8
-
9
-// DropCapabilities drops capabilities for the current process based
10
-// on the container's configuration.
11
-func DropCapabilities(container *libcontainer.Container) error {
12
-	if drop := getCapabilitiesMask(container); len(drop) > 0 {
13
-		c, err := capability.NewPid(os.Getpid())
14
-		if err != nil {
15
-			return err
16
-		}
17
-		c.Unset(capability.CAPS|capability.BOUNDS, drop...)
18
-
19
-		if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil {
20
-			return err
21
-		}
22
-	}
23
-	return nil
24
-}
25
-
26
-// getCapabilitiesMask returns the specific cap mask values for the libcontainer types
27
-func getCapabilitiesMask(container *libcontainer.Container) []capability.Cap {
28
-	drop := []capability.Cap{}
29
-	for _, c := range container.CapabilitiesMask {
30
-		if !c.Enabled {
31
-			drop = append(drop, c.Value)
32
-		}
33
-	}
34
-	return drop
35
-}
36 1
new file mode 100644
... ...
@@ -0,0 +1,60 @@
0
+// +build linux
1
+
2
+package console
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/label"
7
+	"github.com/dotcloud/docker/pkg/system"
8
+	"os"
9
+	"path/filepath"
10
+	"syscall"
11
+)
12
+
13
+// Setup initializes the proper /dev/console inside the rootfs path
14
+func Setup(rootfs, consolePath, mountLabel string) error {
15
+	oldMask := system.Umask(0000)
16
+	defer system.Umask(oldMask)
17
+
18
+	stat, err := os.Stat(consolePath)
19
+	if err != nil {
20
+		return fmt.Errorf("stat console %s %s", consolePath, err)
21
+	}
22
+	var (
23
+		st   = stat.Sys().(*syscall.Stat_t)
24
+		dest = filepath.Join(rootfs, "dev/console")
25
+	)
26
+	if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
27
+		return fmt.Errorf("remove %s %s", dest, err)
28
+	}
29
+	if err := os.Chmod(consolePath, 0600); err != nil {
30
+		return err
31
+	}
32
+	if err := os.Chown(consolePath, 0, 0); err != nil {
33
+		return err
34
+	}
35
+	if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
36
+		return fmt.Errorf("mknod %s %s", dest, err)
37
+	}
38
+	if err := label.SetFileLabel(consolePath, mountLabel); err != nil {
39
+		return fmt.Errorf("set file label %s %s", dest, err)
40
+	}
41
+	if err := system.Mount(consolePath, dest, "bind", syscall.MS_BIND, ""); err != nil {
42
+		return fmt.Errorf("bind %s to %s %s", consolePath, dest, err)
43
+	}
44
+	return nil
45
+}
46
+
47
+func OpenAndDup(consolePath string) error {
48
+	slave, err := system.OpenTerminal(consolePath, syscall.O_RDWR)
49
+	if err != nil {
50
+		return fmt.Errorf("open terminal %s", err)
51
+	}
52
+	if err := system.Dup2(slave.Fd(), 0); err != nil {
53
+		return err
54
+	}
55
+	if err := system.Dup2(slave.Fd(), 1); err != nil {
56
+		return err
57
+	}
58
+	return system.Dup2(slave.Fd(), 2)
59
+}
... ...
@@ -23,7 +23,7 @@ type Container struct {
23 23
 	Networks         []*Network      `json:"networks,omitempty"`          // nil for host's network stack
24 24
 	Cgroups          *cgroups.Cgroup `json:"cgroups,omitempty"`           // cgroups
25 25
 	Context          Context         `json:"context,omitempty"`           // generic context for specific options (apparmor, selinux)
26
-	Mounts           []Mount         `json:"mounts,omitempty"`
26
+	Mounts           Mounts          `json:"mounts,omitempty"`
27 27
 }
28 28
 
29 29
 // Network defines configuration for a container's networking stack
... ...
@@ -37,12 +37,3 @@ type Network struct {
37 37
 	Gateway string  `json:"gateway,omitempty"`
38 38
 	Mtu     int     `json:"mtu,omitempty"`
39 39
 }
40
-
41
-// Bind mounts from the host system to the container
42
-//
43
-type Mount struct {
44
-	Source      string `json:"source"`      // Source path, in the host namespace
45
-	Destination string `json:"destination"` // Destination path, in the container
46
-	Writable    bool   `json:"writable"`
47
-	Private     bool   `json:"private"`
48
-}
... ...
@@ -1,50 +1,146 @@
1 1
 {
2
-    "hostname": "koye",
3
-    "tty": true,
4
-    "environment": [
5
-        "HOME=/",
6
-        "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
7
-        "container=docker",
8
-        "TERM=xterm-256color"
9
-    ],
10
-    "namespaces": [
11
-        "NEWIPC",
12
-        "NEWNS",
13
-        "NEWPID",
14
-        "NEWUTS",
15
-        "NEWNET"
16
-    ],
17
-    "capabilities_mask": [
18
-        "SETPCAP",
19
-        "SYS_MODULE",
20
-        "SYS_RAWIO",
21
-        "SYS_PACCT",
22
-        "SYS_ADMIN",
23
-        "SYS_NICE",
24
-        "SYS_RESOURCE",
25
-        "SYS_TIME",
26
-        "SYS_TTY_CONFIG",
27
-        "MKNOD",
28
-        "AUDIT_WRITE",
29
-        "AUDIT_CONTROL",
30
-        "MAC_OVERRIDE",
31
-        "MAC_ADMIN",
32
-        "NET_ADMIN"
33
-    ],
34
-    "networks": [{
35
-            "type": "veth",
36
-            "context": {
37
-                "bridge": "docker0",
38
-                "prefix": "dock"
39
-            },
40
-            "address": "172.17.0.100/16",
41
-            "gateway": "172.17.42.1",
42
-            "mtu": 1500
43
-        }
44
-    ],
45
-    "cgroups": {
46
-        "name": "docker-koye",
47
-        "parent": "docker",
48
-        "memory": 5248000
49
-    }
2
+   "mounts" : [
3
+      {
4
+         "type" : "devtmpfs"
5
+      }
6
+   ],
7
+   "tty" : true,
8
+   "environment" : [
9
+      "HOME=/",
10
+      "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
11
+      "container=docker",
12
+      "TERM=xterm-256color"
13
+   ],
14
+   "hostname" : "koye",
15
+   "cgroups" : {
16
+      "parent" : "docker",
17
+      "name" : "docker-koye"
18
+   },
19
+   "capabilities_mask" : [
20
+      {
21
+         "value" : 8,
22
+         "key" : "SETPCAP",
23
+         "enabled" : false
24
+      },
25
+      {
26
+         "enabled" : false,
27
+         "value" : 16,
28
+         "key" : "SYS_MODULE"
29
+      },
30
+      {
31
+         "value" : 17,
32
+         "key" : "SYS_RAWIO",
33
+         "enabled" : false
34
+      },
35
+      {
36
+         "key" : "SYS_PACCT",
37
+         "value" : 20,
38
+         "enabled" : false
39
+      },
40
+      {
41
+         "value" : 21,
42
+         "key" : "SYS_ADMIN",
43
+         "enabled" : false
44
+      },
45
+      {
46
+         "value" : 23,
47
+         "key" : "SYS_NICE",
48
+         "enabled" : false
49
+      },
50
+      {
51
+         "value" : 24,
52
+         "key" : "SYS_RESOURCE",
53
+         "enabled" : false
54
+      },
55
+      {
56
+         "key" : "SYS_TIME",
57
+         "value" : 25,
58
+         "enabled" : false
59
+      },
60
+      {
61
+         "enabled" : false,
62
+         "value" : 26,
63
+         "key" : "SYS_TTY_CONFIG"
64
+      },
65
+      {
66
+         "key" : "AUDIT_WRITE",
67
+         "value" : 29,
68
+         "enabled" : false
69
+      },
70
+      {
71
+         "value" : 30,
72
+         "key" : "AUDIT_CONTROL",
73
+         "enabled" : false
74
+      },
75
+      {
76
+         "enabled" : false,
77
+         "key" : "MAC_OVERRIDE",
78
+         "value" : 32
79
+      },
80
+      {
81
+         "enabled" : false,
82
+         "key" : "MAC_ADMIN",
83
+         "value" : 33
84
+      },
85
+      {
86
+         "key" : "NET_ADMIN",
87
+         "value" : 12,
88
+         "enabled" : false
89
+      },
90
+      {
91
+         "value" : 27,
92
+         "key" : "MKNOD",
93
+         "enabled" : true
94
+      }
95
+   ],
96
+   "networks" : [
97
+      {
98
+         "mtu" : 1500,
99
+         "address" : "127.0.0.1/0",
100
+         "type" : "loopback",
101
+         "gateway" : "localhost"
102
+      },
103
+      {
104
+         "mtu" : 1500,
105
+         "address" : "172.17.42.2/16",
106
+         "type" : "veth",
107
+         "context" : {
108
+            "bridge" : "docker0",
109
+            "prefix" : "veth"
110
+         },
111
+         "gateway" : "172.17.42.1"
112
+      }
113
+   ],
114
+   "namespaces" : [
115
+      {
116
+         "key" : "NEWNS",
117
+         "value" : 131072,
118
+         "enabled" : true,
119
+         "file" : "mnt"
120
+      },
121
+      {
122
+         "key" : "NEWUTS",
123
+         "value" : 67108864,
124
+         "enabled" : true,
125
+         "file" : "uts"
126
+      },
127
+      {
128
+         "enabled" : true,
129
+         "file" : "ipc",
130
+         "key" : "NEWIPC",
131
+         "value" : 134217728
132
+      },
133
+      {
134
+         "file" : "pid",
135
+         "enabled" : true,
136
+         "value" : 536870912,
137
+         "key" : "NEWPID"
138
+      },
139
+      {
140
+         "enabled" : true,
141
+         "file" : "net",
142
+         "key" : "NEWNET",
143
+         "value" : 1073741824
144
+      }
145
+   ]
50 146
 }
51 147
new file mode 100644
... ...
@@ -0,0 +1,143 @@
0
+// +build linux
1
+
2
+package mount
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/label"
7
+	"github.com/dotcloud/docker/pkg/libcontainer"
8
+	"github.com/dotcloud/docker/pkg/libcontainer/mount/nodes"
9
+	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
10
+	"github.com/dotcloud/docker/pkg/system"
11
+	"os"
12
+	"path/filepath"
13
+	"syscall"
14
+)
15
+
16
+// default mount point flags
17
+const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
18
+
19
+type mount struct {
20
+	source string
21
+	path   string
22
+	device string
23
+	flags  int
24
+	data   string
25
+}
26
+
27
+// InitializeMountNamespace setups up the devices, mount points, and filesystems for use inside a
28
+// new mount namepsace
29
+func InitializeMountNamespace(rootfs, console string, container *libcontainer.Container) error {
30
+	var (
31
+		err  error
32
+		flag = syscall.MS_PRIVATE
33
+	)
34
+	if container.NoPivotRoot {
35
+		flag = syscall.MS_SLAVE
36
+	}
37
+	if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil {
38
+		return fmt.Errorf("mounting / as slave %s", err)
39
+	}
40
+	if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
41
+		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
42
+	}
43
+	if err := mountSystem(rootfs, container); err != nil {
44
+		return fmt.Errorf("mount system %s", err)
45
+	}
46
+	if err := setupBindmounts(rootfs, container.Mounts); err != nil {
47
+		return fmt.Errorf("bind mounts %s", err)
48
+	}
49
+	if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil {
50
+		return fmt.Errorf("copy dev nodes %s", err)
51
+	}
52
+	if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
53
+		if err := restrict.Restrict(rootfs, restrictionPath); err != nil {
54
+			return fmt.Errorf("restrict %s", err)
55
+		}
56
+	}
57
+	if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil {
58
+		return err
59
+	}
60
+	if err := system.Chdir(rootfs); err != nil {
61
+		return fmt.Errorf("chdir into %s %s", rootfs, err)
62
+	}
63
+
64
+	if container.NoPivotRoot {
65
+		err = MsMoveRoot(rootfs)
66
+	} else {
67
+		err = PivotRoot(rootfs)
68
+	}
69
+	if err != nil {
70
+		return err
71
+	}
72
+
73
+	if container.ReadonlyFs {
74
+		if err := SetReadonly(); err != nil {
75
+			return fmt.Errorf("set readonly %s", err)
76
+		}
77
+	}
78
+
79
+	system.Umask(0022)
80
+
81
+	return nil
82
+}
83
+
84
+// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
85
+// inside the mount namespace
86
+func mountSystem(rootfs string, container *libcontainer.Container) error {
87
+	for _, m := range newSystemMounts(rootfs, container.Context["mount_label"], container.Mounts) {
88
+		if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
89
+			return fmt.Errorf("mkdirall %s %s", m.path, err)
90
+		}
91
+		if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
92
+			return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
93
+		}
94
+	}
95
+	return nil
96
+}
97
+
98
+func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error {
99
+	for _, m := range bindMounts.OfType("bind") {
100
+		var (
101
+			flags = syscall.MS_BIND | syscall.MS_REC
102
+			dest  = filepath.Join(rootfs, m.Destination)
103
+		)
104
+		if !m.Writable {
105
+			flags = flags | syscall.MS_RDONLY
106
+		}
107
+		if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil {
108
+			return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err)
109
+		}
110
+		if !m.Writable {
111
+			if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil {
112
+				return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err)
113
+			}
114
+		}
115
+		if m.Private {
116
+			if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
117
+				return fmt.Errorf("mounting %s private %s", dest, err)
118
+			}
119
+		}
120
+	}
121
+	return nil
122
+}
123
+
124
+// TODO: this is crappy right now and should be cleaned up with a better way of handling system and
125
+// standard bind mounts allowing them to be more dymanic
126
+func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount {
127
+	systemMounts := []mount{
128
+		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
129
+	}
130
+
131
+	if len(mounts.OfType("devtmpfs")) == 1 {
132
+		systemMounts = append(systemMounts, mount{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"})
133
+	}
134
+	systemMounts = append(systemMounts,
135
+		mount{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
136
+		mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)})
137
+
138
+	if len(mounts.OfType("sysfs")) == 1 {
139
+		systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags})
140
+	}
141
+	return systemMounts
142
+}
0 143
new file mode 100644
... ...
@@ -0,0 +1,19 @@
0
+// +build linux
1
+
2
+package mount
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/system"
7
+	"syscall"
8
+)
9
+
10
+func MsMoveRoot(rootfs string) error {
11
+	if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
12
+		return fmt.Errorf("mount move %s into / %s", rootfs, err)
13
+	}
14
+	if err := system.Chroot("."); err != nil {
15
+		return fmt.Errorf("chroot . %s", err)
16
+	}
17
+	return system.Chdir("/")
18
+}
0 19
new file mode 100644
... ...
@@ -0,0 +1,49 @@
0
+// +build linux
1
+
2
+package nodes
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/system"
7
+	"os"
8
+	"path/filepath"
9
+	"syscall"
10
+)
11
+
12
+// Default list of device nodes to copy
13
+var DefaultNodes = []string{
14
+	"null",
15
+	"zero",
16
+	"full",
17
+	"random",
18
+	"urandom",
19
+	"tty",
20
+}
21
+
22
+// CopyN copies the device node from the host into the rootfs
23
+func CopyN(rootfs string, nodesToCopy []string) error {
24
+	oldMask := system.Umask(0000)
25
+	defer system.Umask(oldMask)
26
+
27
+	for _, node := range nodesToCopy {
28
+		if err := Copy(rootfs, node); err != nil {
29
+			return err
30
+		}
31
+	}
32
+	return nil
33
+}
34
+
35
+func Copy(rootfs, node string) error {
36
+	stat, err := os.Stat(filepath.Join("/dev", node))
37
+	if err != nil {
38
+		return err
39
+	}
40
+	var (
41
+		dest = filepath.Join(rootfs, "dev", node)
42
+		st   = stat.Sys().(*syscall.Stat_t)
43
+	)
44
+	if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
45
+		return fmt.Errorf("copy %s %s", node, err)
46
+	}
47
+	return nil
48
+}
0 49
new file mode 100644
... ...
@@ -0,0 +1,31 @@
0
+// +build linux
1
+
2
+package mount
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/system"
7
+	"io/ioutil"
8
+	"os"
9
+	"path/filepath"
10
+	"syscall"
11
+)
12
+
13
+func PivotRoot(rootfs string) error {
14
+	pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root")
15
+	if err != nil {
16
+		return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err)
17
+	}
18
+	if err := system.Pivotroot(rootfs, pivotDir); err != nil {
19
+		return fmt.Errorf("pivot_root %s", err)
20
+	}
21
+	if err := system.Chdir("/"); err != nil {
22
+		return fmt.Errorf("chdir / %s", err)
23
+	}
24
+	// path to pivot dir now changed, update
25
+	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
26
+	if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
27
+		return fmt.Errorf("unmount pivot_root dir %s", err)
28
+	}
29
+	return os.Remove(pivotDir)
30
+}
0 31
new file mode 100644
... ...
@@ -0,0 +1,26 @@
0
+// +build linux
1
+
2
+package mount
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/libcontainer/console"
7
+	"os"
8
+	"path/filepath"
9
+)
10
+
11
+func SetupPtmx(rootfs, consolePath, mountLabel string) error {
12
+	ptmx := filepath.Join(rootfs, "dev/ptmx")
13
+	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
14
+		return err
15
+	}
16
+	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
17
+		return fmt.Errorf("symlink dev ptmx %s", err)
18
+	}
19
+	if consolePath != "" {
20
+		if err := console.Setup(rootfs, consolePath, mountLabel); err != nil {
21
+			return err
22
+		}
23
+	}
24
+	return nil
25
+}
0 26
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build linux
1
+
2
+package mount
3
+
4
+import (
5
+	"github.com/dotcloud/docker/pkg/system"
6
+	"syscall"
7
+)
8
+
9
+func SetReadonly() error {
10
+	return system.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
11
+}
0 12
new file mode 100644
... ...
@@ -0,0 +1,31 @@
0
+// +build linux
1
+
2
+package mount
3
+
4
+import (
5
+	"github.com/dotcloud/docker/pkg/system"
6
+	"syscall"
7
+)
8
+
9
+func RemountProc() error {
10
+	if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
11
+		return err
12
+	}
13
+	if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
14
+		return err
15
+	}
16
+	return nil
17
+}
18
+
19
+func RemountSys() error {
20
+	if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
21
+		if err != syscall.EINVAL {
22
+			return err
23
+		}
24
+	} else {
25
+		if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
26
+			return err
27
+		}
28
+	}
29
+	return nil
30
+}
... ...
@@ -6,6 +6,7 @@ import (
6 6
 	"fmt"
7 7
 	"github.com/dotcloud/docker/pkg/label"
8 8
 	"github.com/dotcloud/docker/pkg/libcontainer"
9
+	"github.com/dotcloud/docker/pkg/libcontainer/mount"
9 10
 	"github.com/dotcloud/docker/pkg/system"
10 11
 	"os"
11 12
 	"path/filepath"
... ...
@@ -63,10 +64,10 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s
63 63
 			if err := system.Unshare(syscall.CLONE_NEWNS); err != nil {
64 64
 				return -1, err
65 65
 			}
66
-			if err := remountProc(); err != nil {
66
+			if err := mount.RemountProc(); err != nil {
67 67
 				return -1, fmt.Errorf("remount proc %s", err)
68 68
 			}
69
-			if err := remountSys(); err != nil {
69
+			if err := mount.RemountSys(); err != nil {
70 70
 				return -1, fmt.Errorf("remount sys %s", err)
71 71
 			}
72 72
 			goto dropAndExec
... ...
@@ -11,8 +11,10 @@ import (
11 11
 	"github.com/dotcloud/docker/pkg/apparmor"
12 12
 	"github.com/dotcloud/docker/pkg/label"
13 13
 	"github.com/dotcloud/docker/pkg/libcontainer"
14
-	"github.com/dotcloud/docker/pkg/libcontainer/capabilities"
14
+	"github.com/dotcloud/docker/pkg/libcontainer/console"
15
+	"github.com/dotcloud/docker/pkg/libcontainer/mount"
15 16
 	"github.com/dotcloud/docker/pkg/libcontainer/network"
17
+	"github.com/dotcloud/docker/pkg/libcontainer/security/capabilities"
16 18
 	"github.com/dotcloud/docker/pkg/libcontainer/utils"
17 19
 	"github.com/dotcloud/docker/pkg/system"
18 20
 	"github.com/dotcloud/docker/pkg/user"
... ...
@@ -20,7 +22,7 @@ import (
20 20
 
21 21
 // Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
22 22
 // and other options required for the new container.
23
-func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
23
+func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consolePath string, syncPipe *SyncPipe, args []string) error {
24 24
 	rootfs, err := utils.ResolveRootfs(uncleanRootfs)
25 25
 	if err != nil {
26 26
 		return err
... ...
@@ -36,20 +38,16 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol
36 36
 	ns.logger.Println("received context from parent")
37 37
 	syncPipe.Close()
38 38
 
39
-	if console != "" {
40
-		ns.logger.Printf("setting up %s as console\n", console)
41
-		slave, err := system.OpenTerminal(console, syscall.O_RDWR)
42
-		if err != nil {
43
-			return fmt.Errorf("open terminal %s", err)
44
-		}
45
-		if err := dupSlave(slave); err != nil {
46
-			return fmt.Errorf("dup2 slave %s", err)
39
+	if consolePath != "" {
40
+		ns.logger.Printf("setting up %s as console\n", consolePath)
41
+		if err := console.OpenAndDup(consolePath); err != nil {
42
+			return err
47 43
 		}
48 44
 	}
49 45
 	if _, err := system.Setsid(); err != nil {
50 46
 		return fmt.Errorf("setsid %s", err)
51 47
 	}
52
-	if console != "" {
48
+	if consolePath != "" {
53 49
 		if err := system.Setctty(); err != nil {
54 50
 			return fmt.Errorf("setctty %s", err)
55 51
 		}
... ...
@@ -60,7 +58,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol
60 60
 
61 61
 	label.Init()
62 62
 	ns.logger.Println("setup mount namespace")
63
-	if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot, container.Context["mount_label"]); err != nil {
63
+	if err := mount.InitializeMountNamespace(rootfs, consolePath, container); err != nil {
64 64
 		return fmt.Errorf("setup mount namespace %s", err)
65 65
 	}
66 66
 	if err := system.Sethostname(container.Hostname); err != nil {
... ...
@@ -114,21 +112,6 @@ func setupUser(container *libcontainer.Container) error {
114 114
 	return nil
115 115
 }
116 116
 
117
-// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that
118
-// the slave's fd is 0, or stdin
119
-func dupSlave(slave *os.File) error {
120
-	if err := system.Dup2(slave.Fd(), 0); err != nil {
121
-		return err
122
-	}
123
-	if err := system.Dup2(slave.Fd(), 1); err != nil {
124
-		return err
125
-	}
126
-	if err := system.Dup2(slave.Fd(), 2); err != nil {
127
-		return err
128
-	}
129
-	return nil
130
-}
131
-
132 117
 // setupVethNetwork uses the Network config if it is not nil to initialize
133 118
 // the new veth interface inside the container for use by changing the name to eth0
134 119
 // setting the MTU and IP address along with the default gateway
135 120
deleted file mode 100644
... ...
@@ -1,265 +0,0 @@
1
-// +build linux
2
-
3
-package nsinit
4
-
5
-import (
6
-	"fmt"
7
-	"github.com/dotcloud/docker/pkg/label"
8
-	"github.com/dotcloud/docker/pkg/libcontainer"
9
-	"github.com/dotcloud/docker/pkg/system"
10
-	"io/ioutil"
11
-	"os"
12
-	"path/filepath"
13
-	"syscall"
14
-)
15
-
16
-// default mount point flags
17
-const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
18
-
19
-// setupNewMountNamespace is used to initialize a new mount namespace for an new
20
-// container in the rootfs that is specified.
21
-//
22
-// There is no need to unmount the new mounts because as soon as the mount namespace
23
-// is no longer in use, the mounts will be removed automatically
24
-func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool, mountLabel string) error {
25
-	flag := syscall.MS_PRIVATE
26
-	if noPivotRoot {
27
-		flag = syscall.MS_SLAVE
28
-	}
29
-	if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil {
30
-		return fmt.Errorf("mounting / as slave %s", err)
31
-	}
32
-	if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
33
-		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
34
-	}
35
-	if err := mountSystem(rootfs, mountLabel); err != nil {
36
-		return fmt.Errorf("mount system %s", err)
37
-	}
38
-
39
-	for _, m := range bindMounts {
40
-		var (
41
-			flags = syscall.MS_BIND | syscall.MS_REC
42
-			dest  = filepath.Join(rootfs, m.Destination)
43
-		)
44
-		if !m.Writable {
45
-			flags = flags | syscall.MS_RDONLY
46
-		}
47
-		if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil {
48
-			return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err)
49
-		}
50
-		if !m.Writable {
51
-			if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil {
52
-				return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err)
53
-			}
54
-		}
55
-		if m.Private {
56
-			if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
57
-				return fmt.Errorf("mounting %s private %s", dest, err)
58
-			}
59
-		}
60
-	}
61
-
62
-	if err := copyDevNodes(rootfs); err != nil {
63
-		return fmt.Errorf("copy dev nodes %s", err)
64
-	}
65
-	if err := setupPtmx(rootfs, console, mountLabel); err != nil {
66
-		return err
67
-	}
68
-	if err := system.Chdir(rootfs); err != nil {
69
-		return fmt.Errorf("chdir into %s %s", rootfs, err)
70
-	}
71
-
72
-	if noPivotRoot {
73
-		if err := rootMsMove(rootfs); err != nil {
74
-			return err
75
-		}
76
-	} else {
77
-		if err := rootPivot(rootfs); err != nil {
78
-			return err
79
-		}
80
-	}
81
-
82
-	if readonly {
83
-		if err := system.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil {
84
-			return fmt.Errorf("mounting %s as readonly %s", rootfs, err)
85
-		}
86
-	}
87
-
88
-	system.Umask(0022)
89
-
90
-	return nil
91
-}
92
-
93
-// use a pivot root to setup the rootfs
94
-func rootPivot(rootfs string) error {
95
-	pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root")
96
-	if err != nil {
97
-		return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err)
98
-	}
99
-	if err := system.Pivotroot(rootfs, pivotDir); err != nil {
100
-		return fmt.Errorf("pivot_root %s", err)
101
-	}
102
-	if err := system.Chdir("/"); err != nil {
103
-		return fmt.Errorf("chdir / %s", err)
104
-	}
105
-	// path to pivot dir now changed, update
106
-	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
107
-	if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
108
-		return fmt.Errorf("unmount pivot_root dir %s", err)
109
-	}
110
-	if err := os.Remove(pivotDir); err != nil {
111
-		return fmt.Errorf("remove pivot_root dir %s", err)
112
-	}
113
-	return nil
114
-}
115
-
116
-// use MS_MOVE and chroot to setup the rootfs
117
-func rootMsMove(rootfs string) error {
118
-	if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
119
-		return fmt.Errorf("mount move %s into / %s", rootfs, err)
120
-	}
121
-	if err := system.Chroot("."); err != nil {
122
-		return fmt.Errorf("chroot . %s", err)
123
-	}
124
-	if err := system.Chdir("/"); err != nil {
125
-		return fmt.Errorf("chdir / %s", err)
126
-	}
127
-	return nil
128
-}
129
-
130
-// copyDevNodes mknods the hosts devices so the new container has access to them
131
-func copyDevNodes(rootfs string) error {
132
-	oldMask := system.Umask(0000)
133
-	defer system.Umask(oldMask)
134
-
135
-	for _, node := range []string{
136
-		"null",
137
-		"zero",
138
-		"full",
139
-		"random",
140
-		"urandom",
141
-		"tty",
142
-	} {
143
-		if err := copyDevNode(rootfs, node); err != nil {
144
-			return err
145
-		}
146
-	}
147
-	return nil
148
-}
149
-
150
-func copyDevNode(rootfs, node string) error {
151
-	stat, err := os.Stat(filepath.Join("/dev", node))
152
-	if err != nil {
153
-		return err
154
-	}
155
-	var (
156
-		dest = filepath.Join(rootfs, "dev", node)
157
-		st   = stat.Sys().(*syscall.Stat_t)
158
-	)
159
-	if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
160
-		return fmt.Errorf("copy %s %s", node, err)
161
-	}
162
-	return nil
163
-}
164
-
165
-// setupConsole ensures that the container has a proper /dev/console setup
166
-func setupConsole(rootfs, console string, mountLabel string) error {
167
-	oldMask := system.Umask(0000)
168
-	defer system.Umask(oldMask)
169
-
170
-	stat, err := os.Stat(console)
171
-	if err != nil {
172
-		return fmt.Errorf("stat console %s %s", console, err)
173
-	}
174
-	var (
175
-		st   = stat.Sys().(*syscall.Stat_t)
176
-		dest = filepath.Join(rootfs, "dev/console")
177
-	)
178
-	if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
179
-		return fmt.Errorf("remove %s %s", dest, err)
180
-	}
181
-	if err := os.Chmod(console, 0600); err != nil {
182
-		return err
183
-	}
184
-	if err := os.Chown(console, 0, 0); err != nil {
185
-		return err
186
-	}
187
-	if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
188
-		return fmt.Errorf("mknod %s %s", dest, err)
189
-	}
190
-	if err := label.SetFileLabel(console, mountLabel); err != nil {
191
-		return fmt.Errorf("SetFileLabel Failed %s %s", dest, err)
192
-	}
193
-	if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil {
194
-		return fmt.Errorf("bind %s to %s %s", console, dest, err)
195
-	}
196
-	return nil
197
-}
198
-
199
-// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
200
-// inside the mount namespace
201
-func mountSystem(rootfs string, mountLabel string) error {
202
-	for _, m := range []struct {
203
-		source string
204
-		path   string
205
-		device string
206
-		flags  int
207
-		data   string
208
-	}{
209
-		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
210
-		{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
211
-		{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
212
-		{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
213
-	} {
214
-		if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
215
-			return fmt.Errorf("mkdirall %s %s", m.path, err)
216
-		}
217
-		if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
218
-			return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
219
-		}
220
-	}
221
-	return nil
222
-}
223
-
224
-// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and
225
-// finishes setting up /dev/console
226
-func setupPtmx(rootfs, console string, mountLabel string) error {
227
-	ptmx := filepath.Join(rootfs, "dev/ptmx")
228
-	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
229
-		return err
230
-	}
231
-	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
232
-		return fmt.Errorf("symlink dev ptmx %s", err)
233
-	}
234
-	if console != "" {
235
-		if err := setupConsole(rootfs, console, mountLabel); err != nil {
236
-			return err
237
-		}
238
-	}
239
-	return nil
240
-}
241
-
242
-// remountProc is used to detach and remount the proc filesystem
243
-// commonly needed with running a new process inside an existing container
244
-func remountProc() error {
245
-	if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
246
-		return err
247
-	}
248
-	if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
249
-		return err
250
-	}
251
-	return nil
252
-}
253
-
254
-func remountSys() error {
255
-	if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
256
-		if err != syscall.EINVAL {
257
-			return err
258
-		}
259
-	} else {
260
-		if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
261
-			return err
262
-		}
263
-	}
264
-	return nil
265
-}
266 1
new file mode 100644
... ...
@@ -0,0 +1,35 @@
0
+package capabilities
1
+
2
+import (
3
+	"github.com/dotcloud/docker/pkg/libcontainer"
4
+	"github.com/syndtr/gocapability/capability"
5
+	"os"
6
+)
7
+
8
+// DropCapabilities drops capabilities for the current process based
9
+// on the container's configuration.
10
+func DropCapabilities(container *libcontainer.Container) error {
11
+	if drop := getCapabilitiesMask(container); len(drop) > 0 {
12
+		c, err := capability.NewPid(os.Getpid())
13
+		if err != nil {
14
+			return err
15
+		}
16
+		c.Unset(capability.CAPS|capability.BOUNDS, drop...)
17
+
18
+		if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil {
19
+			return err
20
+		}
21
+	}
22
+	return nil
23
+}
24
+
25
+// getCapabilitiesMask returns the specific cap mask values for the libcontainer types
26
+func getCapabilitiesMask(container *libcontainer.Container) []capability.Cap {
27
+	drop := []capability.Cap{}
28
+	for _, c := range container.CapabilitiesMask {
29
+		if !c.Enabled {
30
+			drop = append(drop, c.Value)
31
+		}
32
+	}
33
+	return drop
34
+}
0 35
new file mode 100644
... ...
@@ -0,0 +1,51 @@
0
+package restrict
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"path/filepath"
6
+	"syscall"
7
+
8
+	"github.com/dotcloud/docker/pkg/system"
9
+)
10
+
11
+const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY
12
+
13
+var restrictions = map[string]string{
14
+	// dirs
15
+	"/proc/sys":  "",
16
+	"/proc/irq":  "",
17
+	"/proc/acpi": "",
18
+
19
+	// files
20
+	"/proc/sysrq-trigger": "/dev/null",
21
+	"/proc/kcore":         "/dev/null",
22
+}
23
+
24
+// Restrict locks down access to many areas of proc
25
+// by using the asumption that the user does not have mount caps to
26
+// revert the changes made here
27
+func Restrict(rootfs, empty string) error {
28
+	for dest, source := range restrictions {
29
+		dest = filepath.Join(rootfs, dest)
30
+
31
+		// we don't have a "/dev/null" for dirs so have the requester pass a dir
32
+		// for us to bind mount
33
+		switch source {
34
+		case "":
35
+			source = empty
36
+		default:
37
+			source = filepath.Join(rootfs, source)
38
+		}
39
+		if err := system.Mount(source, dest, "bind", flags, ""); err != nil {
40
+			if os.IsNotExist(err) {
41
+				continue
42
+			}
43
+			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
44
+		}
45
+		if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil {
46
+			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
47
+		}
48
+	}
49
+	return nil
50
+}
... ...
@@ -11,6 +11,26 @@ var (
11 11
 	ErrUnsupported      = errors.New("Unsupported method")
12 12
 )
13 13
 
14
+type Mounts []Mount
15
+
16
+func (s Mounts) OfType(t string) Mounts {
17
+	out := Mounts{}
18
+	for _, m := range s {
19
+		if m.Type == t {
20
+			out = append(out, m)
21
+		}
22
+	}
23
+	return out
24
+}
25
+
26
+type Mount struct {
27
+	Type        string `json:"type,omitempty"`
28
+	Source      string `json:"source,omitempty"`      // Source path, in the host namespace
29
+	Destination string `json:"destination,omitempty"` // Destination path, in the container
30
+	Writable    bool   `json:"writable,omitempty"`
31
+	Private     bool   `json:"private,omitempty"`
32
+}
33
+
14 34
 // namespaceList is used to convert the libcontainer types
15 35
 // into the names of the files located in /proc/<pid>/ns/* for
16 36
 // each namespace