Browse code

Move all bind-mounts in the container inside the namespace

This moves the bind mounts like /.dockerinit, /etc/hostname, volumes,
etc into the container namespace, by setting them up using lxc.

This is useful to avoid littering the global namespace with a lot of
mounts that are internal to each container and are not generally
needed on the outside. In particular, it seems that having a lot of
mounts is problematic wrt scaling to a lot of containers on systems
where the root filesystem is mounted --rshared.

Note that the "private" option is only supported by the native driver, as
lxc doesn't support setting this. This is not a huge problem, but it does
mean that some mounts are unnecessarily shared inside the container if you're
using the lxc driver.

Docker-DCO-1.1-Signed-off-by: Alexander Larsson <alexl@redhat.com> (github: alexlarsson)

Alexander Larsson authored on 2014/03/04 00:15:29
Showing 11 changed files
... ...
@@ -97,6 +97,13 @@ type Resources struct {
97 97
 	CpuShares  int64 `json:"cpu_shares"`
98 98
 }
99 99
 
100
+type Mount struct {
101
+	Source      string `json:"source"`
102
+	Destination string `json:"destination"`
103
+	Writable    bool   `json:"writable"`
104
+	Private     bool   `json:"private"`
105
+}
106
+
100 107
 // Process wrapps an os/exec.Cmd to add more metadata
101 108
 type Command struct {
102 109
 	exec.Cmd `json:"-"`
... ...
@@ -114,6 +121,7 @@ type Command struct {
114 114
 	Network    *Network   `json:"network"` // if network is nil then networking is disabled
115 115
 	Config     []string   `json:"config"`  //  generic values that specific drivers can consume
116 116
 	Resources  *Resources `json:"resources"`
117
+	Mounts     []Mount    `json:"mounts"`
117 118
 
118 119
 	Terminal     Terminal `json:"-"`             // standard or tty terminal
119 120
 	Console      string   `json:"-"`             // dev/console path
... ...
@@ -9,7 +9,7 @@ import (
9 9
 	"path"
10 10
 )
11 11
 
12
-func NewDriver(name, root string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) {
12
+func NewDriver(name, root, initPath string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) {
13 13
 	switch name {
14 14
 	case "lxc":
15 15
 		// we want to five the lxc driver the full docker root because it needs
... ...
@@ -17,7 +17,7 @@ func NewDriver(name, root string, sysInfo *sysinfo.SysInfo) (execdriver.Driver,
17 17
 		// to be backwards compatible
18 18
 		return lxc.NewDriver(root, sysInfo.AppArmor)
19 19
 	case "native":
20
-		return native.NewDriver(path.Join(root, "execdriver", "native"))
20
+		return native.NewDriver(path.Join(root, "execdriver", "native"), initPath)
21 21
 	}
22 22
 	return nil, fmt.Errorf("unknown exec driver %s", name)
23 23
 }
... ...
@@ -88,6 +88,14 @@ lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bi
88 88
 lxc.mount.entry = devpts {{escapeFstabSpaces $ROOTFS}}/dev/pts devpts newinstance,ptmxmode=0666,nosuid,noexec 0 0
89 89
 lxc.mount.entry = shm {{escapeFstabSpaces $ROOTFS}}/dev/shm tmpfs size=65536k,nosuid,nodev,noexec 0 0
90 90
 
91
+{{range $value := .Mounts}}
92
+{{if $value.Writable}}
93
+lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,rw 0 0
94
+{{else}}
95
+lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,ro 0 0
96
+{{end}}
97
+{{end}}
98
+
91 99
 {{if .Privileged}}
92 100
 {{if .AppArmor}}
93 101
 lxc.aa_profile = unconfined
... ...
@@ -48,6 +48,10 @@ func createContainer(c *execdriver.Command) *libcontainer.Container {
48 48
 	// check to see if we are running in ramdisk to disable pivot root
49 49
 	container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
50 50
 
51
+	for _, m := range c.Mounts {
52
+		container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private})
53
+	}
54
+
51 55
 	return container
52 56
 }
53 57
 
... ...
@@ -55,10 +55,11 @@ func init() {
55 55
 }
56 56
 
57 57
 type driver struct {
58
-	root string
58
+	root     string
59
+	initPath string
59 60
 }
60 61
 
61
-func NewDriver(root string) (*driver, error) {
62
+func NewDriver(root, initPath string) (*driver, error) {
62 63
 	if err := os.MkdirAll(root, 0700); err != nil {
63 64
 		return nil, err
64 65
 	}
... ...
@@ -66,7 +67,8 @@ func NewDriver(root string) (*driver, error) {
66 66
 		return nil, err
67 67
 	}
68 68
 	return &driver{
69
-		root: root,
69
+		root:     root,
70
+		initPath: initPath,
70 71
 	}, nil
71 72
 }
72 73
 
... ...
@@ -210,7 +212,7 @@ func (d *dockerCommandFactory) Create(container *libcontainer.Container, console
210 210
 	// we need to join the rootfs because nsinit will setup the rootfs and chroot
211 211
 	initPath := filepath.Join(d.c.Rootfs, d.c.InitPath)
212 212
 
213
-	d.c.Path = initPath
213
+	d.c.Path = d.driver.initPath
214 214
 	d.c.Args = append([]string{
215 215
 		initPath,
216 216
 		"-driver", DriverName,
... ...
@@ -23,6 +23,7 @@ type Container struct {
23 23
 	Networks     []*Network      `json:"networks,omitempty"`      // nil for host's network stack
24 24
 	Cgroups      *cgroups.Cgroup `json:"cgroups,omitempty"`       // cgroups
25 25
 	Context      Context         `json:"context,omitempty"`       // generic context for specific options (apparmor, selinux)
26
+	Mounts       []Mount         `json:"mounts,omitempty"`
26 27
 }
27 28
 
28 29
 // Network defines configuration for a container's networking stack
... ...
@@ -36,3 +37,12 @@ type Network struct {
36 36
 	Gateway string  `json:"gateway,omitempty"`
37 37
 	Mtu     int     `json:"mtu,omitempty"`
38 38
 }
39
+
40
+// Bind mounts from the host system to the container
41
+//
42
+type Mount struct {
43
+	Source      string `json:"source"`      // Source path, in the host namespace
44
+	Destination string `json:"destination"` // Destination path, in the container
45
+	Writable    bool   `json:"writable"`
46
+	Private     bool   `json:"private"`
47
+}
... ...
@@ -51,7 +51,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol
51 51
 	if err := system.ParentDeathSignal(); err != nil {
52 52
 		return fmt.Errorf("parent death signal %s", err)
53 53
 	}
54
-	if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs, container.NoPivotRoot); err != nil {
54
+	if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot); err != nil {
55 55
 		return fmt.Errorf("setup mount namespace %s", err)
56 56
 	}
57 57
 	if err := setupNetwork(container, context); err != nil {
... ...
@@ -4,6 +4,7 @@ package nsinit
4 4
 
5 5
 import (
6 6
 	"fmt"
7
+	"github.com/dotcloud/docker/pkg/libcontainer"
7 8
 	"github.com/dotcloud/docker/pkg/system"
8 9
 	"io/ioutil"
9 10
 	"os"
... ...
@@ -19,7 +20,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD
19 19
 //
20 20
 // There is no need to unmount the new mounts because as soon as the mount namespace
21 21
 // is no longer in use, the mounts will be removed automatically
22
-func setupNewMountNamespace(rootfs, console string, readonly, noPivotRoot bool) error {
22
+func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool) error {
23 23
 	flag := syscall.MS_PRIVATE
24 24
 	if noPivotRoot {
25 25
 		flag = syscall.MS_SLAVE
... ...
@@ -38,6 +39,23 @@ func setupNewMountNamespace(rootfs, console string, readonly, noPivotRoot bool)
38 38
 	if err := mountSystem(rootfs); err != nil {
39 39
 		return fmt.Errorf("mount system %s", err)
40 40
 	}
41
+
42
+	for _, m := range bindMounts {
43
+		flags := syscall.MS_BIND | syscall.MS_REC
44
+		if !m.Writable {
45
+			flags = flags | syscall.MS_RDONLY
46
+		}
47
+		dest := filepath.Join(rootfs, m.Destination)
48
+		if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil {
49
+			return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err)
50
+		}
51
+		if m.Private {
52
+			if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
53
+				return fmt.Errorf("mounting %s private %s", dest, err)
54
+			}
55
+		}
56
+	}
57
+
41 58
 	if err := copyDevNodes(rootfs); err != nil {
42 59
 		return fmt.Errorf("copy dev nodes %s", err)
43 60
 	}
... ...
@@ -529,13 +529,13 @@ func (container *Container) Start() (err error) {
529 529
 		return err
530 530
 	}
531 531
 
532
+	populateCommand(container)
533
+	container.command.Env = env
534
+
532 535
 	if err := mountVolumesForContainer(container, envPath); err != nil {
533 536
 		return err
534 537
 	}
535 538
 
536
-	populateCommand(container)
537
-	container.command.Env = env
538
-
539 539
 	// Setup logging of stdout and stderr to disk
540 540
 	if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil {
541 541
 		return err
... ...
@@ -733,7 +733,7 @@ func NewRuntimeFromDirectory(config *daemonconfig.Config, eng *engine.Engine) (*
733 733
 	}
734 734
 
735 735
 	sysInfo := sysinfo.New(false)
736
-	ed, err := execdrivers.NewDriver(config.ExecDriver, config.Root, sysInfo)
736
+	ed, err := execdrivers.NewDriver(config.ExecDriver, config.Root, sysInitPath, sysInfo)
737 737
 	if err != nil {
738 738
 		return nil, err
739 739
 	}
... ...
@@ -3,6 +3,7 @@ package runtime
3 3
 import (
4 4
 	"fmt"
5 5
 	"github.com/dotcloud/docker/archive"
6
+	"github.com/dotcloud/docker/execdriver"
6 7
 	"github.com/dotcloud/docker/pkg/mount"
7 8
 	"github.com/dotcloud/docker/utils"
8 9
 	"io/ioutil"
... ...
@@ -55,70 +56,33 @@ func mountVolumesForContainer(container *Container, envPath string) error {
55 55
 		return err
56 56
 	}
57 57
 
58
-	// Mount docker specific files into the containers root fs
59
-	if err := mount.Mount(runtime.sysInitPath, filepath.Join(root, "/.dockerinit"), "none", "bind,ro"); err != nil {
60
-		return err
61
-	}
62
-	if err := mount.Mount(envPath, filepath.Join(root, "/.dockerenv"), "none", "bind,ro"); err != nil {
63
-		return err
64
-	}
65
-	if err := mount.Mount(container.ResolvConfPath, filepath.Join(root, "/etc/resolv.conf"), "none", "bind,ro"); err != nil {
66
-		return err
58
+	mounts := []execdriver.Mount{
59
+		{runtime.sysInitPath, "/.dockerinit", false, true},
60
+		{envPath, "/.dockerenv", false, true},
61
+		{container.ResolvConfPath, "/etc/resolv.conf", false, true},
67 62
 	}
68 63
 
69 64
 	if container.HostnamePath != "" && container.HostsPath != "" {
70
-		if err := mount.Mount(container.HostnamePath, filepath.Join(root, "/etc/hostname"), "none", "bind,ro"); err != nil {
71
-			return err
72
-		}
73
-		if err := mount.Mount(container.HostsPath, filepath.Join(root, "/etc/hosts"), "none", "bind,ro"); err != nil {
74
-			return err
75
-		}
65
+		mounts = append(mounts, execdriver.Mount{container.HostnamePath, "/etc/hostname", false, true})
66
+		mounts = append(mounts, execdriver.Mount{container.HostsPath, "/etc/hosts", false, true})
76 67
 	}
77 68
 
78 69
 	// Mount user specified volumes
70
+	// Note, these are not private because you may want propagation of (un)mounts from host
71
+	// volumes. For instance if you use -v /usr:/usr and the host later mounts /usr/share you
72
+	// want this new mount in the container
79 73
 	for r, v := range container.Volumes {
80
-		mountAs := "ro"
81
-		if container.VolumesRW[r] {
82
-			mountAs = "rw"
83
-		}
74
+		mounts = append(mounts, execdriver.Mount{v, r, container.VolumesRW[r], false})
75
+	}
84 76
 
85
-		r = filepath.Join(root, r)
86
-		if p, err := utils.FollowSymlinkInScope(r, root); err != nil {
87
-			return err
88
-		} else {
89
-			r = p
90
-		}
77
+	container.command.Mounts = mounts
91 78
 
92
-		if err := mount.Mount(v, r, "none", fmt.Sprintf("bind,%s", mountAs)); err != nil {
93
-			return err
94
-		}
95
-	}
96 79
 	return nil
97 80
 }
98 81
 
99 82
 func unmountVolumesForContainer(container *Container) {
100
-	var (
101
-		root   = container.RootfsPath()
102
-		mounts = []string{
103
-			root,
104
-			filepath.Join(root, "/.dockerinit"),
105
-			filepath.Join(root, "/.dockerenv"),
106
-			filepath.Join(root, "/etc/resolv.conf"),
107
-		}
108
-	)
109
-
110
-	if container.HostnamePath != "" && container.HostsPath != "" {
111
-		mounts = append(mounts, filepath.Join(root, "/etc/hostname"), filepath.Join(root, "/etc/hosts"))
112
-	}
113
-
114
-	for r := range container.Volumes {
115
-		mounts = append(mounts, filepath.Join(root, r))
116
-	}
117
-
118
-	for i := len(mounts) - 1; i >= 0; i-- {
119
-		if lastError := mount.Unmount(mounts[i]); lastError != nil {
120
-			log.Printf("Failed to umount %v: %v", mounts[i], lastError)
121
-		}
83
+	if err := mount.Unmount(container.RootfsPath()); err != nil {
84
+		log.Printf("Failed to umount container: %v", err)
122 85
 	}
123 86
 }
124 87