Browse code

Merge pull request #4327 from crosbymichael/add-libcontainer

Add native execution driver to docker and make it the default

Guillaume J. Charmes authored on 2014/03/04 09:34:20
Showing 43 changed files
... ...
@@ -532,6 +532,7 @@ func (container *Container) Start() (err error) {
532 532
 	}
533 533
 
534 534
 	populateCommand(container)
535
+	container.command.Env = env
535 536
 
536 537
 	// Setup logging of stdout and stderr to disk
537 538
 	if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil {
... ...
@@ -17,7 +17,7 @@ import (
17 17
 )
18 18
 
19 19
 func main() {
20
-	if selfPath := utils.SelfPath(); selfPath == "/sbin/init" || selfPath == "/.dockerinit" {
20
+	if selfPath := utils.SelfPath(); strings.Contains(selfPath, ".dockerinit") {
21 21
 		// Running in init mode
22 22
 		sysinit.SysInit()
23 23
 		return
... ...
@@ -39,7 +39,7 @@ func main() {
39 39
 		flDefaultIp          = flag.String([]string{"#ip", "-ip"}, "0.0.0.0", "Default IP address to use when binding container ports")
40 40
 		flInterContainerComm = flag.Bool([]string{"#icc", "-icc"}, true, "Enable inter-container communication")
41 41
 		flGraphDriver        = flag.String([]string{"s", "-storage-driver"}, "", "Force the docker runtime to use a specific storage driver")
42
-		flExecDriver         = flag.String([]string{"e", "-exec-driver"}, "", "Force the docker runtime to use a specific exec driver")
42
+		flExecDriver         = flag.String([]string{"e", "-exec-driver"}, "native", "Force the docker runtime to use a specific exec driver")
43 43
 		flHosts              = opts.NewListOpts(api.ValidateHost)
44 44
 		flMtu                = flag.Int([]string{"#mtu", "-mtu"}, 0, "Set the containers network MTU; if no value is provided: default to the default route MTU or 1500 if no default route is available")
45 45
 	)
46 46
deleted file mode 100644
... ...
@@ -1,101 +0,0 @@
1
-package chroot
2
-
3
-import (
4
-	"fmt"
5
-	"github.com/dotcloud/docker/execdriver"
6
-	"github.com/dotcloud/docker/pkg/mount"
7
-	"os"
8
-	"os/exec"
9
-	"syscall"
10
-)
11
-
12
-const (
13
-	DriverName = "chroot"
14
-	Version    = "0.1"
15
-)
16
-
17
-func init() {
18
-	execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error {
19
-		if err := mount.ForceMount("proc", "proc", "proc", ""); err != nil {
20
-			return err
21
-		}
22
-		defer mount.ForceUnmount("proc")
23
-		cmd := exec.Command(args.Args[0], args.Args[1:]...)
24
-
25
-		cmd.Stderr = os.Stderr
26
-		cmd.Stdout = os.Stdout
27
-		cmd.Stdin = os.Stdin
28
-
29
-		return cmd.Run()
30
-	})
31
-}
32
-
33
-type driver struct {
34
-}
35
-
36
-func NewDriver() (*driver, error) {
37
-	return &driver{}, nil
38
-}
39
-
40
-func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) {
41
-	params := []string{
42
-		"chroot",
43
-		c.Rootfs,
44
-		"/.dockerinit",
45
-		"-driver",
46
-		DriverName,
47
-	}
48
-	params = append(params, c.Entrypoint)
49
-	params = append(params, c.Arguments...)
50
-
51
-	var (
52
-		name = params[0]
53
-		arg  = params[1:]
54
-	)
55
-	aname, err := exec.LookPath(name)
56
-	if err != nil {
57
-		aname = name
58
-	}
59
-	c.Path = aname
60
-	c.Args = append([]string{name}, arg...)
61
-
62
-	if err := c.Start(); err != nil {
63
-		return -1, err
64
-	}
65
-
66
-	if startCallback != nil {
67
-		startCallback(c)
68
-	}
69
-
70
-	err = c.Wait()
71
-	return getExitCode(c), err
72
-}
73
-
74
-/// Return the exit code of the process
75
-// if the process has not exited -1 will be returned
76
-func getExitCode(c *execdriver.Command) int {
77
-	if c.ProcessState == nil {
78
-		return -1
79
-	}
80
-	return c.ProcessState.Sys().(syscall.WaitStatus).ExitStatus()
81
-}
82
-
83
-func (d *driver) Kill(p *execdriver.Command, sig int) error {
84
-	return p.Process.Kill()
85
-}
86
-
87
-func (d *driver) Restore(c *execdriver.Command) error {
88
-	panic("Not Implemented")
89
-}
90
-
91
-func (d *driver) Info(id string) execdriver.Info {
92
-	panic("Not implemented")
93
-}
94
-
95
-func (d *driver) Name() string {
96
-	return fmt.Sprintf("%s-%s", DriverName, Version)
97
-}
98
-
99
-func (d *driver) GetPidsForContainer(id string) ([]int, error) {
100
-	return nil, fmt.Errorf("Not supported")
101
-}
... ...
@@ -51,6 +51,9 @@ type InitArgs struct {
51 51
 	Args       []string
52 52
 	Mtu        int
53 53
 	Driver     string
54
+	Console    string
55
+	Pipe       int
56
+	Root       string
54 57
 }
55 58
 
56 59
 // Driver specific information based on
57 60
new file mode 100644
... ...
@@ -0,0 +1,82 @@
0
+package native
1
+
2
+import (
3
+	"fmt"
4
+	"github.com/dotcloud/docker/execdriver"
5
+	"github.com/dotcloud/docker/pkg/cgroups"
6
+	"github.com/dotcloud/docker/pkg/libcontainer"
7
+)
8
+
9
+// createContainer populates and configures the container type with the
10
+// data provided by the execdriver.Command
11
+func createContainer(c *execdriver.Command) *libcontainer.Container {
12
+	container := getDefaultTemplate()
13
+
14
+	container.Hostname = getEnv("HOSTNAME", c.Env)
15
+	container.Tty = c.Tty
16
+	container.User = c.User
17
+	container.WorkingDir = c.WorkingDir
18
+	container.Env = c.Env
19
+
20
+	if c.Network != nil {
21
+		container.Networks = []*libcontainer.Network{
22
+			{
23
+				Mtu:     c.Network.Mtu,
24
+				Address: fmt.Sprintf("%s/%d", c.Network.IPAddress, c.Network.IPPrefixLen),
25
+				Gateway: c.Network.Gateway,
26
+				Type:    "veth",
27
+				Context: libcontainer.Context{
28
+					"prefix": "dock",
29
+					"bridge": c.Network.Bridge,
30
+				},
31
+			},
32
+		}
33
+	}
34
+
35
+	container.Cgroups.Name = c.ID
36
+	if c.Privileged {
37
+		container.Capabilities = nil
38
+		container.Cgroups.DeviceAccess = true
39
+	}
40
+	if c.Resources != nil {
41
+		container.Cgroups.CpuShares = c.Resources.CpuShares
42
+		container.Cgroups.Memory = c.Resources.Memory
43
+		container.Cgroups.MemorySwap = c.Resources.MemorySwap
44
+	}
45
+	return container
46
+}
47
+
48
+// getDefaultTemplate returns the docker default for
49
+// the libcontainer configuration file
50
+func getDefaultTemplate() *libcontainer.Container {
51
+	return &libcontainer.Container{
52
+		Capabilities: libcontainer.Capabilities{
53
+			libcontainer.GetCapability("SETPCAP"),
54
+			libcontainer.GetCapability("SYS_MODULE"),
55
+			libcontainer.GetCapability("SYS_RAWIO"),
56
+			libcontainer.GetCapability("SYS_PACCT"),
57
+			libcontainer.GetCapability("SYS_ADMIN"),
58
+			libcontainer.GetCapability("SYS_NICE"),
59
+			libcontainer.GetCapability("SYS_RESOURCE"),
60
+			libcontainer.GetCapability("SYS_TIME"),
61
+			libcontainer.GetCapability("SYS_TTY_CONFIG"),
62
+			libcontainer.GetCapability("MKNOD"),
63
+			libcontainer.GetCapability("AUDIT_WRITE"),
64
+			libcontainer.GetCapability("AUDIT_CONTROL"),
65
+			libcontainer.GetCapability("MAC_OVERRIDE"),
66
+			libcontainer.GetCapability("MAC_ADMIN"),
67
+			libcontainer.GetCapability("NET_ADMIN"),
68
+		},
69
+		Namespaces: libcontainer.Namespaces{
70
+			libcontainer.GetNamespace("NEWNS"),
71
+			libcontainer.GetNamespace("NEWUTS"),
72
+			libcontainer.GetNamespace("NEWIPC"),
73
+			libcontainer.GetNamespace("NEWPID"),
74
+			libcontainer.GetNamespace("NEWNET"),
75
+		},
76
+		Cgroups: &cgroups.Cgroup{
77
+			Parent:       "docker",
78
+			DeviceAccess: false,
79
+		},
80
+	}
81
+}
0 82
new file mode 100644
... ...
@@ -0,0 +1,277 @@
0
+package native
1
+
2
+import (
3
+	"encoding/json"
4
+	"fmt"
5
+	"github.com/dotcloud/docker/execdriver"
6
+	"github.com/dotcloud/docker/pkg/cgroups"
7
+	"github.com/dotcloud/docker/pkg/libcontainer"
8
+	"github.com/dotcloud/docker/pkg/libcontainer/nsinit"
9
+	"github.com/dotcloud/docker/pkg/system"
10
+	"io/ioutil"
11
+	"os"
12
+	"os/exec"
13
+	"path/filepath"
14
+	"strconv"
15
+	"strings"
16
+	"syscall"
17
+	"time"
18
+)
19
+
20
+const (
21
+	DriverName = "native"
22
+	Version    = "0.1"
23
+)
24
+
25
+func init() {
26
+	execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error {
27
+		var (
28
+			container *libcontainer.Container
29
+			ns        = nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{args.Root})
30
+		)
31
+		f, err := os.Open(filepath.Join(args.Root, "container.json"))
32
+		if err != nil {
33
+			return err
34
+		}
35
+		if err := json.NewDecoder(f).Decode(&container); err != nil {
36
+			f.Close()
37
+			return err
38
+		}
39
+		f.Close()
40
+
41
+		cwd, err := os.Getwd()
42
+		if err != nil {
43
+			return err
44
+		}
45
+		syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(args.Pipe))
46
+		if err != nil {
47
+			return err
48
+		}
49
+		if err := ns.Init(container, cwd, args.Console, syncPipe, args.Args); err != nil {
50
+			return err
51
+		}
52
+		return nil
53
+	})
54
+}
55
+
56
+type driver struct {
57
+	root string
58
+}
59
+
60
+func NewDriver(root string) (*driver, error) {
61
+	if err := os.MkdirAll(root, 0700); err != nil {
62
+		return nil, err
63
+	}
64
+	return &driver{
65
+		root: root,
66
+	}, nil
67
+}
68
+
69
+func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) {
70
+	if err := d.validateCommand(c); err != nil {
71
+		return -1, err
72
+	}
73
+	var (
74
+		term        nsinit.Terminal
75
+		container   = createContainer(c)
76
+		factory     = &dockerCommandFactory{c: c, driver: d}
77
+		stateWriter = &dockerStateWriter{
78
+			callback: startCallback,
79
+			c:        c,
80
+			dsw:      &nsinit.DefaultStateWriter{filepath.Join(d.root, c.ID)},
81
+		}
82
+		ns   = nsinit.NewNsInit(factory, stateWriter)
83
+		args = append([]string{c.Entrypoint}, c.Arguments...)
84
+	)
85
+	if err := d.createContainerRoot(c.ID); err != nil {
86
+		return -1, err
87
+	}
88
+	defer d.removeContainerRoot(c.ID)
89
+
90
+	if c.Tty {
91
+		term = &dockerTtyTerm{
92
+			pipes: pipes,
93
+		}
94
+	} else {
95
+		term = &dockerStdTerm{
96
+			pipes: pipes,
97
+		}
98
+	}
99
+	c.Terminal = term
100
+	if err := d.writeContainerFile(container, c.ID); err != nil {
101
+		return -1, err
102
+	}
103
+	return ns.Exec(container, term, args)
104
+}
105
+
106
+func (d *driver) Kill(p *execdriver.Command, sig int) error {
107
+	return syscall.Kill(p.Process.Pid, syscall.Signal(sig))
108
+}
109
+
110
+func (d *driver) Restore(c *execdriver.Command) error {
111
+	var nspid int
112
+	f, err := os.Open(filepath.Join(d.root, c.ID, "pid"))
113
+	if err != nil {
114
+		return err
115
+	}
116
+	defer d.removeContainerRoot(c.ID)
117
+
118
+	if _, err := fmt.Fscanf(f, "%d", &nspid); err != nil {
119
+		f.Close()
120
+		return err
121
+	}
122
+	f.Close()
123
+
124
+	if _, err := os.FindProcess(nspid); err != nil {
125
+		return fmt.Errorf("finding existing pid %d %s", nspid, err)
126
+	}
127
+	c.Process = &os.Process{
128
+		Pid: nspid,
129
+	}
130
+	ticker := time.NewTicker(500 * time.Millisecond)
131
+	defer ticker.Stop()
132
+
133
+	for _ = range ticker.C {
134
+		if err := syscall.Kill(nspid, 0); err != nil {
135
+			if strings.Contains(err.Error(), "no such process") {
136
+				return nil
137
+			}
138
+			return fmt.Errorf("signal error %s", err)
139
+		}
140
+	}
141
+	return nil
142
+}
143
+
144
+func (d *driver) Info(id string) execdriver.Info {
145
+	return &info{
146
+		ID:     id,
147
+		driver: d,
148
+	}
149
+}
150
+
151
+func (d *driver) Name() string {
152
+	return fmt.Sprintf("%s-%s", DriverName, Version)
153
+}
154
+
155
+// TODO: this can be improved with our driver
156
+// there has to be a better way to do this
157
+func (d *driver) GetPidsForContainer(id string) ([]int, error) {
158
+	pids := []int{}
159
+
160
+	subsystem := "devices"
161
+	cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem)
162
+	if err != nil {
163
+		return pids, err
164
+	}
165
+	cgroupDir, err := cgroups.GetThisCgroupDir(subsystem)
166
+	if err != nil {
167
+		return pids, err
168
+	}
169
+
170
+	filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks")
171
+	if _, err := os.Stat(filename); os.IsNotExist(err) {
172
+		filename = filepath.Join(cgroupRoot, cgroupDir, "docker", id, "tasks")
173
+	}
174
+
175
+	output, err := ioutil.ReadFile(filename)
176
+	if err != nil {
177
+		return pids, err
178
+	}
179
+	for _, p := range strings.Split(string(output), "\n") {
180
+		if len(p) == 0 {
181
+			continue
182
+		}
183
+		pid, err := strconv.Atoi(p)
184
+		if err != nil {
185
+			return pids, fmt.Errorf("Invalid pid '%s': %s", p, err)
186
+		}
187
+		pids = append(pids, pid)
188
+	}
189
+	return pids, nil
190
+}
191
+
192
+func (d *driver) writeContainerFile(container *libcontainer.Container, id string) error {
193
+	data, err := json.Marshal(container)
194
+	if err != nil {
195
+		return err
196
+	}
197
+	return ioutil.WriteFile(filepath.Join(d.root, id, "container.json"), data, 0655)
198
+}
199
+
200
+func (d *driver) createContainerRoot(id string) error {
201
+	return os.MkdirAll(filepath.Join(d.root, id), 0655)
202
+}
203
+
204
+func (d *driver) removeContainerRoot(id string) error {
205
+	return os.RemoveAll(filepath.Join(d.root, id))
206
+}
207
+
208
+func (d *driver) validateCommand(c *execdriver.Command) error {
209
+	// we need to check the Config of the command to make sure that we
210
+	// do not have any of the lxc-conf variables
211
+	for _, conf := range c.Config {
212
+		if strings.Contains(conf, "lxc") {
213
+			return fmt.Errorf("%s is not supported by the native driver", conf)
214
+		}
215
+	}
216
+	return nil
217
+}
218
+
219
+func getEnv(key string, env []string) string {
220
+	for _, pair := range env {
221
+		parts := strings.Split(pair, "=")
222
+		if parts[0] == key {
223
+			return parts[1]
224
+		}
225
+	}
226
+	return ""
227
+}
228
+
229
+type dockerCommandFactory struct {
230
+	c      *execdriver.Command
231
+	driver *driver
232
+}
233
+
234
+// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces
235
+// defined on the container's configuration and use the current binary as the init with the
236
+// args provided
237
+func (d *dockerCommandFactory) Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd {
238
+	// we need to join the rootfs because nsinit will setup the rootfs and chroot
239
+	initPath := filepath.Join(d.c.Rootfs, d.c.InitPath)
240
+
241
+	d.c.Path = initPath
242
+	d.c.Args = append([]string{
243
+		initPath,
244
+		"-driver", DriverName,
245
+		"-console", console,
246
+		"-pipe", fmt.Sprint(syncFd),
247
+		"-root", filepath.Join(d.driver.root, d.c.ID),
248
+	}, args...)
249
+
250
+	// set this to nil so that when we set the clone flags anything else is reset
251
+	d.c.SysProcAttr = nil
252
+	system.SetCloneFlags(&d.c.Cmd, uintptr(nsinit.GetNamespaceFlags(container.Namespaces)))
253
+
254
+	d.c.Env = container.Env
255
+	d.c.Dir = d.c.Rootfs
256
+
257
+	return &d.c.Cmd
258
+}
259
+
260
+type dockerStateWriter struct {
261
+	dsw      nsinit.StateWriter
262
+	c        *execdriver.Command
263
+	callback execdriver.StartCallback
264
+}
265
+
266
+func (d *dockerStateWriter) WritePid(pid int) error {
267
+	err := d.dsw.WritePid(pid)
268
+	if d.callback != nil {
269
+		d.callback(d.c)
270
+	}
271
+	return err
272
+}
273
+
274
+func (d *dockerStateWriter) DeletePid() error {
275
+	return d.dsw.DeletePid()
276
+}
0 277
new file mode 100644
... ...
@@ -0,0 +1,21 @@
0
+package native
1
+
2
+import (
3
+	"os"
4
+	"path/filepath"
5
+)
6
+
7
+type info struct {
8
+	ID     string
9
+	driver *driver
10
+}
11
+
12
+// IsRunning is determined by looking for the
13
+// pid file for a container.  If the file exists then the
14
+// container is currently running
15
+func (i *info) IsRunning() bool {
16
+	if _, err := os.Stat(filepath.Join(i.driver.root, i.ID, "pid")); err == nil {
17
+		return true
18
+	}
19
+	return false
20
+}
0 21
new file mode 100644
... ...
@@ -0,0 +1,42 @@
0
+/*
1
+   These types are wrappers around the libcontainer Terminal interface so that
2
+   we can resuse the docker implementations where possible.
3
+*/
4
+package native
5
+
6
+import (
7
+	"github.com/dotcloud/docker/execdriver"
8
+	"io"
9
+	"os"
10
+	"os/exec"
11
+)
12
+
13
+type dockerStdTerm struct {
14
+	execdriver.StdConsole
15
+	pipes *execdriver.Pipes
16
+}
17
+
18
+func (d *dockerStdTerm) Attach(cmd *exec.Cmd) error {
19
+	return d.AttachPipes(cmd, d.pipes)
20
+}
21
+
22
+func (d *dockerStdTerm) SetMaster(master *os.File) {
23
+	// do nothing
24
+}
25
+
26
+type dockerTtyTerm struct {
27
+	execdriver.TtyConsole
28
+	pipes *execdriver.Pipes
29
+}
30
+
31
+func (t *dockerTtyTerm) Attach(cmd *exec.Cmd) error {
32
+	go io.Copy(t.pipes.Stdout, t.MasterPty)
33
+	if t.pipes.Stdin != nil {
34
+		go io.Copy(t.MasterPty, t.pipes.Stdin)
35
+	}
36
+	return nil
37
+}
38
+
39
+func (t *dockerTtyTerm) SetMaster(master *os.File) {
40
+	t.MasterPty = master
41
+}
... ...
@@ -5,6 +5,7 @@ import (
5 5
 	"github.com/kr/pty"
6 6
 	"io"
7 7
 	"os"
8
+	"os/exec"
8 9
 )
9 10
 
10 11
 func SetTerminal(command *Command, pipes *Pipes) error {
... ...
@@ -25,8 +26,8 @@ func SetTerminal(command *Command, pipes *Pipes) error {
25 25
 }
26 26
 
27 27
 type TtyConsole struct {
28
-	master *os.File
29
-	slave  *os.File
28
+	MasterPty *os.File
29
+	SlavePty  *os.File
30 30
 }
31 31
 
32 32
 func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) {
... ...
@@ -35,28 +36,28 @@ func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) {
35 35
 		return nil, err
36 36
 	}
37 37
 	tty := &TtyConsole{
38
-		master: ptyMaster,
39
-		slave:  ptySlave,
38
+		MasterPty: ptyMaster,
39
+		SlavePty:  ptySlave,
40 40
 	}
41
-	if err := tty.attach(command, pipes); err != nil {
41
+	if err := tty.AttachPipes(&command.Cmd, pipes); err != nil {
42 42
 		tty.Close()
43 43
 		return nil, err
44 44
 	}
45
+	command.Console = tty.SlavePty.Name()
45 46
 	return tty, nil
46 47
 }
47 48
 
48 49
 func (t *TtyConsole) Master() *os.File {
49
-	return t.master
50
+	return t.MasterPty
50 51
 }
51 52
 
52 53
 func (t *TtyConsole) Resize(h, w int) error {
53
-	return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
54
+	return term.SetWinsize(t.MasterPty.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
54 55
 }
55 56
 
56
-func (t *TtyConsole) attach(command *Command, pipes *Pipes) error {
57
-	command.Stdout = t.slave
58
-	command.Stderr = t.slave
59
-	command.Console = t.slave.Name()
57
+func (t *TtyConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error {
58
+	command.Stdout = t.SlavePty
59
+	command.Stderr = t.SlavePty
60 60
 
61 61
 	go func() {
62 62
 		if wb, ok := pipes.Stdout.(interface {
... ...
@@ -64,24 +65,24 @@ func (t *TtyConsole) attach(command *Command, pipes *Pipes) error {
64 64
 		}); ok {
65 65
 			defer wb.CloseWriters()
66 66
 		}
67
-		io.Copy(pipes.Stdout, t.master)
67
+		io.Copy(pipes.Stdout, t.MasterPty)
68 68
 	}()
69 69
 
70 70
 	if pipes.Stdin != nil {
71
-		command.Stdin = t.slave
71
+		command.Stdin = t.SlavePty
72 72
 		command.SysProcAttr.Setctty = true
73 73
 
74 74
 		go func() {
75 75
 			defer pipes.Stdin.Close()
76
-			io.Copy(t.master, pipes.Stdin)
76
+			io.Copy(t.MasterPty, pipes.Stdin)
77 77
 		}()
78 78
 	}
79 79
 	return nil
80 80
 }
81 81
 
82 82
 func (t *TtyConsole) Close() error {
83
-	t.slave.Close()
84
-	return t.master.Close()
83
+	t.SlavePty.Close()
84
+	return t.MasterPty.Close()
85 85
 }
86 86
 
87 87
 type StdConsole struct {
... ...
@@ -90,13 +91,13 @@ type StdConsole struct {
90 90
 func NewStdConsole(command *Command, pipes *Pipes) (*StdConsole, error) {
91 91
 	std := &StdConsole{}
92 92
 
93
-	if err := std.attach(command, pipes); err != nil {
93
+	if err := std.AttachPipes(&command.Cmd, pipes); err != nil {
94 94
 		return nil, err
95 95
 	}
96 96
 	return std, nil
97 97
 }
98 98
 
99
-func (s *StdConsole) attach(command *Command, pipes *Pipes) error {
99
+func (s *StdConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error {
100 100
 	command.Stdout = pipes.Stdout
101 101
 	command.Stderr = pipes.Stderr
102 102
 
... ...
@@ -1044,7 +1044,6 @@ func TestEnv(t *testing.T) {
1044 1044
 	goodEnv := []string{
1045 1045
 		"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1046 1046
 		"HOME=/",
1047
-		"container=lxc",
1048 1047
 		"HOSTNAME=" + utils.TruncateID(container.ID),
1049 1048
 		"FALSE=true",
1050 1049
 		"TRUE=false",
... ...
@@ -1581,8 +1580,8 @@ func TestPrivilegedCanMknod(t *testing.T) {
1581 1581
 	eng := NewTestEngine(t)
1582 1582
 	runtime := mkRuntimeFromEngine(eng, t)
1583 1583
 	defer runtime.Nuke()
1584
-	if output, _ := runContainer(eng, runtime, []string{"-privileged", "_", "sh", "-c", "mknod /tmp/sda b 8 0 && echo ok"}, t); output != "ok\n" {
1585
-		t.Fatal("Could not mknod into privileged container")
1584
+	if output, err := runContainer(eng, runtime, []string{"-privileged", "_", "sh", "-c", "mknod /tmp/sda b 8 0 && echo ok"}, t); output != "ok\n" {
1585
+		t.Fatalf("Could not mknod into privileged container %s %v", output, err)
1586 1586
 	}
1587 1587
 }
1588 1588
 
... ...
@@ -85,7 +85,7 @@ func init() {
85 85
 	os.Setenv("TEST", "1")
86 86
 
87 87
 	// Hack to run sys init during unit testing
88
-	if selfPath := utils.SelfPath(); selfPath == "/sbin/init" || selfPath == "/.dockerinit" {
88
+	if selfPath := utils.SelfPath(); strings.Contains(selfPath, ".dockerinit") {
89 89
 		sysinit.SysInit()
90 90
 		return
91 91
 	}
... ...
@@ -190,6 +190,7 @@ func newTestEngine(t utils.Fataler, autorestart bool, root string) *engine.Engin
190 190
 	job := eng.Job("initserver")
191 191
 	job.Setenv("Root", root)
192 192
 	job.SetenvBool("AutoRestart", autorestart)
193
+	job.Setenv("ExecDriver", "native")
193 194
 	// TestGetEnabledCors and TestOptionsRoute require EnableCors=true
194 195
 	job.SetenvBool("EnableCors", true)
195 196
 	if err := job.Run(); err != nil {
... ...
@@ -5,10 +5,23 @@ import (
5 5
 	"fmt"
6 6
 	"github.com/dotcloud/docker/pkg/mount"
7 7
 	"io"
8
+	"io/ioutil"
8 9
 	"os"
10
+	"path/filepath"
11
+	"strconv"
9 12
 	"strings"
10 13
 )
11 14
 
15
+type Cgroup struct {
16
+	Name   string `json:"name,omitempty"`
17
+	Parent string `json:"parent,omitempty"`
18
+
19
+	DeviceAccess bool  `json:"device_access,omitempty"` // name of parent cgroup or slice
20
+	Memory       int64 `json:"memory,omitempty"`        // Memory limit (in bytes)
21
+	MemorySwap   int64 `json:"memory_swap,omitempty"`   // Total memory usage (memory + swap); set `-1' to disable swap
22
+	CpuShares    int64 `json:"cpu_shares,omitempty"`    // CPU shares (relative weight vs. other containers)
23
+}
24
+
12 25
 // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
13 26
 func FindCgroupMountpoint(subsystem string) (string, error) {
14 27
 	mounts, err := mount.GetMounts()
... ...
@@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
25 25
 			}
26 26
 		}
27 27
 	}
28
-
29 28
 	return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem)
30 29
 }
31 30
 
... ...
@@ -40,18 +52,199 @@ func GetThisCgroupDir(subsystem string) (string, error) {
40 40
 	return parseCgroupFile(subsystem, f)
41 41
 }
42 42
 
43
+func GetInitCgroupDir(subsystem string) (string, error) {
44
+	f, err := os.Open("/proc/1/cgroup")
45
+	if err != nil {
46
+		return "", err
47
+	}
48
+	defer f.Close()
49
+
50
+	return parseCgroupFile(subsystem, f)
51
+}
52
+
53
+func (c *Cgroup) Path(root, subsystem string) (string, error) {
54
+	cgroup := c.Name
55
+	if c.Parent != "" {
56
+		cgroup = filepath.Join(c.Parent, cgroup)
57
+	}
58
+	initPath, err := GetInitCgroupDir(subsystem)
59
+	if err != nil {
60
+		return "", err
61
+	}
62
+	return filepath.Join(root, subsystem, initPath, cgroup), nil
63
+}
64
+
65
+func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) {
66
+	path, err := c.Path(root, subsystem)
67
+	if err != nil {
68
+		return "", err
69
+	}
70
+	if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
71
+		return "", err
72
+	}
73
+	if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil {
74
+		return "", err
75
+	}
76
+	return path, nil
77
+}
78
+
79
+func (c *Cgroup) Cleanup(root string) error {
80
+	get := func(subsystem string) string {
81
+		path, _ := c.Path(root, subsystem)
82
+		return path
83
+	}
84
+
85
+	for _, path := range []string{
86
+		get("memory"),
87
+		get("devices"),
88
+		get("cpu"),
89
+	} {
90
+		os.RemoveAll(path)
91
+	}
92
+	return nil
93
+}
94
+
43 95
 func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
44 96
 	s := bufio.NewScanner(r)
45
-
46 97
 	for s.Scan() {
47 98
 		if err := s.Err(); err != nil {
48 99
 			return "", err
49 100
 		}
50 101
 		text := s.Text()
51 102
 		parts := strings.Split(text, ":")
52
-		if parts[1] == subsystem {
53
-			return parts[2], nil
103
+		for _, subs := range strings.Split(parts[1], ",") {
104
+			if subs == subsystem {
105
+				return parts[2], nil
106
+			}
54 107
 		}
55 108
 	}
56 109
 	return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem)
57 110
 }
111
+
112
+func writeFile(dir, file, data string) error {
113
+	return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
114
+}
115
+
116
+func (c *Cgroup) Apply(pid int) error {
117
+	// We have two implementation of cgroups support, one is based on
118
+	// systemd and the dbus api, and one is based on raw cgroup fs operations
119
+	// following the pre-single-writer model docs at:
120
+	// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/
121
+	//
122
+	// we can pick any subsystem to find the root
123
+	cgroupRoot, err := FindCgroupMountpoint("cpu")
124
+	if err != nil {
125
+		return err
126
+	}
127
+	cgroupRoot = filepath.Dir(cgroupRoot)
128
+
129
+	if _, err := os.Stat(cgroupRoot); err != nil {
130
+		return fmt.Errorf("cgroups fs not found")
131
+	}
132
+	if err := c.setupDevices(cgroupRoot, pid); err != nil {
133
+		return err
134
+	}
135
+	if err := c.setupMemory(cgroupRoot, pid); err != nil {
136
+		return err
137
+	}
138
+	if err := c.setupCpu(cgroupRoot, pid); err != nil {
139
+		return err
140
+	}
141
+	return nil
142
+}
143
+
144
+func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) {
145
+	if !c.DeviceAccess {
146
+		dir, err := c.Join(cgroupRoot, "devices", pid)
147
+		if err != nil {
148
+			return err
149
+		}
150
+
151
+		defer func() {
152
+			if err != nil {
153
+				os.RemoveAll(dir)
154
+			}
155
+		}()
156
+
157
+		if err := writeFile(dir, "devices.deny", "a"); err != nil {
158
+			return err
159
+		}
160
+
161
+		allow := []string{
162
+			// /dev/null, zero, full
163
+			"c 1:3 rwm",
164
+			"c 1:5 rwm",
165
+			"c 1:7 rwm",
166
+
167
+			// consoles
168
+			"c 5:1 rwm",
169
+			"c 5:0 rwm",
170
+			"c 4:0 rwm",
171
+			"c 4:1 rwm",
172
+
173
+			// /dev/urandom,/dev/random
174
+			"c 1:9 rwm",
175
+			"c 1:8 rwm",
176
+
177
+			// /dev/pts/ - pts namespaces are "coming soon"
178
+			"c 136:* rwm",
179
+			"c 5:2 rwm",
180
+
181
+			// tuntap
182
+			"c 10:200 rwm",
183
+		}
184
+
185
+		for _, val := range allow {
186
+			if err := writeFile(dir, "devices.allow", val); err != nil {
187
+				return err
188
+			}
189
+		}
190
+	}
191
+	return nil
192
+}
193
+
194
+func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) {
195
+	if c.Memory != 0 || c.MemorySwap != 0 {
196
+		dir, err := c.Join(cgroupRoot, "memory", pid)
197
+		if err != nil {
198
+			return err
199
+		}
200
+		defer func() {
201
+			if err != nil {
202
+				os.RemoveAll(dir)
203
+			}
204
+		}()
205
+
206
+		if c.Memory != 0 {
207
+			if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
208
+				return err
209
+			}
210
+			if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil {
211
+				return err
212
+			}
213
+		}
214
+		// By default, MemorySwap is set to twice the size of RAM.
215
+		// If you want to omit MemorySwap, set it to `-1'.
216
+		if c.MemorySwap != -1 {
217
+			if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil {
218
+				return err
219
+			}
220
+		}
221
+	}
222
+	return nil
223
+}
224
+
225
+func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) {
226
+	// We always want to join the cpu group, to allow fair cpu scheduling
227
+	// on a container basis
228
+	dir, err := c.Join(cgroupRoot, "cpu", pid)
229
+	if err != nil {
230
+		return err
231
+	}
232
+	if c.CpuShares != 0 {
233
+		if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil {
234
+			return err
235
+		}
236
+	}
237
+	return nil
238
+}
58 239
new file mode 100644
... ...
@@ -0,0 +1,2 @@
0
+Michael Crosby <michael@crosbymichael.com> (@crosbymichael)
1
+Guillaume Charmes <guillaume@dotcloud.com> (@creack)
0 2
new file mode 100644
... ...
@@ -0,0 +1,90 @@
0
+## libcontainer - reference implementation for containers
1
+
2
+#### background
3
+
4
+libcontainer specifies configuration options for what a container is.  It provides a native Go implementation 
5
+for using linux namespaces with no external dependencies.  libcontainer provides many convience functions for working with namespaces, networking, and management.  
6
+
7
+
8
+#### container
9
+A container is a self contained directory that is able to run one or more processes without 
10
+affecting the host system.  The directory is usually a full system tree.  Inside the directory
11
+a `container.json` file is placed with the runtime configuration for how the processes 
12
+should be contained and ran.  Environment, networking, and different capabilities for the 
13
+process are specified in this file.  The configuration is used for each process executed inside the container.
14
+
15
+Sample `container.json` file:
16
+```json
17
+{
18
+    "hostname": "koye",
19
+    "tty": true,
20
+    "environment": [
21
+        "HOME=/",
22
+        "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
23
+        "container=docker",
24
+        "TERM=xterm-256color"
25
+    ],
26
+    "namespaces": [
27
+        "NEWIPC",
28
+        "NEWNS",
29
+        "NEWPID",
30
+        "NEWUTS",
31
+        "NEWNET"
32
+    ],
33
+    "capabilities": [
34
+        "SETPCAP",
35
+        "SYS_MODULE",
36
+        "SYS_RAWIO",
37
+        "SYS_PACCT",
38
+        "SYS_ADMIN",
39
+        "SYS_NICE",
40
+        "SYS_RESOURCE",
41
+        "SYS_TIME",
42
+        "SYS_TTY_CONFIG",
43
+        "MKNOD",
44
+        "AUDIT_WRITE",
45
+        "AUDIT_CONTROL",
46
+        "MAC_OVERRIDE",
47
+        "MAC_ADMIN",
48
+        "NET_ADMIN"
49
+    ],
50
+    "networks": [{
51
+            "type": "veth",
52
+            "context": {
53
+                "bridge": "docker0",
54
+                "prefix": "dock"
55
+            },
56
+            "address": "172.17.0.100/16",
57
+            "gateway": "172.17.42.1",
58
+            "mtu": 1500
59
+        }
60
+    ],
61
+    "cgroups": {
62
+        "name": "docker-koye",
63
+        "parent": "docker",
64
+        "memory": 5248000
65
+    }
66
+}
67
+```
68
+
69
+Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file 
70
+is written to the current directory with the pid of the namespaced process to the external world.  A client can use this pid to wait, kill, or perform other operation with the container.  If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process.
71
+
72
+
73
+You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved.
74
+
75
+#### nsinit
76
+
77
+`nsinit` is a cli application used as the reference implementation of libcontainer.  It is able to 
78
+spawn or join new containers giving the current directory.  To use `nsinit` cd into a linux 
79
+rootfs and copy a `container.json` file into the directory with your specified configuration.
80
+
81
+To execute `/bin/bash` in the current directory as a container just run:
82
+```bash
83
+nsinit exec /bin/bash
84
+```
85
+
86
+If you wish to spawn another process inside the container while your current bash session is 
87
+running just run the exact same command again to get another bash shell or change the command.  If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. 
88
+
89
+You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory.   
0 90
new file mode 100644
... ...
@@ -0,0 +1,17 @@
0
+#### goals
1
+* small and simple - line count is not everything but less code is better
2
+* clean lines between what we do in the pkg 
3
+* provide primitives for working with namespaces not cater to every option
4
+* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code
5
+
6
+#### tasks
7
+* proper tty for a new process in an existing container
8
+* use exec or raw syscalls for new process in existing container
9
+* setup proper user in namespace if specified
10
+* implement hook or clean interface for cgroups
11
+* example configs for different setups (host networking, boot init)
12
+* improve pkg documentation with comments
13
+* testing - this is hard in a low level pkg but we could do some, maybe
14
+* pivot root
15
+* selinux
16
+* apparmor
0 17
new file mode 100644
... ...
@@ -0,0 +1,33 @@
0
+package capabilities
1
+
2
+import (
3
+	"github.com/dotcloud/docker/pkg/libcontainer"
4
+	"github.com/syndtr/gocapability/capability"
5
+	"os"
6
+)
7
+
8
+// DropCapabilities drops capabilities for the current process based
9
+// on the container's configuration.
10
+func DropCapabilities(container *libcontainer.Container) error {
11
+	if drop := getCapabilities(container); len(drop) > 0 {
12
+		c, err := capability.NewPid(os.Getpid())
13
+		if err != nil {
14
+			return err
15
+		}
16
+		c.Unset(capability.CAPS|capability.BOUNDS, drop...)
17
+
18
+		if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil {
19
+			return err
20
+		}
21
+	}
22
+	return nil
23
+}
24
+
25
+// getCapabilities returns the specific cap values for the libcontainer types
26
+func getCapabilities(container *libcontainer.Container) []capability.Cap {
27
+	drop := []capability.Cap{}
28
+	for _, c := range container.Capabilities {
29
+		drop = append(drop, c.Value)
30
+	}
31
+	return drop
32
+}
0 33
new file mode 100644
... ...
@@ -0,0 +1,36 @@
0
+package libcontainer
1
+
2
+import (
3
+	"github.com/dotcloud/docker/pkg/cgroups"
4
+)
5
+
6
+// Context is a generic key value pair that allows
7
+// arbatrary data to be sent
8
+type Context map[string]string
9
+
10
+// Container defines configuration options for how a
11
+// container is setup inside a directory and how a process should be executed
12
+type Container struct {
13
+	Hostname     string          `json:"hostname,omitempty"`     // hostname
14
+	ReadonlyFs   bool            `json:"readonly_fs,omitempty"`  // set the containers rootfs as readonly
15
+	User         string          `json:"user,omitempty"`         // user to execute the process as
16
+	WorkingDir   string          `json:"working_dir,omitempty"`  // current working directory
17
+	Env          []string        `json:"environment,omitempty"`  // environment to set
18
+	Tty          bool            `json:"tty,omitempty"`          // setup a proper tty or not
19
+	Namespaces   Namespaces      `json:"namespaces,omitempty"`   // namespaces to apply
20
+	Capabilities Capabilities    `json:"capabilities,omitempty"` // capabilities to drop
21
+	Networks     []*Network      `json:"networks,omitempty"`     // nil for host's network stack
22
+	Cgroups      *cgroups.Cgroup `json:"cgroups,omitempty"`
23
+}
24
+
25
+// Network defines configuration for a container's networking stack
26
+//
27
+// The network configuration can be omited from a container causing the
28
+// container to be setup with the host's networking stack
29
+type Network struct {
30
+	Type    string  `json:"type,omitempty"`    // type of networking to setup i.e. veth, macvlan, etc
31
+	Context Context `json:"context,omitempty"` // generic context for type specific networking options
32
+	Address string  `json:"address,omitempty"`
33
+	Gateway string  `json:"gateway,omitempty"`
34
+	Mtu     int     `json:"mtu,omitempty"`
35
+}
0 36
new file mode 100644
... ...
@@ -0,0 +1,50 @@
0
+{
1
+    "hostname": "koye",
2
+    "tty": true,
3
+    "environment": [
4
+        "HOME=/",
5
+        "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
6
+        "container=docker",
7
+        "TERM=xterm-256color"
8
+    ],
9
+    "namespaces": [
10
+        "NEWIPC",
11
+        "NEWNS",
12
+        "NEWPID",
13
+        "NEWUTS",
14
+        "NEWNET"
15
+    ],
16
+    "capabilities": [
17
+        "SETPCAP",
18
+        "SYS_MODULE",
19
+        "SYS_RAWIO",
20
+        "SYS_PACCT",
21
+        "SYS_ADMIN",
22
+        "SYS_NICE",
23
+        "SYS_RESOURCE",
24
+        "SYS_TIME",
25
+        "SYS_TTY_CONFIG",
26
+        "MKNOD",
27
+        "AUDIT_WRITE",
28
+        "AUDIT_CONTROL",
29
+        "MAC_OVERRIDE",
30
+        "MAC_ADMIN",
31
+        "NET_ADMIN"
32
+    ],
33
+    "networks": [{
34
+            "type": "veth",
35
+            "context": {
36
+                "bridge": "docker0",
37
+                "prefix": "dock"
38
+            },
39
+            "address": "172.17.0.100/16",
40
+            "gateway": "172.17.42.1",
41
+            "mtu": 1500
42
+        }
43
+    ],
44
+    "cgroups": {
45
+        "name": "docker-koye",
46
+        "parent": "docker",
47
+        "memory": 5248000
48
+    }
49
+}
0 50
new file mode 100644
... ...
@@ -0,0 +1,78 @@
0
+package network
1
+
2
+import (
3
+	"github.com/dotcloud/docker/pkg/netlink"
4
+	"net"
5
+)
6
+
7
+func InterfaceUp(name string) error {
8
+	iface, err := net.InterfaceByName(name)
9
+	if err != nil {
10
+		return err
11
+	}
12
+	return netlink.NetworkLinkUp(iface)
13
+}
14
+
15
+func InterfaceDown(name string) error {
16
+	iface, err := net.InterfaceByName(name)
17
+	if err != nil {
18
+		return err
19
+	}
20
+	return netlink.NetworkLinkDown(iface)
21
+}
22
+
23
+func ChangeInterfaceName(old, newName string) error {
24
+	iface, err := net.InterfaceByName(old)
25
+	if err != nil {
26
+		return err
27
+	}
28
+	return netlink.NetworkChangeName(iface, newName)
29
+}
30
+
31
+func CreateVethPair(name1, name2 string) error {
32
+	return netlink.NetworkCreateVethPair(name1, name2)
33
+}
34
+
35
+func SetInterfaceInNamespacePid(name string, nsPid int) error {
36
+	iface, err := net.InterfaceByName(name)
37
+	if err != nil {
38
+		return err
39
+	}
40
+	return netlink.NetworkSetNsPid(iface, nsPid)
41
+}
42
+
43
+func SetInterfaceMaster(name, master string) error {
44
+	iface, err := net.InterfaceByName(name)
45
+	if err != nil {
46
+		return err
47
+	}
48
+	masterIface, err := net.InterfaceByName(master)
49
+	if err != nil {
50
+		return err
51
+	}
52
+	return netlink.NetworkSetMaster(iface, masterIface)
53
+}
54
+
55
+func SetDefaultGateway(ip string) error {
56
+	return netlink.AddDefaultGw(net.ParseIP(ip))
57
+}
58
+
59
+func SetInterfaceIp(name string, rawIp string) error {
60
+	iface, err := net.InterfaceByName(name)
61
+	if err != nil {
62
+		return err
63
+	}
64
+	ip, ipNet, err := net.ParseCIDR(rawIp)
65
+	if err != nil {
66
+		return err
67
+	}
68
+	return netlink.NetworkLinkAddIp(iface, ip, ipNet)
69
+}
70
+
71
+func SetMtu(name string, mtu int) error {
72
+	iface, err := net.InterfaceByName(name)
73
+	if err != nil {
74
+		return err
75
+	}
76
+	return netlink.NetworkSetMTU(iface, mtu)
77
+}
0 78
new file mode 100644
... ...
@@ -0,0 +1,32 @@
0
+package network
1
+
2
+import (
3
+	"errors"
4
+	"github.com/dotcloud/docker/pkg/libcontainer"
5
+)
6
+
7
+var (
8
+	ErrNotValidStrategyType = errors.New("not a valid network strategy type")
9
+)
10
+
11
+var strategies = map[string]NetworkStrategy{
12
+	"veth": &Veth{},
13
+}
14
+
15
+// NetworkStrategy represents a specific network configuration for
16
+// a container's networking stack
17
+type NetworkStrategy interface {
18
+	Create(*libcontainer.Network, int, libcontainer.Context) error
19
+	Initialize(*libcontainer.Network, libcontainer.Context) error
20
+}
21
+
22
+// GetStrategy returns the specific network strategy for the
23
+// provided type.  If no strategy is registered for the type an
24
+// ErrNotValidStrategyType is returned.
25
+func GetStrategy(tpe string) (NetworkStrategy, error) {
26
+	s, exists := strategies[tpe]
27
+	if !exists {
28
+		return nil, ErrNotValidStrategyType
29
+	}
30
+	return s, nil
31
+}
0 32
new file mode 100644
... ...
@@ -0,0 +1,100 @@
0
+package network
1
+
2
+import (
3
+	"fmt"
4
+	"github.com/dotcloud/docker/pkg/libcontainer"
5
+	"github.com/dotcloud/docker/pkg/libcontainer/utils"
6
+)
7
+
8
+// Veth is a network strategy that uses a bridge and creates
9
+// a veth pair, one that stays outside on the host and the other
10
+// is placed inside the container's namespace
11
+type Veth struct {
12
+}
13
+
14
+func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error {
15
+	var (
16
+		bridge string
17
+		prefix string
18
+		exists bool
19
+	)
20
+	if bridge, exists = n.Context["bridge"]; !exists {
21
+		return fmt.Errorf("bridge does not exist in network context")
22
+	}
23
+	if prefix, exists = n.Context["prefix"]; !exists {
24
+		return fmt.Errorf("veth prefix does not exist in network context")
25
+	}
26
+	name1, name2, err := createVethPair(prefix)
27
+	if err != nil {
28
+		return err
29
+	}
30
+	context["veth-host"] = name1
31
+	context["veth-child"] = name2
32
+	if err := SetInterfaceMaster(name1, bridge); err != nil {
33
+		return err
34
+	}
35
+	if err := SetMtu(name1, n.Mtu); err != nil {
36
+		return err
37
+	}
38
+	if err := InterfaceUp(name1); err != nil {
39
+		return err
40
+	}
41
+	if err := SetInterfaceInNamespacePid(name2, nspid); err != nil {
42
+		return err
43
+	}
44
+	return nil
45
+}
46
+
47
+func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error {
48
+	var (
49
+		vethChild string
50
+		exists    bool
51
+	)
52
+	if vethChild, exists = context["veth-child"]; !exists {
53
+		return fmt.Errorf("vethChild does not exist in network context")
54
+	}
55
+	if err := InterfaceDown(vethChild); err != nil {
56
+		return fmt.Errorf("interface down %s %s", vethChild, err)
57
+	}
58
+	if err := ChangeInterfaceName(vethChild, "eth0"); err != nil {
59
+		return fmt.Errorf("change %s to eth0 %s", vethChild, err)
60
+	}
61
+	if err := SetInterfaceIp("eth0", config.Address); err != nil {
62
+		return fmt.Errorf("set eth0 ip %s", err)
63
+	}
64
+	if err := SetMtu("eth0", config.Mtu); err != nil {
65
+		return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err)
66
+	}
67
+	if err := InterfaceUp("eth0"); err != nil {
68
+		return fmt.Errorf("eth0 up %s", err)
69
+	}
70
+	if err := SetMtu("lo", config.Mtu); err != nil {
71
+		return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err)
72
+	}
73
+	if err := InterfaceUp("lo"); err != nil {
74
+		return fmt.Errorf("lo up %s", err)
75
+	}
76
+	if config.Gateway != "" {
77
+		if err := SetDefaultGateway(config.Gateway); err != nil {
78
+			return fmt.Errorf("set gateway to %s %s", config.Gateway, err)
79
+		}
80
+	}
81
+	return nil
82
+}
83
+
84
+// createVethPair will automatically generage two random names for
85
+// the veth pair and ensure that they have been created
86
+func createVethPair(prefix string) (name1 string, name2 string, err error) {
87
+	name1, err = utils.GenerateRandomName(prefix, 4)
88
+	if err != nil {
89
+		return
90
+	}
91
+	name2, err = utils.GenerateRandomName(prefix, 4)
92
+	if err != nil {
93
+		return
94
+	}
95
+	if err = CreateVethPair(name1, name2); err != nil {
96
+		return
97
+	}
98
+	return
99
+}
0 100
new file mode 100644
... ...
@@ -0,0 +1,45 @@
0
+package nsinit
1
+
2
+import (
3
+	"fmt"
4
+	"github.com/dotcloud/docker/pkg/libcontainer"
5
+	"github.com/dotcloud/docker/pkg/system"
6
+	"os"
7
+	"os/exec"
8
+)
9
+
10
+// CommandFactory takes the container's configuration and options passed by the
11
+// parent processes and creates an *exec.Cmd that will be used to fork/exec the
12
+// namespaced init process
13
+type CommandFactory interface {
14
+	Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd
15
+}
16
+
17
+type DefaultCommandFactory struct {
18
+	Root string
19
+}
20
+
21
+// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces
22
+// defined on the container's configuration and use the current binary as the init with the
23
+// args provided
24
+func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd {
25
+	// get our binary name from arg0 so we can always reexec ourself
26
+	command := exec.Command(os.Args[0], append([]string{
27
+		"-console", console,
28
+		"-pipe", fmt.Sprint(pipe),
29
+		"-root", c.Root,
30
+		"init"}, args...)...)
31
+
32
+	system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces)))
33
+	command.Env = container.Env
34
+	return command
35
+}
36
+
37
+// GetNamespaceFlags parses the container's Namespaces options to set the correct
38
+// flags on clone, unshare, and setns
39
+func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) {
40
+	for _, ns := range namespaces {
41
+		flag |= ns.Value
42
+	}
43
+	return flag
44
+}
0 45
new file mode 100644
... ...
@@ -0,0 +1,96 @@
0
+// +build linux
1
+
2
+package nsinit
3
+
4
+import (
5
+	"github.com/dotcloud/docker/pkg/libcontainer"
6
+	"github.com/dotcloud/docker/pkg/libcontainer/network"
7
+	"github.com/dotcloud/docker/pkg/system"
8
+	"os"
9
+	"os/exec"
10
+	"syscall"
11
+)
12
+
13
+// Exec performes setup outside of a namespace so that a container can be
14
+// executed.  Exec is a high level function for working with container namespaces.
15
+func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) {
16
+	var (
17
+		master  *os.File
18
+		console string
19
+		err     error
20
+	)
21
+
22
+	// create a pipe so that we can syncronize with the namespaced process and
23
+	// pass the veth name to the child
24
+	syncPipe, err := NewSyncPipe()
25
+	if err != nil {
26
+		return -1, err
27
+	}
28
+
29
+	if container.Tty {
30
+		master, console, err = system.CreateMasterAndConsole()
31
+		if err != nil {
32
+			return -1, err
33
+		}
34
+		term.SetMaster(master)
35
+	}
36
+
37
+	command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args)
38
+	if err := term.Attach(command); err != nil {
39
+		return -1, err
40
+	}
41
+	defer term.Close()
42
+
43
+	if err := command.Start(); err != nil {
44
+		return -1, err
45
+	}
46
+	if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil {
47
+		command.Process.Kill()
48
+		return -1, err
49
+	}
50
+	defer ns.stateWriter.DeletePid()
51
+
52
+	// Do this before syncing with child so that no children
53
+	// can escape the cgroup
54
+	if err := ns.SetupCgroups(container, command.Process.Pid); err != nil {
55
+		command.Process.Kill()
56
+		return -1, err
57
+	}
58
+	if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil {
59
+		command.Process.Kill()
60
+		return -1, err
61
+	}
62
+
63
+	// Sync with child
64
+	syncPipe.Close()
65
+
66
+	if err := command.Wait(); err != nil {
67
+		if _, ok := err.(*exec.ExitError); !ok {
68
+			return -1, err
69
+		}
70
+	}
71
+	return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil
72
+}
73
+
74
+func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error {
75
+	if container.Cgroups != nil {
76
+		if err := container.Cgroups.Apply(nspid); err != nil {
77
+			return err
78
+		}
79
+	}
80
+	return nil
81
+}
82
+
83
+func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error {
84
+	context := libcontainer.Context{}
85
+	for _, config := range container.Networks {
86
+		strategy, err := network.GetStrategy(config.Type)
87
+		if err != nil {
88
+			return err
89
+		}
90
+		if err := strategy.Create(config, nspid, context); err != nil {
91
+			return err
92
+		}
93
+	}
94
+	return pipe.SendToChild(context)
95
+}
0 96
new file mode 100644
... ...
@@ -0,0 +1,94 @@
0
+// +build linux
1
+
2
+package nsinit
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/libcontainer"
7
+	"github.com/dotcloud/docker/pkg/system"
8
+	"os"
9
+	"path/filepath"
10
+	"strconv"
11
+	"syscall"
12
+)
13
+
14
+// ExecIn uses an existing pid and joins the pid's namespaces with the new command.
15
+func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) {
16
+	for _, ns := range container.Namespaces {
17
+		if err := system.Unshare(ns.Value); err != nil {
18
+			return -1, err
19
+		}
20
+	}
21
+	fds, err := ns.getNsFds(nspid, container)
22
+	closeFds := func() {
23
+		for _, f := range fds {
24
+			system.Closefd(f)
25
+		}
26
+	}
27
+	if err != nil {
28
+		closeFds()
29
+		return -1, err
30
+	}
31
+
32
+	// foreach namespace fd, use setns to join an existing container's namespaces
33
+	for _, fd := range fds {
34
+		if fd > 0 {
35
+			if err := system.Setns(fd, 0); err != nil {
36
+				closeFds()
37
+				return -1, fmt.Errorf("setns %s", err)
38
+			}
39
+		}
40
+		system.Closefd(fd)
41
+	}
42
+
43
+	// if the container has a new pid and mount namespace we need to
44
+	// remount proc and sys to pick up the changes
45
+	if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") {
46
+		pid, err := system.Fork()
47
+		if err != nil {
48
+			return -1, err
49
+		}
50
+		if pid == 0 {
51
+			// TODO: make all raw syscalls to be fork safe
52
+			if err := system.Unshare(syscall.CLONE_NEWNS); err != nil {
53
+				return -1, err
54
+			}
55
+			if err := remountProc(); err != nil {
56
+				return -1, fmt.Errorf("remount proc %s", err)
57
+			}
58
+			if err := remountSys(); err != nil {
59
+				return -1, fmt.Errorf("remount sys %s", err)
60
+			}
61
+			goto dropAndExec
62
+		}
63
+		proc, err := os.FindProcess(pid)
64
+		if err != nil {
65
+			return -1, err
66
+		}
67
+		state, err := proc.Wait()
68
+		if err != nil {
69
+			return -1, err
70
+		}
71
+		os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus())
72
+	}
73
+dropAndExec:
74
+	if err := finalizeNamespace(container); err != nil {
75
+		return -1, err
76
+	}
77
+	if err := system.Execv(args[0], args[0:], container.Env); err != nil {
78
+		return -1, err
79
+	}
80
+	panic("unreachable")
81
+}
82
+
83
+func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) {
84
+	fds := make([]uintptr, len(container.Namespaces))
85
+	for i, ns := range container.Namespaces {
86
+		f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0)
87
+		if err != nil {
88
+			return fds, err
89
+		}
90
+		fds[i] = f.Fd()
91
+	}
92
+	return fds, nil
93
+}
0 94
new file mode 100644
... ...
@@ -0,0 +1,153 @@
0
+// +build linux
1
+
2
+package nsinit
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/libcontainer"
7
+	"github.com/dotcloud/docker/pkg/libcontainer/capabilities"
8
+	"github.com/dotcloud/docker/pkg/libcontainer/network"
9
+	"github.com/dotcloud/docker/pkg/libcontainer/utils"
10
+	"github.com/dotcloud/docker/pkg/system"
11
+	"github.com/dotcloud/docker/pkg/user"
12
+	"os"
13
+	"syscall"
14
+)
15
+
16
+// Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
17
+// and other options required for the new container.
18
+func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
19
+	rootfs, err := utils.ResolveRootfs(uncleanRootfs)
20
+	if err != nil {
21
+		return err
22
+	}
23
+
24
+	// We always read this as it is a way to sync with the parent as well
25
+	context, err := syncPipe.ReadFromParent()
26
+	if err != nil {
27
+		syncPipe.Close()
28
+		return err
29
+	}
30
+	syncPipe.Close()
31
+
32
+	if console != "" {
33
+		// close pipes so that we can replace it with the pty
34
+		closeStdPipes()
35
+		slave, err := system.OpenTerminal(console, syscall.O_RDWR)
36
+		if err != nil {
37
+			return fmt.Errorf("open terminal %s", err)
38
+		}
39
+		if err := dupSlave(slave); err != nil {
40
+			return fmt.Errorf("dup2 slave %s", err)
41
+		}
42
+	}
43
+	if _, err := system.Setsid(); err != nil {
44
+		return fmt.Errorf("setsid %s", err)
45
+	}
46
+	if console != "" {
47
+		if err := system.Setctty(); err != nil {
48
+			return fmt.Errorf("setctty %s", err)
49
+		}
50
+	}
51
+
52
+	/*
53
+		if err := system.ParentDeathSignal(); err != nil {
54
+			return fmt.Errorf("parent death signal %s", err)
55
+		}
56
+	*/
57
+	if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil {
58
+		return fmt.Errorf("setup mount namespace %s", err)
59
+	}
60
+	if err := setupNetwork(container, context); err != nil {
61
+		return fmt.Errorf("setup networking %s", err)
62
+	}
63
+	if err := system.Sethostname(container.Hostname); err != nil {
64
+		return fmt.Errorf("sethostname %s", err)
65
+	}
66
+	if err := finalizeNamespace(container); err != nil {
67
+		return fmt.Errorf("finalize namespace %s", err)
68
+	}
69
+	return system.Execv(args[0], args[0:], container.Env)
70
+}
71
+
72
+func closeStdPipes() {
73
+	os.Stdin.Close()
74
+	os.Stdout.Close()
75
+	os.Stderr.Close()
76
+}
77
+
78
+func setupUser(container *libcontainer.Container) error {
79
+	switch container.User {
80
+	case "root", "":
81
+		if err := system.Setgroups(nil); err != nil {
82
+			return err
83
+		}
84
+		if err := system.Setresgid(0, 0, 0); err != nil {
85
+			return err
86
+		}
87
+		if err := system.Setresuid(0, 0, 0); err != nil {
88
+			return err
89
+		}
90
+	default:
91
+		uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid())
92
+		if err != nil {
93
+			return err
94
+		}
95
+		if err := system.Setgroups(suppGids); err != nil {
96
+			return err
97
+		}
98
+		if err := system.Setgid(gid); err != nil {
99
+			return err
100
+		}
101
+		if err := system.Setuid(uid); err != nil {
102
+			return err
103
+		}
104
+	}
105
+	return nil
106
+}
107
+
108
+// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that
109
+// the slave's fd is 0, or stdin
110
+func dupSlave(slave *os.File) error {
111
+	if slave.Fd() != 0 {
112
+		return fmt.Errorf("slave fd not 0 %d", slave.Fd())
113
+	}
114
+	if err := system.Dup2(slave.Fd(), 1); err != nil {
115
+		return err
116
+	}
117
+	if err := system.Dup2(slave.Fd(), 2); err != nil {
118
+		return err
119
+	}
120
+	return nil
121
+}
122
+
123
+// setupVethNetwork uses the Network config if it is not nil to initialize
124
+// the new veth interface inside the container for use by changing the name to eth0
125
+// setting the MTU and IP address along with the default gateway
126
+func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error {
127
+	for _, config := range container.Networks {
128
+		strategy, err := network.GetStrategy(config.Type)
129
+		if err != nil {
130
+			return err
131
+		}
132
+		return strategy.Initialize(config, context)
133
+	}
134
+	return nil
135
+}
136
+
137
+// finalizeNamespace drops the caps and sets the correct user
138
+// and working dir before execing the command inside the namespace
139
+func finalizeNamespace(container *libcontainer.Container) error {
140
+	if err := capabilities.DropCapabilities(container); err != nil {
141
+		return fmt.Errorf("drop capabilities %s", err)
142
+	}
143
+	if err := setupUser(container); err != nil {
144
+		return fmt.Errorf("setup user %s", err)
145
+	}
146
+	if container.WorkingDir != "" {
147
+		if err := system.Chdir(container.WorkingDir); err != nil {
148
+			return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
149
+		}
150
+	}
151
+	return nil
152
+}
0 153
new file mode 100644
... ...
@@ -0,0 +1,254 @@
0
+// +build linux
1
+
2
+package nsinit
3
+
4
+import (
5
+	"fmt"
6
+	"github.com/dotcloud/docker/pkg/system"
7
+	"os"
8
+	"path/filepath"
9
+	"syscall"
10
+)
11
+
12
+// default mount point flags
13
+const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
14
+
15
+// setupNewMountNamespace is used to initialize a new mount namespace for an new
16
+// container in the rootfs that is specified.
17
+//
18
+// There is no need to unmount the new mounts because as soon as the mount namespace
19
+// is no longer in use, the mounts will be removed automatically
20
+func setupNewMountNamespace(rootfs, console string, readonly bool) error {
21
+	// mount as slave so that the new mounts do not propagate to the host
22
+	if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil {
23
+		return fmt.Errorf("mounting / as slave %s", err)
24
+	}
25
+	if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
26
+		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
27
+	}
28
+	if readonly {
29
+		if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil {
30
+			return fmt.Errorf("mounting %s as readonly %s", rootfs, err)
31
+		}
32
+	}
33
+	if err := mountSystem(rootfs); err != nil {
34
+		return fmt.Errorf("mount system %s", err)
35
+	}
36
+	if err := copyDevNodes(rootfs); err != nil {
37
+		return fmt.Errorf("copy dev nodes %s", err)
38
+	}
39
+	if err := setupLoopbackDevices(rootfs); err != nil {
40
+		return fmt.Errorf("setup loopback devices %s", err)
41
+	}
42
+	if err := setupDev(rootfs); err != nil {
43
+		return err
44
+	}
45
+	if console != "" {
46
+		if err := setupPtmx(rootfs, console); err != nil {
47
+			return err
48
+		}
49
+	}
50
+	if err := system.Chdir(rootfs); err != nil {
51
+		return fmt.Errorf("chdir into %s %s", rootfs, err)
52
+	}
53
+	if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
54
+		return fmt.Errorf("mount move %s into / %s", rootfs, err)
55
+	}
56
+	if err := system.Chroot("."); err != nil {
57
+		return fmt.Errorf("chroot . %s", err)
58
+	}
59
+	if err := system.Chdir("/"); err != nil {
60
+		return fmt.Errorf("chdir / %s", err)
61
+	}
62
+
63
+	system.Umask(0022)
64
+
65
+	return nil
66
+}
67
+
68
+// copyDevNodes mknods the hosts devices so the new container has access to them
69
+func copyDevNodes(rootfs string) error {
70
+	oldMask := system.Umask(0000)
71
+	defer system.Umask(oldMask)
72
+
73
+	for _, node := range []string{
74
+		"null",
75
+		"zero",
76
+		"full",
77
+		"random",
78
+		"urandom",
79
+		"tty",
80
+	} {
81
+		if err := copyDevNode(rootfs, node); err != nil {
82
+			return err
83
+		}
84
+	}
85
+	return nil
86
+}
87
+
88
+func setupLoopbackDevices(rootfs string) error {
89
+	for i := 0; ; i++ {
90
+		var (
91
+			device = fmt.Sprintf("loop%d", i)
92
+			source = filepath.Join("/dev", device)
93
+			dest   = filepath.Join(rootfs, "dev", device)
94
+		)
95
+
96
+		if _, err := os.Stat(source); err != nil {
97
+			if !os.IsNotExist(err) {
98
+				return err
99
+			}
100
+			return nil
101
+		}
102
+		if _, err := os.Stat(dest); err == nil {
103
+			os.Remove(dest)
104
+		}
105
+		f, err := os.Create(dest)
106
+		if err != nil {
107
+			return err
108
+		}
109
+		f.Close()
110
+		if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil {
111
+			return err
112
+		}
113
+	}
114
+	return nil
115
+}
116
+
117
+func copyDevNode(rootfs, node string) error {
118
+	stat, err := os.Stat(filepath.Join("/dev", node))
119
+	if err != nil {
120
+		return err
121
+	}
122
+	var (
123
+		dest = filepath.Join(rootfs, "dev", node)
124
+		st   = stat.Sys().(*syscall.Stat_t)
125
+	)
126
+	if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
127
+		return fmt.Errorf("copy %s %s", node, err)
128
+	}
129
+	return nil
130
+}
131
+
132
+// setupDev symlinks the current processes pipes into the
133
+// appropriate destination on the containers rootfs
134
+func setupDev(rootfs string) error {
135
+	for _, link := range []struct {
136
+		from string
137
+		to   string
138
+	}{
139
+		{"/proc/kcore", "/dev/core"},
140
+		{"/proc/self/fd", "/dev/fd"},
141
+		{"/proc/self/fd/0", "/dev/stdin"},
142
+		{"/proc/self/fd/1", "/dev/stdout"},
143
+		{"/proc/self/fd/2", "/dev/stderr"},
144
+	} {
145
+		dest := filepath.Join(rootfs, link.to)
146
+		if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
147
+			return fmt.Errorf("remove %s %s", dest, err)
148
+		}
149
+		if err := os.Symlink(link.from, dest); err != nil {
150
+			return fmt.Errorf("symlink %s %s", dest, err)
151
+		}
152
+	}
153
+	return nil
154
+}
155
+
156
+// setupConsole ensures that the container has a proper /dev/console setup
157
+func setupConsole(rootfs, console string) error {
158
+	oldMask := system.Umask(0000)
159
+	defer system.Umask(oldMask)
160
+
161
+	stat, err := os.Stat(console)
162
+	if err != nil {
163
+		return fmt.Errorf("stat console %s %s", console, err)
164
+	}
165
+	var (
166
+		st   = stat.Sys().(*syscall.Stat_t)
167
+		dest = filepath.Join(rootfs, "dev/console")
168
+	)
169
+	if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
170
+		return fmt.Errorf("remove %s %s", dest, err)
171
+	}
172
+	if err := os.Chmod(console, 0600); err != nil {
173
+		return err
174
+	}
175
+	if err := os.Chown(console, 0, 0); err != nil {
176
+		return err
177
+	}
178
+	if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
179
+		return fmt.Errorf("mknod %s %s", dest, err)
180
+	}
181
+	if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil {
182
+		return fmt.Errorf("bind %s to %s %s", console, dest, err)
183
+	}
184
+	return nil
185
+}
186
+
187
+// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
188
+// inside the mount namespace
189
+func mountSystem(rootfs string) error {
190
+	for _, m := range []struct {
191
+		source string
192
+		path   string
193
+		device string
194
+		flags  int
195
+		data   string
196
+	}{
197
+		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
198
+		{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
199
+		{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"},
200
+		{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"},
201
+		{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"},
202
+		{source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"},
203
+	} {
204
+		if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
205
+			return fmt.Errorf("mkdirall %s %s", m.path, err)
206
+		}
207
+		if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
208
+			return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
209
+		}
210
+	}
211
+	return nil
212
+}
213
+
214
+// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and
215
+// finishes setting up /dev/console
216
+func setupPtmx(rootfs, console string) error {
217
+	ptmx := filepath.Join(rootfs, "dev/ptmx")
218
+	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
219
+		return err
220
+	}
221
+	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
222
+		return fmt.Errorf("symlink dev ptmx %s", err)
223
+	}
224
+	if err := setupConsole(rootfs, console); err != nil {
225
+		return err
226
+	}
227
+	return nil
228
+}
229
+
230
+// remountProc is used to detach and remount the proc filesystem
231
+// commonly needed with running a new process inside an existing container
232
+func remountProc() error {
233
+	if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
234
+		return err
235
+	}
236
+	if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
237
+		return err
238
+	}
239
+	return nil
240
+}
241
+
242
+func remountSys() error {
243
+	if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
244
+		if err != syscall.EINVAL {
245
+			return err
246
+		}
247
+	} else {
248
+		if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
249
+			return err
250
+		}
251
+	}
252
+	return nil
253
+}
0 254
new file mode 100644
... ...
@@ -0,0 +1,26 @@
0
+package nsinit
1
+
2
+import (
3
+	"github.com/dotcloud/docker/pkg/libcontainer"
4
+)
5
+
6
+// NsInit is an interface with the public facing methods to provide high level
7
+// exec operations on a container
8
+type NsInit interface {
9
+	Exec(container *libcontainer.Container, term Terminal, args []string) (int, error)
10
+	ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error)
11
+	Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error
12
+}
13
+
14
+type linuxNs struct {
15
+	root           string
16
+	commandFactory CommandFactory
17
+	stateWriter    StateWriter
18
+}
19
+
20
+func NewNsInit(command CommandFactory, state StateWriter) NsInit {
21
+	return &linuxNs{
22
+		commandFactory: command,
23
+		stateWriter:    state,
24
+	}
25
+}
0 26
new file mode 100644
... ...
@@ -0,0 +1,110 @@
0
+package main
1
+
2
+import (
3
+	"encoding/json"
4
+	"flag"
5
+	"github.com/dotcloud/docker/pkg/libcontainer"
6
+	"github.com/dotcloud/docker/pkg/libcontainer/nsinit"
7
+	"io/ioutil"
8
+	"log"
9
+	"os"
10
+	"path/filepath"
11
+	"strconv"
12
+)
13
+
14
+var (
15
+	root, console string
16
+	pipeFd        int
17
+)
18
+
19
+func registerFlags() {
20
+	flag.StringVar(&console, "console", "", "console (pty slave) path")
21
+	flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd")
22
+	flag.StringVar(&root, "root", ".", "root for storing configuration data")
23
+
24
+	flag.Parse()
25
+}
26
+
27
+func main() {
28
+	registerFlags()
29
+
30
+	if flag.NArg() < 1 {
31
+		log.Fatalf("wrong number of argments %d", flag.NArg())
32
+	}
33
+	container, err := loadContainer()
34
+	if err != nil {
35
+		log.Fatal(err)
36
+	}
37
+	ns, err := newNsInit()
38
+	if err != nil {
39
+		log.Fatal(err)
40
+	}
41
+
42
+	switch flag.Arg(0) {
43
+	case "exec": // this is executed outside of the namespace in the cwd
44
+		var exitCode int
45
+		nspid, err := readPid()
46
+		if err != nil {
47
+			if !os.IsNotExist(err) {
48
+				log.Fatal(err)
49
+			}
50
+		}
51
+		if nspid > 0 {
52
+			exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:])
53
+		} else {
54
+			term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty)
55
+			exitCode, err = ns.Exec(container, term, flag.Args()[1:])
56
+		}
57
+		if err != nil {
58
+			log.Fatal(err)
59
+		}
60
+		os.Exit(exitCode)
61
+	case "init": // this is executed inside of the namespace to setup the container
62
+		cwd, err := os.Getwd()
63
+		if err != nil {
64
+			log.Fatal(err)
65
+		}
66
+		if flag.NArg() < 2 {
67
+			log.Fatalf("wrong number of argments %d", flag.NArg())
68
+		}
69
+		syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd))
70
+		if err != nil {
71
+			log.Fatal(err)
72
+		}
73
+		if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil {
74
+			log.Fatal(err)
75
+		}
76
+	default:
77
+		log.Fatalf("command not supported for nsinit %s", flag.Arg(0))
78
+	}
79
+}
80
+
81
+func loadContainer() (*libcontainer.Container, error) {
82
+	f, err := os.Open(filepath.Join(root, "container.json"))
83
+	if err != nil {
84
+		return nil, err
85
+	}
86
+	defer f.Close()
87
+
88
+	var container *libcontainer.Container
89
+	if err := json.NewDecoder(f).Decode(&container); err != nil {
90
+		return nil, err
91
+	}
92
+	return container, nil
93
+}
94
+
95
+func readPid() (int, error) {
96
+	data, err := ioutil.ReadFile(filepath.Join(root, "pid"))
97
+	if err != nil {
98
+		return -1, err
99
+	}
100
+	pid, err := strconv.Atoi(string(data))
101
+	if err != nil {
102
+		return -1, err
103
+	}
104
+	return pid, nil
105
+}
106
+
107
+func newNsInit() (nsinit.NsInit, error) {
108
+	return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil
109
+}
0 110
new file mode 100644
... ...
@@ -0,0 +1,28 @@
0
+package nsinit
1
+
2
+import (
3
+	"fmt"
4
+	"io/ioutil"
5
+	"os"
6
+	"path/filepath"
7
+)
8
+
9
+// StateWriter handles writing and deleting the pid file
10
+// on disk
11
+type StateWriter interface {
12
+	WritePid(pid int) error
13
+	DeletePid() error
14
+}
15
+
16
+type DefaultStateWriter struct {
17
+	Root string
18
+}
19
+
20
+// writePidFile writes the namespaced processes pid to pid in the rootfs for the container
21
+func (d *DefaultStateWriter) WritePid(pid int) error {
22
+	return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655)
23
+}
24
+
25
+func (d *DefaultStateWriter) DeletePid() error {
26
+	return os.Remove(filepath.Join(d.Root, "pid"))
27
+}
0 28
new file mode 100644
... ...
@@ -0,0 +1,73 @@
0
+package nsinit
1
+
2
+import (
3
+	"encoding/json"
4
+	"fmt"
5
+	"github.com/dotcloud/docker/pkg/libcontainer"
6
+	"github.com/dotcloud/docker/pkg/system"
7
+	"io/ioutil"
8
+	"os"
9
+)
10
+
11
+// SyncPipe allows communication to and from the child processes
12
+// to it's parent and allows the two independent processes to
13
+// syncronize their state.
14
+type SyncPipe struct {
15
+	parent, child *os.File
16
+}
17
+
18
+func NewSyncPipe() (s *SyncPipe, err error) {
19
+	s = &SyncPipe{}
20
+	s.child, s.parent, err = os.Pipe()
21
+	if err != nil {
22
+		return nil, err
23
+	}
24
+	system.UsetCloseOnExec(s.child.Fd())
25
+	return s, nil
26
+}
27
+
28
+func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) {
29
+	s := &SyncPipe{}
30
+	if parendFd > 0 {
31
+		s.parent = os.NewFile(parendFd, "parendPipe")
32
+	} else if childFd > 0 {
33
+		s.child = os.NewFile(childFd, "childPipe")
34
+	} else {
35
+		return nil, fmt.Errorf("no valid sync pipe fd specified")
36
+	}
37
+	return s, nil
38
+}
39
+
40
+func (s *SyncPipe) SendToChild(context libcontainer.Context) error {
41
+	data, err := json.Marshal(context)
42
+	if err != nil {
43
+		return err
44
+	}
45
+	s.parent.Write(data)
46
+	return nil
47
+}
48
+
49
+func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) {
50
+	data, err := ioutil.ReadAll(s.child)
51
+	if err != nil {
52
+		return nil, fmt.Errorf("error reading from sync pipe %s", err)
53
+	}
54
+	var context libcontainer.Context
55
+	if len(data) > 0 {
56
+		if err := json.Unmarshal(data, &context); err != nil {
57
+			return nil, err
58
+		}
59
+	}
60
+	return context, nil
61
+
62
+}
63
+
64
+func (s *SyncPipe) Close() error {
65
+	if s.parent != nil {
66
+		s.parent.Close()
67
+	}
68
+	if s.child != nil {
69
+		s.child.Close()
70
+	}
71
+	return nil
72
+}
0 73
new file mode 100644
... ...
@@ -0,0 +1,118 @@
0
+package nsinit
1
+
2
+import (
3
+	"github.com/dotcloud/docker/pkg/term"
4
+	"io"
5
+	"os"
6
+	"os/exec"
7
+)
8
+
9
+type Terminal interface {
10
+	io.Closer
11
+	SetMaster(*os.File)
12
+	Attach(*exec.Cmd) error
13
+	Resize(h, w int) error
14
+}
15
+
16
+func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal {
17
+	if tty {
18
+		return &TtyTerminal{
19
+			stdin:  stdin,
20
+			stdout: stdout,
21
+			stderr: stderr,
22
+		}
23
+	}
24
+	return &StdTerminal{
25
+		stdin:  stdin,
26
+		stdout: stdout,
27
+		stderr: stderr,
28
+	}
29
+}
30
+
31
+type TtyTerminal struct {
32
+	stdin          io.Reader
33
+	stdout, stderr io.Writer
34
+	master         *os.File
35
+	state          *term.State
36
+}
37
+
38
+func (t *TtyTerminal) Resize(h, w int) error {
39
+	return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
40
+}
41
+
42
+func (t *TtyTerminal) SetMaster(master *os.File) {
43
+	t.master = master
44
+}
45
+
46
+func (t *TtyTerminal) Attach(command *exec.Cmd) error {
47
+	go io.Copy(t.stdout, t.master)
48
+	go io.Copy(t.master, t.stdin)
49
+
50
+	state, err := t.setupWindow(t.master, os.Stdin)
51
+	if err != nil {
52
+		command.Process.Kill()
53
+		return err
54
+	}
55
+	t.state = state
56
+	return err
57
+}
58
+
59
+// SetupWindow gets the parent window size and sets the master
60
+// pty to the current size and set the parents mode to RAW
61
+func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) {
62
+	ws, err := term.GetWinsize(parent.Fd())
63
+	if err != nil {
64
+		return nil, err
65
+	}
66
+	if err := term.SetWinsize(master.Fd(), ws); err != nil {
67
+		return nil, err
68
+	}
69
+	return term.SetRawTerminal(parent.Fd())
70
+}
71
+
72
+func (t *TtyTerminal) Close() error {
73
+	term.RestoreTerminal(os.Stdin.Fd(), t.state)
74
+	return t.master.Close()
75
+}
76
+
77
+type StdTerminal struct {
78
+	stdin          io.Reader
79
+	stdout, stderr io.Writer
80
+}
81
+
82
+func (s *StdTerminal) SetMaster(*os.File) {
83
+	// no need to set master on non tty
84
+}
85
+
86
+func (s *StdTerminal) Close() error {
87
+	return nil
88
+}
89
+
90
+func (s *StdTerminal) Resize(h, w int) error {
91
+	return nil
92
+}
93
+
94
+func (s *StdTerminal) Attach(command *exec.Cmd) error {
95
+	inPipe, err := command.StdinPipe()
96
+	if err != nil {
97
+		return err
98
+	}
99
+	outPipe, err := command.StdoutPipe()
100
+	if err != nil {
101
+		return err
102
+	}
103
+	errPipe, err := command.StderrPipe()
104
+	if err != nil {
105
+		return err
106
+	}
107
+
108
+	go func() {
109
+		defer inPipe.Close()
110
+		io.Copy(inPipe, s.stdin)
111
+	}()
112
+
113
+	go io.Copy(s.stdout, outPipe)
114
+	go io.Copy(s.stderr, errPipe)
115
+
116
+	return nil
117
+}
0 118
new file mode 100644
... ...
@@ -0,0 +1,19 @@
0
+// +build !linux
1
+
2
+package nsinit
3
+
4
+import (
5
+	"github.com/dotcloud/docker/pkg/libcontainer"
6
+)
7
+
8
+func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) {
9
+	return -1, libcontainer.ErrUnsupported
10
+}
11
+
12
+func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) {
13
+	return -1, libcontainer.ErrUnsupported
14
+}
15
+
16
+func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
17
+	return libcontainer.ErrUnsupported
18
+}
0 19
new file mode 100644
... ...
@@ -0,0 +1,134 @@
0
+package libcontainer
1
+
2
+import (
3
+	"encoding/json"
4
+	"errors"
5
+	"github.com/syndtr/gocapability/capability"
6
+	"os"
7
+)
8
+
9
+var (
10
+	ErrUnkownNamespace  = errors.New("Unknown namespace")
11
+	ErrUnkownCapability = errors.New("Unknown capability")
12
+	ErrUnsupported      = errors.New("Unsupported method")
13
+)
14
+
15
+// namespaceList is used to convert the libcontainer types
16
+// into the names of the files located in /proc/<pid>/ns/* for
17
+// each namespace
18
+var (
19
+	namespaceList = Namespaces{}
20
+
21
+	capabilityList = Capabilities{
22
+		{Key: "SETPCAP", Value: capability.CAP_SETPCAP},
23
+		{Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE},
24
+		{Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO},
25
+		{Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT},
26
+		{Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN},
27
+		{Key: "SYS_NICE", Value: capability.CAP_SYS_NICE},
28
+		{Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE},
29
+		{Key: "SYS_TIME", Value: capability.CAP_SYS_TIME},
30
+		{Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG},
31
+		{Key: "MKNOD", Value: capability.CAP_MKNOD},
32
+		{Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE},
33
+		{Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL},
34
+		{Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE},
35
+		{Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN},
36
+		{Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN},
37
+	}
38
+)
39
+
40
+type (
41
+	Namespace struct {
42
+		Key   string
43
+		Value int
44
+		File  string
45
+	}
46
+	Namespaces []*Namespace
47
+)
48
+
49
+func (ns *Namespace) String() string {
50
+	return ns.Key
51
+}
52
+
53
+func (ns *Namespace) MarshalJSON() ([]byte, error) {
54
+	return json.Marshal(ns.Key)
55
+}
56
+
57
+func (ns *Namespace) UnmarshalJSON(src []byte) error {
58
+	var nsName string
59
+	if err := json.Unmarshal(src, &nsName); err != nil {
60
+		return err
61
+	}
62
+	ret := GetNamespace(nsName)
63
+	if ret == nil {
64
+		return ErrUnkownNamespace
65
+	}
66
+	*ns = *ret
67
+	return nil
68
+}
69
+
70
+func GetNamespace(key string) *Namespace {
71
+	for _, ns := range namespaceList {
72
+		if ns.Key == key {
73
+			return ns
74
+		}
75
+	}
76
+	if os.Getenv("DEBUG") != "" {
77
+		panic("Unreachable: Namespace not found")
78
+	}
79
+	return nil
80
+}
81
+
82
+// Contains returns true if the specified Namespace is
83
+// in the slice
84
+func (n Namespaces) Contains(ns string) bool {
85
+	return GetNamespace(ns) != nil
86
+}
87
+
88
+type (
89
+	Capability struct {
90
+		Key   string
91
+		Value capability.Cap
92
+	}
93
+	Capabilities []*Capability
94
+)
95
+
96
+func (c *Capability) String() string {
97
+	return c.Key
98
+}
99
+
100
+func (c *Capability) MarshalJSON() ([]byte, error) {
101
+	return json.Marshal(c.Key)
102
+}
103
+
104
+func (c *Capability) UnmarshalJSON(src []byte) error {
105
+	var capName string
106
+	if err := json.Unmarshal(src, &capName); err != nil {
107
+		return err
108
+	}
109
+	ret := GetCapability(capName)
110
+	if ret == nil {
111
+		return ErrUnkownCapability
112
+	}
113
+	*c = *ret
114
+	return nil
115
+}
116
+
117
+func GetCapability(key string) *Capability {
118
+	for _, capp := range capabilityList {
119
+		if capp.Key == key {
120
+			return capp
121
+		}
122
+	}
123
+	if os.Getenv("DEBUG") != "" {
124
+		panic("Unreachable: Capability not found")
125
+	}
126
+	return nil
127
+}
128
+
129
+// Contains returns true if the specified Capability is
130
+// in the slice
131
+func (c Capabilities) Contains(capp string) bool {
132
+	return GetCapability(capp) != nil
133
+}
0 134
new file mode 100644
... ...
@@ -0,0 +1,16 @@
0
+package libcontainer
1
+
2
+import (
3
+	"syscall"
4
+)
5
+
6
+func init() {
7
+	namespaceList = Namespaces{
8
+		{Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"},
9
+		{Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"},
10
+		{Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"},
11
+		{Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"},
12
+		{Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"},
13
+		{Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"},
14
+	}
15
+}
0 16
new file mode 100644
... ...
@@ -0,0 +1,28 @@
0
+package utils
1
+
2
+import (
3
+	"crypto/rand"
4
+	"encoding/hex"
5
+	"io"
6
+	"path/filepath"
7
+)
8
+
9
+// GenerateRandomName returns a new name joined with a prefix.  This size
10
+// specified is used to truncate the randomly generated value
11
+func GenerateRandomName(prefix string, size int) (string, error) {
12
+	id := make([]byte, 32)
13
+	if _, err := io.ReadFull(rand.Reader, id); err != nil {
14
+		return "", err
15
+	}
16
+	return prefix + hex.EncodeToString(id)[:size], nil
17
+}
18
+
19
+// ResolveRootfs ensures that the current working directory is
20
+// not a symlink and returns the absolute path to the rootfs
21
+func ResolveRootfs(uncleanRootfs string) (string, error) {
22
+	rootfs, err := filepath.Abs(uncleanRootfs)
23
+	if err != nil {
24
+		return "", err
25
+	}
26
+	return filepath.EvalSymlinks(rootfs)
27
+}
0 28
new file mode 100644
... ...
@@ -0,0 +1,145 @@
0
+package system
1
+
2
+import (
3
+	"os/exec"
4
+	"syscall"
5
+)
6
+
7
+func Chroot(dir string) error {
8
+	return syscall.Chroot(dir)
9
+}
10
+
11
+func Chdir(dir string) error {
12
+	return syscall.Chdir(dir)
13
+}
14
+
15
+func Exec(cmd string, args []string, env []string) error {
16
+	return syscall.Exec(cmd, args, env)
17
+}
18
+
19
+func Execv(cmd string, args []string, env []string) error {
20
+	name, err := exec.LookPath(cmd)
21
+	if err != nil {
22
+		return err
23
+	}
24
+	return Exec(name, args, env)
25
+}
26
+
27
+func Fork() (int, error) {
28
+	syscall.ForkLock.Lock()
29
+	pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
30
+	syscall.ForkLock.Unlock()
31
+	if err != 0 {
32
+		return -1, err
33
+	}
34
+	return int(pid), nil
35
+}
36
+
37
+func Mount(source, target, fstype string, flags uintptr, data string) error {
38
+	return syscall.Mount(source, target, fstype, flags, data)
39
+}
40
+
41
+func Unmount(target string, flags int) error {
42
+	return syscall.Unmount(target, flags)
43
+}
44
+
45
+func Pivotroot(newroot, putold string) error {
46
+	return syscall.PivotRoot(newroot, putold)
47
+}
48
+
49
+func Unshare(flags int) error {
50
+	return syscall.Unshare(flags)
51
+}
52
+
53
+func Clone(flags uintptr) (int, error) {
54
+	syscall.ForkLock.Lock()
55
+	pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0)
56
+	syscall.ForkLock.Unlock()
57
+	if err != 0 {
58
+		return -1, err
59
+	}
60
+	return int(pid), nil
61
+}
62
+
63
+func UsetCloseOnExec(fd uintptr) error {
64
+	if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 {
65
+		return err
66
+	}
67
+	return nil
68
+}
69
+
70
+func Setgroups(gids []int) error {
71
+	return syscall.Setgroups(gids)
72
+}
73
+
74
+func Setresgid(rgid, egid, sgid int) error {
75
+	return syscall.Setresgid(rgid, egid, sgid)
76
+}
77
+
78
+func Setresuid(ruid, euid, suid int) error {
79
+	return syscall.Setresuid(ruid, euid, suid)
80
+}
81
+
82
+func Setgid(gid int) error {
83
+	return syscall.Setgid(gid)
84
+}
85
+
86
+func Setuid(uid int) error {
87
+	return syscall.Setuid(uid)
88
+}
89
+
90
+func Sethostname(name string) error {
91
+	return syscall.Sethostname([]byte(name))
92
+}
93
+
94
+func Setsid() (int, error) {
95
+	return syscall.Setsid()
96
+}
97
+
98
+func Ioctl(fd uintptr, flag, data uintptr) error {
99
+	if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
100
+		return err
101
+	}
102
+	return nil
103
+}
104
+
105
+func Closefd(fd uintptr) error {
106
+	return syscall.Close(int(fd))
107
+}
108
+
109
+func Dup2(fd1, fd2 uintptr) error {
110
+	return syscall.Dup2(int(fd1), int(fd2))
111
+}
112
+
113
+func Mknod(path string, mode uint32, dev int) error {
114
+	return syscall.Mknod(path, mode, dev)
115
+}
116
+
117
+func ParentDeathSignal() error {
118
+	if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 {
119
+		return err
120
+	}
121
+	return nil
122
+}
123
+
124
+func Setctty() error {
125
+	if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 {
126
+		return err
127
+	}
128
+	return nil
129
+}
130
+
131
+func Mkfifo(name string, mode uint32) error {
132
+	return syscall.Mkfifo(name, mode)
133
+}
134
+
135
+func Umask(mask int) int {
136
+	return syscall.Umask(mask)
137
+}
138
+
139
+func SetCloneFlags(cmd *exec.Cmd, flag uintptr) {
140
+	if cmd.SysProcAttr == nil {
141
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
142
+	}
143
+	cmd.SysProcAttr.Cloneflags = flag
144
+}
0 145
new file mode 100644
... ...
@@ -0,0 +1,9 @@
0
+package system
1
+
2
+import (
3
+	"errors"
4
+)
5
+
6
+var (
7
+	ErrNotSupportedPlatform = errors.New("platform and architecture is not supported")
8
+)
0 9
new file mode 100644
... ...
@@ -0,0 +1,58 @@
0
+package system
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"syscall"
6
+	"unsafe"
7
+)
8
+
9
+// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
10
+// Unlockpt should be called before opening the slave side of a pseudoterminal.
11
+func Unlockpt(f *os.File) error {
12
+	var u int
13
+	return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
14
+}
15
+
16
+// Ptsname retrieves the name of the first available pts for the given master.
17
+func Ptsname(f *os.File) (string, error) {
18
+	var n int
19
+
20
+	if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
21
+		return "", err
22
+	}
23
+	return fmt.Sprintf("/dev/pts/%d", n), nil
24
+}
25
+
26
+// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the
27
+// pts name for use as the pty slave inside the container
28
+func CreateMasterAndConsole() (*os.File, string, error) {
29
+	master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
30
+	if err != nil {
31
+		return nil, "", err
32
+	}
33
+	console, err := Ptsname(master)
34
+	if err != nil {
35
+		return nil, "", err
36
+	}
37
+	if err := Unlockpt(master); err != nil {
38
+		return nil, "", err
39
+	}
40
+	return master, console, nil
41
+}
42
+
43
+// OpenPtmx opens /dev/ptmx, i.e. the PTY master.
44
+func OpenPtmx() (*os.File, error) {
45
+	// O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all.
46
+	return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
47
+}
48
+
49
+// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC
50
+// used to open the pty slave inside the container namespace
51
+func OpenTerminal(name string, flag int) (*os.File, error) {
52
+	r, e := syscall.Open(name, flag, 0)
53
+	if e != nil {
54
+		return nil, &os.PathError{"open", name, e}
55
+	}
56
+	return os.NewFile(uintptr(r), name), nil
57
+}
0 58
new file mode 100644
... ...
@@ -0,0 +1,27 @@
0
+package system
1
+
2
+import (
3
+	"fmt"
4
+	"runtime"
5
+	"syscall"
6
+)
7
+
8
+// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092
9
+//
10
+// We need different setns values for the different platforms and arch
11
+// We are declaring the macro here because the SETNS syscall does not exist in th stdlib
12
+var setNsMap = map[string]uintptr{
13
+	"linux/amd64": 308,
14
+}
15
+
16
+func Setns(fd uintptr, flags uintptr) error {
17
+	ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)]
18
+	if !exists {
19
+		return ErrNotSupportedPlatform
20
+	}
21
+	_, _, err := syscall.RawSyscall(ns, fd, flags, 0)
22
+	if err != 0 {
23
+		return err
24
+	}
25
+	return nil
26
+}
0 27
new file mode 100644
... ...
@@ -0,0 +1,15 @@
0
+// +build !linux
1
+
2
+package system
3
+
4
+import (
5
+	"os/exec"
6
+)
7
+
8
+func SetCloneFlags(cmd *exec.Cmd, flag uintptr) {
9
+
10
+}
11
+
12
+func UsetCloseOnExec(fd uintptr) error {
13
+	return ErrNotSupportedPlatform
14
+}
... ...
@@ -7,8 +7,8 @@ import (
7 7
 	"github.com/dotcloud/docker/dockerversion"
8 8
 	"github.com/dotcloud/docker/engine"
9 9
 	"github.com/dotcloud/docker/execdriver"
10
-	"github.com/dotcloud/docker/execdriver/chroot"
11 10
 	"github.com/dotcloud/docker/execdriver/lxc"
11
+	"github.com/dotcloud/docker/execdriver/native"
12 12
 	"github.com/dotcloud/docker/graphdriver"
13 13
 	"github.com/dotcloud/docker/graphdriver/aufs"
14 14
 	_ "github.com/dotcloud/docker/graphdriver/btrfs"
... ...
@@ -702,17 +702,21 @@ func NewRuntimeFromDirectory(config *DaemonConfig, eng *engine.Engine) (*Runtime
702 702
 		sysInitPath = localCopy
703 703
 	}
704 704
 
705
-	sysInfo := sysinfo.New(false)
705
+	var (
706
+		ed      execdriver.Driver
707
+		sysInfo = sysinfo.New(false)
708
+	)
706 709
 
707
-	var ed execdriver.Driver
708
-	utils.Debugf("execDriver: provided %s", config.ExecDriver)
709
-	if config.ExecDriver == "chroot" && false {
710
-		// chroot is presently a noop driver https://github.com/dotcloud/docker/pull/4189#issuecomment-35330655
711
-		ed, err = chroot.NewDriver()
712
-		utils.Debugf("execDriver: using chroot")
713
-	} else {
710
+	switch config.ExecDriver {
711
+	case "lxc":
712
+		// we want to five the lxc driver the full docker root because it needs
713
+		// to access and write config and template files in /var/lib/docker/containers/*
714
+		// to be backwards compatible
714 715
 		ed, err = lxc.NewDriver(config.Root, sysInfo.AppArmor)
715
-		utils.Debugf("execDriver: using lxc")
716
+	case "native":
717
+		ed, err = native.NewDriver(path.Join(config.Root, "execdriver", "native"))
718
+	default:
719
+		return nil, fmt.Errorf("unknown exec driver %s", config.ExecDriver)
716 720
 	}
717 721
 	if err != nil {
718 722
 		return nil, err
... ...
@@ -5,8 +5,8 @@ import (
5 5
 	"flag"
6 6
 	"fmt"
7 7
 	"github.com/dotcloud/docker/execdriver"
8
-	_ "github.com/dotcloud/docker/execdriver/chroot"
9 8
 	_ "github.com/dotcloud/docker/execdriver/lxc"
9
+	_ "github.com/dotcloud/docker/execdriver/native"
10 10
 	"io/ioutil"
11 11
 	"log"
12 12
 	"os"
... ...
@@ -53,19 +53,21 @@ func SysInit() {
53 53
 		privileged = flag.Bool("privileged", false, "privileged mode")
54 54
 		mtu        = flag.Int("mtu", 1500, "interface mtu")
55 55
 		driver     = flag.String("driver", "", "exec driver")
56
+		pipe       = flag.Int("pipe", 0, "sync pipe fd")
57
+		console    = flag.String("console", "", "console (pty slave) path")
58
+		root       = flag.String("root", ".", "root path for configuration files")
56 59
 	)
57 60
 	flag.Parse()
58 61
 
59 62
 	// Get env
60 63
 	var env []string
61
-	content, err := ioutil.ReadFile("/.dockerenv")
64
+	content, err := ioutil.ReadFile(".dockerenv")
62 65
 	if err != nil {
63 66
 		log.Fatalf("Unable to load environment variables: %v", err)
64 67
 	}
65 68
 	if err := json.Unmarshal(content, &env); err != nil {
66 69
 		log.Fatalf("Unable to unmarshal environment variables: %v", err)
67 70
 	}
68
-
69 71
 	// Propagate the plugin-specific container env variable
70 72
 	env = append(env, "container="+os.Getenv("container"))
71 73
 
... ...
@@ -79,6 +81,9 @@ func SysInit() {
79 79
 		Args:       flag.Args(),
80 80
 		Mtu:        *mtu,
81 81
 		Driver:     *driver,
82
+		Console:    *console,
83
+		Pipe:       *pipe,
84
+		Root:       *root,
82 85
 	}
83 86
 
84 87
 	if err := executeProgram(args); err != nil {