Browse code

Allow IPC namespace to be shared between containers or with the host

Some workloads rely on IPC for communications with other processes. We
would like to split workloads between two container but still allow them
to communicate though shared IPC.

This patch mimics the --net code to allow --ipc=host to not split off
the IPC Namespace. ipc=container:CONTAINERID to share ipc between containers

If you share IPC between containers, then you need to make sure SELinux labels
match.

Docker-DCO-1.1-Signed-off-by: Dan Walsh <dwalsh@redhat.com> (github: rhatdan)

Dan Walsh authored on 2014/11/11 06:14:17
Showing 9 changed files
... ...
@@ -233,6 +233,18 @@ func populateCommand(c *Container, env []string) error {
233 233
 		return fmt.Errorf("invalid network mode: %s", c.hostConfig.NetworkMode)
234 234
 	}
235 235
 
236
+	ipc := &execdriver.Ipc{}
237
+
238
+	if c.hostConfig.IpcMode.IsContainer() {
239
+		ic, err := c.getIpcContainer()
240
+		if err != nil {
241
+			return err
242
+		}
243
+		ipc.ContainerID = ic.ID
244
+	} else {
245
+		ipc.HostIpc = c.hostConfig.IpcMode.IsHost()
246
+	}
247
+
236 248
 	// Build lists of devices allowed and created within the container.
237 249
 	userSpecifiedDevices := make([]*devices.Device, len(c.hostConfig.Devices))
238 250
 	for i, deviceMapping := range c.hostConfig.Devices {
... ...
@@ -274,6 +286,7 @@ func populateCommand(c *Container, env []string) error {
274 274
 		InitPath:           "/.dockerinit",
275 275
 		WorkingDir:         c.Config.WorkingDir,
276 276
 		Network:            en,
277
+		Ipc:                ipc,
277 278
 		Resources:          resources,
278 279
 		AllowedDevices:     allowedDevices,
279 280
 		AutoCreatedDevices: autoCreatedDevices,
... ...
@@ -1250,10 +1263,25 @@ func (container *Container) GetMountLabel() string {
1250 1250
 	return container.MountLabel
1251 1251
 }
1252 1252
 
1253
+func (container *Container) getIpcContainer() (*Container, error) {
1254
+	containerID := container.hostConfig.IpcMode.Container()
1255
+	c := container.daemon.Get(containerID)
1256
+	if c == nil {
1257
+		return nil, fmt.Errorf("no such container to join IPC: %s", containerID)
1258
+	}
1259
+	if !c.IsRunning() {
1260
+		return nil, fmt.Errorf("cannot join IPC of a non running container: %s", containerID)
1261
+	}
1262
+	return c, nil
1263
+}
1264
+
1253 1265
 func (container *Container) getNetworkedContainer() (*Container, error) {
1254 1266
 	parts := strings.SplitN(string(container.hostConfig.NetworkMode), ":", 2)
1255 1267
 	switch parts[0] {
1256 1268
 	case "container":
1269
+		if len(parts) != 2 {
1270
+			return nil, fmt.Errorf("no container specified to join network")
1271
+		}
1257 1272
 		nc := container.daemon.Get(parts[1])
1258 1273
 		if nc == nil {
1259 1274
 			return nil, fmt.Errorf("no such container to join network: %s", parts[1])
... ...
@@ -1,10 +1,13 @@
1 1
 package daemon
2 2
 
3 3
 import (
4
+	"fmt"
5
+
4 6
 	"github.com/docker/docker/engine"
5 7
 	"github.com/docker/docker/graph"
6 8
 	"github.com/docker/docker/pkg/parsers"
7 9
 	"github.com/docker/docker/runconfig"
10
+	"github.com/docker/libcontainer/label"
8 11
 )
9 12
 
10 13
 func (daemon *Daemon) ContainerCreate(job *engine.Job) engine.Status {
... ...
@@ -80,6 +83,12 @@ func (daemon *Daemon) Create(config *runconfig.Config, hostConfig *runconfig.Hos
80 80
 	if warnings, err = daemon.mergeAndVerifyConfig(config, img); err != nil {
81 81
 		return nil, nil, err
82 82
 	}
83
+	if hostConfig != nil && config.SecurityOpt == nil {
84
+		config.SecurityOpt, err = daemon.GenerateSecurityOpt(hostConfig.IpcMode)
85
+		if err != nil {
86
+			return nil, nil, err
87
+		}
88
+	}
83 89
 	if container, err = daemon.newContainer(name, config, img); err != nil {
84 90
 		return nil, nil, err
85 91
 	}
... ...
@@ -99,3 +108,20 @@ func (daemon *Daemon) Create(config *runconfig.Config, hostConfig *runconfig.Hos
99 99
 	}
100 100
 	return container, warnings, nil
101 101
 }
102
+func (daemon *Daemon) GenerateSecurityOpt(ipcMode runconfig.IpcMode) ([]string, error) {
103
+	if ipcMode.IsHost() {
104
+		return label.DisableSecOpt(), nil
105
+	}
106
+	if ipcContainer := ipcMode.Container(); ipcContainer != "" {
107
+		c := daemon.Get(ipcContainer)
108
+		if c == nil {
109
+			return nil, fmt.Errorf("no such container to join IPC: %s", ipcContainer)
110
+		}
111
+		if !c.IsRunning() {
112
+			return nil, fmt.Errorf("cannot join IPC of a non running container: %s", ipcContainer)
113
+		}
114
+
115
+		return label.DupSecOpt(c.ProcessLabel), nil
116
+	}
117
+	return nil, nil
118
+}
... ...
@@ -62,6 +62,12 @@ type Network struct {
62 62
 	HostNetworking bool              `json:"host_networking"`
63 63
 }
64 64
 
65
+// IPC settings of the container
66
+type Ipc struct {
67
+	ContainerID string `json:"container_id"` // id of the container to join ipc.
68
+	HostIpc     bool   `json:"host_ipc"`
69
+}
70
+
65 71
 type NetworkInterface struct {
66 72
 	Gateway     string `json:"gateway"`
67 73
 	IPAddress   string `json:"ip"`
... ...
@@ -106,6 +112,7 @@ type Command struct {
106 106
 	WorkingDir         string            `json:"working_dir"`
107 107
 	ConfigPath         string            `json:"config_path"` // this should be able to be removed when the lxc template is moved into the driver
108 108
 	Network            *Network          `json:"network"`
109
+	Ipc                *Ipc              `json:"ipc"`
109 110
 	Resources          *Resources        `json:"resources"`
110 111
 	Mounts             []Mount           `json:"mounts"`
111 112
 	AllowedDevices     []*devices.Device `json:"allowed_devices"`
... ...
@@ -36,6 +36,10 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Config, e
36 36
 	container.MountConfig.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
37 37
 	container.RestrictSys = true
38 38
 
39
+	if err := d.createIpc(container, c); err != nil {
40
+		return nil, err
41
+	}
42
+
39 43
 	if err := d.createNetwork(container, c); err != nil {
40 44
 		return nil, err
41 45
 	}
... ...
@@ -124,6 +128,28 @@ func (d *driver) createNetwork(container *libcontainer.Config, c *execdriver.Com
124 124
 	return nil
125 125
 }
126 126
 
127
+func (d *driver) createIpc(container *libcontainer.Config, c *execdriver.Command) error {
128
+	if c.Ipc.HostIpc {
129
+		container.Namespaces["NEWIPC"] = false
130
+		return nil
131
+	}
132
+
133
+	if c.Ipc.ContainerID != "" {
134
+		d.Lock()
135
+		active := d.activeContainers[c.Ipc.ContainerID]
136
+		d.Unlock()
137
+
138
+		if active == nil || active.cmd.Process == nil {
139
+			return fmt.Errorf("%s is not a valid running container to join", c.Ipc.ContainerID)
140
+		}
141
+		cmd := active.cmd
142
+
143
+		container.IpcNsPath = filepath.Join("/proc", fmt.Sprint(cmd.Process.Pid), "ns", "ipc")
144
+	}
145
+
146
+	return nil
147
+}
148
+
127 149
 func (d *driver) setPrivileged(container *libcontainer.Config) (err error) {
128 150
 	container.Capabilities = capabilities.GetAllCapabilities()
129 151
 	container.Cgroups.AllowAllDevices = true
... ...
@@ -23,6 +23,7 @@ docker-run - Run a command in a new container
23 23
 [**--expose**[=*[]*]]
24 24
 [**-h**|**--hostname**[=*HOSTNAME*]]
25 25
 [**-i**|**--interactive**[=*false*]]
26
+[**--ipc**[=*[]*]]
26 27
 [**--security-opt**[=*[]*]]
27 28
 [**--link**[=*[]*]]
28 29
 [**--lxc-conf**[=*[]*]]
... ...
@@ -142,6 +143,12 @@ ENTRYPOINT.
142 142
 **-i**, **--interactive**=*true*|*false*
143 143
    When set to true, keep stdin open even if not attached. The default is false.
144 144
 
145
+**--ipc**=[]
146
+   Set the IPC mode for the container
147
+     **container**:<*name*|*id*>: reuses another container's IPC stack
148
+     **host**: use the host's IPC stack inside the container.  
149
+     Note: the host mode gives the container full access to local IPC and is therefore considered insecure.
150
+
145 151
 **--security-opt**=*secdriver*:*name*:*value*
146 152
     "label:user:USER"   : Set the label user for the container
147 153
     "label:role:ROLE"   : Set the label role for the container
... ...
@@ -183,10 +190,11 @@ and foreground Docker containers.
183 183
 
184 184
 **--net**="bridge"
185 185
    Set the Network mode for the container
186
-                               'bridge': creates a new network stack for the container on the docker bridge
187
-                               'none': no networking for this container
188
-                               'container:<name|id>': reuses another container network stack
189
-                               'host': use the host network stack inside the container.  Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure.
186
+   **bridge**: creates a new network stack for the container on the docker bridge
187
+   **none**: no networking for this container
188
+   **container**:<*name*|*id*>: reuses another container's network stack
189
+   **host**: use the host network stack inside the container.  
190
+   Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure.
190 191
 
191 192
 **--mac-address**=*macaddress*
192 193
    Set the MAC address for the container's Ethernet device:
... ...
@@ -310,6 +318,71 @@ you’d like to connect instead, as in:
310 310
 
311 311
     # docker run -a stdin -a stdout -i -t fedora /bin/bash
312 312
 
313
+## Sharing IPC between containers
314
+
315
+Using shm_server.c available here: http://www.cs.cf.ac.uk/Dave/C/node27.html
316
+
317
+Testing `--ipc=host` mode:
318
+
319
+Host shows a shared memory segment with 7 pids attached, happens to be from httpd:
320
+
321
+```
322
+ $ sudo ipcs -m
323
+
324
+ ------ Shared Memory Segments --------
325
+ key        shmid      owner      perms      bytes      nattch     status      
326
+ 0x01128e25 0          root       600        1000       7                       
327
+```
328
+
329
+Now run a regular container, and it correctly does NOT see the shared memory segment from the host:
330
+
331
+```
332
+ $ sudo docker run -it shm ipcs -m
333
+
334
+ ------ Shared Memory Segments --------	
335
+ key        shmid      owner      perms      bytes      nattch     status      
336
+```
337
+
338
+Run a container with the new `--ipc=host` option, and it now sees the shared memory segment from the host httpd:
339
+
340
+ ```
341
+ $ sudo docker run -it --ipc=host shm ipcs -m
342
+
343
+ ------ Shared Memory Segments --------
344
+ key        shmid      owner      perms      bytes      nattch     status      
345
+ 0x01128e25 0          root       600        1000       7                   
346
+```
347
+Testing `--ipc=container:CONTAINERID` mode:
348
+
349
+Start a container with a program to create a shared memory segment:
350
+```
351
+ sudo docker run -it shm bash
352
+ $ sudo shm/shm_server &
353
+ $ sudo ipcs -m
354
+
355
+ ------ Shared Memory Segments --------
356
+ key        shmid      owner      perms      bytes      nattch     status      
357
+ 0x0000162e 0          root       666        27         1                       
358
+```
359
+Create a 2nd container correctly shows no shared memory segment from 1st container:
360
+```
361
+ $ sudo docker run shm ipcs -m
362
+
363
+ ------ Shared Memory Segments --------
364
+ key        shmid      owner      perms      bytes      nattch     status      
365
+```
366
+
367
+Create a 3rd container using the new --ipc=container:CONTAINERID option, now it shows the shared memory segment from the first:
368
+
369
+```
370
+ $ sudo docker run -it --ipc=container:ed735b2264ac shm ipcs -m
371
+ $ sudo ipcs -m
372
+
373
+ ------ Shared Memory Segments --------
374
+ key        shmid      owner      perms      bytes      nattch     status      
375
+ 0x0000162e 0          root       666        27         1
376
+```
377
+
313 378
 ## Linking Containers
314 379
 
315 380
 The link feature allows multiple containers to communicate with each other. For
... ...
@@ -50,6 +50,7 @@ following options.
50 50
  - [Container Identification](#container-identification)
51 51
      - [Name (--name)](#name-name)
52 52
      - [PID Equivalent](#pid-equivalent)
53
+ - [IPC Settings](#ipc-settings)
53 54
  - [Network Settings](#network-settings)
54 55
  - [Clean Up (--rm)](#clean-up-rm)
55 56
  - [Runtime Constraints on CPU and Memory](#runtime-constraints-on-cpu-and-memory)
... ...
@@ -131,6 +132,22 @@ While not strictly a means of identifying a container, you can specify a version
131 131
 image you'd like to run the container with by adding `image[:tag]` to the command. For
132 132
 example, `docker run ubuntu:14.04`.
133 133
 
134
+## IPC Settings
135
+    --ipc=""  : Set the IPC mode for the container,
136
+                                 'container:<name|id>': reuses another container's IPC namespace
137
+                                 'host': use the host's IPC namespace inside the container
138
+By default, all containers have the IPC namespace enabled 
139
+
140
+IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores and message queues.  
141
+
142
+Shared memory segments are used to accelerate inter-process communication at
143
+memory speed, rather than through pipes or through the network stack. Shared
144
+memory is commonly used by databases and custom-built (typically C/OpenMPI, 
145
+C++/using boost libraries) high performance applications for scientific
146
+computing and financial services industries. If these types of applications
147
+are broken into multiple containers, you might need to share the IPC mechanisms
148
+of the containers.
149
+
134 150
 ## Network settings
135 151
 
136 152
     --dns=[]         : Set custom dns servers for the container
... ...
@@ -2568,3 +2568,73 @@ func TestRunUnknownCommand(t *testing.T) {
2568 2568
 
2569 2569
 	logDone("run - Unknown Command")
2570 2570
 }
2571
+
2572
+func TestRunModeIpcHost(t *testing.T) {
2573
+	hostIpc, err := os.Readlink("/proc/1/ns/ipc")
2574
+	if err != nil {
2575
+		t.Fatal(err)
2576
+	}
2577
+
2578
+	cmd := exec.Command(dockerBinary, "run", "--ipc=host", "busybox", "readlink", "/proc/self/ns/ipc")
2579
+	out2, _, err := runCommandWithOutput(cmd)
2580
+	if err != nil {
2581
+		t.Fatal(err, out2)
2582
+	}
2583
+
2584
+	out2 = strings.Trim(out2, "\n")
2585
+	if hostIpc != out2 {
2586
+		t.Fatalf("IPC different with --ipc=host %s != %s\n", hostIpc, out2)
2587
+	}
2588
+
2589
+	cmd = exec.Command(dockerBinary, "run", "busybox", "readlink", "/proc/self/ns/ipc")
2590
+	out2, _, err = runCommandWithOutput(cmd)
2591
+	if err != nil {
2592
+		t.Fatal(err, out2)
2593
+	}
2594
+
2595
+	out2 = strings.Trim(out2, "\n")
2596
+	if hostIpc == out2 {
2597
+		t.Fatalf("IPC should be different without --ipc=host %s != %s\n", hostIpc, out2)
2598
+	}
2599
+	deleteAllContainers()
2600
+
2601
+	logDone("run - hostname and several network modes")
2602
+}
2603
+
2604
+func TestRunModeIpcContainer(t *testing.T) {
2605
+	cmd := exec.Command(dockerBinary, "run", "-d", "busybox", "top")
2606
+	out, _, err := runCommandWithOutput(cmd)
2607
+	if err != nil {
2608
+		t.Fatal(err, out)
2609
+	}
2610
+	id := strings.TrimSpace(out)
2611
+	state, err := inspectField(id, "State.Running")
2612
+	if err != nil {
2613
+		t.Fatal(err)
2614
+	}
2615
+	if state != "true" {
2616
+		t.Fatal("Container state is 'not running'")
2617
+	}
2618
+	pid1, err := inspectField(id, "State.Pid")
2619
+	if err != nil {
2620
+		t.Fatal(err)
2621
+	}
2622
+
2623
+	parentContainerIpc, err := os.Readlink(fmt.Sprintf("/proc/%s/ns/ipc", pid1))
2624
+	if err != nil {
2625
+		t.Fatal(err)
2626
+	}
2627
+	cmd = exec.Command(dockerBinary, "run", fmt.Sprintf("--ipc=container:%s", id), "busybox", "readlink", "/proc/self/ns/ipc")
2628
+	out2, _, err := runCommandWithOutput(cmd)
2629
+	if err != nil {
2630
+		t.Fatal(err, out2)
2631
+	}
2632
+
2633
+	out2 = strings.Trim(out2, "\n")
2634
+	if parentContainerIpc != out2 {
2635
+		t.Fatalf("IPC different with --ipc=container:%s %s != %s\n", id, parentContainerIpc, out2)
2636
+	}
2637
+	deleteAllContainers()
2638
+
2639
+	logDone("run - hostname and several network modes")
2640
+}
... ...
@@ -28,6 +28,44 @@ func (n NetworkMode) IsNone() bool {
28 28
 	return n == "none"
29 29
 }
30 30
 
31
+type IpcMode string
32
+
33
+// IsPrivate indicates whether container use it's private ipc stack
34
+func (n IpcMode) IsPrivate() bool {
35
+	return !(n.IsHost() || n.IsContainer())
36
+}
37
+
38
+func (n IpcMode) IsHost() bool {
39
+	return n == "host"
40
+}
41
+
42
+func (n IpcMode) IsContainer() bool {
43
+	parts := strings.SplitN(string(n), ":", 2)
44
+	return len(parts) > 1 && parts[0] == "container"
45
+}
46
+
47
+func (n IpcMode) Valid() bool {
48
+	parts := strings.Split(string(n), ":")
49
+	switch mode := parts[0]; mode {
50
+	case "", "host":
51
+	case "container":
52
+		if len(parts) != 2 || parts[1] == "" {
53
+			return false
54
+		}
55
+	default:
56
+		return false
57
+	}
58
+	return true
59
+}
60
+
61
+func (n IpcMode) Container() string {
62
+	parts := strings.SplitN(string(n), ":", 2)
63
+	if len(parts) > 1 {
64
+		return parts[1]
65
+	}
66
+	return ""
67
+}
68
+
31 69
 type DeviceMapping struct {
32 70
 	PathOnHost        string
33 71
 	PathInContainer   string
... ...
@@ -53,6 +91,7 @@ type HostConfig struct {
53 53
 	VolumesFrom     []string
54 54
 	Devices         []DeviceMapping
55 55
 	NetworkMode     NetworkMode
56
+	IpcMode         IpcMode
56 57
 	CapAdd          []string
57 58
 	CapDrop         []string
58 59
 	RestartPolicy   RestartPolicy
... ...
@@ -84,6 +123,7 @@ func ContainerHostConfigFromJob(job *engine.Job) *HostConfig {
84 84
 		Privileged:      job.GetenvBool("Privileged"),
85 85
 		PublishAllPorts: job.GetenvBool("PublishAllPorts"),
86 86
 		NetworkMode:     NetworkMode(job.Getenv("NetworkMode")),
87
+		IpcMode:         IpcMode(job.Getenv("IpcMode")),
87 88
 	}
88 89
 
89 90
 	job.GetenvJson("LxcConf", &hostConfig.LxcConf)
... ...
@@ -60,6 +60,7 @@ func Parse(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Config,
60 60
 		flCpuset          = cmd.String([]string{"-cpuset"}, "", "CPUs in which to allow execution (0-3, 0,1)")
61 61
 		flNetMode         = cmd.String([]string{"-net"}, "bridge", "Set the Network mode for the container\n'bridge': creates a new network stack for the container on the docker bridge\n'none': no networking for this container\n'container:<name|id>': reuses another container network stack\n'host': use the host network stack inside the container.  Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure.")
62 62
 		flMacAddress      = cmd.String([]string{"-mac-address"}, "", "Container MAC address (e.g. 92:d0:c6:0a:29:33)")
63
+		flIpcMode         = cmd.String([]string{"-ipc"}, "", "Default is to create a private IPC namespace (POSIX SysV IPC) for the container\n'container:<name|id>': reuses another container shared memory, semaphores and message queues\n'host': use the host shared memory,semaphores and message queues inside the container.  Note: the host mode gives the container full access to local shared memory and is therefore considered insecure.")
63 64
 		flRestartPolicy   = cmd.String([]string{"-restart"}, "", "Restart policy to apply when a container exits (no, on-failure[:max-retry], always)")
64 65
 	)
65 66
 
... ...
@@ -241,6 +242,11 @@ func Parse(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Config,
241 241
 	// parse the '-e' and '--env' after, to allow override
242 242
 	envVariables = append(envVariables, flEnv.GetAll()...)
243 243
 
244
+	ipcMode := IpcMode(*flIpcMode)
245
+	if !ipcMode.Valid() {
246
+		return nil, nil, cmd, fmt.Errorf("--ipc: invalid IPC mode: %v", err)
247
+	}
248
+
244 249
 	netMode, err := parseNetMode(*flNetMode)
245 250
 	if err != nil {
246 251
 		return nil, nil, cmd, fmt.Errorf("--net: invalid net mode: %v", err)
... ...
@@ -289,6 +295,7 @@ func Parse(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Config,
289 289
 		ExtraHosts:      flExtraHosts.GetAll(),
290 290
 		VolumesFrom:     flVolumesFrom.GetAll(),
291 291
 		NetworkMode:     netMode,
292
+		IpcMode:         ipcMode,
292 293
 		Devices:         deviceMappings,
293 294
 		CapAdd:          flCapAdd.GetAll(),
294 295
 		CapDrop:         flCapDrop.GetAll(),