Browse code

libcontainer: Initial version of cgroups support

This is a minimal version of raw cgroup support for libcontainer.
It has only enough for what docker needs, and it has no support
for systemd yet.

Docker-DCO-1.1-Signed-off-by: Alexander Larsson <alexl@redhat.com> (github: alexlarsson)

Alexander Larsson authored on 2014/02/21 07:12:08
Showing 6 changed files
... ...
@@ -40,6 +40,16 @@ func GetThisCgroupDir(subsystem string) (string, error) {
40 40
 	return parseCgroupFile(subsystem, f)
41 41
 }
42 42
 
43
+func GetInitCgroupDir(subsystem string) (string, error) {
44
+	f, err := os.Open("/proc/1/cgroup")
45
+	if err != nil {
46
+		return "", err
47
+	}
48
+	defer f.Close()
49
+
50
+	return parseCgroupFile(subsystem, f)
51
+}
52
+
43 53
 func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
44 54
 	s := bufio.NewScanner(r)
45 55
 
... ...
@@ -49,8 +59,10 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
49 49
 		}
50 50
 		text := s.Text()
51 51
 		parts := strings.Split(text, ":")
52
-		if parts[1] == subsystem {
53
-			return parts[2], nil
52
+		for _, subs := range strings.Split(parts[1], ",") {
53
+			if subs == subsystem {
54
+				return parts[2], nil
55
+			}
54 56
 		}
55 57
 	}
56 58
 	return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem)
57 59
new file mode 100644
... ...
@@ -0,0 +1,177 @@
0
+package cgroup
1
+
2
+import (
3
+	"fmt"
4
+	"github.com/dotcloud/docker/pkg/cgroups"
5
+	"github.com/dotcloud/docker/pkg/libcontainer"
6
+	"io/ioutil"
7
+	"os"
8
+	"path/filepath"
9
+	"strconv"
10
+)
11
+
12
+// We have two implementation of cgroups support, one is based on
13
+// systemd and the dbus api, and one is based on raw cgroup fs operations
14
+// following the pre-single-writer model docs at:
15
+// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/
16
+const (
17
+	cgroupRoot = "/sys/fs/cgroup"
18
+)
19
+
20
+func useSystemd() bool {
21
+	return false
22
+}
23
+
24
+func applyCgroupSystemd(container *libcontainer.Container, pid int) error {
25
+	return fmt.Errorf("not supported yet")
26
+}
27
+
28
+func writeFile(dir, file, data string) error {
29
+	return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
30
+}
31
+
32
+func getCgroup(subsystem string, container *libcontainer.Container) (string, error) {
33
+	cgroup := container.CgroupName
34
+	if container.CgroupParent != "" {
35
+		cgroup = filepath.Join(container.CgroupParent, cgroup)
36
+	}
37
+
38
+	initPath, err := cgroups.GetInitCgroupDir(subsystem)
39
+	if err != nil {
40
+		return "", err
41
+	}
42
+
43
+	path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup)
44
+
45
+	return path, nil
46
+}
47
+
48
+func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) {
49
+	path, err := getCgroup(subsystem, container)
50
+	if err != nil {
51
+		return "", err
52
+	}
53
+
54
+	if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
55
+		return "", err
56
+	}
57
+
58
+	if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil {
59
+		return "", err
60
+	}
61
+
62
+	return path, nil
63
+}
64
+
65
+func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) {
66
+	if _, err := os.Stat(cgroupRoot); err != nil {
67
+		return fmt.Errorf("cgroups fs not found")
68
+	}
69
+
70
+	if !container.DeviceAccess {
71
+		dir, err := joinCgroup("devices", container, pid)
72
+		if err != nil {
73
+			return err
74
+		}
75
+		defer func() {
76
+			if retErr != nil {
77
+				os.RemoveAll(dir)
78
+			}
79
+		}()
80
+
81
+		if err := writeFile(dir, "devices.deny", "a"); err != nil {
82
+			return err
83
+		}
84
+
85
+		allow := []string{
86
+			// /dev/null, zero, full
87
+			"c 1:3 rwm",
88
+			"c 1:5 rwm",
89
+			"c 1:7 rwm",
90
+
91
+			// consoles
92
+			"c 5:1 rwm",
93
+			"c 5:0 rwm",
94
+			"c 4:0 rwm",
95
+			"c 4:1 rwm",
96
+
97
+			// /dev/urandom,/dev/random
98
+			"c 1:9 rwm",
99
+			"c 1:8 rwm",
100
+
101
+			// /dev/pts/ - pts namespaces are "coming soon"
102
+			"c 136:* rwm",
103
+			"c 5:2 rwm",
104
+
105
+			// tuntap
106
+			"c 10:200 rwm",
107
+		}
108
+
109
+		for _, val := range allow {
110
+			if err := writeFile(dir, "devices.allow", val); err != nil {
111
+				return err
112
+			}
113
+		}
114
+	}
115
+
116
+	if container.Memory != 0 || container.MemorySwap != 0 {
117
+		dir, err := joinCgroup("memory", container, pid)
118
+		if err != nil {
119
+			return err
120
+		}
121
+		defer func() {
122
+			if retErr != nil {
123
+				os.RemoveAll(dir)
124
+			}
125
+		}()
126
+
127
+		if container.Memory != 0 {
128
+			if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil {
129
+				return err
130
+			}
131
+			if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil {
132
+				return err
133
+			}
134
+		}
135
+		if container.MemorySwap != 0 {
136
+			if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil {
137
+				return err
138
+			}
139
+		}
140
+	}
141
+
142
+	// We always want to join the cpu group, to allow fair cpu scheduling
143
+	// on a container basis
144
+	dir, err := joinCgroup("cpu", container, pid)
145
+	if err != nil {
146
+		return err
147
+	}
148
+	if container.CpuShares != 0 {
149
+		if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil {
150
+			return err
151
+		}
152
+	}
153
+	return nil
154
+}
155
+
156
+func CleanupCgroup(container *libcontainer.Container) error {
157
+	path, _ := getCgroup("memory", container)
158
+	os.RemoveAll(path)
159
+	path, _ = getCgroup("devices", container)
160
+	os.RemoveAll(path)
161
+	path, _ = getCgroup("cpu", container)
162
+	os.RemoveAll(path)
163
+	return nil
164
+}
165
+
166
+func ApplyCgroup(container *libcontainer.Container, pid int) error {
167
+	if container.CgroupName == "" {
168
+		return nil
169
+	}
170
+
171
+	if useSystemd() {
172
+		return applyCgroupSystemd(container, pid)
173
+	} else {
174
+		return applyCgroupRaw(container, pid)
175
+	}
176
+}
... ...
@@ -11,6 +11,13 @@ type Container struct {
11 11
 	Namespaces   Namespaces   `json:"namespaces,omitempty"`   // namespaces to apply
12 12
 	Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop
13 13
 	Network      *Network     `json:"network,omitempty"`      // nil for host's network stack
14
+
15
+	CgroupName   string `json:"cgroup_name,omitempty"`   // name of cgroup
16
+	CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice
17
+	DeviceAccess bool   `json:"device_access,omitempty"` // name of parent cgroup or slice
18
+	Memory       int64  `json:"memory,omitempty"`        // Memory limit (in bytes)
19
+	MemorySwap   int64  `json:"memory_swap,omitempty"`   // Total memory usage (memory + swap); set `-1' to disable swap
20
+	CpuShares    int64  `json:"cpu_shares,omitempty"`    // CPU shares (relative weight vs. other containers)
14 21
 }
15 22
 
16 23
 // Network defines configuration for a container's networking stack
... ...
@@ -34,5 +34,8 @@
34 34
         "gateway": "172.17.42.1",
35 35
         "bridge": "docker0",
36 36
         "mtu": 1500
37
-    }
37
+    },
38
+    "cgroup_name": "docker-koye",
39
+    "cgroup_parent": "docker",
40
+    "memory": 524800
38 41
 }
... ...
@@ -5,6 +5,7 @@ package main
5 5
 import (
6 6
 	"fmt"
7 7
 	"github.com/dotcloud/docker/pkg/libcontainer"
8
+	"github.com/dotcloud/docker/pkg/libcontainer/cgroup"
8 9
 	"github.com/dotcloud/docker/pkg/libcontainer/network"
9 10
 	"github.com/dotcloud/docker/pkg/libcontainer/utils"
10 11
 	"github.com/dotcloud/docker/pkg/system"
... ...
@@ -33,10 +34,18 @@ func execCommand(container *libcontainer.Container, args []string) (int, error)
33 33
 		return -1, err
34 34
 	}
35 35
 	if err := writePidFile(command); err != nil {
36
+		command.Process.Kill()
36 37
 		return -1, err
37 38
 	}
38 39
 	defer deletePidFile()
39 40
 
41
+	// Do this before syncing with child so that no children
42
+	// can escape the cgroup
43
+	if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil {
44
+		command.Process.Kill()
45
+		return -1, err
46
+	}
47
+
40 48
 	if container.Network != nil {
41 49
 		vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid)
42 50
 		if err != nil {
... ...
@@ -45,6 +54,9 @@ func execCommand(container *libcontainer.Container, args []string) (int, error)
45 45
 		sendVethName(vethPair, inPipe)
46 46
 	}
47 47
 
48
+	// Sync with child
49
+	inPipe.Close()
50
+
48 51
 	go io.Copy(os.Stdout, master)
49 52
 	go io.Copy(master, os.Stdin)
50 53
 
... ...
@@ -67,7 +79,6 @@ func execCommand(container *libcontainer.Container, args []string) (int, error)
67 67
 // pipe so that the child stops waiting for more data
68 68
 func sendVethName(name string, pipe io.WriteCloser) {
69 69
 	fmt.Fprint(pipe, name)
70
-	pipe.Close()
71 70
 }
72 71
 
73 72
 // initializeContainerVeth will create a veth pair and setup the host's
... ...
@@ -20,12 +20,10 @@ func initCommand(container *libcontainer.Container, console string, args []strin
20 20
 		return err
21 21
 	}
22 22
 
23
-	var tempVethName string
24
-	if container.Network != nil {
25
-		tempVethName, err = getVethName()
26
-		if err != nil {
27
-			return err
28
-		}
23
+	// We always read this as it is a way to sync with the parent as well
24
+	tempVethName, err := getVethName()
25
+	if err != nil {
26
+		return err
29 27
 	}
30 28
 
31 29
 	// close pipes so that we can replace it with the pty