This is a minimal version of raw cgroup support for libcontainer.
It has only enough for what docker needs, and it has no support
for systemd yet.
Docker-DCO-1.1-Signed-off-by: Alexander Larsson <alexl@redhat.com> (github: alexlarsson)
| ... | ... |
@@ -40,6 +40,16 @@ func GetThisCgroupDir(subsystem string) (string, error) {
|
| 40 | 40 |
return parseCgroupFile(subsystem, f) |
| 41 | 41 |
} |
| 42 | 42 |
|
| 43 |
+func GetInitCgroupDir(subsystem string) (string, error) {
|
|
| 44 |
+ f, err := os.Open("/proc/1/cgroup")
|
|
| 45 |
+ if err != nil {
|
|
| 46 |
+ return "", err |
|
| 47 |
+ } |
|
| 48 |
+ defer f.Close() |
|
| 49 |
+ |
|
| 50 |
+ return parseCgroupFile(subsystem, f) |
|
| 51 |
+} |
|
| 52 |
+ |
|
| 43 | 53 |
func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
|
| 44 | 54 |
s := bufio.NewScanner(r) |
| 45 | 55 |
|
| ... | ... |
@@ -49,8 +59,10 @@ func parseCgroupFile(subsystem string, r io.Reader) (string, error) {
|
| 49 | 49 |
} |
| 50 | 50 |
text := s.Text() |
| 51 | 51 |
parts := strings.Split(text, ":") |
| 52 |
- if parts[1] == subsystem {
|
|
| 53 |
- return parts[2], nil |
|
| 52 |
+ for _, subs := range strings.Split(parts[1], ",") {
|
|
| 53 |
+ if subs == subsystem {
|
|
| 54 |
+ return parts[2], nil |
|
| 55 |
+ } |
|
| 54 | 56 |
} |
| 55 | 57 |
} |
| 56 | 58 |
return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem)
|
| 57 | 59 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,177 @@ |
| 0 |
+package cgroup |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "fmt" |
|
| 4 |
+ "github.com/dotcloud/docker/pkg/cgroups" |
|
| 5 |
+ "github.com/dotcloud/docker/pkg/libcontainer" |
|
| 6 |
+ "io/ioutil" |
|
| 7 |
+ "os" |
|
| 8 |
+ "path/filepath" |
|
| 9 |
+ "strconv" |
|
| 10 |
+) |
|
| 11 |
+ |
|
| 12 |
+// We have two implementation of cgroups support, one is based on |
|
| 13 |
+// systemd and the dbus api, and one is based on raw cgroup fs operations |
|
| 14 |
+// following the pre-single-writer model docs at: |
|
| 15 |
+// http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ |
|
| 16 |
+const ( |
|
| 17 |
+ cgroupRoot = "/sys/fs/cgroup" |
|
| 18 |
+) |
|
| 19 |
+ |
|
| 20 |
+func useSystemd() bool {
|
|
| 21 |
+ return false |
|
| 22 |
+} |
|
| 23 |
+ |
|
| 24 |
+func applyCgroupSystemd(container *libcontainer.Container, pid int) error {
|
|
| 25 |
+ return fmt.Errorf("not supported yet")
|
|
| 26 |
+} |
|
| 27 |
+ |
|
| 28 |
+func writeFile(dir, file, data string) error {
|
|
| 29 |
+ return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) |
|
| 30 |
+} |
|
| 31 |
+ |
|
| 32 |
+func getCgroup(subsystem string, container *libcontainer.Container) (string, error) {
|
|
| 33 |
+ cgroup := container.CgroupName |
|
| 34 |
+ if container.CgroupParent != "" {
|
|
| 35 |
+ cgroup = filepath.Join(container.CgroupParent, cgroup) |
|
| 36 |
+ } |
|
| 37 |
+ |
|
| 38 |
+ initPath, err := cgroups.GetInitCgroupDir(subsystem) |
|
| 39 |
+ if err != nil {
|
|
| 40 |
+ return "", err |
|
| 41 |
+ } |
|
| 42 |
+ |
|
| 43 |
+ path := filepath.Join(cgroupRoot, subsystem, initPath, cgroup) |
|
| 44 |
+ |
|
| 45 |
+ return path, nil |
|
| 46 |
+} |
|
| 47 |
+ |
|
| 48 |
+func joinCgroup(subsystem string, container *libcontainer.Container, pid int) (string, error) {
|
|
| 49 |
+ path, err := getCgroup(subsystem, container) |
|
| 50 |
+ if err != nil {
|
|
| 51 |
+ return "", err |
|
| 52 |
+ } |
|
| 53 |
+ |
|
| 54 |
+ if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
|
|
| 55 |
+ return "", err |
|
| 56 |
+ } |
|
| 57 |
+ |
|
| 58 |
+ if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil {
|
|
| 59 |
+ return "", err |
|
| 60 |
+ } |
|
| 61 |
+ |
|
| 62 |
+ return path, nil |
|
| 63 |
+} |
|
| 64 |
+ |
|
| 65 |
+func applyCgroupRaw(container *libcontainer.Container, pid int) (retErr error) {
|
|
| 66 |
+ if _, err := os.Stat(cgroupRoot); err != nil {
|
|
| 67 |
+ return fmt.Errorf("cgroups fs not found")
|
|
| 68 |
+ } |
|
| 69 |
+ |
|
| 70 |
+ if !container.DeviceAccess {
|
|
| 71 |
+ dir, err := joinCgroup("devices", container, pid)
|
|
| 72 |
+ if err != nil {
|
|
| 73 |
+ return err |
|
| 74 |
+ } |
|
| 75 |
+ defer func() {
|
|
| 76 |
+ if retErr != nil {
|
|
| 77 |
+ os.RemoveAll(dir) |
|
| 78 |
+ } |
|
| 79 |
+ }() |
|
| 80 |
+ |
|
| 81 |
+ if err := writeFile(dir, "devices.deny", "a"); err != nil {
|
|
| 82 |
+ return err |
|
| 83 |
+ } |
|
| 84 |
+ |
|
| 85 |
+ allow := []string{
|
|
| 86 |
+ // /dev/null, zero, full |
|
| 87 |
+ "c 1:3 rwm", |
|
| 88 |
+ "c 1:5 rwm", |
|
| 89 |
+ "c 1:7 rwm", |
|
| 90 |
+ |
|
| 91 |
+ // consoles |
|
| 92 |
+ "c 5:1 rwm", |
|
| 93 |
+ "c 5:0 rwm", |
|
| 94 |
+ "c 4:0 rwm", |
|
| 95 |
+ "c 4:1 rwm", |
|
| 96 |
+ |
|
| 97 |
+ // /dev/urandom,/dev/random |
|
| 98 |
+ "c 1:9 rwm", |
|
| 99 |
+ "c 1:8 rwm", |
|
| 100 |
+ |
|
| 101 |
+ // /dev/pts/ - pts namespaces are "coming soon" |
|
| 102 |
+ "c 136:* rwm", |
|
| 103 |
+ "c 5:2 rwm", |
|
| 104 |
+ |
|
| 105 |
+ // tuntap |
|
| 106 |
+ "c 10:200 rwm", |
|
| 107 |
+ } |
|
| 108 |
+ |
|
| 109 |
+ for _, val := range allow {
|
|
| 110 |
+ if err := writeFile(dir, "devices.allow", val); err != nil {
|
|
| 111 |
+ return err |
|
| 112 |
+ } |
|
| 113 |
+ } |
|
| 114 |
+ } |
|
| 115 |
+ |
|
| 116 |
+ if container.Memory != 0 || container.MemorySwap != 0 {
|
|
| 117 |
+ dir, err := joinCgroup("memory", container, pid)
|
|
| 118 |
+ if err != nil {
|
|
| 119 |
+ return err |
|
| 120 |
+ } |
|
| 121 |
+ defer func() {
|
|
| 122 |
+ if retErr != nil {
|
|
| 123 |
+ os.RemoveAll(dir) |
|
| 124 |
+ } |
|
| 125 |
+ }() |
|
| 126 |
+ |
|
| 127 |
+ if container.Memory != 0 {
|
|
| 128 |
+ if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil {
|
|
| 129 |
+ return err |
|
| 130 |
+ } |
|
| 131 |
+ if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(container.Memory, 10)); err != nil {
|
|
| 132 |
+ return err |
|
| 133 |
+ } |
|
| 134 |
+ } |
|
| 135 |
+ if container.MemorySwap != 0 {
|
|
| 136 |
+ if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(container.MemorySwap, 10)); err != nil {
|
|
| 137 |
+ return err |
|
| 138 |
+ } |
|
| 139 |
+ } |
|
| 140 |
+ } |
|
| 141 |
+ |
|
| 142 |
+ // We always want to join the cpu group, to allow fair cpu scheduling |
|
| 143 |
+ // on a container basis |
|
| 144 |
+ dir, err := joinCgroup("cpu", container, pid)
|
|
| 145 |
+ if err != nil {
|
|
| 146 |
+ return err |
|
| 147 |
+ } |
|
| 148 |
+ if container.CpuShares != 0 {
|
|
| 149 |
+ if err := writeFile(dir, "cpu.shares", strconv.FormatInt(container.CpuShares, 10)); err != nil {
|
|
| 150 |
+ return err |
|
| 151 |
+ } |
|
| 152 |
+ } |
|
| 153 |
+ return nil |
|
| 154 |
+} |
|
| 155 |
+ |
|
| 156 |
+func CleanupCgroup(container *libcontainer.Container) error {
|
|
| 157 |
+ path, _ := getCgroup("memory", container)
|
|
| 158 |
+ os.RemoveAll(path) |
|
| 159 |
+ path, _ = getCgroup("devices", container)
|
|
| 160 |
+ os.RemoveAll(path) |
|
| 161 |
+ path, _ = getCgroup("cpu", container)
|
|
| 162 |
+ os.RemoveAll(path) |
|
| 163 |
+ return nil |
|
| 164 |
+} |
|
| 165 |
+ |
|
| 166 |
+func ApplyCgroup(container *libcontainer.Container, pid int) error {
|
|
| 167 |
+ if container.CgroupName == "" {
|
|
| 168 |
+ return nil |
|
| 169 |
+ } |
|
| 170 |
+ |
|
| 171 |
+ if useSystemd() {
|
|
| 172 |
+ return applyCgroupSystemd(container, pid) |
|
| 173 |
+ } else {
|
|
| 174 |
+ return applyCgroupRaw(container, pid) |
|
| 175 |
+ } |
|
| 176 |
+} |
| ... | ... |
@@ -11,6 +11,13 @@ type Container struct {
|
| 11 | 11 |
Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply |
| 12 | 12 |
Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop |
| 13 | 13 |
Network *Network `json:"network,omitempty"` // nil for host's network stack |
| 14 |
+ |
|
| 15 |
+ CgroupName string `json:"cgroup_name,omitempty"` // name of cgroup |
|
| 16 |
+ CgroupParent string `json:"cgroup_parent,omitempty"` // name of parent cgroup or slice |
|
| 17 |
+ DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice |
|
| 18 |
+ Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) |
|
| 19 |
+ MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap |
|
| 20 |
+ CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) |
|
| 14 | 21 |
} |
| 15 | 22 |
|
| 16 | 23 |
// Network defines configuration for a container's networking stack |
| ... | ... |
@@ -5,6 +5,7 @@ package main |
| 5 | 5 |
import ( |
| 6 | 6 |
"fmt" |
| 7 | 7 |
"github.com/dotcloud/docker/pkg/libcontainer" |
| 8 |
+ "github.com/dotcloud/docker/pkg/libcontainer/cgroup" |
|
| 8 | 9 |
"github.com/dotcloud/docker/pkg/libcontainer/network" |
| 9 | 10 |
"github.com/dotcloud/docker/pkg/libcontainer/utils" |
| 10 | 11 |
"github.com/dotcloud/docker/pkg/system" |
| ... | ... |
@@ -33,10 +34,18 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) |
| 33 | 33 |
return -1, err |
| 34 | 34 |
} |
| 35 | 35 |
if err := writePidFile(command); err != nil {
|
| 36 |
+ command.Process.Kill() |
|
| 36 | 37 |
return -1, err |
| 37 | 38 |
} |
| 38 | 39 |
defer deletePidFile() |
| 39 | 40 |
|
| 41 |
+ // Do this before syncing with child so that no children |
|
| 42 |
+ // can escape the cgroup |
|
| 43 |
+ if err := cgroup.ApplyCgroup(container, command.Process.Pid); err != nil {
|
|
| 44 |
+ command.Process.Kill() |
|
| 45 |
+ return -1, err |
|
| 46 |
+ } |
|
| 47 |
+ |
|
| 40 | 48 |
if container.Network != nil {
|
| 41 | 49 |
vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) |
| 42 | 50 |
if err != nil {
|
| ... | ... |
@@ -45,6 +54,9 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) |
| 45 | 45 |
sendVethName(vethPair, inPipe) |
| 46 | 46 |
} |
| 47 | 47 |
|
| 48 |
+ // Sync with child |
|
| 49 |
+ inPipe.Close() |
|
| 50 |
+ |
|
| 48 | 51 |
go io.Copy(os.Stdout, master) |
| 49 | 52 |
go io.Copy(master, os.Stdin) |
| 50 | 53 |
|
| ... | ... |
@@ -67,7 +79,6 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) |
| 67 | 67 |
// pipe so that the child stops waiting for more data |
| 68 | 68 |
func sendVethName(name string, pipe io.WriteCloser) {
|
| 69 | 69 |
fmt.Fprint(pipe, name) |
| 70 |
- pipe.Close() |
|
| 71 | 70 |
} |
| 72 | 71 |
|
| 73 | 72 |
// initializeContainerVeth will create a veth pair and setup the host's |
| ... | ... |
@@ -20,12 +20,10 @@ func initCommand(container *libcontainer.Container, console string, args []strin |
| 20 | 20 |
return err |
| 21 | 21 |
} |
| 22 | 22 |
|
| 23 |
- var tempVethName string |
|
| 24 |
- if container.Network != nil {
|
|
| 25 |
- tempVethName, err = getVethName() |
|
| 26 |
- if err != nil {
|
|
| 27 |
- return err |
|
| 28 |
- } |
|
| 23 |
+ // We always read this as it is a way to sync with the parent as well |
|
| 24 |
+ tempVethName, err := getVethName() |
|
| 25 |
+ if err != nil {
|
|
| 26 |
+ return err |
|
| 29 | 27 |
} |
| 30 | 28 |
|
| 31 | 29 |
// close pipes so that we can replace it with the pty |