This includes all of v0.0.8 as well as a few bug fixes that popped up
during vendoring.
Signed-off-by: Aleksa Sarai <asarai@suse.com>
| ... | ... |
@@ -59,7 +59,7 @@ clone git github.com/miekg/pkcs11 80f102b5cac759de406949c47f0928b99bd64cdf |
| 59 | 59 |
clone git github.com/docker/go v1.5.1-1-1-gbaf439e |
| 60 | 60 |
clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c |
| 61 | 61 |
|
| 62 |
-clone git github.com/opencontainers/runc 3d8a20bb772defc28c355534d83486416d1719b4 # libcontainer |
|
| 62 |
+clone git github.com/opencontainers/runc ce72f86a2b54bc114d6ffb51f6500479b2d42154 # libcontainer |
|
| 63 | 63 |
clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1 |
| 64 | 64 |
# libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json) |
| 65 | 65 |
clone git github.com/coreos/go-systemd v4 |
| ... | ... |
@@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst |
| 10 | 10 |
|
| 11 | 11 |
#### Using libcontainer |
| 12 | 12 |
|
| 13 |
-To create a container you first have to initialize an instance of a factory |
|
| 14 |
-that will handle the creation and initialization for a container. |
|
| 13 |
+Because containers are spawned in a two step process you will need a binary that |
|
| 14 |
+will be executed as the init process for the container. In libcontainer, we use |
|
| 15 |
+the current binary (/proc/self/exe) to be executed as the init process, and use |
|
| 16 |
+arg "init", we call the first step process "bootstrap", so you always need a "init" |
|
| 17 |
+function as the entry of "bootstrap". |
|
| 15 | 18 |
|
| 16 |
-Because containers are spawned in a two step process you will need to provide |
|
| 17 |
-arguments to a binary that will be executed as the init process for the container. |
|
| 18 |
-To use the current binary that is spawning the containers and acting as the parent |
|
| 19 |
-you can use `os.Args[0]` and we have a command called `init` setup. |
|
| 19 |
+```go |
|
| 20 |
+func init() {
|
|
| 21 |
+ if len(os.Args) > 1 && os.Args[1] == "init" {
|
|
| 22 |
+ runtime.GOMAXPROCS(1) |
|
| 23 |
+ runtime.LockOSThread() |
|
| 24 |
+ factory, _ := libcontainer.New("")
|
|
| 25 |
+ if err := factory.StartInitialization(); err != nil {
|
|
| 26 |
+ logrus.Fatal(err) |
|
| 27 |
+ } |
|
| 28 |
+ panic("--this line should have never been executed, congratulations--")
|
|
| 29 |
+ } |
|
| 30 |
+} |
|
| 31 |
+``` |
|
| 32 |
+ |
|
| 33 |
+Then to create a container you first have to initialize an instance of a factory |
|
| 34 |
+that will handle the creation and initialization for a container. |
|
| 20 | 35 |
|
| 21 | 36 |
```go |
| 22 |
-root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init"))
|
|
| 37 |
+factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
|
|
| 23 | 38 |
if err != nil {
|
| 24 |
- log.Fatal(err) |
|
| 39 |
+ logrus.Fatal(err) |
|
| 40 |
+ return |
|
| 25 | 41 |
} |
| 26 | 42 |
``` |
| 27 | 43 |
|
| 28 | 44 |
Once you have an instance of the factory created we can create a configuration |
| 29 |
-struct describing how the container is to be created. A sample would look similar to this: |
|
| 45 |
+struct describing how the container is to be created. A sample would look similar to this: |
|
| 30 | 46 |
|
| 31 | 47 |
```go |
| 48 |
+defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV |
|
| 32 | 49 |
config := &configs.Config{
|
| 33 |
- Rootfs: rootfs, |
|
| 34 |
- Capabilities: []string{
|
|
| 35 |
- "CAP_CHOWN", |
|
| 36 |
- "CAP_DAC_OVERRIDE", |
|
| 37 |
- "CAP_FSETID", |
|
| 38 |
- "CAP_FOWNER", |
|
| 39 |
- "CAP_MKNOD", |
|
| 40 |
- "CAP_NET_RAW", |
|
| 41 |
- "CAP_SETGID", |
|
| 42 |
- "CAP_SETUID", |
|
| 43 |
- "CAP_SETFCAP", |
|
| 44 |
- "CAP_SETPCAP", |
|
| 45 |
- "CAP_NET_BIND_SERVICE", |
|
| 46 |
- "CAP_SYS_CHROOT", |
|
| 47 |
- "CAP_KILL", |
|
| 48 |
- "CAP_AUDIT_WRITE", |
|
| 49 |
- }, |
|
| 50 |
- Namespaces: configs.Namespaces([]configs.Namespace{
|
|
| 51 |
- {Type: configs.NEWNS},
|
|
| 52 |
- {Type: configs.NEWUTS},
|
|
| 53 |
- {Type: configs.NEWIPC},
|
|
| 54 |
- {Type: configs.NEWPID},
|
|
| 55 |
- {Type: configs.NEWNET},
|
|
| 56 |
- }), |
|
| 57 |
- Cgroups: &configs.Cgroup{
|
|
| 58 |
- Name: "test-container", |
|
| 59 |
- Parent: "system", |
|
| 60 |
- AllowAllDevices: false, |
|
| 61 |
- AllowedDevices: configs.DefaultAllowedDevices, |
|
| 62 |
- }, |
|
| 63 |
- |
|
| 64 |
- Devices: configs.DefaultAutoCreatedDevices, |
|
| 65 |
- Hostname: "testing", |
|
| 66 |
- Networks: []*configs.Network{
|
|
| 67 |
- {
|
|
| 68 |
- Type: "loopback", |
|
| 69 |
- Address: "127.0.0.1/0", |
|
| 70 |
- Gateway: "localhost", |
|
| 71 |
- }, |
|
| 72 |
- }, |
|
| 73 |
- Rlimits: []configs.Rlimit{
|
|
| 74 |
- {
|
|
| 75 |
- Type: syscall.RLIMIT_NOFILE, |
|
| 76 |
- Hard: uint64(1024), |
|
| 77 |
- Soft: uint64(1024), |
|
| 78 |
- }, |
|
| 79 |
- }, |
|
| 50 |
+ Rootfs: "/your/path/to/rootfs", |
|
| 51 |
+ Capabilities: []string{
|
|
| 52 |
+ "CAP_CHOWN", |
|
| 53 |
+ "CAP_DAC_OVERRIDE", |
|
| 54 |
+ "CAP_FSETID", |
|
| 55 |
+ "CAP_FOWNER", |
|
| 56 |
+ "CAP_MKNOD", |
|
| 57 |
+ "CAP_NET_RAW", |
|
| 58 |
+ "CAP_SETGID", |
|
| 59 |
+ "CAP_SETUID", |
|
| 60 |
+ "CAP_SETFCAP", |
|
| 61 |
+ "CAP_SETPCAP", |
|
| 62 |
+ "CAP_NET_BIND_SERVICE", |
|
| 63 |
+ "CAP_SYS_CHROOT", |
|
| 64 |
+ "CAP_KILL", |
|
| 65 |
+ "CAP_AUDIT_WRITE", |
|
| 66 |
+ }, |
|
| 67 |
+ Namespaces: configs.Namespaces([]configs.Namespace{
|
|
| 68 |
+ {Type: configs.NEWNS},
|
|
| 69 |
+ {Type: configs.NEWUTS},
|
|
| 70 |
+ {Type: configs.NEWIPC},
|
|
| 71 |
+ {Type: configs.NEWPID},
|
|
| 72 |
+ {Type: configs.NEWUSER},
|
|
| 73 |
+ {Type: configs.NEWNET},
|
|
| 74 |
+ }), |
|
| 75 |
+ Cgroups: &configs.Cgroup{
|
|
| 76 |
+ Name: "test-container", |
|
| 77 |
+ Parent: "system", |
|
| 78 |
+ Resources: &configs.Resources{
|
|
| 79 |
+ MemorySwappiness: -1, |
|
| 80 |
+ AllowAllDevices: false, |
|
| 81 |
+ AllowedDevices: configs.DefaultAllowedDevices, |
|
| 82 |
+ }, |
|
| 83 |
+ }, |
|
| 84 |
+ MaskPaths: []string{
|
|
| 85 |
+ "/proc/kcore", |
|
| 86 |
+ }, |
|
| 87 |
+ ReadonlyPaths: []string{
|
|
| 88 |
+ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", |
|
| 89 |
+ }, |
|
| 90 |
+ Devices: configs.DefaultAutoCreatedDevices, |
|
| 91 |
+ Hostname: "testing", |
|
| 92 |
+ Mounts: []*configs.Mount{
|
|
| 93 |
+ {
|
|
| 94 |
+ Source: "proc", |
|
| 95 |
+ Destination: "/proc", |
|
| 96 |
+ Device: "proc", |
|
| 97 |
+ Flags: defaultMountFlags, |
|
| 98 |
+ }, |
|
| 99 |
+ {
|
|
| 100 |
+ Source: "tmpfs", |
|
| 101 |
+ Destination: "/dev", |
|
| 102 |
+ Device: "tmpfs", |
|
| 103 |
+ Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, |
|
| 104 |
+ Data: "mode=755", |
|
| 105 |
+ }, |
|
| 106 |
+ {
|
|
| 107 |
+ Source: "devpts", |
|
| 108 |
+ Destination: "/dev/pts", |
|
| 109 |
+ Device: "devpts", |
|
| 110 |
+ Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, |
|
| 111 |
+ Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", |
|
| 112 |
+ }, |
|
| 113 |
+ {
|
|
| 114 |
+ Device: "tmpfs", |
|
| 115 |
+ Source: "shm", |
|
| 116 |
+ Destination: "/dev/shm", |
|
| 117 |
+ Data: "mode=1777,size=65536k", |
|
| 118 |
+ Flags: defaultMountFlags, |
|
| 119 |
+ }, |
|
| 120 |
+ {
|
|
| 121 |
+ Source: "mqueue", |
|
| 122 |
+ Destination: "/dev/mqueue", |
|
| 123 |
+ Device: "mqueue", |
|
| 124 |
+ Flags: defaultMountFlags, |
|
| 125 |
+ }, |
|
| 126 |
+ {
|
|
| 127 |
+ Source: "sysfs", |
|
| 128 |
+ Destination: "/sys", |
|
| 129 |
+ Device: "sysfs", |
|
| 130 |
+ Flags: defaultMountFlags | syscall.MS_RDONLY, |
|
| 131 |
+ }, |
|
| 132 |
+ }, |
|
| 133 |
+ UidMappings: []configs.IDMap{
|
|
| 134 |
+ {
|
|
| 135 |
+ ContainerID: 0, |
|
| 136 |
+ Host: 1000, |
|
| 137 |
+ size: 65536, |
|
| 138 |
+ }, |
|
| 139 |
+ }, |
|
| 140 |
+ GidMappings: []configs.IDMap{
|
|
| 141 |
+ {
|
|
| 142 |
+ ContainerID: 0, |
|
| 143 |
+ Host: 1000, |
|
| 144 |
+ size: 65536, |
|
| 145 |
+ }, |
|
| 146 |
+ }, |
|
| 147 |
+ Networks: []*configs.Network{
|
|
| 148 |
+ {
|
|
| 149 |
+ Type: "loopback", |
|
| 150 |
+ Address: "127.0.0.1/0", |
|
| 151 |
+ Gateway: "localhost", |
|
| 152 |
+ }, |
|
| 153 |
+ }, |
|
| 154 |
+ Rlimits: []configs.Rlimit{
|
|
| 155 |
+ {
|
|
| 156 |
+ Type: syscall.RLIMIT_NOFILE, |
|
| 157 |
+ Hard: uint64(1025), |
|
| 158 |
+ Soft: uint64(1025), |
|
| 159 |
+ }, |
|
| 160 |
+ }, |
|
| 80 | 161 |
} |
| 81 | 162 |
``` |
| 82 | 163 |
|
| 83 | 164 |
Once you have the configuration populated you can create a container: |
| 84 | 165 |
|
| 85 | 166 |
```go |
| 86 |
-container, err := root.Create("container-id", config)
|
|
| 167 |
+container, err := factory.Create("container-id", config)
|
|
| 168 |
+if err != nil {
|
|
| 169 |
+ logrus.Fatal(err) |
|
| 170 |
+ return |
|
| 171 |
+} |
|
| 87 | 172 |
``` |
| 88 | 173 |
|
| 89 | 174 |
To spawn bash as the initial process inside the container and have the |
| ... | ... |
@@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process: |
| 91 | 91 |
|
| 92 | 92 |
```go |
| 93 | 93 |
process := &libcontainer.Process{
|
| 94 |
- Args: []string{"/bin/bash"},
|
|
| 95 |
- Env: []string{"PATH=/bin"},
|
|
| 96 |
- User: "daemon", |
|
| 97 |
- Stdin: os.Stdin, |
|
| 98 |
- Stdout: os.Stdout, |
|
| 99 |
- Stderr: os.Stderr, |
|
| 94 |
+ Args: []string{"/bin/bash"},
|
|
| 95 |
+ Env: []string{"PATH=/bin"},
|
|
| 96 |
+ User: "daemon", |
|
| 97 |
+ Stdin: os.Stdin, |
|
| 98 |
+ Stdout: os.Stdout, |
|
| 99 |
+ Stderr: os.Stderr, |
|
| 100 | 100 |
} |
| 101 | 101 |
|
| 102 | 102 |
err := container.Start(process) |
| 103 | 103 |
if err != nil {
|
| 104 |
- log.Fatal(err) |
|
| 104 |
+ logrus.Fatal(err) |
|
| 105 |
+ container.Destroy() |
|
| 106 |
+ return |
|
| 105 | 107 |
} |
| 106 | 108 |
|
| 107 | 109 |
// wait for the process to finish. |
| 108 |
-status, err := process.Wait() |
|
| 110 |
+_, err := process.Wait() |
|
| 109 | 111 |
if err != nil {
|
| 110 |
- log.Fatal(err) |
|
| 112 |
+ logrus.Fatal(err) |
|
| 111 | 113 |
} |
| 112 | 114 |
|
| 113 | 115 |
// destroy the container. |
| ... | ... |
@@ -124,7 +211,6 @@ processes, err := container.Processes() |
| 124 | 124 |
// it's processes. |
| 125 | 125 |
stats, err := container.Stats() |
| 126 | 126 |
|
| 127 |
- |
|
| 128 | 127 |
// pause all processes inside the container. |
| 129 | 128 |
container.Pause() |
| 130 | 129 |
|
| ... | ... |
@@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup. |
| 60 | 60 |
After a container's filesystems are mounted within the newly created |
| 61 | 61 |
mount namespace `/dev` will need to be populated with a set of device nodes. |
| 62 | 62 |
It is expected that a rootfs does not need to have any device nodes specified |
| 63 |
-for `/dev` witin the rootfs as the container will setup the correct devices |
|
| 63 |
+for `/dev` within the rootfs as the container will setup the correct devices |
|
| 64 | 64 |
that are required for executing a container's process. |
| 65 | 65 |
|
| 66 | 66 |
| Path | Mode | Access | |
| ... | ... |
@@ -142,6 +142,7 @@ system resources like cpu, memory, and device access. |
| 142 | 142 |
| perf_event | 1 | |
| 143 | 143 |
| freezer | 1 | |
| 144 | 144 |
| hugetlb | 1 | |
| 145 |
+| pids | 1 | |
|
| 145 | 146 |
|
| 146 | 147 |
|
| 147 | 148 |
All cgroup subsystem are joined so that statistics can be collected from |
| ... | ... |
@@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications. |
| 199 | 199 |
| CAP_SYS_BOOT | 0 | |
| 200 | 200 |
| CAP_LEASE | 0 | |
| 201 | 201 |
| CAP_WAKE_ALARM | 0 | |
| 202 |
-| CAP_BLOCK_SUSPE | 0 | |
|
| 202 |
+| CAP_BLOCK_SUSPEND | 0 | |
|
| 203 | 203 |
|
| 204 | 204 |
|
| 205 | 205 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) |
| ... | ... |
@@ -15,6 +15,9 @@ type Manager interface {
|
| 15 | 15 |
// Returns the PIDs inside the cgroup set |
| 16 | 16 |
GetPids() ([]int, error) |
| 17 | 17 |
|
| 18 |
+ // Returns the PIDs inside the cgroup set & all sub-cgroups |
|
| 19 |
+ GetAllPids() ([]int, error) |
|
| 20 |
+ |
|
| 18 | 21 |
// Returns statistics for the cgroup set |
| 19 | 22 |
GetStats() (*Stats, error) |
| 20 | 23 |
|
| ... | ... |
@@ -14,6 +14,7 @@ import ( |
| 14 | 14 |
|
| 15 | 15 |
"github.com/opencontainers/runc/libcontainer/cgroups" |
| 16 | 16 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 17 |
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" |
|
| 17 | 18 |
) |
| 18 | 19 |
|
| 19 | 20 |
var ( |
| ... | ... |
@@ -23,6 +24,7 @@ var ( |
| 23 | 23 |
&MemoryGroup{},
|
| 24 | 24 |
&CpuGroup{},
|
| 25 | 25 |
&CpuacctGroup{},
|
| 26 |
+ &PidsGroup{},
|
|
| 26 | 27 |
&BlkioGroup{},
|
| 27 | 28 |
&HugetlbGroup{},
|
| 28 | 29 |
&NetClsGroup{},
|
| ... | ... |
@@ -93,11 +95,10 @@ func getCgroupRoot() (string, error) {
|
| 93 | 93 |
} |
| 94 | 94 |
|
| 95 | 95 |
type cgroupData struct {
|
| 96 |
- root string |
|
| 97 |
- parent string |
|
| 98 |
- name string |
|
| 99 |
- config *configs.Cgroup |
|
| 100 |
- pid int |
|
| 96 |
+ root string |
|
| 97 |
+ innerPath string |
|
| 98 |
+ config *configs.Cgroup |
|
| 99 |
+ pid int |
|
| 101 | 100 |
} |
| 102 | 101 |
|
| 103 | 102 |
func (m *Manager) Apply(pid int) (err error) {
|
| ... | ... |
@@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
|
| 112 | 112 |
return err |
| 113 | 113 |
} |
| 114 | 114 |
|
| 115 |
+ if c.Paths != nil {
|
|
| 116 |
+ paths := make(map[string]string) |
|
| 117 |
+ for name, path := range c.Paths {
|
|
| 118 |
+ _, err := d.path(name) |
|
| 119 |
+ if err != nil {
|
|
| 120 |
+ if cgroups.IsNotFound(err) {
|
|
| 121 |
+ continue |
|
| 122 |
+ } |
|
| 123 |
+ return err |
|
| 124 |
+ } |
|
| 125 |
+ paths[name] = path |
|
| 126 |
+ } |
|
| 127 |
+ m.Paths = paths |
|
| 128 |
+ return cgroups.EnterPid(m.Paths, pid) |
|
| 129 |
+ } |
|
| 130 |
+ |
|
| 115 | 131 |
paths := make(map[string]string) |
| 116 | 132 |
defer func() {
|
| 117 | 133 |
if err != nil {
|
| ... | ... |
@@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
|
| 135 | 135 |
paths[sys.Name()] = p |
| 136 | 136 |
} |
| 137 | 137 |
m.Paths = paths |
| 138 |
- |
|
| 139 |
- if paths["cpu"] != "" {
|
|
| 140 |
- if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
|
|
| 141 |
- return err |
|
| 142 |
- } |
|
| 143 |
- } |
|
| 144 |
- |
|
| 145 | 138 |
return nil |
| 146 | 139 |
} |
| 147 | 140 |
|
| 148 | 141 |
func (m *Manager) Destroy() error {
|
| 142 |
+ if m.Cgroups.Paths != nil {
|
|
| 143 |
+ return nil |
|
| 144 |
+ } |
|
| 149 | 145 |
m.mu.Lock() |
| 150 | 146 |
defer m.mu.Unlock() |
| 151 | 147 |
if err := cgroups.RemovePaths(m.Paths); err != nil {
|
| ... | ... |
@@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
| 179 | 179 |
} |
| 180 | 180 |
|
| 181 | 181 |
func (m *Manager) Set(container *configs.Config) error {
|
| 182 |
- for name, path := range m.Paths {
|
|
| 183 |
- sys, err := subsystems.Get(name) |
|
| 184 |
- if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
|
|
| 185 |
- continue |
|
| 182 |
+ for _, sys := range subsystems {
|
|
| 183 |
+ // Generate fake cgroup data. |
|
| 184 |
+ d, err := getCgroupData(container.Cgroups, -1) |
|
| 185 |
+ if err != nil {
|
|
| 186 |
+ return err |
|
| 187 |
+ } |
|
| 188 |
+ // Get the path, but don't error out if the cgroup wasn't found. |
|
| 189 |
+ path, err := d.path(sys.Name()) |
|
| 190 |
+ if err != nil && !cgroups.IsNotFound(err) {
|
|
| 191 |
+ return err |
|
| 186 | 192 |
} |
| 193 |
+ |
|
| 187 | 194 |
if err := sys.Set(path, container.Cgroups); err != nil {
|
| 188 | 195 |
return err |
| 189 | 196 |
} |
| 190 | 197 |
} |
| 198 |
+ |
|
| 199 |
+ if m.Paths["cpu"] != "" {
|
|
| 200 |
+ if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
|
|
| 201 |
+ return err |
|
| 202 |
+ } |
|
| 203 |
+ } |
|
| 191 | 204 |
return nil |
| 192 | 205 |
} |
| 193 | 206 |
|
| ... | ... |
@@ -217,41 +243,28 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
|
| 217 | 217 |
} |
| 218 | 218 |
|
| 219 | 219 |
func (m *Manager) GetPids() ([]int, error) {
|
| 220 |
- d, err := getCgroupData(m.Cgroups, 0) |
|
| 220 |
+ dir, err := getCgroupPath(m.Cgroups) |
|
| 221 | 221 |
if err != nil {
|
| 222 | 222 |
return nil, err |
| 223 | 223 |
} |
| 224 |
+ return cgroups.GetPids(dir) |
|
| 225 |
+} |
|
| 224 | 226 |
|
| 225 |
- dir, err := d.path("devices")
|
|
| 227 |
+func (m *Manager) GetAllPids() ([]int, error) {
|
|
| 228 |
+ dir, err := getCgroupPath(m.Cgroups) |
|
| 226 | 229 |
if err != nil {
|
| 227 | 230 |
return nil, err |
| 228 | 231 |
} |
| 229 |
- |
|
| 230 |
- return cgroups.GetPids(dir) |
|
| 232 |
+ return cgroups.GetAllPids(dir) |
|
| 231 | 233 |
} |
| 232 | 234 |
|
| 233 |
-// pathClean makes a path safe for use with filepath.Join. This is done by not |
|
| 234 |
-// only cleaning the path, but also (if the path is relative) adding a leading |
|
| 235 |
-// '/' and cleaning it (then removing the leading '/'). This ensures that a |
|
| 236 |
-// path resulting from prepending another path will always resolve to lexically |
|
| 237 |
-// be a subdirectory of the prefixed path. This is all done lexically, so paths |
|
| 238 |
-// that include symlinks won't be safe as a result of using pathClean. |
|
| 239 |
-func pathClean(path string) string {
|
|
| 240 |
- // Ensure that all paths are cleaned (especially problematic ones like |
|
| 241 |
- // "/../../../../../" which can cause lots of issues). |
|
| 242 |
- path = filepath.Clean(path) |
|
| 243 |
- |
|
| 244 |
- // If the path isn't absolute, we need to do more processing to fix paths |
|
| 245 |
- // such as "../../../../<etc>/some/path". We also shouldn't convert absolute |
|
| 246 |
- // paths to relative ones. |
|
| 247 |
- if !filepath.IsAbs(path) {
|
|
| 248 |
- path = filepath.Clean(string(os.PathSeparator) + path) |
|
| 249 |
- // This can't fail, as (by definition) all paths are relative to root. |
|
| 250 |
- path, _ = filepath.Rel(string(os.PathSeparator), path) |
|
| 251 |
- } |
|
| 252 |
- |
|
| 253 |
- // Clean the path again for good measure. |
|
| 254 |
- return filepath.Clean(path) |
|
| 235 |
+func getCgroupPath(c *configs.Cgroup) (string, error) {
|
|
| 236 |
+ d, err := getCgroupData(c, 0) |
|
| 237 |
+ if err != nil {
|
|
| 238 |
+ return "", err |
|
| 239 |
+ } |
|
| 240 |
+ |
|
| 241 |
+ return d.path("devices")
|
|
| 255 | 242 |
} |
| 256 | 243 |
|
| 257 | 244 |
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
|
| ... | ... |
@@ -260,15 +273,25 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
|
| 260 | 260 |
return nil, err |
| 261 | 261 |
} |
| 262 | 262 |
|
| 263 |
- // Clean the parent slice path. |
|
| 264 |
- c.Parent = pathClean(c.Parent) |
|
| 263 |
+ if (c.Name != "" || c.Parent != "") && c.Path != "" {
|
|
| 264 |
+ return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
|
|
| 265 |
+ } |
|
| 266 |
+ |
|
| 267 |
+ // XXX: Do not remove this code. Path safety is important! -- cyphar |
|
| 268 |
+ cgPath := libcontainerUtils.CleanPath(c.Path) |
|
| 269 |
+ cgParent := libcontainerUtils.CleanPath(c.Parent) |
|
| 270 |
+ cgName := libcontainerUtils.CleanPath(c.Name) |
|
| 271 |
+ |
|
| 272 |
+ innerPath := cgPath |
|
| 273 |
+ if innerPath == "" {
|
|
| 274 |
+ innerPath = filepath.Join(cgParent, cgName) |
|
| 275 |
+ } |
|
| 265 | 276 |
|
| 266 | 277 |
return &cgroupData{
|
| 267 |
- root: root, |
|
| 268 |
- parent: c.Parent, |
|
| 269 |
- name: c.Name, |
|
| 270 |
- config: c, |
|
| 271 |
- pid: pid, |
|
| 278 |
+ root: root, |
|
| 279 |
+ innerPath: innerPath, |
|
| 280 |
+ config: c, |
|
| 281 |
+ pid: pid, |
|
| 272 | 282 |
}, nil |
| 273 | 283 |
} |
| 274 | 284 |
|
| ... | ... |
@@ -296,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
|
| 296 | 296 |
return "", err |
| 297 | 297 |
} |
| 298 | 298 |
|
| 299 |
- cgPath := filepath.Join(raw.parent, raw.name) |
|
| 300 | 299 |
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process. |
| 301 |
- if filepath.IsAbs(cgPath) {
|
|
| 300 |
+ if filepath.IsAbs(raw.innerPath) {
|
|
| 302 | 301 |
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. |
| 303 |
- return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil |
|
| 302 |
+ return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil |
|
| 304 | 303 |
} |
| 305 | 304 |
|
| 306 | 305 |
parentPath, err := raw.parentPath(subsystem, mnt, root) |
| ... | ... |
@@ -308,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
|
| 308 | 308 |
return "", err |
| 309 | 309 |
} |
| 310 | 310 |
|
| 311 |
- return filepath.Join(parentPath, cgPath), nil |
|
| 311 |
+ return filepath.Join(parentPath, raw.innerPath), nil |
|
| 312 | 312 |
} |
| 313 | 313 |
|
| 314 | 314 |
func (raw *cgroupData) join(subsystem string) (string, error) {
|
| ... | ... |
@@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string {
|
| 22 | 22 |
} |
| 23 | 23 |
|
| 24 | 24 |
func (s *BlkioGroup) Apply(d *cgroupData) error {
|
| 25 |
- dir, err := d.join("blkio")
|
|
| 25 |
+ _, err := d.join("blkio")
|
|
| 26 | 26 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 27 | 27 |
return err |
| 28 | 28 |
} |
| 29 |
- |
|
| 30 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 31 |
- return err |
|
| 32 |
- } |
|
| 33 |
- |
|
| 34 | 29 |
return nil |
| 35 | 30 |
} |
| 36 | 31 |
|
| ... | ... |
@@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string {
|
| 22 | 22 |
func (s *CpuGroup) Apply(d *cgroupData) error {
|
| 23 | 23 |
// We always want to join the cpu group, to allow fair cpu scheduling |
| 24 | 24 |
// on a container basis |
| 25 |
- dir, err := d.join("cpu")
|
|
| 25 |
+ _, err := d.join("cpu")
|
|
| 26 | 26 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 27 | 27 |
return err |
| 28 | 28 |
} |
| 29 |
- |
|
| 30 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 31 |
- return err |
|
| 32 |
- } |
|
| 33 |
- |
|
| 34 | 29 |
return nil |
| 35 | 30 |
} |
| 36 | 31 |
|
| ... | ... |
@@ -12,6 +12,7 @@ import ( |
| 12 | 12 |
|
| 13 | 13 |
"github.com/opencontainers/runc/libcontainer/cgroups" |
| 14 | 14 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 15 |
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" |
|
| 15 | 16 |
) |
| 16 | 17 |
|
| 17 | 18 |
type CpusetGroup struct {
|
| ... | ... |
@@ -64,11 +65,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro |
| 64 | 64 |
if err := s.ensureParent(dir, root); err != nil {
|
| 65 | 65 |
return err |
| 66 | 66 |
} |
| 67 |
- // the default values inherit from parent cgroup are already set in |
|
| 68 |
- // s.ensureParent, cover these if we have our own |
|
| 69 |
- if err := s.Set(dir, cgroup); err != nil {
|
|
| 70 |
- return err |
|
| 71 |
- } |
|
| 72 | 67 |
// because we are not using d.join we need to place the pid into the procs file |
| 73 | 68 |
// unlike the other subsystems |
| 74 | 69 |
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
|
| ... | ... |
@@ -93,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b |
| 93 | 93 |
// it's parent. |
| 94 | 94 |
func (s *CpusetGroup) ensureParent(current, root string) error {
|
| 95 | 95 |
parent := filepath.Dir(current) |
| 96 |
- if filepath.Clean(parent) == root {
|
|
| 96 |
+ if libcontainerUtils.CleanPath(parent) == root {
|
|
| 97 | 97 |
return nil |
| 98 | 98 |
} |
| 99 | 99 |
// Avoid infinite recursion. |
| ... | ... |
@@ -15,21 +15,29 @@ func (s *DevicesGroup) Name() string {
|
| 15 | 15 |
} |
| 16 | 16 |
|
| 17 | 17 |
func (s *DevicesGroup) Apply(d *cgroupData) error {
|
| 18 |
- dir, err := d.join("devices")
|
|
| 18 |
+ _, err := d.join("devices")
|
|
| 19 | 19 |
if err != nil {
|
| 20 | 20 |
// We will return error even it's `not found` error, devices |
| 21 | 21 |
// cgroup is hard requirement for container's security. |
| 22 | 22 |
return err |
| 23 | 23 |
} |
| 24 |
- |
|
| 25 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 26 |
- return err |
|
| 27 |
- } |
|
| 28 |
- |
|
| 29 | 24 |
return nil |
| 30 | 25 |
} |
| 31 | 26 |
|
| 32 | 27 |
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
|
| 28 |
+ devices := cgroup.Resources.Devices |
|
| 29 |
+ if len(devices) > 0 {
|
|
| 30 |
+ for _, dev := range devices {
|
|
| 31 |
+ file := "devices.deny" |
|
| 32 |
+ if dev.Allow {
|
|
| 33 |
+ file = "devices.allow" |
|
| 34 |
+ } |
|
| 35 |
+ if err := writeFile(path, file, dev.CgroupString()); err != nil {
|
|
| 36 |
+ return err |
|
| 37 |
+ } |
|
| 38 |
+ } |
|
| 39 |
+ return nil |
|
| 40 |
+ } |
|
| 33 | 41 |
if !cgroup.Resources.AllowAllDevices {
|
| 34 | 42 |
if err := writeFile(path, "devices.deny", "a"); err != nil {
|
| 35 | 43 |
return err |
| ... | ... |
@@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string {
|
| 19 | 19 |
} |
| 20 | 20 |
|
| 21 | 21 |
func (s *FreezerGroup) Apply(d *cgroupData) error {
|
| 22 |
- dir, err := d.join("freezer")
|
|
| 22 |
+ _, err := d.join("freezer")
|
|
| 23 | 23 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 24 | 24 |
return err |
| 25 | 25 |
} |
| 26 |
- |
|
| 27 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 28 |
- return err |
|
| 29 |
- } |
|
| 30 |
- |
|
| 31 | 26 |
return nil |
| 32 | 27 |
} |
| 33 | 28 |
|
| ... | ... |
@@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string {
|
| 19 | 19 |
} |
| 20 | 20 |
|
| 21 | 21 |
func (s *HugetlbGroup) Apply(d *cgroupData) error {
|
| 22 |
- dir, err := d.join("hugetlb")
|
|
| 22 |
+ _, err := d.join("hugetlb")
|
|
| 23 | 23 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 24 | 24 |
return err |
| 25 | 25 |
} |
| 26 |
- |
|
| 27 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 28 |
- return err |
|
| 29 |
- } |
|
| 30 |
- |
|
| 31 | 26 |
return nil |
| 32 | 27 |
} |
| 33 | 28 |
|
| ... | ... |
@@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
|
| 32 | 32 |
return err |
| 33 | 33 |
} |
| 34 | 34 |
} |
| 35 |
- |
|
| 36 |
- if err := s.Set(path, d.config); err != nil {
|
|
| 35 |
+ // We have to set kernel memory here, as we can't change it once |
|
| 36 |
+ // processes have been attached. |
|
| 37 |
+ if err := s.SetKernelMemory(path, d.config); err != nil {
|
|
| 37 | 38 |
return err |
| 38 | 39 |
} |
| 39 | 40 |
} |
| ... | ... |
@@ -50,7 +51,17 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
|
| 50 | 50 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 51 | 51 |
return err |
| 52 | 52 |
} |
| 53 |
+ return nil |
|
| 54 |
+} |
|
| 53 | 55 |
|
| 56 |
+func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
|
|
| 57 |
+ // This has to be done separately because it has special constraints (it |
|
| 58 |
+ // can't be done after there are processes attached to the cgroup). |
|
| 59 |
+ if cgroup.Resources.KernelMemory > 0 {
|
|
| 60 |
+ if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
|
|
| 61 |
+ return err |
|
| 62 |
+ } |
|
| 63 |
+ } |
|
| 54 | 64 |
return nil |
| 55 | 65 |
} |
| 56 | 66 |
|
| ... | ... |
@@ -70,12 +81,6 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
|
| 70 | 70 |
return err |
| 71 | 71 |
} |
| 72 | 72 |
} |
| 73 |
- if cgroup.Resources.KernelMemory > 0 {
|
|
| 74 |
- if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
|
|
| 75 |
- return err |
|
| 76 |
- } |
|
| 77 |
- } |
|
| 78 |
- |
|
| 79 | 73 |
if cgroup.Resources.OomKillDisable {
|
| 80 | 74 |
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
|
| 81 | 75 |
return err |
| ... | ... |
@@ -157,6 +162,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
|
| 157 | 157 |
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
|
| 158 | 158 |
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
|
| 159 | 159 |
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
|
| 160 |
+ limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
|
|
| 160 | 161 |
|
| 161 | 162 |
value, err := getCgroupParamUint(path, usage) |
| 162 | 163 |
if err != nil {
|
| ... | ... |
@@ -182,6 +188,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
|
| 182 | 182 |
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
|
| 183 | 183 |
} |
| 184 | 184 |
memoryData.Failcnt = value |
| 185 |
+ value, err = getCgroupParamUint(path, limit) |
|
| 186 |
+ if err != nil {
|
|
| 187 |
+ if moduleName != "memory" && os.IsNotExist(err) {
|
|
| 188 |
+ return cgroups.MemoryData{}, nil
|
|
| 189 |
+ } |
|
| 190 |
+ return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
|
|
| 191 |
+ } |
|
| 192 |
+ memoryData.Limit = value |
|
| 185 | 193 |
|
| 186 | 194 |
return memoryData, nil |
| 187 | 195 |
} |
| ... | ... |
@@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string {
|
| 15 | 15 |
} |
| 16 | 16 |
|
| 17 | 17 |
func (s *NetClsGroup) Apply(d *cgroupData) error {
|
| 18 |
- dir, err := d.join("net_cls")
|
|
| 18 |
+ _, err := d.join("net_cls")
|
|
| 19 | 19 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 20 | 20 |
return err |
| 21 | 21 |
} |
| 22 |
- |
|
| 23 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 24 |
- return err |
|
| 25 |
- } |
|
| 26 |
- |
|
| 27 | 22 |
return nil |
| 28 | 23 |
} |
| 29 | 24 |
|
| ... | ... |
@@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string {
|
| 15 | 15 |
} |
| 16 | 16 |
|
| 17 | 17 |
func (s *NetPrioGroup) Apply(d *cgroupData) error {
|
| 18 |
- dir, err := d.join("net_prio")
|
|
| 18 |
+ _, err := d.join("net_prio")
|
|
| 19 | 19 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 20 | 20 |
return err |
| 21 | 21 |
} |
| 22 |
- |
|
| 23 |
- if err := s.Set(dir, d.config); err != nil {
|
|
| 24 |
- return err |
|
| 25 |
- } |
|
| 26 |
- |
|
| 27 | 22 |
return nil |
| 28 | 23 |
} |
| 29 | 24 |
|
| 30 | 25 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,57 @@ |
| 0 |
+// +build linux |
|
| 1 |
+ |
|
| 2 |
+package fs |
|
| 3 |
+ |
|
| 4 |
+import ( |
|
| 5 |
+ "fmt" |
|
| 6 |
+ "strconv" |
|
| 7 |
+ |
|
| 8 |
+ "github.com/opencontainers/runc/libcontainer/cgroups" |
|
| 9 |
+ "github.com/opencontainers/runc/libcontainer/configs" |
|
| 10 |
+) |
|
| 11 |
+ |
|
| 12 |
+type PidsGroup struct {
|
|
| 13 |
+} |
|
| 14 |
+ |
|
| 15 |
+func (s *PidsGroup) Name() string {
|
|
| 16 |
+ return "pids" |
|
| 17 |
+} |
|
| 18 |
+ |
|
| 19 |
+func (s *PidsGroup) Apply(d *cgroupData) error {
|
|
| 20 |
+ _, err := d.join("pids")
|
|
| 21 |
+ if err != nil && !cgroups.IsNotFound(err) {
|
|
| 22 |
+ return err |
|
| 23 |
+ } |
|
| 24 |
+ return nil |
|
| 25 |
+} |
|
| 26 |
+ |
|
| 27 |
+func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
|
|
| 28 |
+ if cgroup.Resources.PidsLimit != 0 {
|
|
| 29 |
+ // "max" is the fallback value. |
|
| 30 |
+ limit := "max" |
|
| 31 |
+ |
|
| 32 |
+ if cgroup.Resources.PidsLimit > 0 {
|
|
| 33 |
+ limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) |
|
| 34 |
+ } |
|
| 35 |
+ |
|
| 36 |
+ if err := writeFile(path, "pids.max", limit); err != nil {
|
|
| 37 |
+ return err |
|
| 38 |
+ } |
|
| 39 |
+ } |
|
| 40 |
+ |
|
| 41 |
+ return nil |
|
| 42 |
+} |
|
| 43 |
+ |
|
| 44 |
+func (s *PidsGroup) Remove(d *cgroupData) error {
|
|
| 45 |
+ return removePath(d.path("pids"))
|
|
| 46 |
+} |
|
| 47 |
+ |
|
| 48 |
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
|
|
| 49 |
+ value, err := getCgroupParamUint(path, "pids.current") |
|
| 50 |
+ if err != nil {
|
|
| 51 |
+ return fmt.Errorf("failed to parse pids.current - %s", err)
|
|
| 52 |
+ } |
|
| 53 |
+ |
|
| 54 |
+ stats.PidsStats.Current = value |
|
| 55 |
+ return nil |
|
| 56 |
+} |
| ... | ... |
@@ -36,7 +36,9 @@ type MemoryData struct {
|
| 36 | 36 |
Usage uint64 `json:"usage,omitempty"` |
| 37 | 37 |
MaxUsage uint64 `json:"max_usage,omitempty"` |
| 38 | 38 |
Failcnt uint64 `json:"failcnt"` |
| 39 |
+ Limit uint64 `json:"limit"` |
|
| 39 | 40 |
} |
| 41 |
+ |
|
| 40 | 42 |
type MemoryStats struct {
|
| 41 | 43 |
// memory used for cache |
| 42 | 44 |
Cache uint64 `json:"cache,omitempty"` |
| ... | ... |
@@ -49,6 +51,11 @@ type MemoryStats struct {
|
| 49 | 49 |
Stats map[string]uint64 `json:"stats,omitempty"` |
| 50 | 50 |
} |
| 51 | 51 |
|
| 52 |
+type PidsStats struct {
|
|
| 53 |
+ // number of pids in the cgroup |
|
| 54 |
+ Current uint64 `json:"current,omitempty"` |
|
| 55 |
+} |
|
| 56 |
+ |
|
| 52 | 57 |
type BlkioStatEntry struct {
|
| 53 | 58 |
Major uint64 `json:"major,omitempty"` |
| 54 | 59 |
Minor uint64 `json:"minor,omitempty"` |
| ... | ... |
@@ -80,6 +87,7 @@ type HugetlbStats struct {
|
| 80 | 80 |
type Stats struct {
|
| 81 | 81 |
CpuStats CpuStats `json:"cpu_stats,omitempty"` |
| 82 | 82 |
MemoryStats MemoryStats `json:"memory_stats,omitempty"` |
| 83 |
+ PidsStats PidsStats `json:"pids_stats,omitempty"` |
|
| 83 | 84 |
BlkioStats BlkioStats `json:"blkio_stats,omitempty"` |
| 84 | 85 |
// the map is in the format "size of hugepage: stats of the hugepage" |
| 85 | 86 |
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` |
| ... | ... |
@@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
|
| 26 | 26 |
return nil, fmt.Errorf("Systemd not supported")
|
| 27 | 27 |
} |
| 28 | 28 |
|
| 29 |
+func (m *Manager) GetAllPids() ([]int, error) {
|
|
| 30 |
+ return nil, fmt.Errorf("Systemd not supported")
|
|
| 31 |
+} |
|
| 32 |
+ |
|
| 29 | 33 |
func (m *Manager) Destroy() error {
|
| 30 | 34 |
return fmt.Errorf("Systemd not supported")
|
| 31 | 35 |
} |
| ... | ... |
@@ -55,6 +55,7 @@ var subsystems = subsystemSet{
|
| 55 | 55 |
&fs.MemoryGroup{},
|
| 56 | 56 |
&fs.CpuGroup{},
|
| 57 | 57 |
&fs.CpuacctGroup{},
|
| 58 |
+ &fs.PidsGroup{},
|
|
| 58 | 59 |
&fs.BlkioGroup{},
|
| 59 | 60 |
&fs.HugetlbGroup{},
|
| 60 | 61 |
&fs.PerfEventGroup{},
|
| ... | ... |
@@ -167,6 +168,23 @@ func (m *Manager) Apply(pid int) error {
|
| 167 | 167 |
properties []systemdDbus.Property |
| 168 | 168 |
) |
| 169 | 169 |
|
| 170 |
+ if c.Paths != nil {
|
|
| 171 |
+ paths := make(map[string]string) |
|
| 172 |
+ for name, path := range c.Paths {
|
|
| 173 |
+ _, err := getSubsystemPath(m.Cgroups, name) |
|
| 174 |
+ if err != nil {
|
|
| 175 |
+ // Don't fail if a cgroup hierarchy was not found, just skip this subsystem |
|
| 176 |
+ if cgroups.IsNotFound(err) {
|
|
| 177 |
+ continue |
|
| 178 |
+ } |
|
| 179 |
+ return err |
|
| 180 |
+ } |
|
| 181 |
+ paths[name] = path |
|
| 182 |
+ } |
|
| 183 |
+ m.Paths = paths |
|
| 184 |
+ return cgroups.EnterPid(m.Paths, pid) |
|
| 185 |
+ } |
|
| 186 |
+ |
|
| 170 | 187 |
if c.Parent != "" {
|
| 171 | 188 |
slice = c.Parent |
| 172 | 189 |
} |
| ... | ... |
@@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
|
| 233 | 233 |
return err |
| 234 | 234 |
} |
| 235 | 235 |
|
| 236 |
- // we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd |
|
| 236 |
+ // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd |
|
| 237 | 237 |
// because it does not currently support it via the dbus api. |
| 238 | 238 |
if err := joinFreezer(c, pid); err != nil {
|
| 239 | 239 |
return err |
| ... | ... |
@@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
|
| 246 | 246 |
return err |
| 247 | 247 |
} |
| 248 | 248 |
|
| 249 |
+ if err := joinPids(c, pid); err != nil {
|
|
| 250 |
+ return err |
|
| 251 |
+ } |
|
| 252 |
+ |
|
| 249 | 253 |
if err := joinCpuset(c, pid); err != nil {
|
| 250 | 254 |
return err |
| 251 | 255 |
} |
| ... | ... |
@@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
|
| 277 | 277 |
paths[s.Name()] = subsystemPath |
| 278 | 278 |
} |
| 279 | 279 |
m.Paths = paths |
| 280 |
- |
|
| 281 |
- if paths["cpu"] != "" {
|
|
| 282 |
- if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
|
|
| 283 |
- return err |
|
| 284 |
- } |
|
| 285 |
- } |
|
| 286 |
- |
|
| 287 | 280 |
return nil |
| 288 | 281 |
} |
| 289 | 282 |
|
| 290 | 283 |
func (m *Manager) Destroy() error {
|
| 284 |
+ if m.Cgroups.Paths != nil {
|
|
| 285 |
+ return nil |
|
| 286 |
+ } |
|
| 291 | 287 |
m.mu.Lock() |
| 292 | 288 |
defer m.mu.Unlock() |
| 293 | 289 |
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) |
| ... | ... |
@@ -330,68 +348,74 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
|
| 330 | 330 |
} |
| 331 | 331 |
|
| 332 | 332 |
func joinCpu(c *configs.Cgroup, pid int) error {
|
| 333 |
- path, err := getSubsystemPath(c, "cpu") |
|
| 333 |
+ _, err := join(c, "cpu", pid) |
|
| 334 | 334 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 335 | 335 |
return err |
| 336 | 336 |
} |
| 337 |
- if c.Resources.CpuQuota != 0 {
|
|
| 338 |
- if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil {
|
|
| 339 |
- return err |
|
| 340 |
- } |
|
| 341 |
- } |
|
| 342 |
- if c.Resources.CpuPeriod != 0 {
|
|
| 343 |
- if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil {
|
|
| 344 |
- return err |
|
| 345 |
- } |
|
| 346 |
- } |
|
| 347 |
- if c.Resources.CpuRtPeriod != 0 {
|
|
| 348 |
- if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil {
|
|
| 349 |
- return err |
|
| 350 |
- } |
|
| 351 |
- } |
|
| 352 |
- if c.Resources.CpuRtRuntime != 0 {
|
|
| 353 |
- if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil {
|
|
| 354 |
- return err |
|
| 355 |
- } |
|
| 356 |
- } |
|
| 357 |
- |
|
| 358 | 337 |
return nil |
| 359 | 338 |
} |
| 360 | 339 |
|
| 361 | 340 |
func joinFreezer(c *configs.Cgroup, pid int) error {
|
| 362 |
- path, err := join(c, "freezer", pid) |
|
| 341 |
+ _, err := join(c, "freezer", pid) |
|
| 363 | 342 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 364 | 343 |
return err |
| 365 | 344 |
} |
| 366 |
- freezer, err := subsystems.Get("freezer")
|
|
| 367 |
- if err != nil {
|
|
| 368 |
- return err |
|
| 369 |
- } |
|
| 370 |
- return freezer.Set(path, c) |
|
| 345 |
+ return nil |
|
| 371 | 346 |
} |
| 372 | 347 |
|
| 373 | 348 |
func joinNetPrio(c *configs.Cgroup, pid int) error {
|
| 374 |
- path, err := join(c, "net_prio", pid) |
|
| 349 |
+ _, err := join(c, "net_prio", pid) |
|
| 375 | 350 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 376 | 351 |
return err |
| 377 | 352 |
} |
| 378 |
- netPrio, err := subsystems.Get("net_prio")
|
|
| 379 |
- if err != nil {
|
|
| 380 |
- return err |
|
| 381 |
- } |
|
| 382 |
- return netPrio.Set(path, c) |
|
| 353 |
+ return nil |
|
| 383 | 354 |
} |
| 384 | 355 |
|
| 385 | 356 |
func joinNetCls(c *configs.Cgroup, pid int) error {
|
| 386 |
- path, err := join(c, "net_cls", pid) |
|
| 357 |
+ _, err := join(c, "net_cls", pid) |
|
| 387 | 358 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 388 | 359 |
return err |
| 389 | 360 |
} |
| 390 |
- netcls, err := subsystems.Get("net_cls")
|
|
| 391 |
- if err != nil {
|
|
| 361 |
+ return nil |
|
| 362 |
+} |
|
| 363 |
+ |
|
| 364 |
+func joinPids(c *configs.Cgroup, pid int) error {
|
|
| 365 |
+ _, err := join(c, "pids", pid) |
|
| 366 |
+ if err != nil && !cgroups.IsNotFound(err) {
|
|
| 392 | 367 |
return err |
| 393 | 368 |
} |
| 394 |
- return netcls.Set(path, c) |
|
| 369 |
+ return nil |
|
| 370 |
+} |
|
| 371 |
+ |
|
| 372 |
+// systemd represents slice heirarchy using `-`, so we need to follow suit when |
|
| 373 |
+// generating the path of slice. Essentially, test-a-b.slice becomes |
|
| 374 |
+// test.slice/test-a.slice/test-a-b.slice. |
|
| 375 |
+func expandSlice(slice string) (string, error) {
|
|
| 376 |
+ suffix := ".slice" |
|
| 377 |
+ // Name has to end with ".slice", but can't be just ".slice". |
|
| 378 |
+ if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
|
|
| 379 |
+ return "", fmt.Errorf("invalid slice name: %s", slice)
|
|
| 380 |
+ } |
|
| 381 |
+ |
|
| 382 |
+ // Path-separators are not allowed. |
|
| 383 |
+ if strings.Contains(slice, "/") {
|
|
| 384 |
+ return "", fmt.Errorf("invalid slice name: %s", slice)
|
|
| 385 |
+ } |
|
| 386 |
+ |
|
| 387 |
+ var path, prefix string |
|
| 388 |
+ sliceName := strings.TrimSuffix(slice, suffix) |
|
| 389 |
+ for _, component := range strings.Split(sliceName, "-") {
|
|
| 390 |
+ // test--a.slice isn't permitted, nor is -test.slice. |
|
| 391 |
+ if component == "" {
|
|
| 392 |
+ return "", fmt.Errorf("invalid slice name: %s", slice)
|
|
| 393 |
+ } |
|
| 394 |
+ |
|
| 395 |
+ // Append the component to the path and to the prefix. |
|
| 396 |
+ path += prefix + component + suffix + "/" |
|
| 397 |
+ prefix += component + "-" |
|
| 398 |
+ } |
|
| 399 |
+ |
|
| 400 |
+ return path, nil |
|
| 395 | 401 |
} |
| 396 | 402 |
|
| 397 | 403 |
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
|
| ... | ... |
@@ -410,6 +434,11 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
|
| 410 | 410 |
slice = c.Parent |
| 411 | 411 |
} |
| 412 | 412 |
|
| 413 |
+ slice, err = expandSlice(slice) |
|
| 414 |
+ if err != nil {
|
|
| 415 |
+ return "", err |
|
| 416 |
+ } |
|
| 417 |
+ |
|
| 413 | 418 |
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil |
| 414 | 419 |
} |
| 415 | 420 |
|
| ... | ... |
@@ -440,6 +469,14 @@ func (m *Manager) GetPids() ([]int, error) {
|
| 440 | 440 |
return cgroups.GetPids(path) |
| 441 | 441 |
} |
| 442 | 442 |
|
| 443 |
+func (m *Manager) GetAllPids() ([]int, error) {
|
|
| 444 |
+ path, err := getSubsystemPath(m.Cgroups, "devices") |
|
| 445 |
+ if err != nil {
|
|
| 446 |
+ return nil, err |
|
| 447 |
+ } |
|
| 448 |
+ return cgroups.GetAllPids(path) |
|
| 449 |
+} |
|
| 450 |
+ |
|
| 443 | 451 |
func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
| 444 | 452 |
m.mu.Lock() |
| 445 | 453 |
defer m.mu.Unlock() |
| ... | ... |
@@ -458,16 +495,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
|
| 458 | 458 |
} |
| 459 | 459 |
|
| 460 | 460 |
func (m *Manager) Set(container *configs.Config) error {
|
| 461 |
- for name, path := range m.Paths {
|
|
| 462 |
- sys, err := subsystems.Get(name) |
|
| 463 |
- if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
|
|
| 464 |
- continue |
|
| 461 |
+ for _, sys := range subsystems {
|
|
| 462 |
+ // Get the subsystem path, but don't error out for not found cgroups. |
|
| 463 |
+ path, err := getSubsystemPath(container.Cgroups, sys.Name()) |
|
| 464 |
+ if err != nil && !cgroups.IsNotFound(err) {
|
|
| 465 |
+ return err |
|
| 465 | 466 |
} |
| 467 |
+ |
|
| 466 | 468 |
if err := sys.Set(path, container.Cgroups); err != nil {
|
| 467 | 469 |
return err |
| 468 | 470 |
} |
| 469 | 471 |
} |
| 470 | 472 |
|
| 473 |
+ if m.Paths["cpu"] != "" {
|
|
| 474 |
+ if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
|
|
| 475 |
+ return err |
|
| 476 |
+ } |
|
| 477 |
+ } |
|
| 471 | 478 |
return nil |
| 472 | 479 |
} |
| 473 | 480 |
|
| ... | ... |
@@ -487,17 +531,13 @@ func getUnitName(c *configs.Cgroup) string {
|
| 487 | 487 |
// because systemd will re-write the device settings if it needs to re-apply the cgroup context. |
| 488 | 488 |
// This happens at least for v208 when any sibling unit is started. |
| 489 | 489 |
func joinDevices(c *configs.Cgroup, pid int) error {
|
| 490 |
- path, err := join(c, "devices", pid) |
|
| 490 |
+ _, err := join(c, "devices", pid) |
|
| 491 | 491 |
// Even if it's `not found` error, we'll return err because devices cgroup |
| 492 | 492 |
// is hard requirement for container security. |
| 493 | 493 |
if err != nil {
|
| 494 | 494 |
return err |
| 495 | 495 |
} |
| 496 |
- devices, err := subsystems.Get("devices")
|
|
| 497 |
- if err != nil {
|
|
| 498 |
- return err |
|
| 499 |
- } |
|
| 500 |
- return devices.Set(path, c) |
|
| 496 |
+ return nil |
|
| 501 | 497 |
} |
| 502 | 498 |
|
| 503 | 499 |
func setKernelMemory(c *configs.Cgroup) error {
|
| ... | ... |
@@ -510,52 +550,16 @@ func setKernelMemory(c *configs.Cgroup) error {
|
| 510 | 510 |
return err |
| 511 | 511 |
} |
| 512 | 512 |
|
| 513 |
- if c.Resources.KernelMemory > 0 {
|
|
| 514 |
- err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10)) |
|
| 515 |
- if err != nil {
|
|
| 516 |
- return err |
|
| 517 |
- } |
|
| 518 |
- } |
|
| 519 |
- |
|
| 520 |
- return nil |
|
| 513 |
+ // This doesn't get called by manager.Set, so we need to do it here. |
|
| 514 |
+ s := &fs.MemoryGroup{}
|
|
| 515 |
+ return s.SetKernelMemory(path, c) |
|
| 521 | 516 |
} |
| 522 | 517 |
|
| 523 | 518 |
func joinMemory(c *configs.Cgroup, pid int) error {
|
| 524 |
- path, err := getSubsystemPath(c, "memory") |
|
| 519 |
+ _, err := join(c, "memory", pid) |
|
| 525 | 520 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 526 | 521 |
return err |
| 527 | 522 |
} |
| 528 |
- |
|
| 529 |
- // -1 disables memoryswap |
|
| 530 |
- if c.Resources.MemorySwap > 0 {
|
|
| 531 |
- err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10)) |
|
| 532 |
- if err != nil {
|
|
| 533 |
- return err |
|
| 534 |
- } |
|
| 535 |
- } |
|
| 536 |
- if c.Resources.MemoryReservation > 0 {
|
|
| 537 |
- err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10)) |
|
| 538 |
- if err != nil {
|
|
| 539 |
- return err |
|
| 540 |
- } |
|
| 541 |
- } |
|
| 542 |
- if c.Resources.OomKillDisable {
|
|
| 543 |
- if err := writeFile(path, "memory.oom_control", "1"); err != nil {
|
|
| 544 |
- return err |
|
| 545 |
- } |
|
| 546 |
- } |
|
| 547 |
- |
|
| 548 |
- if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 {
|
|
| 549 |
- err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10)) |
|
| 550 |
- if err != nil {
|
|
| 551 |
- return err |
|
| 552 |
- } |
|
| 553 |
- } else if c.Resources.MemorySwappiness == -1 {
|
|
| 554 |
- return nil |
|
| 555 |
- } else {
|
|
| 556 |
- return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.Resources.MemorySwappiness)
|
|
| 557 |
- } |
|
| 558 |
- |
|
| 559 | 523 |
return nil |
| 560 | 524 |
} |
| 561 | 525 |
|
| ... | ... |
@@ -577,68 +581,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
|
| 577 | 577 |
// expects device path instead of major minor numbers, which is also confusing |
| 578 | 578 |
// for users. So we use fs work around for now. |
| 579 | 579 |
func joinBlkio(c *configs.Cgroup, pid int) error {
|
| 580 |
- path, err := getSubsystemPath(c, "blkio") |
|
| 580 |
+ _, err := join(c, "blkio", pid) |
|
| 581 | 581 |
if err != nil {
|
| 582 | 582 |
return err |
| 583 | 583 |
} |
| 584 |
- // systemd doesn't directly support this in the dbus properties |
|
| 585 |
- if c.Resources.BlkioLeafWeight != 0 {
|
|
| 586 |
- if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil {
|
|
| 587 |
- return err |
|
| 588 |
- } |
|
| 589 |
- } |
|
| 590 |
- for _, wd := range c.Resources.BlkioWeightDevice {
|
|
| 591 |
- if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
|
|
| 592 |
- return err |
|
| 593 |
- } |
|
| 594 |
- if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
|
|
| 595 |
- return err |
|
| 596 |
- } |
|
| 597 |
- } |
|
| 598 |
- for _, td := range c.Resources.BlkioThrottleReadBpsDevice {
|
|
| 599 |
- if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
|
|
| 600 |
- return err |
|
| 601 |
- } |
|
| 602 |
- } |
|
| 603 |
- for _, td := range c.Resources.BlkioThrottleWriteBpsDevice {
|
|
| 604 |
- if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
|
|
| 605 |
- return err |
|
| 606 |
- } |
|
| 607 |
- } |
|
| 608 |
- for _, td := range c.Resources.BlkioThrottleReadIOPSDevice {
|
|
| 609 |
- if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
|
|
| 610 |
- return err |
|
| 611 |
- } |
|
| 612 |
- } |
|
| 613 |
- for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice {
|
|
| 614 |
- if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
|
|
| 615 |
- return err |
|
| 616 |
- } |
|
| 617 |
- } |
|
| 618 |
- |
|
| 619 | 584 |
return nil |
| 620 | 585 |
} |
| 621 | 586 |
|
| 622 | 587 |
func joinHugetlb(c *configs.Cgroup, pid int) error {
|
| 623 |
- path, err := join(c, "hugetlb", pid) |
|
| 588 |
+ _, err := join(c, "hugetlb", pid) |
|
| 624 | 589 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 625 | 590 |
return err |
| 626 | 591 |
} |
| 627 |
- hugetlb, err := subsystems.Get("hugetlb")
|
|
| 628 |
- if err != nil {
|
|
| 629 |
- return err |
|
| 630 |
- } |
|
| 631 |
- return hugetlb.Set(path, c) |
|
| 592 |
+ return nil |
|
| 632 | 593 |
} |
| 633 | 594 |
|
| 634 | 595 |
func joinPerfEvent(c *configs.Cgroup, pid int) error {
|
| 635 |
- path, err := join(c, "perf_event", pid) |
|
| 596 |
+ _, err := join(c, "perf_event", pid) |
|
| 636 | 597 |
if err != nil && !cgroups.IsNotFound(err) {
|
| 637 | 598 |
return err |
| 638 | 599 |
} |
| 639 |
- perfEvent, err := subsystems.Get("perf_event")
|
|
| 640 |
- if err != nil {
|
|
| 641 |
- return err |
|
| 642 |
- } |
|
| 643 |
- return perfEvent.Set(path, c) |
|
| 600 |
+ return nil |
|
| 644 | 601 |
} |
| ... | ... |
@@ -5,6 +5,7 @@ package cgroups |
| 5 | 5 |
import ( |
| 6 | 6 |
"bufio" |
| 7 | 7 |
"fmt" |
| 8 |
+ "io" |
|
| 8 | 9 |
"io/ioutil" |
| 9 | 10 |
"os" |
| 10 | 11 |
"path/filepath" |
| ... | ... |
@@ -12,7 +13,6 @@ import ( |
| 12 | 12 |
"strings" |
| 13 | 13 |
"time" |
| 14 | 14 |
|
| 15 |
- "github.com/docker/docker/pkg/mount" |
|
| 16 | 15 |
"github.com/docker/go-units" |
| 17 | 16 |
) |
| 18 | 17 |
|
| ... | ... |
@@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
|
| 84 | 84 |
// Safe as mountinfo encodes mountpoints with spaces as \040. |
| 85 | 85 |
index := strings.Index(text, " - ") |
| 86 | 86 |
postSeparatorFields := strings.Fields(text[index+3:]) |
| 87 |
- if len(postSeparatorFields) < 3 {
|
|
| 88 |
- return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
|
|
| 87 |
+ numPostFields := len(postSeparatorFields) |
|
| 88 |
+ |
|
| 89 |
+ // This is an error as we can't detect if the mount is for "cgroup" |
|
| 90 |
+ if numPostFields == 0 {
|
|
| 91 |
+ return "", fmt.Errorf("Found no fields post '-' in %q", text)
|
|
| 89 | 92 |
} |
| 93 |
+ |
|
| 90 | 94 |
if postSeparatorFields[0] == "cgroup" {
|
| 95 |
+ // Check that the mount is properly formated. |
|
| 96 |
+ if numPostFields < 3 {
|
|
| 97 |
+ return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
|
|
| 98 |
+ } |
|
| 99 |
+ |
|
| 91 | 100 |
return filepath.Dir(fields[4]), nil |
| 92 | 101 |
} |
| 93 | 102 |
} |
| ... | ... |
@@ -112,11 +121,45 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
|
| 112 | 112 |
return getControllerPath(m.Subsystems[0], cgroups) |
| 113 | 113 |
} |
| 114 | 114 |
|
| 115 |
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
|
|
| 116 |
+ res := make([]Mount, 0, len(ss)) |
|
| 117 |
+ scanner := bufio.NewScanner(mi) |
|
| 118 |
+ for scanner.Scan() {
|
|
| 119 |
+ txt := scanner.Text() |
|
| 120 |
+ sepIdx := strings.IndexByte(txt, '-') |
|
| 121 |
+ if sepIdx == -1 {
|
|
| 122 |
+ return nil, fmt.Errorf("invalid mountinfo format")
|
|
| 123 |
+ } |
|
| 124 |
+ if txt[sepIdx+2:sepIdx+8] != "cgroup" {
|
|
| 125 |
+ continue |
|
| 126 |
+ } |
|
| 127 |
+ fields := strings.Split(txt, " ") |
|
| 128 |
+ m := Mount{
|
|
| 129 |
+ Mountpoint: fields[4], |
|
| 130 |
+ Root: fields[3], |
|
| 131 |
+ } |
|
| 132 |
+ for _, opt := range strings.Split(fields[len(fields)-1], ",") {
|
|
| 133 |
+ if strings.HasPrefix(opt, cgroupNamePrefix) {
|
|
| 134 |
+ m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) |
|
| 135 |
+ } |
|
| 136 |
+ if ss[opt] {
|
|
| 137 |
+ m.Subsystems = append(m.Subsystems, opt) |
|
| 138 |
+ } |
|
| 139 |
+ } |
|
| 140 |
+ res = append(res, m) |
|
| 141 |
+ } |
|
| 142 |
+ if err := scanner.Err(); err != nil {
|
|
| 143 |
+ return nil, err |
|
| 144 |
+ } |
|
| 145 |
+ return res, nil |
|
| 146 |
+} |
|
| 147 |
+ |
|
| 115 | 148 |
func GetCgroupMounts() ([]Mount, error) {
|
| 116 |
- mounts, err := mount.GetMounts() |
|
| 149 |
+ f, err := os.Open("/proc/self/mountinfo")
|
|
| 117 | 150 |
if err != nil {
|
| 118 | 151 |
return nil, err |
| 119 | 152 |
} |
| 153 |
+ defer f.Close() |
|
| 120 | 154 |
|
| 121 | 155 |
all, err := GetAllSubsystems() |
| 122 | 156 |
if err != nil {
|
| ... | ... |
@@ -127,24 +170,7 @@ func GetCgroupMounts() ([]Mount, error) {
|
| 127 | 127 |
for _, s := range all {
|
| 128 | 128 |
allMap[s] = true |
| 129 | 129 |
} |
| 130 |
- |
|
| 131 |
- res := []Mount{}
|
|
| 132 |
- for _, mount := range mounts {
|
|
| 133 |
- if mount.Fstype == "cgroup" {
|
|
| 134 |
- m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
|
|
| 135 |
- |
|
| 136 |
- for _, opt := range strings.Split(mount.VfsOpts, ",") {
|
|
| 137 |
- if strings.HasPrefix(opt, cgroupNamePrefix) {
|
|
| 138 |
- m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) |
|
| 139 |
- } |
|
| 140 |
- if allMap[opt] {
|
|
| 141 |
- m.Subsystems = append(m.Subsystems, opt) |
|
| 142 |
- } |
|
| 143 |
- } |
|
| 144 |
- res = append(res, m) |
|
| 145 |
- } |
|
| 146 |
- } |
|
| 147 |
- return res, nil |
|
| 130 |
+ return getCgroupMountsHelper(allMap, f) |
|
| 148 | 131 |
} |
| 149 | 132 |
|
| 150 | 133 |
// Returns all the cgroup subsystems supported by the kernel |
| ... | ... |
@@ -323,9 +349,14 @@ func GetHugePageSize() ([]string, error) {
|
| 323 | 323 |
return pageSizes, nil |
| 324 | 324 |
} |
| 325 | 325 |
|
| 326 |
-// GetPids returns all pids, that were added to cgroup at path and to all its |
|
| 327 |
-// subcgroups. |
|
| 326 |
+// GetPids returns all pids, that were added to cgroup at path. |
|
| 328 | 327 |
func GetPids(path string) ([]int, error) {
|
| 328 |
+ return readProcsFile(path) |
|
| 329 |
+} |
|
| 330 |
+ |
|
| 331 |
+// GetAllPids returns all pids, that were added to cgroup at path and to all its |
|
| 332 |
+// subcgroups. |
|
| 333 |
+func GetAllPids(path string) ([]int, error) {
|
|
| 329 | 334 |
var pids []int |
| 330 | 335 |
// collect pids from all sub-cgroups |
| 331 | 336 |
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
|
| ... | ... |
@@ -11,25 +11,38 @@ const ( |
| 11 | 11 |
) |
| 12 | 12 |
|
| 13 | 13 |
type Cgroup struct {
|
| 14 |
- Name string `json:"name"` |
|
| 14 |
+ // Deprecated, use Path instead |
|
| 15 |
+ Name string `json:"name,omitempty"` |
|
| 15 | 16 |
|
| 16 |
- // name of parent cgroup or slice |
|
| 17 |
- Parent string `json:"parent"` |
|
| 17 |
+ // name of parent of cgroup or slice |
|
| 18 |
+ // Deprecated, use Path instead |
|
| 19 |
+ Parent string `json:"parent,omitempty"` |
|
| 20 |
+ |
|
| 21 |
+ // Path specifies the path to cgroups that are created and/or joined by the container. |
|
| 22 |
+ // The path is assumed to be relative to the host system cgroup mountpoint. |
|
| 23 |
+ Path string `json:"path"` |
|
| 18 | 24 |
|
| 19 | 25 |
// ScopePrefix decribes prefix for the scope name |
| 20 | 26 |
ScopePrefix string `json:"scope_prefix"` |
| 21 | 27 |
|
| 28 |
+ // Paths represent the absolute cgroups paths to join. |
|
| 29 |
+ // This takes precedence over Path. |
|
| 30 |
+ Paths map[string]string |
|
| 31 |
+ |
|
| 22 | 32 |
// Resources contains various cgroups settings to apply |
| 23 | 33 |
*Resources |
| 24 | 34 |
} |
| 25 | 35 |
|
| 26 | 36 |
type Resources struct {
|
| 27 | 37 |
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. |
| 28 |
- AllowAllDevices bool `json:"allow_all_devices"` |
|
| 38 |
+ // Deprecated |
|
| 39 |
+ AllowAllDevices bool `json:"allow_all_devices,omitempty"` |
|
| 40 |
+ // Deprecated |
|
| 41 |
+ AllowedDevices []*Device `json:"allowed_devices,omitempty"` |
|
| 42 |
+ // Deprecated |
|
| 43 |
+ DeniedDevices []*Device `json:"denied_devices,omitempty"` |
|
| 29 | 44 |
|
| 30 |
- AllowedDevices []*Device `json:"allowed_devices"` |
|
| 31 |
- |
|
| 32 |
- DeniedDevices []*Device `json:"denied_devices"` |
|
| 45 |
+ Devices []*Device `json:"devices"` |
|
| 33 | 46 |
|
| 34 | 47 |
// Memory limit (in bytes) |
| 35 | 48 |
Memory int64 `json:"memory"` |
| ... | ... |
@@ -37,7 +50,7 @@ type Resources struct {
|
| 37 | 37 |
// Memory reservation or soft_limit (in bytes) |
| 38 | 38 |
MemoryReservation int64 `json:"memory_reservation"` |
| 39 | 39 |
|
| 40 |
- // Total memory usage (memory + swap); set `-1' to disable swap |
|
| 40 |
+ // Total memory usage (memory + swap); set `-1` to enable unlimited swap |
|
| 41 | 41 |
MemorySwap int64 `json:"memory_swap"` |
| 42 | 42 |
|
| 43 | 43 |
// Kernel memory limit (in bytes) |
| ... | ... |
@@ -64,6 +77,9 @@ type Resources struct {
|
| 64 | 64 |
// MEM to use |
| 65 | 65 |
CpusetMems string `json:"cpuset_mems"` |
| 66 | 66 |
|
| 67 |
+ // Process limit; set <= `0' to disable limit. |
|
| 68 |
+ PidsLimit int64 `json:"pids_limit"` |
|
| 69 |
+ |
|
| 67 | 70 |
// Specifies per cgroup weight, range is from 10 to 1000. |
| 68 | 71 |
BlkioWeight uint16 `json:"blkio_weight"` |
| 69 | 72 |
|
| ... | ... |
@@ -171,6 +171,9 @@ type Config struct {
|
| 171 | 171 |
// A default action to be taken if no rules match is also given. |
| 172 | 172 |
Seccomp *Seccomp `json:"seccomp"` |
| 173 | 173 |
|
| 174 |
+ // NoNewPrivileges controls whether processes in the container can gain additional privileges. |
|
| 175 |
+ NoNewPrivileges bool `json:"no_new_privileges"` |
|
| 176 |
+ |
|
| 174 | 177 |
// Hooks are a collection of actions to perform at various container lifecycle events. |
| 175 | 178 |
// Hooks are not able to be marshaled to json but they are also not needed to. |
| 176 | 179 |
Hooks *Hooks `json:"-"` |
| ... | ... |
@@ -82,20 +82,6 @@ var ( |
| 82 | 82 |
Minor: 1, |
| 83 | 83 |
Permissions: "rwm", |
| 84 | 84 |
}, |
| 85 |
- {
|
|
| 86 |
- Path: "/dev/tty0", |
|
| 87 |
- Type: 'c', |
|
| 88 |
- Major: 4, |
|
| 89 |
- Minor: 0, |
|
| 90 |
- Permissions: "rwm", |
|
| 91 |
- }, |
|
| 92 |
- {
|
|
| 93 |
- Path: "/dev/tty1", |
|
| 94 |
- Type: 'c', |
|
| 95 |
- Major: 4, |
|
| 96 |
- Minor: 1, |
|
| 97 |
- Permissions: "rwm", |
|
| 98 |
- }, |
|
| 99 | 85 |
// /dev/pts/ - pts namespaces are "coming soon" |
| 100 | 86 |
{
|
| 101 | 87 |
Path: "", |
| ... | ... |
@@ -6,6 +6,7 @@ package libcontainer |
| 6 | 6 |
|
| 7 | 7 |
import ( |
| 8 | 8 |
"os" |
| 9 |
+ "time" |
|
| 9 | 10 |
|
| 10 | 11 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 11 | 12 |
) |
| ... | ... |
@@ -14,8 +15,11 @@ import ( |
| 14 | 14 |
type Status int |
| 15 | 15 |
|
| 16 | 16 |
const ( |
| 17 |
+ // The container exists but has not been run yet |
|
| 18 |
+ Created Status = iota |
|
| 19 |
+ |
|
| 17 | 20 |
// The container exists and is running. |
| 18 |
- Running Status = iota + 1 |
|
| 21 |
+ Running |
|
| 19 | 22 |
|
| 20 | 23 |
// The container exists, it is in the process of being paused. |
| 21 | 24 |
Pausing |
| ... | ... |
@@ -30,6 +34,25 @@ const ( |
| 30 | 30 |
Destroyed |
| 31 | 31 |
) |
| 32 | 32 |
|
| 33 |
+func (s Status) String() string {
|
|
| 34 |
+ switch s {
|
|
| 35 |
+ case Created: |
|
| 36 |
+ return "created" |
|
| 37 |
+ case Running: |
|
| 38 |
+ return "running" |
|
| 39 |
+ case Pausing: |
|
| 40 |
+ return "pausing" |
|
| 41 |
+ case Paused: |
|
| 42 |
+ return "paused" |
|
| 43 |
+ case Checkpointed: |
|
| 44 |
+ return "checkpointed" |
|
| 45 |
+ case Destroyed: |
|
| 46 |
+ return "destroyed" |
|
| 47 |
+ default: |
|
| 48 |
+ return "unknown" |
|
| 49 |
+ } |
|
| 50 |
+} |
|
| 51 |
+ |
|
| 33 | 52 |
// BaseState represents the platform agnostic pieces relating to a |
| 34 | 53 |
// running container's state |
| 35 | 54 |
type BaseState struct {
|
| ... | ... |
@@ -39,9 +62,12 @@ type BaseState struct {
|
| 39 | 39 |
// InitProcessPid is the init process id in the parent namespace. |
| 40 | 40 |
InitProcessPid int `json:"init_process_pid"` |
| 41 | 41 |
|
| 42 |
- // InitProcessStartTime is the init process start time. |
|
| 42 |
+ // InitProcessStartTime is the init process start time in clock cycles since boot time. |
|
| 43 | 43 |
InitProcessStartTime string `json:"init_process_start"` |
| 44 | 44 |
|
| 45 |
+ // Created is the unix timestamp for the creation time of the container in UTC |
|
| 46 |
+ Created time.Time `json:"created"` |
|
| 47 |
+ |
|
| 45 | 48 |
// Config is the container's configuration. |
| 46 | 49 |
Config configs.Config `json:"config"` |
| 47 | 50 |
} |
| ... | ... |
@@ -15,6 +15,7 @@ import ( |
| 15 | 15 |
"strings" |
| 16 | 16 |
"sync" |
| 17 | 17 |
"syscall" |
| 18 |
+ "time" |
|
| 18 | 19 |
|
| 19 | 20 |
"github.com/Sirupsen/logrus" |
| 20 | 21 |
"github.com/golang/protobuf/proto" |
| ... | ... |
@@ -38,6 +39,8 @@ type linuxContainer struct {
|
| 38 | 38 |
criuPath string |
| 39 | 39 |
m sync.Mutex |
| 40 | 40 |
criuVersion int |
| 41 |
+ state containerState |
|
| 42 |
+ created time.Time |
|
| 41 | 43 |
} |
| 42 | 44 |
|
| 43 | 45 |
// State represents a running container's state |
| ... | ... |
@@ -104,6 +107,12 @@ type Container interface {
|
| 104 | 104 |
// errors: |
| 105 | 105 |
// Systemerror - System error. |
| 106 | 106 |
NotifyOOM() (<-chan struct{}, error)
|
| 107 |
+ |
|
| 108 |
+ // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level |
|
| 109 |
+ // |
|
| 110 |
+ // errors: |
|
| 111 |
+ // Systemerror - System error. |
|
| 112 |
+ NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
|
|
| 107 | 113 |
} |
| 108 | 114 |
|
| 109 | 115 |
// ID returns the container's unique ID |
| ... | ... |
@@ -129,7 +138,7 @@ func (c *linuxContainer) State() (*State, error) {
|
| 129 | 129 |
} |
| 130 | 130 |
|
| 131 | 131 |
func (c *linuxContainer) Processes() ([]int, error) {
|
| 132 |
- pids, err := c.cgroupManager.GetPids() |
|
| 132 |
+ pids, err := c.cgroupManager.GetAllPids() |
|
| 133 | 133 |
if err != nil {
|
| 134 | 134 |
return nil, newSystemError(err) |
| 135 | 135 |
} |
| ... | ... |
@@ -183,22 +192,30 @@ func (c *linuxContainer) Start(process *Process) error {
|
| 183 | 183 |
} |
| 184 | 184 |
return newSystemError(err) |
| 185 | 185 |
} |
| 186 |
+ // generate a timestamp indicating when the container was started |
|
| 187 |
+ c.created = time.Now().UTC() |
|
| 188 |
+ |
|
| 189 |
+ c.state = &runningState{
|
|
| 190 |
+ c: c, |
|
| 191 |
+ } |
|
| 186 | 192 |
if doInit {
|
| 187 |
- c.updateState(parent) |
|
| 188 |
- } |
|
| 189 |
- if c.config.Hooks != nil {
|
|
| 190 |
- s := configs.HookState{
|
|
| 191 |
- Version: c.config.Version, |
|
| 192 |
- ID: c.id, |
|
| 193 |
- Pid: parent.pid(), |
|
| 194 |
- Root: c.config.Rootfs, |
|
| 193 |
+ if err := c.updateState(parent); err != nil {
|
|
| 194 |
+ return err |
|
| 195 | 195 |
} |
| 196 |
- for _, hook := range c.config.Hooks.Poststart {
|
|
| 197 |
- if err := hook.Run(s); err != nil {
|
|
| 198 |
- if err := parent.terminate(); err != nil {
|
|
| 199 |
- logrus.Warn(err) |
|
| 196 |
+ if c.config.Hooks != nil {
|
|
| 197 |
+ s := configs.HookState{
|
|
| 198 |
+ Version: c.config.Version, |
|
| 199 |
+ ID: c.id, |
|
| 200 |
+ Pid: parent.pid(), |
|
| 201 |
+ Root: c.config.Rootfs, |
|
| 202 |
+ } |
|
| 203 |
+ for _, hook := range c.config.Hooks.Poststart {
|
|
| 204 |
+ if err := hook.Run(s); err != nil {
|
|
| 205 |
+ if err := parent.terminate(); err != nil {
|
|
| 206 |
+ logrus.Warn(err) |
|
| 207 |
+ } |
|
| 208 |
+ return newSystemError(err) |
|
| 200 | 209 |
} |
| 201 |
- return newSystemError(err) |
|
| 202 | 210 |
} |
| 203 | 211 |
} |
| 204 | 212 |
} |
| ... | ... |
@@ -251,7 +268,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. |
| 251 | 251 |
} |
| 252 | 252 |
|
| 253 | 253 |
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
|
| 254 |
- t := "_LIBCONTAINER_INITTYPE=standard" |
|
| 254 |
+ t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) |
|
| 255 | 255 |
cloneFlags := c.config.Namespaces.CloneFlags() |
| 256 | 256 |
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
|
| 257 | 257 |
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
|
| ... | ... |
@@ -278,7 +295,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c |
| 278 | 278 |
} |
| 279 | 279 |
|
| 280 | 280 |
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
|
| 281 |
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns") |
|
| 281 |
+ cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) |
|
| 282 | 282 |
// for setns process, we dont have to set cloneflags as the process namespaces |
| 283 | 283 |
// will only be set via setns syscall |
| 284 | 284 |
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) |
| ... | ... |
@@ -321,54 +338,53 @@ func newPipe() (parent *os.File, child *os.File, err error) {
|
| 321 | 321 |
func (c *linuxContainer) Destroy() error {
|
| 322 | 322 |
c.m.Lock() |
| 323 | 323 |
defer c.m.Unlock() |
| 324 |
+ return c.state.destroy() |
|
| 325 |
+} |
|
| 326 |
+ |
|
| 327 |
+func (c *linuxContainer) Pause() error {
|
|
| 328 |
+ c.m.Lock() |
|
| 329 |
+ defer c.m.Unlock() |
|
| 324 | 330 |
status, err := c.currentStatus() |
| 325 | 331 |
if err != nil {
|
| 326 | 332 |
return err |
| 327 | 333 |
} |
| 328 |
- if status != Destroyed {
|
|
| 329 |
- return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
|
|
| 334 |
+ if status != Running {
|
|
| 335 |
+ return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
|
|
| 330 | 336 |
} |
| 331 |
- if !c.config.Namespaces.Contains(configs.NEWPID) {
|
|
| 332 |
- if err := killCgroupProcesses(c.cgroupManager); err != nil {
|
|
| 333 |
- logrus.Warn(err) |
|
| 334 |
- } |
|
| 335 |
- } |
|
| 336 |
- err = c.cgroupManager.Destroy() |
|
| 337 |
- if rerr := os.RemoveAll(c.root); err == nil {
|
|
| 338 |
- err = rerr |
|
| 339 |
- } |
|
| 340 |
- c.initProcess = nil |
|
| 341 |
- if c.config.Hooks != nil {
|
|
| 342 |
- s := configs.HookState{
|
|
| 343 |
- Version: c.config.Version, |
|
| 344 |
- ID: c.id, |
|
| 345 |
- Root: c.config.Rootfs, |
|
| 346 |
- } |
|
| 347 |
- for _, hook := range c.config.Hooks.Poststop {
|
|
| 348 |
- if err := hook.Run(s); err != nil {
|
|
| 349 |
- return err |
|
| 350 |
- } |
|
| 351 |
- } |
|
| 337 |
+ if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
|
|
| 338 |
+ return err |
|
| 352 | 339 |
} |
| 353 |
- return err |
|
| 354 |
-} |
|
| 355 |
- |
|
| 356 |
-func (c *linuxContainer) Pause() error {
|
|
| 357 |
- c.m.Lock() |
|
| 358 |
- defer c.m.Unlock() |
|
| 359 |
- return c.cgroupManager.Freeze(configs.Frozen) |
|
| 340 |
+ return c.state.transition(&pausedState{
|
|
| 341 |
+ c: c, |
|
| 342 |
+ }) |
|
| 360 | 343 |
} |
| 361 | 344 |
|
| 362 | 345 |
func (c *linuxContainer) Resume() error {
|
| 363 | 346 |
c.m.Lock() |
| 364 | 347 |
defer c.m.Unlock() |
| 365 |
- return c.cgroupManager.Freeze(configs.Thawed) |
|
| 348 |
+ status, err := c.currentStatus() |
|
| 349 |
+ if err != nil {
|
|
| 350 |
+ return err |
|
| 351 |
+ } |
|
| 352 |
+ if status != Paused {
|
|
| 353 |
+ return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
|
|
| 354 |
+ } |
|
| 355 |
+ if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
|
| 356 |
+ return err |
|
| 357 |
+ } |
|
| 358 |
+ return c.state.transition(&runningState{
|
|
| 359 |
+ c: c, |
|
| 360 |
+ }) |
|
| 366 | 361 |
} |
| 367 | 362 |
|
| 368 | 363 |
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
|
| 369 | 364 |
return notifyOnOOM(c.cgroupManager.GetPaths()) |
| 370 | 365 |
} |
| 371 | 366 |
|
| 367 |
+func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
|
|
| 368 |
+ return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) |
|
| 369 |
+} |
|
| 370 |
+ |
|
| 372 | 371 |
// XXX debug support, remove when debugging done. |
| 373 | 372 |
func addArgsFromEnv(evar string, args *[]string) {
|
| 374 | 373 |
if e := os.Getenv(evar); e != "" {
|
| ... | ... |
@@ -460,7 +476,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
| 460 | 460 |
} |
| 461 | 461 |
|
| 462 | 462 |
if criuOpts.ImagesDirectory == "" {
|
| 463 |
- criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") |
|
| 463 |
+ return fmt.Errorf("invalid directory to save checkpoint")
|
|
| 464 | 464 |
} |
| 465 | 465 |
|
| 466 | 466 |
// Since a container can be C/R'ed multiple times, |
| ... | ... |
@@ -579,11 +595,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo |
| 579 | 579 |
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
| 580 | 580 |
c.m.Lock() |
| 581 | 581 |
defer c.m.Unlock() |
| 582 |
- |
|
| 583 | 582 |
if err := c.checkCriuVersion("1.5.2"); err != nil {
|
| 584 | 583 |
return err |
| 585 | 584 |
} |
| 586 |
- |
|
| 587 | 585 |
if criuOpts.WorkDirectory == "" {
|
| 588 | 586 |
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") |
| 589 | 587 |
} |
| ... | ... |
@@ -592,22 +606,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
| 592 | 592 |
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
|
| 593 | 593 |
return err |
| 594 | 594 |
} |
| 595 |
- |
|
| 596 | 595 |
workDir, err := os.Open(criuOpts.WorkDirectory) |
| 597 | 596 |
if err != nil {
|
| 598 | 597 |
return err |
| 599 | 598 |
} |
| 600 | 599 |
defer workDir.Close() |
| 601 |
- |
|
| 602 | 600 |
if criuOpts.ImagesDirectory == "" {
|
| 603 |
- criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") |
|
| 601 |
+ return fmt.Errorf("invalid directory to restore checkpoint")
|
|
| 604 | 602 |
} |
| 605 | 603 |
imageDir, err := os.Open(criuOpts.ImagesDirectory) |
| 606 | 604 |
if err != nil {
|
| 607 | 605 |
return err |
| 608 | 606 |
} |
| 609 | 607 |
defer imageDir.Close() |
| 610 |
- |
|
| 611 | 608 |
// CRIU has a few requirements for a root directory: |
| 612 | 609 |
// * it must be a mount point |
| 613 | 610 |
// * its parent must not be overmounted |
| ... | ... |
@@ -618,18 +629,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
| 618 | 618 |
return err |
| 619 | 619 |
} |
| 620 | 620 |
defer os.Remove(root) |
| 621 |
- |
|
| 622 | 621 |
root, err = filepath.EvalSymlinks(root) |
| 623 | 622 |
if err != nil {
|
| 624 | 623 |
return err |
| 625 | 624 |
} |
| 626 |
- |
|
| 627 | 625 |
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") |
| 628 | 626 |
if err != nil {
|
| 629 | 627 |
return err |
| 630 | 628 |
} |
| 631 | 629 |
defer syscall.Unmount(root, syscall.MNT_DETACH) |
| 632 |
- |
|
| 633 | 630 |
t := criurpc.CriuReqType_RESTORE |
| 634 | 631 |
req := &criurpc.CriuReq{
|
| 635 | 632 |
Type: &t, |
| ... | ... |
@@ -697,15 +705,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
| 697 | 697 |
fds []string |
| 698 | 698 |
fdJSON []byte |
| 699 | 699 |
) |
| 700 |
- |
|
| 701 | 700 |
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
| 702 | 701 |
return err |
| 703 | 702 |
} |
| 704 | 703 |
|
| 705 |
- if err = json.Unmarshal(fdJSON, &fds); err != nil {
|
|
| 704 |
+ if err := json.Unmarshal(fdJSON, &fds); err != nil {
|
|
| 706 | 705 |
return err |
| 707 | 706 |
} |
| 708 |
- |
|
| 709 | 707 |
for i := range fds {
|
| 710 | 708 |
if s := fds[i]; strings.Contains(s, "pipe:") {
|
| 711 | 709 |
inheritFd := new(criurpc.InheritFd) |
| ... | ... |
@@ -714,12 +720,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
| 714 | 714 |
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) |
| 715 | 715 |
} |
| 716 | 716 |
} |
| 717 |
- |
|
| 718 |
- err = c.criuSwrk(process, req, criuOpts, true) |
|
| 719 |
- if err != nil {
|
|
| 720 |
- return err |
|
| 721 |
- } |
|
| 722 |
- return nil |
|
| 717 |
+ return c.criuSwrk(process, req, criuOpts, true) |
|
| 723 | 718 |
} |
| 724 | 719 |
|
| 725 | 720 |
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
| ... | ... |
@@ -914,46 +915,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc |
| 914 | 914 |
if notify == nil {
|
| 915 | 915 |
return fmt.Errorf("invalid response: %s", resp.String())
|
| 916 | 916 |
} |
| 917 |
- |
|
| 918 | 917 |
switch {
|
| 919 | 918 |
case notify.GetScript() == "post-dump": |
| 920 |
- if !opts.LeaveRunning {
|
|
| 921 |
- f, err := os.Create(filepath.Join(c.root, "checkpoint")) |
|
| 922 |
- if err != nil {
|
|
| 923 |
- return err |
|
| 924 |
- } |
|
| 925 |
- f.Close() |
|
| 919 |
+ f, err := os.Create(filepath.Join(c.root, "checkpoint")) |
|
| 920 |
+ if err != nil {
|
|
| 921 |
+ return err |
|
| 926 | 922 |
} |
| 927 |
- break |
|
| 928 |
- |
|
| 923 |
+ f.Close() |
|
| 929 | 924 |
case notify.GetScript() == "network-unlock": |
| 930 | 925 |
if err := unlockNetwork(c.config); err != nil {
|
| 931 | 926 |
return err |
| 932 | 927 |
} |
| 933 |
- break |
|
| 934 |
- |
|
| 935 | 928 |
case notify.GetScript() == "network-lock": |
| 936 | 929 |
if err := lockNetwork(c.config); err != nil {
|
| 937 | 930 |
return err |
| 938 | 931 |
} |
| 939 |
- break |
|
| 940 |
- |
|
| 941 | 932 |
case notify.GetScript() == "post-restore": |
| 942 | 933 |
pid := notify.GetPid() |
| 943 | 934 |
r, err := newRestoredProcess(int(pid), fds) |
| 944 | 935 |
if err != nil {
|
| 945 | 936 |
return err |
| 946 | 937 |
} |
| 947 |
- |
|
| 948 |
- // TODO: crosbymichael restore previous process information by saving the init process information in |
|
| 949 |
- // the container's state file or separate process state files. |
|
| 938 |
+ process.ops = r |
|
| 939 |
+ if err := c.state.transition(&restoredState{
|
|
| 940 |
+ imageDir: opts.ImagesDirectory, |
|
| 941 |
+ c: c, |
|
| 942 |
+ }); err != nil {
|
|
| 943 |
+ return err |
|
| 944 |
+ } |
|
| 950 | 945 |
if err := c.updateState(r); err != nil {
|
| 951 | 946 |
return err |
| 952 | 947 |
} |
| 953 |
- process.ops = r |
|
| 954 |
- break |
|
| 948 |
+ if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
|
|
| 949 |
+ if !os.IsNotExist(err) {
|
|
| 950 |
+ logrus.Error(err) |
|
| 951 |
+ } |
|
| 952 |
+ } |
|
| 955 | 953 |
} |
| 956 |
- |
|
| 957 | 954 |
return nil |
| 958 | 955 |
} |
| 959 | 956 |
|
| ... | ... |
@@ -963,65 +961,108 @@ func (c *linuxContainer) updateState(process parentProcess) error {
|
| 963 | 963 |
if err != nil {
|
| 964 | 964 |
return err |
| 965 | 965 |
} |
| 966 |
+ return c.saveState(state) |
|
| 967 |
+} |
|
| 968 |
+ |
|
| 969 |
+func (c *linuxContainer) saveState(s *State) error {
|
|
| 966 | 970 |
f, err := os.Create(filepath.Join(c.root, stateFilename)) |
| 967 | 971 |
if err != nil {
|
| 968 | 972 |
return err |
| 969 | 973 |
} |
| 970 | 974 |
defer f.Close() |
| 971 |
- os.Remove(filepath.Join(c.root, "checkpoint")) |
|
| 972 |
- return utils.WriteJSON(f, state) |
|
| 975 |
+ return utils.WriteJSON(f, s) |
|
| 976 |
+} |
|
| 977 |
+ |
|
| 978 |
+func (c *linuxContainer) deleteState() error {
|
|
| 979 |
+ return os.Remove(filepath.Join(c.root, stateFilename)) |
|
| 973 | 980 |
} |
| 974 | 981 |
|
| 975 | 982 |
func (c *linuxContainer) currentStatus() (Status, error) {
|
| 976 |
- if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil {
|
|
| 977 |
- return Checkpointed, nil |
|
| 983 |
+ if err := c.refreshState(); err != nil {
|
|
| 984 |
+ return -1, err |
|
| 978 | 985 |
} |
| 986 |
+ return c.state.status(), nil |
|
| 987 |
+} |
|
| 988 |
+ |
|
| 989 |
+// refreshState needs to be called to verify that the current state on the |
|
| 990 |
+// container is what is true. Because consumers of libcontainer can use it |
|
| 991 |
+// out of process we need to verify the container's status based on runtime |
|
| 992 |
+// information and not rely on our in process info. |
|
| 993 |
+func (c *linuxContainer) refreshState() error {
|
|
| 994 |
+ paused, err := c.isPaused() |
|
| 995 |
+ if err != nil {
|
|
| 996 |
+ return err |
|
| 997 |
+ } |
|
| 998 |
+ if paused {
|
|
| 999 |
+ return c.state.transition(&pausedState{c: c})
|
|
| 1000 |
+ } |
|
| 1001 |
+ running, err := c.isRunning() |
|
| 1002 |
+ if err != nil {
|
|
| 1003 |
+ return err |
|
| 1004 |
+ } |
|
| 1005 |
+ if running {
|
|
| 1006 |
+ return c.state.transition(&runningState{c: c})
|
|
| 1007 |
+ } |
|
| 1008 |
+ return c.state.transition(&stoppedState{c: c})
|
|
| 1009 |
+} |
|
| 1010 |
+ |
|
| 1011 |
+func (c *linuxContainer) isRunning() (bool, error) {
|
|
| 979 | 1012 |
if c.initProcess == nil {
|
| 980 |
- return Destroyed, nil |
|
| 1013 |
+ return false, nil |
|
| 981 | 1014 |
} |
| 982 | 1015 |
// return Running if the init process is alive |
| 983 | 1016 |
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
|
| 984 | 1017 |
if err == syscall.ESRCH {
|
| 985 |
- return Destroyed, nil |
|
| 1018 |
+ return false, nil |
|
| 986 | 1019 |
} |
| 987 |
- return 0, newSystemError(err) |
|
| 1020 |
+ return false, newSystemError(err) |
|
| 988 | 1021 |
} |
| 989 |
- if c.config.Cgroups != nil && c.config.Cgroups.Resources != nil && c.config.Cgroups.Resources.Freezer == configs.Frozen {
|
|
| 990 |
- return Paused, nil |
|
| 991 |
- } |
|
| 992 |
- return Running, nil |
|
| 1022 |
+ return true, nil |
|
| 993 | 1023 |
} |
| 994 | 1024 |
|
| 995 |
-func (c *linuxContainer) currentState() (*State, error) {
|
|
| 996 |
- status, err := c.currentStatus() |
|
| 1025 |
+func (c *linuxContainer) isPaused() (bool, error) {
|
|
| 1026 |
+ data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) |
|
| 997 | 1027 |
if err != nil {
|
| 998 |
- return nil, err |
|
| 999 |
- } |
|
| 1000 |
- if status == Destroyed {
|
|
| 1001 |
- return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists)
|
|
| 1028 |
+ if os.IsNotExist(err) {
|
|
| 1029 |
+ return false, nil |
|
| 1030 |
+ } |
|
| 1031 |
+ return false, newSystemError(err) |
|
| 1002 | 1032 |
} |
| 1003 |
- startTime, err := c.initProcess.startTime() |
|
| 1004 |
- if err != nil {
|
|
| 1005 |
- return nil, newSystemError(err) |
|
| 1033 |
+ return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
|
|
| 1034 |
+} |
|
| 1035 |
+ |
|
| 1036 |
+func (c *linuxContainer) currentState() (*State, error) {
|
|
| 1037 |
+ var ( |
|
| 1038 |
+ startTime string |
|
| 1039 |
+ externalDescriptors []string |
|
| 1040 |
+ pid = -1 |
|
| 1041 |
+ ) |
|
| 1042 |
+ if c.initProcess != nil {
|
|
| 1043 |
+ pid = c.initProcess.pid() |
|
| 1044 |
+ startTime, _ = c.initProcess.startTime() |
|
| 1045 |
+ externalDescriptors = c.initProcess.externalDescriptors() |
|
| 1006 | 1046 |
} |
| 1007 | 1047 |
state := &State{
|
| 1008 | 1048 |
BaseState: BaseState{
|
| 1009 | 1049 |
ID: c.ID(), |
| 1010 | 1050 |
Config: *c.config, |
| 1011 |
- InitProcessPid: c.initProcess.pid(), |
|
| 1051 |
+ InitProcessPid: pid, |
|
| 1012 | 1052 |
InitProcessStartTime: startTime, |
| 1053 |
+ Created: c.created, |
|
| 1013 | 1054 |
}, |
| 1014 | 1055 |
CgroupPaths: c.cgroupManager.GetPaths(), |
| 1015 | 1056 |
NamespacePaths: make(map[configs.NamespaceType]string), |
| 1016 |
- ExternalDescriptors: c.initProcess.externalDescriptors(), |
|
| 1057 |
+ ExternalDescriptors: externalDescriptors, |
|
| 1017 | 1058 |
} |
| 1018 |
- for _, ns := range c.config.Namespaces {
|
|
| 1019 |
- state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) |
|
| 1020 |
- } |
|
| 1021 |
- for _, nsType := range configs.NamespaceTypes() {
|
|
| 1022 |
- if _, ok := state.NamespacePaths[nsType]; !ok {
|
|
| 1023 |
- ns := configs.Namespace{Type: nsType}
|
|
| 1024 |
- state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) |
|
| 1059 |
+ if pid > 0 {
|
|
| 1060 |
+ for _, ns := range c.config.Namespaces {
|
|
| 1061 |
+ state.NamespacePaths[ns.Type] = ns.GetPath(pid) |
|
| 1062 |
+ } |
|
| 1063 |
+ for _, nsType := range configs.NamespaceTypes() {
|
|
| 1064 |
+ if _, ok := state.NamespacePaths[nsType]; !ok {
|
|
| 1065 |
+ ns := configs.Namespace{Type: nsType}
|
|
| 1066 |
+ state.NamespacePaths[ns.Type] = ns.GetPath(pid) |
|
| 1067 |
+ } |
|
| 1025 | 1068 |
} |
| 1026 | 1069 |
} |
| 1027 | 1070 |
return state, nil |
| ... | ... |
@@ -16,9 +16,10 @@ const ( |
| 16 | 16 |
ContainerPaused |
| 17 | 17 |
ContainerNotStopped |
| 18 | 18 |
ContainerNotRunning |
| 19 |
+ ContainerNotPaused |
|
| 19 | 20 |
|
| 20 | 21 |
// Process errors |
| 21 |
- ProcessNotExecuted |
|
| 22 |
+ NoProcessOps |
|
| 22 | 23 |
|
| 23 | 24 |
// Common errors |
| 24 | 25 |
ConfigInvalid |
| ... | ... |
@@ -46,6 +47,10 @@ func (c ErrorCode) String() string {
|
| 46 | 46 |
return "Container is not running" |
| 47 | 47 |
case ConsoleExists: |
| 48 | 48 |
return "Console exists for process" |
| 49 |
+ case ContainerNotPaused: |
|
| 50 |
+ return "Container is not paused" |
|
| 51 |
+ case NoProcessOps: |
|
| 52 |
+ return "No process operations" |
|
| 49 | 53 |
default: |
| 50 | 54 |
return "Unknown error" |
| 51 | 55 |
} |
| ... | ... |
@@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err |
| 166 | 166 |
if err := os.MkdirAll(containerRoot, 0700); err != nil {
|
| 167 | 167 |
return nil, newGenericError(err, SystemError) |
| 168 | 168 |
} |
| 169 |
- return &linuxContainer{
|
|
| 169 |
+ c := &linuxContainer{
|
|
| 170 | 170 |
id: id, |
| 171 | 171 |
root: containerRoot, |
| 172 | 172 |
config: config, |
| ... | ... |
@@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err |
| 174 | 174 |
initArgs: l.InitArgs, |
| 175 | 175 |
criuPath: l.CriuPath, |
| 176 | 176 |
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), |
| 177 |
- }, nil |
|
| 177 |
+ } |
|
| 178 |
+ c.state = &stoppedState{c: c}
|
|
| 179 |
+ return c, nil |
|
| 178 | 180 |
} |
| 179 | 181 |
|
| 180 | 182 |
func (l *LinuxFactory) Load(id string) (Container, error) {
|
| ... | ... |
@@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
|
| 191 | 191 |
processStartTime: state.InitProcessStartTime, |
| 192 | 192 |
fds: state.ExternalDescriptors, |
| 193 | 193 |
} |
| 194 |
- return &linuxContainer{
|
|
| 194 |
+ c := &linuxContainer{
|
|
| 195 | 195 |
initProcess: r, |
| 196 | 196 |
id: id, |
| 197 | 197 |
config: &state.Config, |
| ... | ... |
@@ -200,7 +202,13 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
|
| 200 | 200 |
criuPath: l.CriuPath, |
| 201 | 201 |
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), |
| 202 | 202 |
root: containerRoot, |
| 203 |
- }, nil |
|
| 203 |
+ created: state.Created, |
|
| 204 |
+ } |
|
| 205 |
+ c.state = &createdState{c: c, s: Created}
|
|
| 206 |
+ if err := c.refreshState(); err != nil {
|
|
| 207 |
+ return nil, err |
|
| 208 |
+ } |
|
| 209 |
+ return c, nil |
|
| 204 | 210 |
} |
| 205 | 211 |
|
| 206 | 212 |
func (l *LinuxFactory) Type() string {
|
| ... | ... |
@@ -222,18 +230,25 @@ func (l *LinuxFactory) StartInitialization() (err error) {
|
| 222 | 222 |
// clear the current process's environment to clean any libcontainer |
| 223 | 223 |
// specific env vars. |
| 224 | 224 |
os.Clearenv() |
| 225 |
+ var i initer |
|
| 225 | 226 |
defer func() {
|
| 226 |
- // if we have an error during the initialization of the container's init then send it back to the |
|
| 227 |
- // parent process in the form of an initError. |
|
| 228 |
- if err != nil {
|
|
| 229 |
- if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
|
|
| 227 |
+ // We have an error during the initialization of the container's init, |
|
| 228 |
+ // send it back to the parent process in the form of an initError. |
|
| 229 |
+ // If container's init successed, syscall.Exec will not return, hence |
|
| 230 |
+ // this defer function will never be called. |
|
| 231 |
+ if _, ok := i.(*linuxStandardInit); ok {
|
|
| 232 |
+ // Synchronisation only necessary for standard init. |
|
| 233 |
+ if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
|
|
| 230 | 234 |
panic(err) |
| 231 | 235 |
} |
| 232 | 236 |
} |
| 237 |
+ if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
|
|
| 238 |
+ panic(err) |
|
| 239 |
+ } |
|
| 233 | 240 |
// ensure that this pipe is always closed |
| 234 | 241 |
pipe.Close() |
| 235 | 242 |
}() |
| 236 |
- i, err := newContainerInit(it, pipe) |
|
| 243 |
+ i, err = newContainerInit(it, pipe) |
|
| 237 | 244 |
if err != nil {
|
| 238 | 245 |
return err |
| 239 | 246 |
} |
| ... | ... |
@@ -9,6 +9,18 @@ import ( |
| 9 | 9 |
"github.com/opencontainers/runc/libcontainer/stacktrace" |
| 10 | 10 |
) |
| 11 | 11 |
|
| 12 |
+type syncType uint8 |
|
| 13 |
+ |
|
| 14 |
+const ( |
|
| 15 |
+ procReady syncType = iota |
|
| 16 |
+ procError |
|
| 17 |
+ procRun |
|
| 18 |
+) |
|
| 19 |
+ |
|
| 20 |
+type syncT struct {
|
|
| 21 |
+ Type syncType `json:"type"` |
|
| 22 |
+} |
|
| 23 |
+ |
|
| 12 | 24 |
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
|
| 13 | 25 |
Code: {{.ECode}}
|
| 14 | 26 |
{{if .Message }}
|
| ... | ... |
@@ -5,6 +5,7 @@ package libcontainer |
| 5 | 5 |
import ( |
| 6 | 6 |
"encoding/json" |
| 7 | 7 |
"fmt" |
| 8 |
+ "io" |
|
| 8 | 9 |
"io/ioutil" |
| 9 | 10 |
"net" |
| 10 | 11 |
"os" |
| ... | ... |
@@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
|
| 73 | 73 |
}, nil |
| 74 | 74 |
case initStandard: |
| 75 | 75 |
return &linuxStandardInit{
|
| 76 |
+ pipe: pipe, |
|
| 76 | 77 |
parentPid: syscall.Getppid(), |
| 77 | 78 |
config: config, |
| 78 | 79 |
}, nil |
| ... | ... |
@@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
|
| 140 | 140 |
return nil |
| 141 | 141 |
} |
| 142 | 142 |
|
| 143 |
+// syncParentReady sends to the given pipe a JSON payload which indicates that |
|
| 144 |
+// the init is ready to Exec the child process. It then waits for the parent to |
|
| 145 |
+// indicate that it is cleared to Exec. |
|
| 146 |
+func syncParentReady(pipe io.ReadWriter) error {
|
|
| 147 |
+ // Tell parent. |
|
| 148 |
+ if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
|
|
| 149 |
+ return err |
|
| 150 |
+ } |
|
| 151 |
+ // Wait for parent to give the all-clear. |
|
| 152 |
+ var procSync syncT |
|
| 153 |
+ if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
|
|
| 154 |
+ if err == io.EOF {
|
|
| 155 |
+ return fmt.Errorf("parent closed synchronisation channel")
|
|
| 156 |
+ } |
|
| 157 |
+ if procSync.Type != procRun {
|
|
| 158 |
+ return fmt.Errorf("invalid synchronisation flag from parent")
|
|
| 159 |
+ } |
|
| 160 |
+ } |
|
| 161 |
+ return nil |
|
| 162 |
+} |
|
| 163 |
+ |
|
| 143 | 164 |
// joinExistingNamespaces gets all the namespace paths specified for the container and |
| 144 | 165 |
// does a setns on the namespace fd so that the current process joins the namespace. |
| 145 | 166 |
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
| ... | ... |
@@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
|
| 309 | 309 |
if err := m.Freeze(configs.Frozen); err != nil {
|
| 310 | 310 |
logrus.Warn(err) |
| 311 | 311 |
} |
| 312 |
- pids, err := m.GetPids() |
|
| 312 |
+ pids, err := m.GetAllPids() |
|
| 313 | 313 |
if err != nil {
|
| 314 | 314 |
m.Freeze(configs.Thawed) |
| 315 | 315 |
return err |
| 316 | 316 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,67 @@ |
| 0 |
+// +build linux |
|
| 1 |
+ |
|
| 2 |
+package keyctl |
|
| 3 |
+ |
|
| 4 |
+import ( |
|
| 5 |
+ "fmt" |
|
| 6 |
+ "syscall" |
|
| 7 |
+ "strings" |
|
| 8 |
+ "strconv" |
|
| 9 |
+ "unsafe" |
|
| 10 |
+) |
|
| 11 |
+ |
|
| 12 |
+const KEYCTL_JOIN_SESSION_KEYRING = 1 |
|
| 13 |
+const KEYCTL_SETPERM = 5 |
|
| 14 |
+const KEYCTL_DESCRIBE = 6 |
|
| 15 |
+ |
|
| 16 |
+type KeySerial uint32 |
|
| 17 |
+ |
|
| 18 |
+func JoinSessionKeyring(name string) (KeySerial, error) {
|
|
| 19 |
+ var _name *byte = nil |
|
| 20 |
+ var err error |
|
| 21 |
+ |
|
| 22 |
+ if len(name) > 0 {
|
|
| 23 |
+ _name, err = syscall.BytePtrFromString(name) |
|
| 24 |
+ if err != nil {
|
|
| 25 |
+ return KeySerial(0), err |
|
| 26 |
+ } |
|
| 27 |
+ } |
|
| 28 |
+ |
|
| 29 |
+ sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0) |
|
| 30 |
+ if errn != 0 {
|
|
| 31 |
+ return 0, fmt.Errorf("could not create session key: %v", errn)
|
|
| 32 |
+ } |
|
| 33 |
+ return KeySerial(sessKeyId), nil |
|
| 34 |
+} |
|
| 35 |
+ |
|
| 36 |
+// modify permissions on a keyring by reading the current permissions, |
|
| 37 |
+// anding the bits with the given mask (clearing permissions) and setting |
|
| 38 |
+// additional permission bits |
|
| 39 |
+func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
|
|
| 40 |
+ dest := make([]byte, 1024) |
|
| 41 |
+ destBytes := unsafe.Pointer(&dest[0]) |
|
| 42 |
+ |
|
| 43 |
+ if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
|
|
| 44 |
+ return err |
|
| 45 |
+ } |
|
| 46 |
+ |
|
| 47 |
+ res := strings.Split(string(dest), ";") |
|
| 48 |
+ if len(res) < 5 {
|
|
| 49 |
+ return fmt.Errorf("Destination buffer for key description is too small")
|
|
| 50 |
+ } |
|
| 51 |
+ |
|
| 52 |
+ // parse permissions |
|
| 53 |
+ perm64, err := strconv.ParseUint(res[3], 16, 32) |
|
| 54 |
+ if err != nil {
|
|
| 55 |
+ return err |
|
| 56 |
+ } |
|
| 57 |
+ |
|
| 58 |
+ perm := (uint32(perm64) & mask) | setbits |
|
| 59 |
+ |
|
| 60 |
+ if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
|
|
| 61 |
+ return err |
|
| 62 |
+ } |
|
| 63 |
+ |
|
| 64 |
+ return nil |
|
| 65 |
+} |
|
| 66 |
+ |
| ... | ... |
@@ -12,31 +12,32 @@ import ( |
| 12 | 12 |
|
| 13 | 13 |
const oomCgroupName = "memory" |
| 14 | 14 |
|
| 15 |
-// notifyOnOOM returns channel on which you can expect event about OOM, |
|
| 16 |
-// if process died without OOM this channel will be closed. |
|
| 17 |
-// s is current *libcontainer.State for container. |
|
| 18 |
-func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
|
| 19 |
- dir := paths[oomCgroupName] |
|
| 20 |
- if dir == "" {
|
|
| 21 |
- return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName)
|
|
| 22 |
- } |
|
| 23 |
- oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) |
|
| 15 |
+type PressureLevel uint |
|
| 16 |
+ |
|
| 17 |
+const ( |
|
| 18 |
+ LowPressure PressureLevel = iota |
|
| 19 |
+ MediumPressure |
|
| 20 |
+ CriticalPressure |
|
| 21 |
+) |
|
| 22 |
+ |
|
| 23 |
+func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
|
|
| 24 |
+ evFile, err := os.Open(filepath.Join(cgDir, evName)) |
|
| 24 | 25 |
if err != nil {
|
| 25 | 26 |
return nil, err |
| 26 | 27 |
} |
| 27 | 28 |
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) |
| 28 | 29 |
if syserr != 0 {
|
| 29 |
- oomControl.Close() |
|
| 30 |
+ evFile.Close() |
|
| 30 | 31 |
return nil, syserr |
| 31 | 32 |
} |
| 32 | 33 |
|
| 33 | 34 |
eventfd := os.NewFile(fd, "eventfd") |
| 34 | 35 |
|
| 35 |
- eventControlPath := filepath.Join(dir, "cgroup.event_control") |
|
| 36 |
- data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd())
|
|
| 36 |
+ eventControlPath := filepath.Join(cgDir, "cgroup.event_control") |
|
| 37 |
+ data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
|
|
| 37 | 38 |
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
|
| 38 | 39 |
eventfd.Close() |
| 39 |
- oomControl.Close() |
|
| 40 |
+ evFile.Close() |
|
| 40 | 41 |
return nil, err |
| 41 | 42 |
} |
| 42 | 43 |
ch := make(chan struct{})
|
| ... | ... |
@@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
| 44 | 44 |
defer func() {
|
| 45 | 45 |
close(ch) |
| 46 | 46 |
eventfd.Close() |
| 47 |
- oomControl.Close() |
|
| 47 |
+ evFile.Close() |
|
| 48 | 48 |
}() |
| 49 | 49 |
buf := make([]byte, 8) |
| 50 | 50 |
for {
|
| ... | ... |
@@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
| 61 | 61 |
}() |
| 62 | 62 |
return ch, nil |
| 63 | 63 |
} |
| 64 |
+ |
|
| 65 |
+// notifyOnOOM returns channel on which you can expect event about OOM, |
|
| 66 |
+// if process died without OOM this channel will be closed. |
|
| 67 |
+func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
|
|
| 68 |
+ dir := paths[oomCgroupName] |
|
| 69 |
+ if dir == "" {
|
|
| 70 |
+ return nil, fmt.Errorf("path %q missing", oomCgroupName)
|
|
| 71 |
+ } |
|
| 72 |
+ |
|
| 73 |
+ return registerMemoryEvent(dir, "memory.oom_control", "") |
|
| 74 |
+} |
|
| 75 |
+ |
|
| 76 |
+func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
|
|
| 77 |
+ dir := paths[oomCgroupName] |
|
| 78 |
+ if dir == "" {
|
|
| 79 |
+ return nil, fmt.Errorf("path %q missing", oomCgroupName)
|
|
| 80 |
+ } |
|
| 81 |
+ |
|
| 82 |
+ if level > CriticalPressure {
|
|
| 83 |
+ return nil, fmt.Errorf("invalid pressure level %d", level)
|
|
| 84 |
+ } |
|
| 85 |
+ |
|
| 86 |
+ levelStr := []string{"low", "medium", "critical"}[level]
|
|
| 87 |
+ return registerMemoryEvent(dir, "memory.pressure_level", levelStr) |
|
| 88 |
+} |
| ... | ... |
@@ -55,7 +55,7 @@ type Process struct {
|
| 55 | 55 |
// Wait releases any resources associated with the Process |
| 56 | 56 |
func (p Process) Wait() (*os.ProcessState, error) {
|
| 57 | 57 |
if p.ops == nil {
|
| 58 |
- return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
|
|
| 58 |
+ return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
|
|
| 59 | 59 |
} |
| 60 | 60 |
return p.ops.wait() |
| 61 | 61 |
} |
| ... | ... |
@@ -65,7 +65,7 @@ func (p Process) Pid() (int, error) {
|
| 65 | 65 |
// math.MinInt32 is returned here, because it's invalid value |
| 66 | 66 |
// for the kill() system call. |
| 67 | 67 |
if p.ops == nil {
|
| 68 |
- return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
|
|
| 68 |
+ return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
|
|
| 69 | 69 |
} |
| 70 | 70 |
return p.ops.pid(), nil |
| 71 | 71 |
} |
| ... | ... |
@@ -73,7 +73,7 @@ func (p Process) Pid() (int, error) {
|
| 73 | 73 |
// Signal sends a signal to the Process. |
| 74 | 74 |
func (p Process) Signal(sig os.Signal) error {
|
| 75 | 75 |
if p.ops == nil {
|
| 76 |
- return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
|
|
| 76 |
+ return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
|
|
| 77 | 77 |
} |
| 78 | 78 |
return p.ops.signal(sig) |
| 79 | 79 |
} |
| ... | ... |
@@ -5,6 +5,7 @@ package libcontainer |
| 5 | 5 |
import ( |
| 6 | 6 |
"encoding/json" |
| 7 | 7 |
"errors" |
| 8 |
+ "fmt" |
|
| 8 | 9 |
"io" |
| 9 | 10 |
"os" |
| 10 | 11 |
"os/exec" |
| ... | ... |
@@ -87,6 +88,7 @@ func (p *setnsProcess) start() (err error) {
|
| 87 | 87 |
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
|
| 88 | 88 |
return newSystemError(err) |
| 89 | 89 |
} |
| 90 |
+ |
|
| 90 | 91 |
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
| 91 | 92 |
return newSystemError(err) |
| 92 | 93 |
} |
| ... | ... |
@@ -96,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
|
| 96 | 96 |
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
| 97 | 97 |
return newSystemError(err) |
| 98 | 98 |
} |
| 99 |
+ // Must be done after Shutdown so the child will exit and we can wait for it. |
|
| 99 | 100 |
if ierr != nil {
|
| 100 | 101 |
p.wait() |
| 101 | 102 |
return newSystemError(ierr) |
| ... | ... |
@@ -199,7 +202,6 @@ func (p *initProcess) start() (err error) {
|
| 199 | 199 |
return newSystemError(err) |
| 200 | 200 |
} |
| 201 | 201 |
p.setExternalDescriptors(fds) |
| 202 |
- |
|
| 203 | 202 |
// Do this before syncing with child so that no children |
| 204 | 203 |
// can escape the cgroup |
| 205 | 204 |
if err := p.manager.Apply(p.pid()); err != nil {
|
| ... | ... |
@@ -230,13 +232,54 @@ func (p *initProcess) start() (err error) {
|
| 230 | 230 |
if err := p.sendConfig(); err != nil {
|
| 231 | 231 |
return newSystemError(err) |
| 232 | 232 |
} |
| 233 |
- // wait for the child process to fully complete and receive an error message |
|
| 234 |
- // if one was encoutered |
|
| 235 |
- var ierr *genericError |
|
| 236 |
- if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
|
| 233 |
+ var ( |
|
| 234 |
+ procSync syncT |
|
| 235 |
+ sentRun bool |
|
| 236 |
+ ierr *genericError |
|
| 237 |
+ ) |
|
| 238 |
+ |
|
| 239 |
+loop: |
|
| 240 |
+ for {
|
|
| 241 |
+ if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
|
|
| 242 |
+ if err == io.EOF {
|
|
| 243 |
+ break loop |
|
| 244 |
+ } |
|
| 245 |
+ return newSystemError(err) |
|
| 246 |
+ } |
|
| 247 |
+ switch procSync.Type {
|
|
| 248 |
+ case procReady: |
|
| 249 |
+ if err := p.manager.Set(p.config.Config); err != nil {
|
|
| 250 |
+ return newSystemError(err) |
|
| 251 |
+ } |
|
| 252 |
+ // Sync with child. |
|
| 253 |
+ if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
|
|
| 254 |
+ return newSystemError(err) |
|
| 255 |
+ } |
|
| 256 |
+ sentRun = true |
|
| 257 |
+ case procError: |
|
| 258 |
+ // wait for the child process to fully complete and receive an error message |
|
| 259 |
+ // if one was encoutered |
|
| 260 |
+ if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
|
| 261 |
+ return newSystemError(err) |
|
| 262 |
+ } |
|
| 263 |
+ if ierr != nil {
|
|
| 264 |
+ break loop |
|
| 265 |
+ } |
|
| 266 |
+ // Programmer error. |
|
| 267 |
+ panic("No error following JSON procError payload.")
|
|
| 268 |
+ default: |
|
| 269 |
+ return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
|
|
| 270 |
+ } |
|
| 271 |
+ } |
|
| 272 |
+ if !sentRun {
|
|
| 273 |
+ return newSystemError(fmt.Errorf("could not synchronise with container process"))
|
|
| 274 |
+ } |
|
| 275 |
+ if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
|
| 237 | 276 |
return newSystemError(err) |
| 238 | 277 |
} |
| 278 |
+ // Must be done after Shutdown so the child will exit and we can wait for it. |
|
| 239 | 279 |
if ierr != nil {
|
| 280 |
+ p.wait() |
|
| 240 | 281 |
return newSystemError(ierr) |
| 241 | 282 |
} |
| 242 | 283 |
return nil |
| ... | ... |
@@ -270,12 +313,10 @@ func (p *initProcess) startTime() (string, error) {
|
| 270 | 270 |
} |
| 271 | 271 |
|
| 272 | 272 |
func (p *initProcess) sendConfig() error {
|
| 273 |
- // send the state to the container's init process then shutdown writes for the parent |
|
| 274 |
- if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
|
|
| 275 |
- return err |
|
| 276 |
- } |
|
| 277 |
- // shutdown writes for the parent side of the pipe |
|
| 278 |
- return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR) |
|
| 273 |
+ // send the config to the container's init process, we don't use JSON Encode |
|
| 274 |
+ // here because there might be a problem in JSON decoder in some cases, see: |
|
| 275 |
+ // https://github.com/docker/docker/issues/14203#issuecomment-174177790 |
|
| 276 |
+ return utils.WriteJSON(p.parentPipe, p.config) |
|
| 279 | 277 |
} |
| 280 | 278 |
|
| 281 | 279 |
func (p *initProcess) createNetworkInterfaces() error {
|
| ... | ... |
@@ -18,6 +18,8 @@ import ( |
| 18 | 18 |
"github.com/opencontainers/runc/libcontainer/cgroups" |
| 19 | 19 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 20 | 20 |
"github.com/opencontainers/runc/libcontainer/label" |
| 21 |
+ "github.com/opencontainers/runc/libcontainer/system" |
|
| 22 |
+ libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" |
|
| 21 | 23 |
) |
| 22 | 24 |
|
| 23 | 25 |
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV |
| ... | ... |
@@ -293,12 +295,31 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
|
| 293 | 293 |
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. |
| 294 | 294 |
// dest is required to be an abs path and have any symlinks resolved before calling this function. |
| 295 | 295 |
func checkMountDestination(rootfs, dest string) error {
|
| 296 |
- if filepath.Clean(rootfs) == filepath.Clean(dest) {
|
|
| 296 |
+ if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
|
|
| 297 | 297 |
return fmt.Errorf("mounting into / is prohibited")
|
| 298 | 298 |
} |
| 299 | 299 |
invalidDestinations := []string{
|
| 300 | 300 |
"/proc", |
| 301 | 301 |
} |
| 302 |
+ // White list, it should be sub directories of invalid destinations |
|
| 303 |
+ validDestinations := []string{
|
|
| 304 |
+ // These entries can be bind mounted by files emulated by fuse, |
|
| 305 |
+ // so commands like top, free displays stats in container. |
|
| 306 |
+ "/proc/cpuinfo", |
|
| 307 |
+ "/proc/diskstats", |
|
| 308 |
+ "/proc/meminfo", |
|
| 309 |
+ "/proc/stat", |
|
| 310 |
+ "/proc/net/dev", |
|
| 311 |
+ } |
|
| 312 |
+ for _, valid := range validDestinations {
|
|
| 313 |
+ path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) |
|
| 314 |
+ if err != nil {
|
|
| 315 |
+ return err |
|
| 316 |
+ } |
|
| 317 |
+ if path == "." {
|
|
| 318 |
+ return nil |
|
| 319 |
+ } |
|
| 320 |
+ } |
|
| 302 | 321 |
for _, invalid := range invalidDestinations {
|
| 303 | 322 |
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) |
| 304 | 323 |
if err != nil {
|
| ... | ... |
@@ -321,7 +342,7 @@ func setupDevSymlinks(rootfs string) error {
|
| 321 | 321 |
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink |
| 322 | 322 |
// in /dev if it exists in /proc. |
| 323 | 323 |
if _, err := os.Stat("/proc/kcore"); err == nil {
|
| 324 |
- links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
|
|
| 324 |
+ links = append(links, [2]string{"/proc/kcore", "/dev/core"})
|
|
| 325 | 325 |
} |
| 326 | 326 |
for _, link := range links {
|
| 327 | 327 |
var ( |
| ... | ... |
@@ -365,11 +386,12 @@ func reOpenDevNull() error {
|
| 365 | 365 |
|
| 366 | 366 |
// Create the device nodes in the container. |
| 367 | 367 |
func createDevices(config *configs.Config) error {
|
| 368 |
+ useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) |
|
| 368 | 369 |
oldMask := syscall.Umask(0000) |
| 369 | 370 |
for _, node := range config.Devices {
|
| 370 | 371 |
// containers running in a user namespace are not allowed to mknod |
| 371 | 372 |
// devices so we can just bind mount it from the host. |
| 372 |
- if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil {
|
|
| 373 |
+ if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
|
|
| 373 | 374 |
syscall.Umask(oldMask) |
| 374 | 375 |
return err |
| 375 | 376 |
} |
| ... | ... |
@@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
|
| 231 | 231 |
} |
| 232 | 232 |
} |
| 233 | 233 |
|
| 234 |
+func selinuxEnforcePath() string {
|
|
| 235 |
+ return fmt.Sprintf("%s/enforce", selinuxPath)
|
|
| 236 |
+} |
|
| 237 |
+ |
|
| 234 | 238 |
func SelinuxGetEnforce() int {
|
| 235 | 239 |
var enforce int |
| 236 | 240 |
|
| 237 |
- enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath))
|
|
| 241 |
+ enforceS, err := readCon(selinuxEnforcePath()) |
|
| 238 | 242 |
if err != nil {
|
| 239 | 243 |
return -1 |
| 240 | 244 |
} |
| ... | ... |
@@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
|
| 246 | 246 |
return enforce |
| 247 | 247 |
} |
| 248 | 248 |
|
| 249 |
+func SelinuxSetEnforce(mode int) error {
|
|
| 250 |
+ return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
|
|
| 251 |
+} |
|
| 252 |
+ |
|
| 249 | 253 |
func SelinuxGetEnforceMode() int {
|
| 250 | 254 |
switch readConfig(selinuxTag) {
|
| 251 | 255 |
case "enforcing": |
| ... | ... |
@@ -6,6 +6,7 @@ import ( |
| 6 | 6 |
"os" |
| 7 | 7 |
|
| 8 | 8 |
"github.com/opencontainers/runc/libcontainer/apparmor" |
| 9 |
+ "github.com/opencontainers/runc/libcontainer/keys" |
|
| 9 | 10 |
"github.com/opencontainers/runc/libcontainer/label" |
| 10 | 11 |
"github.com/opencontainers/runc/libcontainer/seccomp" |
| 11 | 12 |
"github.com/opencontainers/runc/libcontainer/system" |
| ... | ... |
@@ -18,12 +19,21 @@ type linuxSetnsInit struct {
|
| 18 | 18 |
} |
| 19 | 19 |
|
| 20 | 20 |
func (l *linuxSetnsInit) Init() error {
|
| 21 |
+ // do not inherit the parent's session keyring |
|
| 22 |
+ if _, err := keyctl.JoinSessionKeyring("_ses"); err != nil {
|
|
| 23 |
+ return err |
|
| 24 |
+ } |
|
| 21 | 25 |
if err := setupRlimits(l.config.Config); err != nil {
|
| 22 | 26 |
return err |
| 23 | 27 |
} |
| 24 | 28 |
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
| 25 | 29 |
return err |
| 26 | 30 |
} |
| 31 |
+ if l.config.Config.NoNewPrivileges {
|
|
| 32 |
+ if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
|
| 33 |
+ return err |
|
| 34 |
+ } |
|
| 35 |
+ } |
|
| 27 | 36 |
if l.config.Config.Seccomp != nil {
|
| 28 | 37 |
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
| 29 | 38 |
return err |
| ... | ... |
@@ -3,22 +3,41 @@ |
| 3 | 3 |
package libcontainer |
| 4 | 4 |
|
| 5 | 5 |
import ( |
| 6 |
+ "io" |
|
| 6 | 7 |
"os" |
| 7 | 8 |
"syscall" |
| 8 | 9 |
|
| 9 | 10 |
"github.com/opencontainers/runc/libcontainer/apparmor" |
| 10 | 11 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 12 |
+ "github.com/opencontainers/runc/libcontainer/keys" |
|
| 11 | 13 |
"github.com/opencontainers/runc/libcontainer/label" |
| 12 | 14 |
"github.com/opencontainers/runc/libcontainer/seccomp" |
| 13 | 15 |
"github.com/opencontainers/runc/libcontainer/system" |
| 14 | 16 |
) |
| 15 | 17 |
|
| 16 | 18 |
type linuxStandardInit struct {
|
| 19 |
+ pipe io.ReadWriter |
|
| 17 | 20 |
parentPid int |
| 18 | 21 |
config *initConfig |
| 19 | 22 |
} |
| 20 | 23 |
|
| 24 |
+// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value |
|
| 25 |
+// the kernel |
|
| 26 |
+const PR_SET_NO_NEW_PRIVS = 0x26 |
|
| 27 |
+ |
|
| 21 | 28 |
func (l *linuxStandardInit) Init() error {
|
| 29 |
+ // do not inherit the parent's session keyring |
|
| 30 |
+ sessKeyId, err := keyctl.JoinSessionKeyring("")
|
|
| 31 |
+ if err != nil {
|
|
| 32 |
+ return err |
|
| 33 |
+ } |
|
| 34 |
+ // make session keyring searcheable |
|
| 35 |
+ // without user ns we need 'UID' search permissions |
|
| 36 |
+ // with user ns we need 'other' search permissions |
|
| 37 |
+ if err := keyctl.ModKeyringPerm(sessKeyId, 0xffffffff, 0x080008); err != nil {
|
|
| 38 |
+ return err |
|
| 39 |
+ } |
|
| 40 |
+ |
|
| 22 | 41 |
// join any namespaces via a path to the namespace fd if provided |
| 23 | 42 |
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
|
| 24 | 43 |
return err |
| ... | ... |
@@ -50,7 +69,6 @@ func (l *linuxStandardInit) Init() error {
|
| 50 | 50 |
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
| 51 | 51 |
return err |
| 52 | 52 |
} |
| 53 |
- |
|
| 54 | 53 |
label.Init() |
| 55 | 54 |
// InitializeMountNamespace() can be executed only for a new mount namespace |
| 56 | 55 |
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
| ... | ... |
@@ -75,7 +93,6 @@ func (l *linuxStandardInit) Init() error {
|
| 75 | 75 |
return err |
| 76 | 76 |
} |
| 77 | 77 |
} |
| 78 |
- |
|
| 79 | 78 |
for _, path := range l.config.Config.ReadonlyPaths {
|
| 80 | 79 |
if err := remountReadonly(path); err != nil {
|
| 81 | 80 |
return err |
| ... | ... |
@@ -90,6 +107,17 @@ func (l *linuxStandardInit) Init() error {
|
| 90 | 90 |
if err != nil {
|
| 91 | 91 |
return err |
| 92 | 92 |
} |
| 93 |
+ if l.config.Config.NoNewPrivileges {
|
|
| 94 |
+ if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
|
| 95 |
+ return err |
|
| 96 |
+ } |
|
| 97 |
+ } |
|
| 98 |
+ // Tell our parent that we're ready to Execv. This must be done before the |
|
| 99 |
+ // Seccomp rules have been applied, because we need to be able to read and |
|
| 100 |
+ // write to a socket. |
|
| 101 |
+ if err := syncParentReady(l.pipe); err != nil {
|
|
| 102 |
+ return err |
|
| 103 |
+ } |
|
| 93 | 104 |
if l.config.Config.Seccomp != nil {
|
| 94 | 105 |
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
| 95 | 106 |
return err |
| ... | ... |
@@ -109,5 +137,6 @@ func (l *linuxStandardInit) Init() error {
|
| 109 | 109 |
if syscall.Getppid() != l.parentPid {
|
| 110 | 110 |
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) |
| 111 | 111 |
} |
| 112 |
+ |
|
| 112 | 113 |
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) |
| 113 | 114 |
} |
| 114 | 115 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,226 @@ |
| 0 |
+// +build linux |
|
| 1 |
+ |
|
| 2 |
+package libcontainer |
|
| 3 |
+ |
|
| 4 |
+import ( |
|
| 5 |
+ "fmt" |
|
| 6 |
+ "os" |
|
| 7 |
+ "path/filepath" |
|
| 8 |
+ |
|
| 9 |
+ "github.com/Sirupsen/logrus" |
|
| 10 |
+ "github.com/opencontainers/runc/libcontainer/configs" |
|
| 11 |
+) |
|
| 12 |
+ |
|
| 13 |
+func newStateTransitionError(from, to containerState) error {
|
|
| 14 |
+ return &stateTransitionError{
|
|
| 15 |
+ From: from.status().String(), |
|
| 16 |
+ To: to.status().String(), |
|
| 17 |
+ } |
|
| 18 |
+} |
|
| 19 |
+ |
|
| 20 |
+// stateTransitionError is returned when an invalid state transition happens from one |
|
| 21 |
+// state to another. |
|
| 22 |
+type stateTransitionError struct {
|
|
| 23 |
+ From string |
|
| 24 |
+ To string |
|
| 25 |
+} |
|
| 26 |
+ |
|
| 27 |
+func (s *stateTransitionError) Error() string {
|
|
| 28 |
+ return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
|
|
| 29 |
+} |
|
| 30 |
+ |
|
| 31 |
+type containerState interface {
|
|
| 32 |
+ transition(containerState) error |
|
| 33 |
+ destroy() error |
|
| 34 |
+ status() Status |
|
| 35 |
+} |
|
| 36 |
+ |
|
| 37 |
+func destroy(c *linuxContainer) error {
|
|
| 38 |
+ if !c.config.Namespaces.Contains(configs.NEWPID) {
|
|
| 39 |
+ if err := killCgroupProcesses(c.cgroupManager); err != nil {
|
|
| 40 |
+ logrus.Warn(err) |
|
| 41 |
+ } |
|
| 42 |
+ } |
|
| 43 |
+ err := c.cgroupManager.Destroy() |
|
| 44 |
+ if rerr := os.RemoveAll(c.root); err == nil {
|
|
| 45 |
+ err = rerr |
|
| 46 |
+ } |
|
| 47 |
+ c.initProcess = nil |
|
| 48 |
+ if herr := runPoststopHooks(c); err == nil {
|
|
| 49 |
+ err = herr |
|
| 50 |
+ } |
|
| 51 |
+ c.state = &stoppedState{c: c}
|
|
| 52 |
+ return err |
|
| 53 |
+} |
|
| 54 |
+ |
|
| 55 |
+func runPoststopHooks(c *linuxContainer) error {
|
|
| 56 |
+ if c.config.Hooks != nil {
|
|
| 57 |
+ s := configs.HookState{
|
|
| 58 |
+ Version: c.config.Version, |
|
| 59 |
+ ID: c.id, |
|
| 60 |
+ Root: c.config.Rootfs, |
|
| 61 |
+ } |
|
| 62 |
+ for _, hook := range c.config.Hooks.Poststop {
|
|
| 63 |
+ if err := hook.Run(s); err != nil {
|
|
| 64 |
+ return err |
|
| 65 |
+ } |
|
| 66 |
+ } |
|
| 67 |
+ } |
|
| 68 |
+ return nil |
|
| 69 |
+} |
|
| 70 |
+ |
|
| 71 |
+// stoppedState represents a container is a stopped/destroyed state. |
|
| 72 |
+type stoppedState struct {
|
|
| 73 |
+ c *linuxContainer |
|
| 74 |
+} |
|
| 75 |
+ |
|
| 76 |
+func (b *stoppedState) status() Status {
|
|
| 77 |
+ return Destroyed |
|
| 78 |
+} |
|
| 79 |
+ |
|
| 80 |
+func (b *stoppedState) transition(s containerState) error {
|
|
| 81 |
+ switch s.(type) {
|
|
| 82 |
+ case *runningState: |
|
| 83 |
+ b.c.state = s |
|
| 84 |
+ return nil |
|
| 85 |
+ case *restoredState: |
|
| 86 |
+ b.c.state = s |
|
| 87 |
+ return nil |
|
| 88 |
+ case *stoppedState: |
|
| 89 |
+ return nil |
|
| 90 |
+ } |
|
| 91 |
+ return newStateTransitionError(b, s) |
|
| 92 |
+} |
|
| 93 |
+ |
|
| 94 |
+func (b *stoppedState) destroy() error {
|
|
| 95 |
+ return destroy(b.c) |
|
| 96 |
+} |
|
| 97 |
+ |
|
| 98 |
+// runningState represents a container that is currently running. |
|
| 99 |
+type runningState struct {
|
|
| 100 |
+ c *linuxContainer |
|
| 101 |
+} |
|
| 102 |
+ |
|
| 103 |
+func (r *runningState) status() Status {
|
|
| 104 |
+ return Running |
|
| 105 |
+} |
|
| 106 |
+ |
|
| 107 |
+func (r *runningState) transition(s containerState) error {
|
|
| 108 |
+ switch s.(type) {
|
|
| 109 |
+ case *stoppedState: |
|
| 110 |
+ running, err := r.c.isRunning() |
|
| 111 |
+ if err != nil {
|
|
| 112 |
+ return err |
|
| 113 |
+ } |
|
| 114 |
+ if running {
|
|
| 115 |
+ return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
|
|
| 116 |
+ } |
|
| 117 |
+ r.c.state = s |
|
| 118 |
+ return nil |
|
| 119 |
+ case *pausedState: |
|
| 120 |
+ r.c.state = s |
|
| 121 |
+ return nil |
|
| 122 |
+ case *runningState: |
|
| 123 |
+ return nil |
|
| 124 |
+ } |
|
| 125 |
+ return newStateTransitionError(r, s) |
|
| 126 |
+} |
|
| 127 |
+ |
|
| 128 |
+func (r *runningState) destroy() error {
|
|
| 129 |
+ running, err := r.c.isRunning() |
|
| 130 |
+ if err != nil {
|
|
| 131 |
+ return err |
|
| 132 |
+ } |
|
| 133 |
+ if running {
|
|
| 134 |
+ return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
|
|
| 135 |
+ } |
|
| 136 |
+ return destroy(r.c) |
|
| 137 |
+} |
|
| 138 |
+ |
|
| 139 |
+// pausedState represents a container that is currently pause. It cannot be destroyed in a |
|
| 140 |
+// paused state and must transition back to running first. |
|
| 141 |
+type pausedState struct {
|
|
| 142 |
+ c *linuxContainer |
|
| 143 |
+} |
|
| 144 |
+ |
|
| 145 |
+func (p *pausedState) status() Status {
|
|
| 146 |
+ return Paused |
|
| 147 |
+} |
|
| 148 |
+ |
|
| 149 |
+func (p *pausedState) transition(s containerState) error {
|
|
| 150 |
+ switch s.(type) {
|
|
| 151 |
+ case *runningState, *stoppedState: |
|
| 152 |
+ p.c.state = s |
|
| 153 |
+ return nil |
|
| 154 |
+ case *pausedState: |
|
| 155 |
+ return nil |
|
| 156 |
+ } |
|
| 157 |
+ return newStateTransitionError(p, s) |
|
| 158 |
+} |
|
| 159 |
+ |
|
| 160 |
+func (p *pausedState) destroy() error {
|
|
| 161 |
+ isRunning, err := p.c.isRunning() |
|
| 162 |
+ if err != nil {
|
|
| 163 |
+ return err |
|
| 164 |
+ } |
|
| 165 |
+ if !isRunning {
|
|
| 166 |
+ if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
|
| 167 |
+ return err |
|
| 168 |
+ } |
|
| 169 |
+ return destroy(p.c) |
|
| 170 |
+ } |
|
| 171 |
+ return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
|
|
| 172 |
+} |
|
| 173 |
+ |
|
| 174 |
+// restoredState is the same as the running state but also has accociated checkpoint |
|
| 175 |
+// information that maybe need destroyed when the container is stopped and destory is called. |
|
| 176 |
+type restoredState struct {
|
|
| 177 |
+ imageDir string |
|
| 178 |
+ c *linuxContainer |
|
| 179 |
+} |
|
| 180 |
+ |
|
| 181 |
+func (r *restoredState) status() Status {
|
|
| 182 |
+ return Running |
|
| 183 |
+} |
|
| 184 |
+ |
|
| 185 |
+func (r *restoredState) transition(s containerState) error {
|
|
| 186 |
+ switch s.(type) {
|
|
| 187 |
+ case *stoppedState: |
|
| 188 |
+ return nil |
|
| 189 |
+ case *runningState: |
|
| 190 |
+ return nil |
|
| 191 |
+ } |
|
| 192 |
+ return newStateTransitionError(r, s) |
|
| 193 |
+} |
|
| 194 |
+ |
|
| 195 |
+func (r *restoredState) destroy() error {
|
|
| 196 |
+ if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
|
|
| 197 |
+ if !os.IsNotExist(err) {
|
|
| 198 |
+ return err |
|
| 199 |
+ } |
|
| 200 |
+ } |
|
| 201 |
+ return destroy(r.c) |
|
| 202 |
+} |
|
| 203 |
+ |
|
| 204 |
+// createdState is used whenever a container is restored, loaded, or setting additional |
|
| 205 |
+// processes inside and it should not be destroyed when it is exiting. |
|
| 206 |
+type createdState struct {
|
|
| 207 |
+ c *linuxContainer |
|
| 208 |
+ s Status |
|
| 209 |
+} |
|
| 210 |
+ |
|
| 211 |
+func (n *createdState) status() Status {
|
|
| 212 |
+ return n.s |
|
| 213 |
+} |
|
| 214 |
+ |
|
| 215 |
+func (n *createdState) transition(s containerState) error {
|
|
| 216 |
+ n.c.state = s |
|
| 217 |
+ return nil |
|
| 218 |
+} |
|
| 219 |
+ |
|
| 220 |
+func (n *createdState) destroy() error {
|
|
| 221 |
+ if err := n.c.refreshState(); err != nil {
|
|
| 222 |
+ return err |
|
| 223 |
+ } |
|
| 224 |
+ return n.c.state.destroy() |
|
| 225 |
+} |
| ... | ... |
@@ -3,6 +3,9 @@ |
| 3 | 3 |
package system |
| 4 | 4 |
|
| 5 | 5 |
import ( |
| 6 |
+ "bufio" |
|
| 7 |
+ "fmt" |
|
| 8 |
+ "os" |
|
| 6 | 9 |
"os/exec" |
| 7 | 10 |
"syscall" |
| 8 | 11 |
"unsafe" |
| ... | ... |
@@ -75,3 +78,45 @@ func Setctty() error {
|
| 75 | 75 |
} |
| 76 | 76 |
return nil |
| 77 | 77 |
} |
| 78 |
+ |
|
| 79 |
+/* |
|
| 80 |
+ * Detect whether we are currently running in a user namespace. |
|
| 81 |
+ * Copied from github.com/lxc/lxd/shared/util.go |
|
| 82 |
+ */ |
|
| 83 |
+func RunningInUserNS() bool {
|
|
| 84 |
+ file, err := os.Open("/proc/self/uid_map")
|
|
| 85 |
+ if err != nil {
|
|
| 86 |
+ /* |
|
| 87 |
+ * This kernel-provided file only exists if user namespaces are |
|
| 88 |
+ * supported |
|
| 89 |
+ */ |
|
| 90 |
+ return false |
|
| 91 |
+ } |
|
| 92 |
+ defer file.Close() |
|
| 93 |
+ |
|
| 94 |
+ buf := bufio.NewReader(file) |
|
| 95 |
+ l, _, err := buf.ReadLine() |
|
| 96 |
+ if err != nil {
|
|
| 97 |
+ return false |
|
| 98 |
+ } |
|
| 99 |
+ |
|
| 100 |
+ line := string(l) |
|
| 101 |
+ var a, b, c int64 |
|
| 102 |
+ fmt.Sscanf(line, "%d %d %d", &a, &b, &c) |
|
| 103 |
+ /* |
|
| 104 |
+ * We assume we are in the initial user namespace if we have a full |
|
| 105 |
+ * range - 4294967295 uids starting at uid 0. |
|
| 106 |
+ */ |
|
| 107 |
+ if a == 0 && b == 0 && c == 4294967295 {
|
|
| 108 |
+ return false |
|
| 109 |
+ } |
|
| 110 |
+ return true |
|
| 111 |
+} |
|
| 112 |
+ |
|
| 113 |
+func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
|
|
| 114 |
+ _, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) |
|
| 115 |
+ if e1 != 0 {
|
|
| 116 |
+ err = e1 |
|
| 117 |
+ } |
|
| 118 |
+ return |
|
| 119 |
+} |
| ... | ... |
@@ -5,6 +5,7 @@ import ( |
| 5 | 5 |
"encoding/hex" |
| 6 | 6 |
"encoding/json" |
| 7 | 7 |
"io" |
| 8 |
+ "os" |
|
| 8 | 9 |
"path/filepath" |
| 9 | 10 |
"syscall" |
| 10 | 11 |
) |
| ... | ... |
@@ -54,3 +55,32 @@ func WriteJSON(w io.Writer, v interface{}) error {
|
| 54 | 54 |
_, err = w.Write(data) |
| 55 | 55 |
return err |
| 56 | 56 |
} |
| 57 |
+ |
|
| 58 |
+// CleanPath makes a path safe for use with filepath.Join. This is done by not |
|
| 59 |
+// only cleaning the path, but also (if the path is relative) adding a leading |
|
| 60 |
+// '/' and cleaning it (then removing the leading '/'). This ensures that a |
|
| 61 |
+// path resulting from prepending another path will always resolve to lexically |
|
| 62 |
+// be a subdirectory of the prefixed path. This is all done lexically, so paths |
|
| 63 |
+// that include symlinks won't be safe as a result of using CleanPath. |
|
| 64 |
+func CleanPath(path string) string {
|
|
| 65 |
+ // Deal with empty strings nicely. |
|
| 66 |
+ if path == "" {
|
|
| 67 |
+ return "" |
|
| 68 |
+ } |
|
| 69 |
+ |
|
| 70 |
+ // Ensure that all paths are cleaned (especially problematic ones like |
|
| 71 |
+ // "/../../../../../" which can cause lots of issues). |
|
| 72 |
+ path = filepath.Clean(path) |
|
| 73 |
+ |
|
| 74 |
+ // If the path isn't absolute, we need to do more processing to fix paths |
|
| 75 |
+ // such as "../../../../<etc>/some/path". We also shouldn't convert absolute |
|
| 76 |
+ // paths to relative ones. |
|
| 77 |
+ if !filepath.IsAbs(path) {
|
|
| 78 |
+ path = filepath.Clean(string(os.PathSeparator) + path) |
|
| 79 |
+ // This can't fail, as (by definition) all paths are relative to root. |
|
| 80 |
+ path, _ = filepath.Rel(string(os.PathSeparator), path) |
|
| 81 |
+ } |
|
| 82 |
+ |
|
| 83 |
+ // Clean the path again for good measure. |
|
| 84 |
+ return filepath.Clean(path) |
|
| 85 |
+} |