Browse code

vendor: update runc/libcontainer

This includes all of v0.0.8 as well as a few bug fixes that popped up
during vendoring.

Signed-off-by: Aleksa Sarai <asarai@suse.com>

Aleksa Sarai authored on 2016/01/27 08:05:13
Showing 41 changed files
... ...
@@ -59,7 +59,7 @@ clone git github.com/miekg/pkcs11 80f102b5cac759de406949c47f0928b99bd64cdf
59 59
 clone git github.com/docker/go v1.5.1-1-1-gbaf439e
60 60
 clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
61 61
 
62
-clone git github.com/opencontainers/runc 3d8a20bb772defc28c355534d83486416d1719b4 # libcontainer
62
+clone git github.com/opencontainers/runc ce72f86a2b54bc114d6ffb51f6500479b2d42154 # libcontainer
63 63
 clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1
64 64
 # libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json)
65 65
 clone git github.com/coreos/go-systemd v4
... ...
@@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst
10 10
 
11 11
 #### Using libcontainer
12 12
 
13
-To create a container you first have to initialize an instance of a factory
14
-that will handle the creation and initialization for a container.
13
+Because containers are spawned in a two step process you will need a binary that
14
+will be executed as the init process for the container. In libcontainer, we use
15
+the current binary (/proc/self/exe) to be executed as the init process, and use
16
+arg "init", we call the first step process "bootstrap", so you always need a "init"
17
+function as the entry of "bootstrap".
15 18
 
16
-Because containers are spawned in a two step process you will need to provide
17
-arguments to a binary that will be executed as the init process for the container.
18
-To use the current binary that is spawning the containers and acting as the parent
19
-you can use `os.Args[0]` and we have a command called `init` setup.
19
+```go
20
+func init() {
21
+	if len(os.Args) > 1 && os.Args[1] == "init" {
22
+		runtime.GOMAXPROCS(1)
23
+		runtime.LockOSThread()
24
+		factory, _ := libcontainer.New("")
25
+		if err := factory.StartInitialization(); err != nil {
26
+			logrus.Fatal(err)
27
+		}
28
+		panic("--this line should have never been executed, congratulations--")
29
+	}
30
+}
31
+```
32
+
33
+Then to create a container you first have to initialize an instance of a factory
34
+that will handle the creation and initialization for a container.
20 35
 
21 36
 ```go
22
-root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init"))
37
+factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
23 38
 if err != nil {
24
-    log.Fatal(err)
39
+	logrus.Fatal(err)
40
+	return
25 41
 }
26 42
 ```
27 43
 
28 44
 Once you have an instance of the factory created we can create a configuration
29
-struct describing how the container is to be created.  A sample would look similar to this:
45
+struct describing how the container is to be created. A sample would look similar to this:
30 46
 
31 47
 ```go
48
+defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
32 49
 config := &configs.Config{
33
-    Rootfs: rootfs,
34
-    Capabilities: []string{
35
-        "CAP_CHOWN",
36
-        "CAP_DAC_OVERRIDE",
37
-        "CAP_FSETID",
38
-        "CAP_FOWNER",
39
-        "CAP_MKNOD",
40
-        "CAP_NET_RAW",
41
-        "CAP_SETGID",
42
-        "CAP_SETUID",
43
-        "CAP_SETFCAP",
44
-        "CAP_SETPCAP",
45
-        "CAP_NET_BIND_SERVICE",
46
-        "CAP_SYS_CHROOT",
47
-        "CAP_KILL",
48
-        "CAP_AUDIT_WRITE",
49
-    },
50
-    Namespaces: configs.Namespaces([]configs.Namespace{
51
-        {Type: configs.NEWNS},
52
-        {Type: configs.NEWUTS},
53
-        {Type: configs.NEWIPC},
54
-        {Type: configs.NEWPID},
55
-        {Type: configs.NEWNET},
56
-    }),
57
-    Cgroups: &configs.Cgroup{
58
-        Name:            "test-container",
59
-        Parent:          "system",
60
-        AllowAllDevices: false,
61
-        AllowedDevices:  configs.DefaultAllowedDevices,
62
-    },
63
-
64
-    Devices:  configs.DefaultAutoCreatedDevices,
65
-    Hostname: "testing",
66
-    Networks: []*configs.Network{
67
-        {
68
-            Type:    "loopback",
69
-            Address: "127.0.0.1/0",
70
-            Gateway: "localhost",
71
-        },
72
-    },
73
-    Rlimits: []configs.Rlimit{
74
-        {
75
-            Type: syscall.RLIMIT_NOFILE,
76
-            Hard: uint64(1024),
77
-            Soft: uint64(1024),
78
-        },
79
-    },
50
+	Rootfs: "/your/path/to/rootfs",
51
+	Capabilities: []string{
52
+		"CAP_CHOWN",
53
+		"CAP_DAC_OVERRIDE",
54
+		"CAP_FSETID",
55
+		"CAP_FOWNER",
56
+		"CAP_MKNOD",
57
+		"CAP_NET_RAW",
58
+		"CAP_SETGID",
59
+		"CAP_SETUID",
60
+		"CAP_SETFCAP",
61
+		"CAP_SETPCAP",
62
+		"CAP_NET_BIND_SERVICE",
63
+		"CAP_SYS_CHROOT",
64
+		"CAP_KILL",
65
+		"CAP_AUDIT_WRITE",
66
+	},
67
+	Namespaces: configs.Namespaces([]configs.Namespace{
68
+		{Type: configs.NEWNS},
69
+		{Type: configs.NEWUTS},
70
+		{Type: configs.NEWIPC},
71
+		{Type: configs.NEWPID},
72
+		{Type: configs.NEWUSER},
73
+		{Type: configs.NEWNET},
74
+	}),
75
+	Cgroups: &configs.Cgroup{
76
+		Name:   "test-container",
77
+		Parent: "system",
78
+		Resources: &configs.Resources{
79
+			MemorySwappiness: -1,
80
+			AllowAllDevices:  false,
81
+			AllowedDevices:   configs.DefaultAllowedDevices,
82
+		},
83
+	},
84
+	MaskPaths: []string{
85
+		"/proc/kcore",
86
+	},
87
+	ReadonlyPaths: []string{
88
+		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
89
+	},
90
+	Devices:  configs.DefaultAutoCreatedDevices,
91
+	Hostname: "testing",
92
+	Mounts: []*configs.Mount{
93
+		{
94
+			Source:      "proc",
95
+			Destination: "/proc",
96
+			Device:      "proc",
97
+			Flags:       defaultMountFlags,
98
+		},
99
+		{
100
+			Source:      "tmpfs",
101
+			Destination: "/dev",
102
+			Device:      "tmpfs",
103
+			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
104
+			Data:        "mode=755",
105
+		},
106
+		{
107
+			Source:      "devpts",
108
+			Destination: "/dev/pts",
109
+			Device:      "devpts",
110
+			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
111
+			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
112
+		},
113
+		{
114
+			Device:      "tmpfs",
115
+			Source:      "shm",
116
+			Destination: "/dev/shm",
117
+			Data:        "mode=1777,size=65536k",
118
+			Flags:       defaultMountFlags,
119
+		},
120
+		{
121
+			Source:      "mqueue",
122
+			Destination: "/dev/mqueue",
123
+			Device:      "mqueue",
124
+			Flags:       defaultMountFlags,
125
+		},
126
+		{
127
+			Source:      "sysfs",
128
+			Destination: "/sys",
129
+			Device:      "sysfs",
130
+			Flags:       defaultMountFlags | syscall.MS_RDONLY,
131
+		},
132
+	},
133
+	UidMappings: []configs.IDMap{
134
+		{
135
+			ContainerID: 0,
136
+			Host: 1000,
137
+			size: 65536,
138
+		},
139
+	},
140
+	GidMappings: []configs.IDMap{
141
+		{
142
+			ContainerID: 0,
143
+			Host: 1000,
144
+			size: 65536,
145
+		},
146
+	},
147
+	Networks: []*configs.Network{
148
+		{
149
+			Type:    "loopback",
150
+			Address: "127.0.0.1/0",
151
+			Gateway: "localhost",
152
+		},
153
+	},
154
+	Rlimits: []configs.Rlimit{
155
+		{
156
+			Type: syscall.RLIMIT_NOFILE,
157
+			Hard: uint64(1025),
158
+			Soft: uint64(1025),
159
+		},
160
+	},
80 161
 }
81 162
 ```
82 163
 
83 164
 Once you have the configuration populated you can create a container:
84 165
 
85 166
 ```go
86
-container, err := root.Create("container-id", config)
167
+container, err := factory.Create("container-id", config)
168
+if err != nil {
169
+	logrus.Fatal(err)
170
+	return
171
+}
87 172
 ```
88 173
 
89 174
 To spawn bash as the initial process inside the container and have the
... ...
@@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process:
91 91
 
92 92
 ```go
93 93
 process := &libcontainer.Process{
94
-    Args:   []string{"/bin/bash"},
95
-    Env:    []string{"PATH=/bin"},
96
-    User:   "daemon",
97
-    Stdin:  os.Stdin,
98
-    Stdout: os.Stdout,
99
-    Stderr: os.Stderr,
94
+	Args:   []string{"/bin/bash"},
95
+	Env:    []string{"PATH=/bin"},
96
+	User:   "daemon",
97
+	Stdin:  os.Stdin,
98
+	Stdout: os.Stdout,
99
+	Stderr: os.Stderr,
100 100
 }
101 101
 
102 102
 err := container.Start(process)
103 103
 if err != nil {
104
-    log.Fatal(err)
104
+	logrus.Fatal(err)
105
+	container.Destroy()
106
+	return
105 107
 }
106 108
 
107 109
 // wait for the process to finish.
108
-status, err := process.Wait()
110
+_, err := process.Wait()
109 111
 if err != nil {
110
-    log.Fatal(err)
112
+	logrus.Fatal(err)
111 113
 }
112 114
 
113 115
 // destroy the container.
... ...
@@ -124,7 +211,6 @@ processes, err := container.Processes()
124 124
 // it's processes.
125 125
 stats, err := container.Stats()
126 126
 
127
-
128 127
 // pause all processes inside the container.
129 128
 container.Pause()
130 129
 
... ...
@@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup.
60 60
 After a container's filesystems are mounted within the newly created 
61 61
 mount namespace `/dev` will need to be populated with a set of device nodes.
62 62
 It is expected that a rootfs does not need to have any device nodes specified
63
-for `/dev` witin the rootfs as the container will setup the correct devices
63
+for `/dev` within the rootfs as the container will setup the correct devices
64 64
 that are required for executing a container's process.
65 65
 
66 66
 |      Path    | Mode |   Access   |
... ...
@@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
142 142
 | perf_event | 1       |
143 143
 | freezer    | 1       |
144 144
 | hugetlb    | 1       |
145
+| pids       | 1       |
145 146
 
146 147
 
147 148
 All cgroup subsystem are joined so that statistics can be collected from
... ...
@@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
199 199
 | CAP_SYS_BOOT         | 0       |
200 200
 | CAP_LEASE            | 0       |
201 201
 | CAP_WAKE_ALARM       | 0       |
202
-| CAP_BLOCK_SUSPE      | 0       |
202
+| CAP_BLOCK_SUSPEND    | 0       |
203 203
 
204 204
 
205 205
 Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
... ...
@@ -15,6 +15,9 @@ type Manager interface {
15 15
 	// Returns the PIDs inside the cgroup set
16 16
 	GetPids() ([]int, error)
17 17
 
18
+	// Returns the PIDs inside the cgroup set & all sub-cgroups
19
+	GetAllPids() ([]int, error)
20
+
18 21
 	// Returns statistics for the cgroup set
19 22
 	GetStats() (*Stats, error)
20 23
 
... ...
@@ -14,6 +14,7 @@ import (
14 14
 
15 15
 	"github.com/opencontainers/runc/libcontainer/cgroups"
16 16
 	"github.com/opencontainers/runc/libcontainer/configs"
17
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
17 18
 )
18 19
 
19 20
 var (
... ...
@@ -23,6 +24,7 @@ var (
23 23
 		&MemoryGroup{},
24 24
 		&CpuGroup{},
25 25
 		&CpuacctGroup{},
26
+		&PidsGroup{},
26 27
 		&BlkioGroup{},
27 28
 		&HugetlbGroup{},
28 29
 		&NetClsGroup{},
... ...
@@ -93,11 +95,10 @@ func getCgroupRoot() (string, error) {
93 93
 }
94 94
 
95 95
 type cgroupData struct {
96
-	root   string
97
-	parent string
98
-	name   string
99
-	config *configs.Cgroup
100
-	pid    int
96
+	root      string
97
+	innerPath string
98
+	config    *configs.Cgroup
99
+	pid       int
101 100
 }
102 101
 
103 102
 func (m *Manager) Apply(pid int) (err error) {
... ...
@@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
112 112
 		return err
113 113
 	}
114 114
 
115
+	if c.Paths != nil {
116
+		paths := make(map[string]string)
117
+		for name, path := range c.Paths {
118
+			_, err := d.path(name)
119
+			if err != nil {
120
+				if cgroups.IsNotFound(err) {
121
+					continue
122
+				}
123
+				return err
124
+			}
125
+			paths[name] = path
126
+		}
127
+		m.Paths = paths
128
+		return cgroups.EnterPid(m.Paths, pid)
129
+	}
130
+
115 131
 	paths := make(map[string]string)
116 132
 	defer func() {
117 133
 		if err != nil {
... ...
@@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
135 135
 		paths[sys.Name()] = p
136 136
 	}
137 137
 	m.Paths = paths
138
-
139
-	if paths["cpu"] != "" {
140
-		if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
141
-			return err
142
-		}
143
-	}
144
-
145 138
 	return nil
146 139
 }
147 140
 
148 141
 func (m *Manager) Destroy() error {
142
+	if m.Cgroups.Paths != nil {
143
+		return nil
144
+	}
149 145
 	m.mu.Lock()
150 146
 	defer m.mu.Unlock()
151 147
 	if err := cgroups.RemovePaths(m.Paths); err != nil {
... ...
@@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
179 179
 }
180 180
 
181 181
 func (m *Manager) Set(container *configs.Config) error {
182
-	for name, path := range m.Paths {
183
-		sys, err := subsystems.Get(name)
184
-		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
185
-			continue
182
+	for _, sys := range subsystems {
183
+		// Generate fake cgroup data.
184
+		d, err := getCgroupData(container.Cgroups, -1)
185
+		if err != nil {
186
+			return err
187
+		}
188
+		// Get the path, but don't error out if the cgroup wasn't found.
189
+		path, err := d.path(sys.Name())
190
+		if err != nil && !cgroups.IsNotFound(err) {
191
+			return err
186 192
 		}
193
+
187 194
 		if err := sys.Set(path, container.Cgroups); err != nil {
188 195
 			return err
189 196
 		}
190 197
 	}
198
+
199
+	if m.Paths["cpu"] != "" {
200
+		if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
201
+			return err
202
+		}
203
+	}
191 204
 	return nil
192 205
 }
193 206
 
... ...
@@ -217,41 +243,28 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
217 217
 }
218 218
 
219 219
 func (m *Manager) GetPids() ([]int, error) {
220
-	d, err := getCgroupData(m.Cgroups, 0)
220
+	dir, err := getCgroupPath(m.Cgroups)
221 221
 	if err != nil {
222 222
 		return nil, err
223 223
 	}
224
+	return cgroups.GetPids(dir)
225
+}
224 226
 
225
-	dir, err := d.path("devices")
227
+func (m *Manager) GetAllPids() ([]int, error) {
228
+	dir, err := getCgroupPath(m.Cgroups)
226 229
 	if err != nil {
227 230
 		return nil, err
228 231
 	}
229
-
230
-	return cgroups.GetPids(dir)
232
+	return cgroups.GetAllPids(dir)
231 233
 }
232 234
 
233
-// pathClean makes a path safe for use with filepath.Join. This is done by not
234
-// only cleaning the path, but also (if the path is relative) adding a leading
235
-// '/' and cleaning it (then removing the leading '/'). This ensures that a
236
-// path resulting from prepending another path will always resolve to lexically
237
-// be a subdirectory of the prefixed path. This is all done lexically, so paths
238
-// that include symlinks won't be safe as a result of using pathClean.
239
-func pathClean(path string) string {
240
-	// Ensure that all paths are cleaned (especially problematic ones like
241
-	// "/../../../../../" which can cause lots of issues).
242
-	path = filepath.Clean(path)
243
-
244
-	// If the path isn't absolute, we need to do more processing to fix paths
245
-	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
246
-	// paths to relative ones.
247
-	if !filepath.IsAbs(path) {
248
-		path = filepath.Clean(string(os.PathSeparator) + path)
249
-		// This can't fail, as (by definition) all paths are relative to root.
250
-		path, _ = filepath.Rel(string(os.PathSeparator), path)
251
-	}
252
-
253
-	// Clean the path again for good measure.
254
-	return filepath.Clean(path)
235
+func getCgroupPath(c *configs.Cgroup) (string, error) {
236
+	d, err := getCgroupData(c, 0)
237
+	if err != nil {
238
+		return "", err
239
+	}
240
+
241
+	return d.path("devices")
255 242
 }
256 243
 
257 244
 func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
... ...
@@ -260,15 +273,25 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
260 260
 		return nil, err
261 261
 	}
262 262
 
263
-	// Clean the parent slice path.
264
-	c.Parent = pathClean(c.Parent)
263
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
264
+		return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
265
+	}
266
+
267
+	// XXX: Do not remove this code. Path safety is important! -- cyphar
268
+	cgPath := libcontainerUtils.CleanPath(c.Path)
269
+	cgParent := libcontainerUtils.CleanPath(c.Parent)
270
+	cgName := libcontainerUtils.CleanPath(c.Name)
271
+
272
+	innerPath := cgPath
273
+	if innerPath == "" {
274
+		innerPath = filepath.Join(cgParent, cgName)
275
+	}
265 276
 
266 277
 	return &cgroupData{
267
-		root:   root,
268
-		parent: c.Parent,
269
-		name:   c.Name,
270
-		config: c,
271
-		pid:    pid,
278
+		root:      root,
279
+		innerPath: innerPath,
280
+		config:    c,
281
+		pid:       pid,
272 282
 	}, nil
273 283
 }
274 284
 
... ...
@@ -296,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
296 296
 		return "", err
297 297
 	}
298 298
 
299
-	cgPath := filepath.Join(raw.parent, raw.name)
300 299
 	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
301
-	if filepath.IsAbs(cgPath) {
300
+	if filepath.IsAbs(raw.innerPath) {
302 301
 		// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
303
-		return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil
302
+		return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
304 303
 	}
305 304
 
306 305
 	parentPath, err := raw.parentPath(subsystem, mnt, root)
... ...
@@ -308,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
308 308
 		return "", err
309 309
 	}
310 310
 
311
-	return filepath.Join(parentPath, cgPath), nil
311
+	return filepath.Join(parentPath, raw.innerPath), nil
312 312
 }
313 313
 
314 314
 func (raw *cgroupData) join(subsystem string) (string, error) {
... ...
@@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string {
22 22
 }
23 23
 
24 24
 func (s *BlkioGroup) Apply(d *cgroupData) error {
25
-	dir, err := d.join("blkio")
25
+	_, err := d.join("blkio")
26 26
 	if err != nil && !cgroups.IsNotFound(err) {
27 27
 		return err
28 28
 	}
29
-
30
-	if err := s.Set(dir, d.config); err != nil {
31
-		return err
32
-	}
33
-
34 29
 	return nil
35 30
 }
36 31
 
... ...
@@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string {
22 22
 func (s *CpuGroup) Apply(d *cgroupData) error {
23 23
 	// We always want to join the cpu group, to allow fair cpu scheduling
24 24
 	// on a container basis
25
-	dir, err := d.join("cpu")
25
+	_, err := d.join("cpu")
26 26
 	if err != nil && !cgroups.IsNotFound(err) {
27 27
 		return err
28 28
 	}
29
-
30
-	if err := s.Set(dir, d.config); err != nil {
31
-		return err
32
-	}
33
-
34 29
 	return nil
35 30
 }
36 31
 
... ...
@@ -12,6 +12,7 @@ import (
12 12
 
13 13
 	"github.com/opencontainers/runc/libcontainer/cgroups"
14 14
 	"github.com/opencontainers/runc/libcontainer/configs"
15
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
15 16
 )
16 17
 
17 18
 type CpusetGroup struct {
... ...
@@ -64,11 +65,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
64 64
 	if err := s.ensureParent(dir, root); err != nil {
65 65
 		return err
66 66
 	}
67
-	// the default values inherit from parent cgroup are already set in
68
-	// s.ensureParent, cover these if we have our own
69
-	if err := s.Set(dir, cgroup); err != nil {
70
-		return err
71
-	}
72 67
 	// because we are not using d.join we need to place the pid into the procs file
73 68
 	// unlike the other subsystems
74 69
 	if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
... ...
@@ -93,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
93 93
 // it's parent.
94 94
 func (s *CpusetGroup) ensureParent(current, root string) error {
95 95
 	parent := filepath.Dir(current)
96
-	if filepath.Clean(parent) == root {
96
+	if libcontainerUtils.CleanPath(parent) == root {
97 97
 		return nil
98 98
 	}
99 99
 	// Avoid infinite recursion.
... ...
@@ -15,21 +15,29 @@ func (s *DevicesGroup) Name() string {
15 15
 }
16 16
 
17 17
 func (s *DevicesGroup) Apply(d *cgroupData) error {
18
-	dir, err := d.join("devices")
18
+	_, err := d.join("devices")
19 19
 	if err != nil {
20 20
 		// We will return error even it's `not found` error, devices
21 21
 		// cgroup is hard requirement for container's security.
22 22
 		return err
23 23
 	}
24
-
25
-	if err := s.Set(dir, d.config); err != nil {
26
-		return err
27
-	}
28
-
29 24
 	return nil
30 25
 }
31 26
 
32 27
 func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
28
+	devices := cgroup.Resources.Devices
29
+	if len(devices) > 0 {
30
+		for _, dev := range devices {
31
+			file := "devices.deny"
32
+			if dev.Allow {
33
+				file = "devices.allow"
34
+			}
35
+			if err := writeFile(path, file, dev.CgroupString()); err != nil {
36
+				return err
37
+			}
38
+		}
39
+		return nil
40
+	}
33 41
 	if !cgroup.Resources.AllowAllDevices {
34 42
 		if err := writeFile(path, "devices.deny", "a"); err != nil {
35 43
 			return err
... ...
@@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string {
19 19
 }
20 20
 
21 21
 func (s *FreezerGroup) Apply(d *cgroupData) error {
22
-	dir, err := d.join("freezer")
22
+	_, err := d.join("freezer")
23 23
 	if err != nil && !cgroups.IsNotFound(err) {
24 24
 		return err
25 25
 	}
26
-
27
-	if err := s.Set(dir, d.config); err != nil {
28
-		return err
29
-	}
30
-
31 26
 	return nil
32 27
 }
33 28
 
... ...
@@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string {
19 19
 }
20 20
 
21 21
 func (s *HugetlbGroup) Apply(d *cgroupData) error {
22
-	dir, err := d.join("hugetlb")
22
+	_, err := d.join("hugetlb")
23 23
 	if err != nil && !cgroups.IsNotFound(err) {
24 24
 		return err
25 25
 	}
26
-
27
-	if err := s.Set(dir, d.config); err != nil {
28
-		return err
29
-	}
30
-
31 26
 	return nil
32 27
 }
33 28
 
... ...
@@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
32 32
 				return err
33 33
 			}
34 34
 		}
35
-
36
-		if err := s.Set(path, d.config); err != nil {
35
+		// We have to set kernel memory here, as we can't change it once
36
+		// processes have been attached.
37
+		if err := s.SetKernelMemory(path, d.config); err != nil {
37 38
 			return err
38 39
 		}
39 40
 	}
... ...
@@ -50,7 +51,17 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
50 50
 	if err != nil && !cgroups.IsNotFound(err) {
51 51
 		return err
52 52
 	}
53
+	return nil
54
+}
53 55
 
56
+func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
57
+	// This has to be done separately because it has special constraints (it
58
+	// can't be done after there are processes attached to the cgroup).
59
+	if cgroup.Resources.KernelMemory > 0 {
60
+		if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
61
+			return err
62
+		}
63
+	}
54 64
 	return nil
55 65
 }
56 66
 
... ...
@@ -70,12 +81,6 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
70 70
 			return err
71 71
 		}
72 72
 	}
73
-	if cgroup.Resources.KernelMemory > 0 {
74
-		if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
75
-			return err
76
-		}
77
-	}
78
-
79 73
 	if cgroup.Resources.OomKillDisable {
80 74
 		if err := writeFile(path, "memory.oom_control", "1"); err != nil {
81 75
 			return err
... ...
@@ -157,6 +162,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
157 157
 	usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
158 158
 	maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
159 159
 	failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
160
+	limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
160 161
 
161 162
 	value, err := getCgroupParamUint(path, usage)
162 163
 	if err != nil {
... ...
@@ -182,6 +188,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
182 182
 		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
183 183
 	}
184 184
 	memoryData.Failcnt = value
185
+	value, err = getCgroupParamUint(path, limit)
186
+	if err != nil {
187
+		if moduleName != "memory" && os.IsNotExist(err) {
188
+			return cgroups.MemoryData{}, nil
189
+		}
190
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
191
+	}
192
+	memoryData.Limit = value
185 193
 
186 194
 	return memoryData, nil
187 195
 }
... ...
@@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string {
15 15
 }
16 16
 
17 17
 func (s *NetClsGroup) Apply(d *cgroupData) error {
18
-	dir, err := d.join("net_cls")
18
+	_, err := d.join("net_cls")
19 19
 	if err != nil && !cgroups.IsNotFound(err) {
20 20
 		return err
21 21
 	}
22
-
23
-	if err := s.Set(dir, d.config); err != nil {
24
-		return err
25
-	}
26
-
27 22
 	return nil
28 23
 }
29 24
 
... ...
@@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string {
15 15
 }
16 16
 
17 17
 func (s *NetPrioGroup) Apply(d *cgroupData) error {
18
-	dir, err := d.join("net_prio")
18
+	_, err := d.join("net_prio")
19 19
 	if err != nil && !cgroups.IsNotFound(err) {
20 20
 		return err
21 21
 	}
22
-
23
-	if err := s.Set(dir, d.config); err != nil {
24
-		return err
25
-	}
26
-
27 22
 	return nil
28 23
 }
29 24
 
30 25
new file mode 100644
... ...
@@ -0,0 +1,57 @@
0
+// +build linux
1
+
2
+package fs
3
+
4
+import (
5
+	"fmt"
6
+	"strconv"
7
+
8
+	"github.com/opencontainers/runc/libcontainer/cgroups"
9
+	"github.com/opencontainers/runc/libcontainer/configs"
10
+)
11
+
12
+type PidsGroup struct {
13
+}
14
+
15
+func (s *PidsGroup) Name() string {
16
+	return "pids"
17
+}
18
+
19
+func (s *PidsGroup) Apply(d *cgroupData) error {
20
+	_, err := d.join("pids")
21
+	if err != nil && !cgroups.IsNotFound(err) {
22
+		return err
23
+	}
24
+	return nil
25
+}
26
+
27
+func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
28
+	if cgroup.Resources.PidsLimit != 0 {
29
+		// "max" is the fallback value.
30
+		limit := "max"
31
+
32
+		if cgroup.Resources.PidsLimit > 0 {
33
+			limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
34
+		}
35
+
36
+		if err := writeFile(path, "pids.max", limit); err != nil {
37
+			return err
38
+		}
39
+	}
40
+
41
+	return nil
42
+}
43
+
44
+func (s *PidsGroup) Remove(d *cgroupData) error {
45
+	return removePath(d.path("pids"))
46
+}
47
+
48
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
49
+	value, err := getCgroupParamUint(path, "pids.current")
50
+	if err != nil {
51
+		return fmt.Errorf("failed to parse pids.current - %s", err)
52
+	}
53
+
54
+	stats.PidsStats.Current = value
55
+	return nil
56
+}
... ...
@@ -36,7 +36,9 @@ type MemoryData struct {
36 36
 	Usage    uint64 `json:"usage,omitempty"`
37 37
 	MaxUsage uint64 `json:"max_usage,omitempty"`
38 38
 	Failcnt  uint64 `json:"failcnt"`
39
+	Limit    uint64 `json:"limit"`
39 40
 }
41
+
40 42
 type MemoryStats struct {
41 43
 	// memory used for cache
42 44
 	Cache uint64 `json:"cache,omitempty"`
... ...
@@ -49,6 +51,11 @@ type MemoryStats struct {
49 49
 	Stats       map[string]uint64 `json:"stats,omitempty"`
50 50
 }
51 51
 
52
+type PidsStats struct {
53
+	// number of pids in the cgroup
54
+	Current uint64 `json:"current,omitempty"`
55
+}
56
+
52 57
 type BlkioStatEntry struct {
53 58
 	Major uint64 `json:"major,omitempty"`
54 59
 	Minor uint64 `json:"minor,omitempty"`
... ...
@@ -80,6 +87,7 @@ type HugetlbStats struct {
80 80
 type Stats struct {
81 81
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
82 82
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
83
+	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
83 84
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
84 85
 	// the map is in the format "size of hugepage: stats of the hugepage"
85 86
 	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
... ...
@@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
26 26
 	return nil, fmt.Errorf("Systemd not supported")
27 27
 }
28 28
 
29
+func (m *Manager) GetAllPids() ([]int, error) {
30
+	return nil, fmt.Errorf("Systemd not supported")
31
+}
32
+
29 33
 func (m *Manager) Destroy() error {
30 34
 	return fmt.Errorf("Systemd not supported")
31 35
 }
... ...
@@ -55,6 +55,7 @@ var subsystems = subsystemSet{
55 55
 	&fs.MemoryGroup{},
56 56
 	&fs.CpuGroup{},
57 57
 	&fs.CpuacctGroup{},
58
+	&fs.PidsGroup{},
58 59
 	&fs.BlkioGroup{},
59 60
 	&fs.HugetlbGroup{},
60 61
 	&fs.PerfEventGroup{},
... ...
@@ -167,6 +168,23 @@ func (m *Manager) Apply(pid int) error {
167 167
 		properties []systemdDbus.Property
168 168
 	)
169 169
 
170
+	if c.Paths != nil {
171
+		paths := make(map[string]string)
172
+		for name, path := range c.Paths {
173
+			_, err := getSubsystemPath(m.Cgroups, name)
174
+			if err != nil {
175
+				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
176
+				if cgroups.IsNotFound(err) {
177
+					continue
178
+				}
179
+				return err
180
+			}
181
+			paths[name] = path
182
+		}
183
+		m.Paths = paths
184
+		return cgroups.EnterPid(m.Paths, pid)
185
+	}
186
+
170 187
 	if c.Parent != "" {
171 188
 		slice = c.Parent
172 189
 	}
... ...
@@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
233 233
 		return err
234 234
 	}
235 235
 
236
-	// we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd
236
+	// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
237 237
 	// because it does not currently support it via the dbus api.
238 238
 	if err := joinFreezer(c, pid); err != nil {
239 239
 		return err
... ...
@@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
246 246
 		return err
247 247
 	}
248 248
 
249
+	if err := joinPids(c, pid); err != nil {
250
+		return err
251
+	}
252
+
249 253
 	if err := joinCpuset(c, pid); err != nil {
250 254
 		return err
251 255
 	}
... ...
@@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
277 277
 		paths[s.Name()] = subsystemPath
278 278
 	}
279 279
 	m.Paths = paths
280
-
281
-	if paths["cpu"] != "" {
282
-		if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
283
-			return err
284
-		}
285
-	}
286
-
287 280
 	return nil
288 281
 }
289 282
 
290 283
 func (m *Manager) Destroy() error {
284
+	if m.Cgroups.Paths != nil {
285
+		return nil
286
+	}
291 287
 	m.mu.Lock()
292 288
 	defer m.mu.Unlock()
293 289
 	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
... ...
@@ -330,68 +348,74 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
330 330
 }
331 331
 
332 332
 func joinCpu(c *configs.Cgroup, pid int) error {
333
-	path, err := getSubsystemPath(c, "cpu")
333
+	_, err := join(c, "cpu", pid)
334 334
 	if err != nil && !cgroups.IsNotFound(err) {
335 335
 		return err
336 336
 	}
337
-	if c.Resources.CpuQuota != 0 {
338
-		if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil {
339
-			return err
340
-		}
341
-	}
342
-	if c.Resources.CpuPeriod != 0 {
343
-		if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil {
344
-			return err
345
-		}
346
-	}
347
-	if c.Resources.CpuRtPeriod != 0 {
348
-		if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil {
349
-			return err
350
-		}
351
-	}
352
-	if c.Resources.CpuRtRuntime != 0 {
353
-		if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil {
354
-			return err
355
-		}
356
-	}
357
-
358 337
 	return nil
359 338
 }
360 339
 
361 340
 func joinFreezer(c *configs.Cgroup, pid int) error {
362
-	path, err := join(c, "freezer", pid)
341
+	_, err := join(c, "freezer", pid)
363 342
 	if err != nil && !cgroups.IsNotFound(err) {
364 343
 		return err
365 344
 	}
366
-	freezer, err := subsystems.Get("freezer")
367
-	if err != nil {
368
-		return err
369
-	}
370
-	return freezer.Set(path, c)
345
+	return nil
371 346
 }
372 347
 
373 348
 func joinNetPrio(c *configs.Cgroup, pid int) error {
374
-	path, err := join(c, "net_prio", pid)
349
+	_, err := join(c, "net_prio", pid)
375 350
 	if err != nil && !cgroups.IsNotFound(err) {
376 351
 		return err
377 352
 	}
378
-	netPrio, err := subsystems.Get("net_prio")
379
-	if err != nil {
380
-		return err
381
-	}
382
-	return netPrio.Set(path, c)
353
+	return nil
383 354
 }
384 355
 
385 356
 func joinNetCls(c *configs.Cgroup, pid int) error {
386
-	path, err := join(c, "net_cls", pid)
357
+	_, err := join(c, "net_cls", pid)
387 358
 	if err != nil && !cgroups.IsNotFound(err) {
388 359
 		return err
389 360
 	}
390
-	netcls, err := subsystems.Get("net_cls")
391
-	if err != nil {
361
+	return nil
362
+}
363
+
364
+func joinPids(c *configs.Cgroup, pid int) error {
365
+	_, err := join(c, "pids", pid)
366
+	if err != nil && !cgroups.IsNotFound(err) {
392 367
 		return err
393 368
 	}
394
-	return netcls.Set(path, c)
369
+	return nil
370
+}
371
+
372
+// systemd represents slice heirarchy using `-`, so we need to follow suit when
373
+// generating the path of slice. Essentially, test-a-b.slice becomes
374
+// test.slice/test-a.slice/test-a-b.slice.
375
+func expandSlice(slice string) (string, error) {
376
+	suffix := ".slice"
377
+	// Name has to end with ".slice", but can't be just ".slice".
378
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
379
+		return "", fmt.Errorf("invalid slice name: %s", slice)
380
+	}
381
+
382
+	// Path-separators are not allowed.
383
+	if strings.Contains(slice, "/") {
384
+		return "", fmt.Errorf("invalid slice name: %s", slice)
385
+	}
386
+
387
+	var path, prefix string
388
+	sliceName := strings.TrimSuffix(slice, suffix)
389
+	for _, component := range strings.Split(sliceName, "-") {
390
+		// test--a.slice isn't permitted, nor is -test.slice.
391
+		if component == "" {
392
+			return "", fmt.Errorf("invalid slice name: %s", slice)
393
+		}
394
+
395
+		// Append the component to the path and to the prefix.
396
+		path += prefix + component + suffix + "/"
397
+		prefix += component + "-"
398
+	}
399
+
400
+	return path, nil
395 401
 }
396 402
 
397 403
 func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
... ...
@@ -410,6 +434,11 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
410 410
 		slice = c.Parent
411 411
 	}
412 412
 
413
+	slice, err = expandSlice(slice)
414
+	if err != nil {
415
+		return "", err
416
+	}
417
+
413 418
 	return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
414 419
 }
415 420
 
... ...
@@ -440,6 +469,14 @@ func (m *Manager) GetPids() ([]int, error) {
440 440
 	return cgroups.GetPids(path)
441 441
 }
442 442
 
443
+func (m *Manager) GetAllPids() ([]int, error) {
444
+	path, err := getSubsystemPath(m.Cgroups, "devices")
445
+	if err != nil {
446
+		return nil, err
447
+	}
448
+	return cgroups.GetAllPids(path)
449
+}
450
+
443 451
 func (m *Manager) GetStats() (*cgroups.Stats, error) {
444 452
 	m.mu.Lock()
445 453
 	defer m.mu.Unlock()
... ...
@@ -458,16 +495,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
458 458
 }
459 459
 
460 460
 func (m *Manager) Set(container *configs.Config) error {
461
-	for name, path := range m.Paths {
462
-		sys, err := subsystems.Get(name)
463
-		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
464
-			continue
461
+	for _, sys := range subsystems {
462
+		// Get the subsystem path, but don't error out for not found cgroups.
463
+		path, err := getSubsystemPath(container.Cgroups, sys.Name())
464
+		if err != nil && !cgroups.IsNotFound(err) {
465
+			return err
465 466
 		}
467
+
466 468
 		if err := sys.Set(path, container.Cgroups); err != nil {
467 469
 			return err
468 470
 		}
469 471
 	}
470 472
 
473
+	if m.Paths["cpu"] != "" {
474
+		if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
475
+			return err
476
+		}
477
+	}
471 478
 	return nil
472 479
 }
473 480
 
... ...
@@ -487,17 +531,13 @@ func getUnitName(c *configs.Cgroup) string {
487 487
 // because systemd will re-write the device settings if it needs to re-apply the cgroup context.
488 488
 // This happens at least for v208 when any sibling unit is started.
489 489
 func joinDevices(c *configs.Cgroup, pid int) error {
490
-	path, err := join(c, "devices", pid)
490
+	_, err := join(c, "devices", pid)
491 491
 	// Even if it's `not found` error, we'll return err because devices cgroup
492 492
 	// is hard requirement for container security.
493 493
 	if err != nil {
494 494
 		return err
495 495
 	}
496
-	devices, err := subsystems.Get("devices")
497
-	if err != nil {
498
-		return err
499
-	}
500
-	return devices.Set(path, c)
496
+	return nil
501 497
 }
502 498
 
503 499
 func setKernelMemory(c *configs.Cgroup) error {
... ...
@@ -510,52 +550,16 @@ func setKernelMemory(c *configs.Cgroup) error {
510 510
 		return err
511 511
 	}
512 512
 
513
-	if c.Resources.KernelMemory > 0 {
514
-		err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10))
515
-		if err != nil {
516
-			return err
517
-		}
518
-	}
519
-
520
-	return nil
513
+	// This doesn't get called by manager.Set, so we need to do it here.
514
+	s := &fs.MemoryGroup{}
515
+	return s.SetKernelMemory(path, c)
521 516
 }
522 517
 
523 518
 func joinMemory(c *configs.Cgroup, pid int) error {
524
-	path, err := getSubsystemPath(c, "memory")
519
+	_, err := join(c, "memory", pid)
525 520
 	if err != nil && !cgroups.IsNotFound(err) {
526 521
 		return err
527 522
 	}
528
-
529
-	// -1 disables memoryswap
530
-	if c.Resources.MemorySwap > 0 {
531
-		err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10))
532
-		if err != nil {
533
-			return err
534
-		}
535
-	}
536
-	if c.Resources.MemoryReservation > 0 {
537
-		err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10))
538
-		if err != nil {
539
-			return err
540
-		}
541
-	}
542
-	if c.Resources.OomKillDisable {
543
-		if err := writeFile(path, "memory.oom_control", "1"); err != nil {
544
-			return err
545
-		}
546
-	}
547
-
548
-	if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 {
549
-		err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10))
550
-		if err != nil {
551
-			return err
552
-		}
553
-	} else if c.Resources.MemorySwappiness == -1 {
554
-		return nil
555
-	} else {
556
-		return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.Resources.MemorySwappiness)
557
-	}
558
-
559 523
 	return nil
560 524
 }
561 525
 
... ...
@@ -577,68 +581,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
577 577
 // expects device path instead of major minor numbers, which is also confusing
578 578
 // for users. So we use fs work around for now.
579 579
 func joinBlkio(c *configs.Cgroup, pid int) error {
580
-	path, err := getSubsystemPath(c, "blkio")
580
+	_, err := join(c, "blkio", pid)
581 581
 	if err != nil {
582 582
 		return err
583 583
 	}
584
-	// systemd doesn't directly support this in the dbus properties
585
-	if c.Resources.BlkioLeafWeight != 0 {
586
-		if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil {
587
-			return err
588
-		}
589
-	}
590
-	for _, wd := range c.Resources.BlkioWeightDevice {
591
-		if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
592
-			return err
593
-		}
594
-		if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
595
-			return err
596
-		}
597
-	}
598
-	for _, td := range c.Resources.BlkioThrottleReadBpsDevice {
599
-		if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
600
-			return err
601
-		}
602
-	}
603
-	for _, td := range c.Resources.BlkioThrottleWriteBpsDevice {
604
-		if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
605
-			return err
606
-		}
607
-	}
608
-	for _, td := range c.Resources.BlkioThrottleReadIOPSDevice {
609
-		if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
610
-			return err
611
-		}
612
-	}
613
-	for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice {
614
-		if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
615
-			return err
616
-		}
617
-	}
618
-
619 584
 	return nil
620 585
 }
621 586
 
622 587
 func joinHugetlb(c *configs.Cgroup, pid int) error {
623
-	path, err := join(c, "hugetlb", pid)
588
+	_, err := join(c, "hugetlb", pid)
624 589
 	if err != nil && !cgroups.IsNotFound(err) {
625 590
 		return err
626 591
 	}
627
-	hugetlb, err := subsystems.Get("hugetlb")
628
-	if err != nil {
629
-		return err
630
-	}
631
-	return hugetlb.Set(path, c)
592
+	return nil
632 593
 }
633 594
 
634 595
 func joinPerfEvent(c *configs.Cgroup, pid int) error {
635
-	path, err := join(c, "perf_event", pid)
596
+	_, err := join(c, "perf_event", pid)
636 597
 	if err != nil && !cgroups.IsNotFound(err) {
637 598
 		return err
638 599
 	}
639
-	perfEvent, err := subsystems.Get("perf_event")
640
-	if err != nil {
641
-		return err
642
-	}
643
-	return perfEvent.Set(path, c)
600
+	return nil
644 601
 }
... ...
@@ -5,6 +5,7 @@ package cgroups
5 5
 import (
6 6
 	"bufio"
7 7
 	"fmt"
8
+	"io"
8 9
 	"io/ioutil"
9 10
 	"os"
10 11
 	"path/filepath"
... ...
@@ -12,7 +13,6 @@ import (
12 12
 	"strings"
13 13
 	"time"
14 14
 
15
-	"github.com/docker/docker/pkg/mount"
16 15
 	"github.com/docker/go-units"
17 16
 )
18 17
 
... ...
@@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
84 84
 		// Safe as mountinfo encodes mountpoints with spaces as \040.
85 85
 		index := strings.Index(text, " - ")
86 86
 		postSeparatorFields := strings.Fields(text[index+3:])
87
-		if len(postSeparatorFields) < 3 {
88
-			return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
87
+		numPostFields := len(postSeparatorFields)
88
+
89
+		// This is an error as we can't detect if the mount is for "cgroup"
90
+		if numPostFields == 0 {
91
+			return "", fmt.Errorf("Found no fields post '-' in %q", text)
89 92
 		}
93
+
90 94
 		if postSeparatorFields[0] == "cgroup" {
95
+			// Check that the mount is properly formated.
96
+			if numPostFields < 3 {
97
+				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
98
+			}
99
+
91 100
 			return filepath.Dir(fields[4]), nil
92 101
 		}
93 102
 	}
... ...
@@ -112,11 +121,45 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
112 112
 	return getControllerPath(m.Subsystems[0], cgroups)
113 113
 }
114 114
 
115
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
116
+	res := make([]Mount, 0, len(ss))
117
+	scanner := bufio.NewScanner(mi)
118
+	for scanner.Scan() {
119
+		txt := scanner.Text()
120
+		sepIdx := strings.IndexByte(txt, '-')
121
+		if sepIdx == -1 {
122
+			return nil, fmt.Errorf("invalid mountinfo format")
123
+		}
124
+		if txt[sepIdx+2:sepIdx+8] != "cgroup" {
125
+			continue
126
+		}
127
+		fields := strings.Split(txt, " ")
128
+		m := Mount{
129
+			Mountpoint: fields[4],
130
+			Root:       fields[3],
131
+		}
132
+		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
133
+			if strings.HasPrefix(opt, cgroupNamePrefix) {
134
+				m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
135
+			}
136
+			if ss[opt] {
137
+				m.Subsystems = append(m.Subsystems, opt)
138
+			}
139
+		}
140
+		res = append(res, m)
141
+	}
142
+	if err := scanner.Err(); err != nil {
143
+		return nil, err
144
+	}
145
+	return res, nil
146
+}
147
+
115 148
 func GetCgroupMounts() ([]Mount, error) {
116
-	mounts, err := mount.GetMounts()
149
+	f, err := os.Open("/proc/self/mountinfo")
117 150
 	if err != nil {
118 151
 		return nil, err
119 152
 	}
153
+	defer f.Close()
120 154
 
121 155
 	all, err := GetAllSubsystems()
122 156
 	if err != nil {
... ...
@@ -127,24 +170,7 @@ func GetCgroupMounts() ([]Mount, error) {
127 127
 	for _, s := range all {
128 128
 		allMap[s] = true
129 129
 	}
130
-
131
-	res := []Mount{}
132
-	for _, mount := range mounts {
133
-		if mount.Fstype == "cgroup" {
134
-			m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
135
-
136
-			for _, opt := range strings.Split(mount.VfsOpts, ",") {
137
-				if strings.HasPrefix(opt, cgroupNamePrefix) {
138
-					m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
139
-				}
140
-				if allMap[opt] {
141
-					m.Subsystems = append(m.Subsystems, opt)
142
-				}
143
-			}
144
-			res = append(res, m)
145
-		}
146
-	}
147
-	return res, nil
130
+	return getCgroupMountsHelper(allMap, f)
148 131
 }
149 132
 
150 133
 // Returns all the cgroup subsystems supported by the kernel
... ...
@@ -323,9 +349,14 @@ func GetHugePageSize() ([]string, error) {
323 323
 	return pageSizes, nil
324 324
 }
325 325
 
326
-// GetPids returns all pids, that were added to cgroup at path and to all its
327
-// subcgroups.
326
+// GetPids returns all pids, that were added to cgroup at path.
328 327
 func GetPids(path string) ([]int, error) {
328
+	return readProcsFile(path)
329
+}
330
+
331
+// GetAllPids returns all pids, that were added to cgroup at path and to all its
332
+// subcgroups.
333
+func GetAllPids(path string) ([]int, error) {
329 334
 	var pids []int
330 335
 	// collect pids from all sub-cgroups
331 336
 	err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
... ...
@@ -11,25 +11,38 @@ const (
11 11
 )
12 12
 
13 13
 type Cgroup struct {
14
-	Name string `json:"name"`
14
+	// Deprecated, use Path instead
15
+	Name string `json:"name,omitempty"`
15 16
 
16
-	// name of parent cgroup or slice
17
-	Parent string `json:"parent"`
17
+	// name of parent of cgroup or slice
18
+	// Deprecated, use Path instead
19
+	Parent string `json:"parent,omitempty"`
20
+
21
+	// Path specifies the path to cgroups that are created and/or joined by the container.
22
+	// The path is assumed to be relative to the host system cgroup mountpoint.
23
+	Path string `json:"path"`
18 24
 
19 25
 	// ScopePrefix decribes prefix for the scope name
20 26
 	ScopePrefix string `json:"scope_prefix"`
21 27
 
28
+	// Paths represent the absolute cgroups paths to join.
29
+	// This takes precedence over Path.
30
+	Paths map[string]string
31
+
22 32
 	// Resources contains various cgroups settings to apply
23 33
 	*Resources
24 34
 }
25 35
 
26 36
 type Resources struct {
27 37
 	// If this is true allow access to any kind of device within the container.  If false, allow access only to devices explicitly listed in the allowed_devices list.
28
-	AllowAllDevices bool `json:"allow_all_devices"`
38
+	// Deprecated
39
+	AllowAllDevices bool `json:"allow_all_devices,omitempty"`
40
+	// Deprecated
41
+	AllowedDevices []*Device `json:"allowed_devices,omitempty"`
42
+	// Deprecated
43
+	DeniedDevices []*Device `json:"denied_devices,omitempty"`
29 44
 
30
-	AllowedDevices []*Device `json:"allowed_devices"`
31
-
32
-	DeniedDevices []*Device `json:"denied_devices"`
45
+	Devices []*Device `json:"devices"`
33 46
 
34 47
 	// Memory limit (in bytes)
35 48
 	Memory int64 `json:"memory"`
... ...
@@ -37,7 +50,7 @@ type Resources struct {
37 37
 	// Memory reservation or soft_limit (in bytes)
38 38
 	MemoryReservation int64 `json:"memory_reservation"`
39 39
 
40
-	// Total memory usage (memory + swap); set `-1' to disable swap
40
+	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
41 41
 	MemorySwap int64 `json:"memory_swap"`
42 42
 
43 43
 	// Kernel memory limit (in bytes)
... ...
@@ -64,6 +77,9 @@ type Resources struct {
64 64
 	// MEM to use
65 65
 	CpusetMems string `json:"cpuset_mems"`
66 66
 
67
+	// Process limit; set <= `0' to disable limit.
68
+	PidsLimit int64 `json:"pids_limit"`
69
+
67 70
 	// Specifies per cgroup weight, range is from 10 to 1000.
68 71
 	BlkioWeight uint16 `json:"blkio_weight"`
69 72
 
... ...
@@ -171,6 +171,9 @@ type Config struct {
171 171
 	// A default action to be taken if no rules match is also given.
172 172
 	Seccomp *Seccomp `json:"seccomp"`
173 173
 
174
+	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
175
+	NoNewPrivileges bool `json:"no_new_privileges"`
176
+
174 177
 	// Hooks are a collection of actions to perform at various container lifecycle events.
175 178
 	// Hooks are not able to be marshaled to json but they are also not needed to.
176 179
 	Hooks *Hooks `json:"-"`
... ...
@@ -35,6 +35,9 @@ type Device struct {
35 35
 
36 36
 	// Gid of the device.
37 37
 	Gid uint32 `json:"gid"`
38
+
39
+	// Write the file to the allowed list
40
+	Allow bool `json:"allow"`
38 41
 }
39 42
 
40 43
 func (d *Device) CgroupString() string {
... ...
@@ -82,20 +82,6 @@ var (
82 82
 			Minor:       1,
83 83
 			Permissions: "rwm",
84 84
 		},
85
-		{
86
-			Path:        "/dev/tty0",
87
-			Type:        'c',
88
-			Major:       4,
89
-			Minor:       0,
90
-			Permissions: "rwm",
91
-		},
92
-		{
93
-			Path:        "/dev/tty1",
94
-			Type:        'c',
95
-			Major:       4,
96
-			Minor:       1,
97
-			Permissions: "rwm",
98
-		},
99 85
 		// /dev/pts/ - pts namespaces are "coming soon"
100 86
 		{
101 87
 			Path:        "",
... ...
@@ -6,6 +6,7 @@ package libcontainer
6 6
 
7 7
 import (
8 8
 	"os"
9
+	"time"
9 10
 
10 11
 	"github.com/opencontainers/runc/libcontainer/configs"
11 12
 )
... ...
@@ -14,8 +15,11 @@ import (
14 14
 type Status int
15 15
 
16 16
 const (
17
+	// The container exists but has not been run yet
18
+	Created Status = iota
19
+
17 20
 	// The container exists and is running.
18
-	Running Status = iota + 1
21
+	Running
19 22
 
20 23
 	// The container exists, it is in the process of being paused.
21 24
 	Pausing
... ...
@@ -30,6 +34,25 @@ const (
30 30
 	Destroyed
31 31
 )
32 32
 
33
+func (s Status) String() string {
34
+	switch s {
35
+	case Created:
36
+		return "created"
37
+	case Running:
38
+		return "running"
39
+	case Pausing:
40
+		return "pausing"
41
+	case Paused:
42
+		return "paused"
43
+	case Checkpointed:
44
+		return "checkpointed"
45
+	case Destroyed:
46
+		return "destroyed"
47
+	default:
48
+		return "unknown"
49
+	}
50
+}
51
+
33 52
 // BaseState represents the platform agnostic pieces relating to a
34 53
 // running container's state
35 54
 type BaseState struct {
... ...
@@ -39,9 +62,12 @@ type BaseState struct {
39 39
 	// InitProcessPid is the init process id in the parent namespace.
40 40
 	InitProcessPid int `json:"init_process_pid"`
41 41
 
42
-	// InitProcessStartTime is the init process start time.
42
+	// InitProcessStartTime is the init process start time in clock cycles since boot time.
43 43
 	InitProcessStartTime string `json:"init_process_start"`
44 44
 
45
+	// Created is the unix timestamp for the creation time of the container in UTC
46
+	Created time.Time `json:"created"`
47
+
45 48
 	// Config is the container's configuration.
46 49
 	Config configs.Config `json:"config"`
47 50
 }
... ...
@@ -15,6 +15,7 @@ import (
15 15
 	"strings"
16 16
 	"sync"
17 17
 	"syscall"
18
+	"time"
18 19
 
19 20
 	"github.com/Sirupsen/logrus"
20 21
 	"github.com/golang/protobuf/proto"
... ...
@@ -38,6 +39,8 @@ type linuxContainer struct {
38 38
 	criuPath      string
39 39
 	m             sync.Mutex
40 40
 	criuVersion   int
41
+	state         containerState
42
+	created       time.Time
41 43
 }
42 44
 
43 45
 // State represents a running container's state
... ...
@@ -104,6 +107,12 @@ type Container interface {
104 104
 	// errors:
105 105
 	// Systemerror - System error.
106 106
 	NotifyOOM() (<-chan struct{}, error)
107
+
108
+	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
109
+	//
110
+	// errors:
111
+	// Systemerror - System error.
112
+	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
107 113
 }
108 114
 
109 115
 // ID returns the container's unique ID
... ...
@@ -129,7 +138,7 @@ func (c *linuxContainer) State() (*State, error) {
129 129
 }
130 130
 
131 131
 func (c *linuxContainer) Processes() ([]int, error) {
132
-	pids, err := c.cgroupManager.GetPids()
132
+	pids, err := c.cgroupManager.GetAllPids()
133 133
 	if err != nil {
134 134
 		return nil, newSystemError(err)
135 135
 	}
... ...
@@ -183,22 +192,30 @@ func (c *linuxContainer) Start(process *Process) error {
183 183
 		}
184 184
 		return newSystemError(err)
185 185
 	}
186
+	// generate a timestamp indicating when the container was started
187
+	c.created = time.Now().UTC()
188
+
189
+	c.state = &runningState{
190
+		c: c,
191
+	}
186 192
 	if doInit {
187
-		c.updateState(parent)
188
-	}
189
-	if c.config.Hooks != nil {
190
-		s := configs.HookState{
191
-			Version: c.config.Version,
192
-			ID:      c.id,
193
-			Pid:     parent.pid(),
194
-			Root:    c.config.Rootfs,
193
+		if err := c.updateState(parent); err != nil {
194
+			return err
195 195
 		}
196
-		for _, hook := range c.config.Hooks.Poststart {
197
-			if err := hook.Run(s); err != nil {
198
-				if err := parent.terminate(); err != nil {
199
-					logrus.Warn(err)
196
+		if c.config.Hooks != nil {
197
+			s := configs.HookState{
198
+				Version: c.config.Version,
199
+				ID:      c.id,
200
+				Pid:     parent.pid(),
201
+				Root:    c.config.Rootfs,
202
+			}
203
+			for _, hook := range c.config.Hooks.Poststart {
204
+				if err := hook.Run(s); err != nil {
205
+					if err := parent.terminate(); err != nil {
206
+						logrus.Warn(err)
207
+					}
208
+					return newSystemError(err)
200 209
 				}
201
-				return newSystemError(err)
202 210
 			}
203 211
 		}
204 212
 	}
... ...
@@ -251,7 +268,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
251 251
 }
252 252
 
253 253
 func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
254
-	t := "_LIBCONTAINER_INITTYPE=standard"
254
+	t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
255 255
 	cloneFlags := c.config.Namespaces.CloneFlags()
256 256
 	if cloneFlags&syscall.CLONE_NEWUSER != 0 {
257 257
 		if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
... ...
@@ -278,7 +295,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
278 278
 }
279 279
 
280 280
 func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
281
-	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns")
281
+	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
282 282
 	// for setns process, we dont have to set cloneflags as the process namespaces
283 283
 	// will only be set via setns syscall
284 284
 	data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
... ...
@@ -321,54 +338,53 @@ func newPipe() (parent *os.File, child *os.File, err error) {
321 321
 func (c *linuxContainer) Destroy() error {
322 322
 	c.m.Lock()
323 323
 	defer c.m.Unlock()
324
+	return c.state.destroy()
325
+}
326
+
327
+func (c *linuxContainer) Pause() error {
328
+	c.m.Lock()
329
+	defer c.m.Unlock()
324 330
 	status, err := c.currentStatus()
325 331
 	if err != nil {
326 332
 		return err
327 333
 	}
328
-	if status != Destroyed {
329
-		return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
334
+	if status != Running {
335
+		return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
330 336
 	}
331
-	if !c.config.Namespaces.Contains(configs.NEWPID) {
332
-		if err := killCgroupProcesses(c.cgroupManager); err != nil {
333
-			logrus.Warn(err)
334
-		}
335
-	}
336
-	err = c.cgroupManager.Destroy()
337
-	if rerr := os.RemoveAll(c.root); err == nil {
338
-		err = rerr
339
-	}
340
-	c.initProcess = nil
341
-	if c.config.Hooks != nil {
342
-		s := configs.HookState{
343
-			Version: c.config.Version,
344
-			ID:      c.id,
345
-			Root:    c.config.Rootfs,
346
-		}
347
-		for _, hook := range c.config.Hooks.Poststop {
348
-			if err := hook.Run(s); err != nil {
349
-				return err
350
-			}
351
-		}
337
+	if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
338
+		return err
352 339
 	}
353
-	return err
354
-}
355
-
356
-func (c *linuxContainer) Pause() error {
357
-	c.m.Lock()
358
-	defer c.m.Unlock()
359
-	return c.cgroupManager.Freeze(configs.Frozen)
340
+	return c.state.transition(&pausedState{
341
+		c: c,
342
+	})
360 343
 }
361 344
 
362 345
 func (c *linuxContainer) Resume() error {
363 346
 	c.m.Lock()
364 347
 	defer c.m.Unlock()
365
-	return c.cgroupManager.Freeze(configs.Thawed)
348
+	status, err := c.currentStatus()
349
+	if err != nil {
350
+		return err
351
+	}
352
+	if status != Paused {
353
+		return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
354
+	}
355
+	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
356
+		return err
357
+	}
358
+	return c.state.transition(&runningState{
359
+		c: c,
360
+	})
366 361
 }
367 362
 
368 363
 func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
369 364
 	return notifyOnOOM(c.cgroupManager.GetPaths())
370 365
 }
371 366
 
367
+func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
368
+	return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
369
+}
370
+
372 371
 // XXX debug support, remove when debugging done.
373 372
 func addArgsFromEnv(evar string, args *[]string) {
374 373
 	if e := os.Getenv(evar); e != "" {
... ...
@@ -460,7 +476,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
460 460
 	}
461 461
 
462 462
 	if criuOpts.ImagesDirectory == "" {
463
-		criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
463
+		return fmt.Errorf("invalid directory to save checkpoint")
464 464
 	}
465 465
 
466 466
 	// Since a container can be C/R'ed multiple times,
... ...
@@ -579,11 +595,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
579 579
 func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
580 580
 	c.m.Lock()
581 581
 	defer c.m.Unlock()
582
-
583 582
 	if err := c.checkCriuVersion("1.5.2"); err != nil {
584 583
 		return err
585 584
 	}
586
-
587 585
 	if criuOpts.WorkDirectory == "" {
588 586
 		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
589 587
 	}
... ...
@@ -592,22 +606,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
592 592
 	if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
593 593
 		return err
594 594
 	}
595
-
596 595
 	workDir, err := os.Open(criuOpts.WorkDirectory)
597 596
 	if err != nil {
598 597
 		return err
599 598
 	}
600 599
 	defer workDir.Close()
601
-
602 600
 	if criuOpts.ImagesDirectory == "" {
603
-		criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
601
+		return fmt.Errorf("invalid directory to restore checkpoint")
604 602
 	}
605 603
 	imageDir, err := os.Open(criuOpts.ImagesDirectory)
606 604
 	if err != nil {
607 605
 		return err
608 606
 	}
609 607
 	defer imageDir.Close()
610
-
611 608
 	// CRIU has a few requirements for a root directory:
612 609
 	// * it must be a mount point
613 610
 	// * its parent must not be overmounted
... ...
@@ -618,18 +629,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
618 618
 		return err
619 619
 	}
620 620
 	defer os.Remove(root)
621
-
622 621
 	root, err = filepath.EvalSymlinks(root)
623 622
 	if err != nil {
624 623
 		return err
625 624
 	}
626
-
627 625
 	err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
628 626
 	if err != nil {
629 627
 		return err
630 628
 	}
631 629
 	defer syscall.Unmount(root, syscall.MNT_DETACH)
632
-
633 630
 	t := criurpc.CriuReqType_RESTORE
634 631
 	req := &criurpc.CriuReq{
635 632
 		Type: &t,
... ...
@@ -697,15 +705,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
697 697
 		fds    []string
698 698
 		fdJSON []byte
699 699
 	)
700
-
701 700
 	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
702 701
 		return err
703 702
 	}
704 703
 
705
-	if err = json.Unmarshal(fdJSON, &fds); err != nil {
704
+	if err := json.Unmarshal(fdJSON, &fds); err != nil {
706 705
 		return err
707 706
 	}
708
-
709 707
 	for i := range fds {
710 708
 		if s := fds[i]; strings.Contains(s, "pipe:") {
711 709
 			inheritFd := new(criurpc.InheritFd)
... ...
@@ -714,12 +720,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
714 714
 			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
715 715
 		}
716 716
 	}
717
-
718
-	err = c.criuSwrk(process, req, criuOpts, true)
719
-	if err != nil {
720
-		return err
721
-	}
722
-	return nil
717
+	return c.criuSwrk(process, req, criuOpts, true)
723 718
 }
724 719
 
725 720
 func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
... ...
@@ -914,46 +915,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
914 914
 	if notify == nil {
915 915
 		return fmt.Errorf("invalid response: %s", resp.String())
916 916
 	}
917
-
918 917
 	switch {
919 918
 	case notify.GetScript() == "post-dump":
920
-		if !opts.LeaveRunning {
921
-			f, err := os.Create(filepath.Join(c.root, "checkpoint"))
922
-			if err != nil {
923
-				return err
924
-			}
925
-			f.Close()
919
+		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
920
+		if err != nil {
921
+			return err
926 922
 		}
927
-		break
928
-
923
+		f.Close()
929 924
 	case notify.GetScript() == "network-unlock":
930 925
 		if err := unlockNetwork(c.config); err != nil {
931 926
 			return err
932 927
 		}
933
-		break
934
-
935 928
 	case notify.GetScript() == "network-lock":
936 929
 		if err := lockNetwork(c.config); err != nil {
937 930
 			return err
938 931
 		}
939
-		break
940
-
941 932
 	case notify.GetScript() == "post-restore":
942 933
 		pid := notify.GetPid()
943 934
 		r, err := newRestoredProcess(int(pid), fds)
944 935
 		if err != nil {
945 936
 			return err
946 937
 		}
947
-
948
-		// TODO: crosbymichael restore previous process information by saving the init process information in
949
-		// the container's state file or separate process state files.
938
+		process.ops = r
939
+		if err := c.state.transition(&restoredState{
940
+			imageDir: opts.ImagesDirectory,
941
+			c:        c,
942
+		}); err != nil {
943
+			return err
944
+		}
950 945
 		if err := c.updateState(r); err != nil {
951 946
 			return err
952 947
 		}
953
-		process.ops = r
954
-		break
948
+		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
949
+			if !os.IsNotExist(err) {
950
+				logrus.Error(err)
951
+			}
952
+		}
955 953
 	}
956
-
957 954
 	return nil
958 955
 }
959 956
 
... ...
@@ -963,65 +961,108 @@ func (c *linuxContainer) updateState(process parentProcess) error {
963 963
 	if err != nil {
964 964
 		return err
965 965
 	}
966
+	return c.saveState(state)
967
+}
968
+
969
+func (c *linuxContainer) saveState(s *State) error {
966 970
 	f, err := os.Create(filepath.Join(c.root, stateFilename))
967 971
 	if err != nil {
968 972
 		return err
969 973
 	}
970 974
 	defer f.Close()
971
-	os.Remove(filepath.Join(c.root, "checkpoint"))
972
-	return utils.WriteJSON(f, state)
975
+	return utils.WriteJSON(f, s)
976
+}
977
+
978
+func (c *linuxContainer) deleteState() error {
979
+	return os.Remove(filepath.Join(c.root, stateFilename))
973 980
 }
974 981
 
975 982
 func (c *linuxContainer) currentStatus() (Status, error) {
976
-	if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil {
977
-		return Checkpointed, nil
983
+	if err := c.refreshState(); err != nil {
984
+		return -1, err
978 985
 	}
986
+	return c.state.status(), nil
987
+}
988
+
989
+// refreshState needs to be called to verify that the current state on the
990
+// container is what is true.  Because consumers of libcontainer can use it
991
+// out of process we need to verify the container's status based on runtime
992
+// information and not rely on our in process info.
993
+func (c *linuxContainer) refreshState() error {
994
+	paused, err := c.isPaused()
995
+	if err != nil {
996
+		return err
997
+	}
998
+	if paused {
999
+		return c.state.transition(&pausedState{c: c})
1000
+	}
1001
+	running, err := c.isRunning()
1002
+	if err != nil {
1003
+		return err
1004
+	}
1005
+	if running {
1006
+		return c.state.transition(&runningState{c: c})
1007
+	}
1008
+	return c.state.transition(&stoppedState{c: c})
1009
+}
1010
+
1011
+func (c *linuxContainer) isRunning() (bool, error) {
979 1012
 	if c.initProcess == nil {
980
-		return Destroyed, nil
1013
+		return false, nil
981 1014
 	}
982 1015
 	// return Running if the init process is alive
983 1016
 	if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
984 1017
 		if err == syscall.ESRCH {
985
-			return Destroyed, nil
1018
+			return false, nil
986 1019
 		}
987
-		return 0, newSystemError(err)
1020
+		return false, newSystemError(err)
988 1021
 	}
989
-	if c.config.Cgroups != nil && c.config.Cgroups.Resources != nil && c.config.Cgroups.Resources.Freezer == configs.Frozen {
990
-		return Paused, nil
991
-	}
992
-	return Running, nil
1022
+	return true, nil
993 1023
 }
994 1024
 
995
-func (c *linuxContainer) currentState() (*State, error) {
996
-	status, err := c.currentStatus()
1025
+func (c *linuxContainer) isPaused() (bool, error) {
1026
+	data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
997 1027
 	if err != nil {
998
-		return nil, err
999
-	}
1000
-	if status == Destroyed {
1001
-		return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists)
1028
+		if os.IsNotExist(err) {
1029
+			return false, nil
1030
+		}
1031
+		return false, newSystemError(err)
1002 1032
 	}
1003
-	startTime, err := c.initProcess.startTime()
1004
-	if err != nil {
1005
-		return nil, newSystemError(err)
1033
+	return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
1034
+}
1035
+
1036
+func (c *linuxContainer) currentState() (*State, error) {
1037
+	var (
1038
+		startTime           string
1039
+		externalDescriptors []string
1040
+		pid                 = -1
1041
+	)
1042
+	if c.initProcess != nil {
1043
+		pid = c.initProcess.pid()
1044
+		startTime, _ = c.initProcess.startTime()
1045
+		externalDescriptors = c.initProcess.externalDescriptors()
1006 1046
 	}
1007 1047
 	state := &State{
1008 1048
 		BaseState: BaseState{
1009 1049
 			ID:                   c.ID(),
1010 1050
 			Config:               *c.config,
1011
-			InitProcessPid:       c.initProcess.pid(),
1051
+			InitProcessPid:       pid,
1012 1052
 			InitProcessStartTime: startTime,
1053
+			Created:              c.created,
1013 1054
 		},
1014 1055
 		CgroupPaths:         c.cgroupManager.GetPaths(),
1015 1056
 		NamespacePaths:      make(map[configs.NamespaceType]string),
1016
-		ExternalDescriptors: c.initProcess.externalDescriptors(),
1057
+		ExternalDescriptors: externalDescriptors,
1017 1058
 	}
1018
-	for _, ns := range c.config.Namespaces {
1019
-		state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
1020
-	}
1021
-	for _, nsType := range configs.NamespaceTypes() {
1022
-		if _, ok := state.NamespacePaths[nsType]; !ok {
1023
-			ns := configs.Namespace{Type: nsType}
1024
-			state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
1059
+	if pid > 0 {
1060
+		for _, ns := range c.config.Namespaces {
1061
+			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
1062
+		}
1063
+		for _, nsType := range configs.NamespaceTypes() {
1064
+			if _, ok := state.NamespacePaths[nsType]; !ok {
1065
+				ns := configs.Namespace{Type: nsType}
1066
+				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
1067
+			}
1025 1068
 		}
1026 1069
 	}
1027 1070
 	return state, nil
... ...
@@ -16,9 +16,10 @@ const (
16 16
 	ContainerPaused
17 17
 	ContainerNotStopped
18 18
 	ContainerNotRunning
19
+	ContainerNotPaused
19 20
 
20 21
 	// Process errors
21
-	ProcessNotExecuted
22
+	NoProcessOps
22 23
 
23 24
 	// Common errors
24 25
 	ConfigInvalid
... ...
@@ -46,6 +47,10 @@ func (c ErrorCode) String() string {
46 46
 		return "Container is not running"
47 47
 	case ConsoleExists:
48 48
 		return "Console exists for process"
49
+	case ContainerNotPaused:
50
+		return "Container is not paused"
51
+	case NoProcessOps:
52
+		return "No process operations"
49 53
 	default:
50 54
 		return "Unknown error"
51 55
 	}
... ...
@@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
166 166
 	if err := os.MkdirAll(containerRoot, 0700); err != nil {
167 167
 		return nil, newGenericError(err, SystemError)
168 168
 	}
169
-	return &linuxContainer{
169
+	c := &linuxContainer{
170 170
 		id:            id,
171 171
 		root:          containerRoot,
172 172
 		config:        config,
... ...
@@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
174 174
 		initArgs:      l.InitArgs,
175 175
 		criuPath:      l.CriuPath,
176 176
 		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
177
-	}, nil
177
+	}
178
+	c.state = &stoppedState{c: c}
179
+	return c, nil
178 180
 }
179 181
 
180 182
 func (l *LinuxFactory) Load(id string) (Container, error) {
... ...
@@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
191 191
 		processStartTime: state.InitProcessStartTime,
192 192
 		fds:              state.ExternalDescriptors,
193 193
 	}
194
-	return &linuxContainer{
194
+	c := &linuxContainer{
195 195
 		initProcess:   r,
196 196
 		id:            id,
197 197
 		config:        &state.Config,
... ...
@@ -200,7 +202,13 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
200 200
 		criuPath:      l.CriuPath,
201 201
 		cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
202 202
 		root:          containerRoot,
203
-	}, nil
203
+		created:       state.Created,
204
+	}
205
+	c.state = &createdState{c: c, s: Created}
206
+	if err := c.refreshState(); err != nil {
207
+		return nil, err
208
+	}
209
+	return c, nil
204 210
 }
205 211
 
206 212
 func (l *LinuxFactory) Type() string {
... ...
@@ -222,18 +230,25 @@ func (l *LinuxFactory) StartInitialization() (err error) {
222 222
 	// clear the current process's environment to clean any libcontainer
223 223
 	// specific env vars.
224 224
 	os.Clearenv()
225
+	var i initer
225 226
 	defer func() {
226
-		// if we have an error during the initialization of the container's init then send it back to the
227
-		// parent process in the form of an initError.
228
-		if err != nil {
229
-			if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
227
+		// We have an error during the initialization of the container's init,
228
+		// send it back to the parent process in the form of an initError.
229
+		// If container's init successed, syscall.Exec will not return, hence
230
+		// this defer function will never be called.
231
+		if _, ok := i.(*linuxStandardInit); ok {
232
+			//  Synchronisation only necessary for standard init.
233
+			if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
230 234
 				panic(err)
231 235
 			}
232 236
 		}
237
+		if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
238
+			panic(err)
239
+		}
233 240
 		// ensure that this pipe is always closed
234 241
 		pipe.Close()
235 242
 	}()
236
-	i, err := newContainerInit(it, pipe)
243
+	i, err = newContainerInit(it, pipe)
237 244
 	if err != nil {
238 245
 		return err
239 246
 	}
... ...
@@ -9,6 +9,18 @@ import (
9 9
 	"github.com/opencontainers/runc/libcontainer/stacktrace"
10 10
 )
11 11
 
12
+type syncType uint8
13
+
14
+const (
15
+	procReady syncType = iota
16
+	procError
17
+	procRun
18
+)
19
+
20
+type syncT struct {
21
+	Type syncType `json:"type"`
22
+}
23
+
12 24
 var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
13 25
 Code: {{.ECode}}
14 26
 {{if .Message }}
... ...
@@ -5,6 +5,7 @@ package libcontainer
5 5
 import (
6 6
 	"encoding/json"
7 7
 	"fmt"
8
+	"io"
8 9
 	"io/ioutil"
9 10
 	"net"
10 11
 	"os"
... ...
@@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
73 73
 		}, nil
74 74
 	case initStandard:
75 75
 		return &linuxStandardInit{
76
+			pipe:      pipe,
76 77
 			parentPid: syscall.Getppid(),
77 78
 			config:    config,
78 79
 		}, nil
... ...
@@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
140 140
 	return nil
141 141
 }
142 142
 
143
+// syncParentReady sends to the given pipe a JSON payload which indicates that
144
+// the init is ready to Exec the child process. It then waits for the parent to
145
+// indicate that it is cleared to Exec.
146
+func syncParentReady(pipe io.ReadWriter) error {
147
+	// Tell parent.
148
+	if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
149
+		return err
150
+	}
151
+	// Wait for parent to give the all-clear.
152
+	var procSync syncT
153
+	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
154
+		if err == io.EOF {
155
+			return fmt.Errorf("parent closed synchronisation channel")
156
+		}
157
+		if procSync.Type != procRun {
158
+			return fmt.Errorf("invalid synchronisation flag from parent")
159
+		}
160
+	}
161
+	return nil
162
+}
163
+
143 164
 // joinExistingNamespaces gets all the namespace paths specified for the container and
144 165
 // does a setns on the namespace fd so that the current process joins the namespace.
145 166
 func joinExistingNamespaces(namespaces []configs.Namespace) error {
... ...
@@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
309 309
 	if err := m.Freeze(configs.Frozen); err != nil {
310 310
 		logrus.Warn(err)
311 311
 	}
312
-	pids, err := m.GetPids()
312
+	pids, err := m.GetAllPids()
313 313
 	if err != nil {
314 314
 		m.Freeze(configs.Thawed)
315 315
 		return err
316 316
new file mode 100644
... ...
@@ -0,0 +1,67 @@
0
+// +build linux
1
+
2
+package keyctl
3
+
4
+import (
5
+	"fmt"
6
+	"syscall"
7
+	"strings"
8
+	"strconv"
9
+	"unsafe"
10
+)
11
+
12
+const KEYCTL_JOIN_SESSION_KEYRING = 1
13
+const KEYCTL_SETPERM = 5
14
+const KEYCTL_DESCRIBE = 6
15
+
16
+type KeySerial uint32
17
+
18
+func JoinSessionKeyring(name string) (KeySerial, error) {
19
+	var _name *byte = nil
20
+	var err error
21
+
22
+	if len(name) > 0 {
23
+		_name, err = syscall.BytePtrFromString(name)
24
+		if err != nil {
25
+			return KeySerial(0), err
26
+		}
27
+	}
28
+
29
+	sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
30
+	if errn != 0 {
31
+		return 0, fmt.Errorf("could not create session key: %v", errn)
32
+	}
33
+	return KeySerial(sessKeyId), nil
34
+}
35
+
36
+// modify permissions on a keyring by reading the current permissions,
37
+// anding the bits with the given mask (clearing permissions) and setting
38
+// additional permission bits
39
+func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
40
+	dest := make([]byte, 1024)
41
+	destBytes := unsafe.Pointer(&dest[0])
42
+
43
+	if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
44
+		return err
45
+	}
46
+
47
+	res := strings.Split(string(dest), ";")
48
+	if len(res) < 5 {
49
+		return fmt.Errorf("Destination buffer for key description is too small")
50
+	}
51
+
52
+	// parse permissions
53
+	perm64, err := strconv.ParseUint(res[3], 16, 32)
54
+	if err != nil {
55
+		return err
56
+	}
57
+
58
+	perm := (uint32(perm64) & mask) | setbits
59
+
60
+	if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
61
+		return err
62
+	}
63
+
64
+	return nil
65
+}
66
+
... ...
@@ -12,31 +12,32 @@ import (
12 12
 
13 13
 const oomCgroupName = "memory"
14 14
 
15
-// notifyOnOOM returns channel on which you can expect event about OOM,
16
-// if process died without OOM this channel will be closed.
17
-// s is current *libcontainer.State for container.
18
-func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
19
-	dir := paths[oomCgroupName]
20
-	if dir == "" {
21
-		return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName)
22
-	}
23
-	oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control"))
15
+type PressureLevel uint
16
+
17
+const (
18
+	LowPressure PressureLevel = iota
19
+	MediumPressure
20
+	CriticalPressure
21
+)
22
+
23
+func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
24
+	evFile, err := os.Open(filepath.Join(cgDir, evName))
24 25
 	if err != nil {
25 26
 		return nil, err
26 27
 	}
27 28
 	fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
28 29
 	if syserr != 0 {
29
-		oomControl.Close()
30
+		evFile.Close()
30 31
 		return nil, syserr
31 32
 	}
32 33
 
33 34
 	eventfd := os.NewFile(fd, "eventfd")
34 35
 
35
-	eventControlPath := filepath.Join(dir, "cgroup.event_control")
36
-	data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd())
36
+	eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
37
+	data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
37 38
 	if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
38 39
 		eventfd.Close()
39
-		oomControl.Close()
40
+		evFile.Close()
40 41
 		return nil, err
41 42
 	}
42 43
 	ch := make(chan struct{})
... ...
@@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
44 44
 		defer func() {
45 45
 			close(ch)
46 46
 			eventfd.Close()
47
-			oomControl.Close()
47
+			evFile.Close()
48 48
 		}()
49 49
 		buf := make([]byte, 8)
50 50
 		for {
... ...
@@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
61 61
 	}()
62 62
 	return ch, nil
63 63
 }
64
+
65
+// notifyOnOOM returns channel on which you can expect event about OOM,
66
+// if process died without OOM this channel will be closed.
67
+func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
68
+	dir := paths[oomCgroupName]
69
+	if dir == "" {
70
+		return nil, fmt.Errorf("path %q missing", oomCgroupName)
71
+	}
72
+
73
+	return registerMemoryEvent(dir, "memory.oom_control", "")
74
+}
75
+
76
+func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
77
+	dir := paths[oomCgroupName]
78
+	if dir == "" {
79
+		return nil, fmt.Errorf("path %q missing", oomCgroupName)
80
+	}
81
+
82
+	if level > CriticalPressure {
83
+		return nil, fmt.Errorf("invalid pressure level %d", level)
84
+	}
85
+
86
+	levelStr := []string{"low", "medium", "critical"}[level]
87
+	return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
88
+}
... ...
@@ -17,6 +17,7 @@
17 17
 #include <sched.h>
18 18
 #include <signal.h>
19 19
 
20
+#include <bits/sockaddr.h>
20 21
 #include <linux/netlink.h>
21 22
 #include <linux/types.h>
22 23
 #include <stdint.h>
... ...
@@ -55,7 +55,7 @@ type Process struct {
55 55
 // Wait releases any resources associated with the Process
56 56
 func (p Process) Wait() (*os.ProcessState, error) {
57 57
 	if p.ops == nil {
58
-		return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
58
+		return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
59 59
 	}
60 60
 	return p.ops.wait()
61 61
 }
... ...
@@ -65,7 +65,7 @@ func (p Process) Pid() (int, error) {
65 65
 	// math.MinInt32 is returned here, because it's invalid value
66 66
 	// for the kill() system call.
67 67
 	if p.ops == nil {
68
-		return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
68
+		return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
69 69
 	}
70 70
 	return p.ops.pid(), nil
71 71
 }
... ...
@@ -73,7 +73,7 @@ func (p Process) Pid() (int, error) {
73 73
 // Signal sends a signal to the Process.
74 74
 func (p Process) Signal(sig os.Signal) error {
75 75
 	if p.ops == nil {
76
-		return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
76
+		return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
77 77
 	}
78 78
 	return p.ops.signal(sig)
79 79
 }
... ...
@@ -5,6 +5,7 @@ package libcontainer
5 5
 import (
6 6
 	"encoding/json"
7 7
 	"errors"
8
+	"fmt"
8 9
 	"io"
9 10
 	"os"
10 11
 	"os/exec"
... ...
@@ -87,6 +88,7 @@ func (p *setnsProcess) start() (err error) {
87 87
 	if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
88 88
 		return newSystemError(err)
89 89
 	}
90
+
90 91
 	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
91 92
 		return newSystemError(err)
92 93
 	}
... ...
@@ -96,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
96 96
 	if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
97 97
 		return newSystemError(err)
98 98
 	}
99
+	// Must be done after Shutdown so the child will exit and we can wait for it.
99 100
 	if ierr != nil {
100 101
 		p.wait()
101 102
 		return newSystemError(ierr)
... ...
@@ -199,7 +202,6 @@ func (p *initProcess) start() (err error) {
199 199
 		return newSystemError(err)
200 200
 	}
201 201
 	p.setExternalDescriptors(fds)
202
-
203 202
 	// Do this before syncing with child so that no children
204 203
 	// can escape the cgroup
205 204
 	if err := p.manager.Apply(p.pid()); err != nil {
... ...
@@ -230,13 +232,54 @@ func (p *initProcess) start() (err error) {
230 230
 	if err := p.sendConfig(); err != nil {
231 231
 		return newSystemError(err)
232 232
 	}
233
-	// wait for the child process to fully complete and receive an error message
234
-	// if one was encoutered
235
-	var ierr *genericError
236
-	if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
233
+	var (
234
+		procSync syncT
235
+		sentRun  bool
236
+		ierr     *genericError
237
+	)
238
+
239
+loop:
240
+	for {
241
+		if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
242
+			if err == io.EOF {
243
+				break loop
244
+			}
245
+			return newSystemError(err)
246
+		}
247
+		switch procSync.Type {
248
+		case procReady:
249
+			if err := p.manager.Set(p.config.Config); err != nil {
250
+				return newSystemError(err)
251
+			}
252
+			// Sync with child.
253
+			if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
254
+				return newSystemError(err)
255
+			}
256
+			sentRun = true
257
+		case procError:
258
+			// wait for the child process to fully complete and receive an error message
259
+			// if one was encoutered
260
+			if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
261
+				return newSystemError(err)
262
+			}
263
+			if ierr != nil {
264
+				break loop
265
+			}
266
+			// Programmer error.
267
+			panic("No error following JSON procError payload.")
268
+		default:
269
+			return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
270
+		}
271
+	}
272
+	if !sentRun {
273
+		return newSystemError(fmt.Errorf("could not synchronise with container process"))
274
+	}
275
+	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
237 276
 		return newSystemError(err)
238 277
 	}
278
+	// Must be done after Shutdown so the child will exit and we can wait for it.
239 279
 	if ierr != nil {
280
+		p.wait()
240 281
 		return newSystemError(ierr)
241 282
 	}
242 283
 	return nil
... ...
@@ -270,12 +313,10 @@ func (p *initProcess) startTime() (string, error) {
270 270
 }
271 271
 
272 272
 func (p *initProcess) sendConfig() error {
273
-	// send the state to the container's init process then shutdown writes for the parent
274
-	if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
275
-		return err
276
-	}
277
-	// shutdown writes for the parent side of the pipe
278
-	return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
273
+	// send the config to the container's init process, we don't use JSON Encode
274
+	// here because there might be a problem in JSON decoder in some cases, see:
275
+	// https://github.com/docker/docker/issues/14203#issuecomment-174177790
276
+	return utils.WriteJSON(p.parentPipe, p.config)
279 277
 }
280 278
 
281 279
 func (p *initProcess) createNetworkInterfaces() error {
... ...
@@ -18,6 +18,8 @@ import (
18 18
 	"github.com/opencontainers/runc/libcontainer/cgroups"
19 19
 	"github.com/opencontainers/runc/libcontainer/configs"
20 20
 	"github.com/opencontainers/runc/libcontainer/label"
21
+	"github.com/opencontainers/runc/libcontainer/system"
22
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
21 23
 )
22 24
 
23 25
 const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
... ...
@@ -293,12 +295,31 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
293 293
 // checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
294 294
 // dest is required to be an abs path and have any symlinks resolved before calling this function.
295 295
 func checkMountDestination(rootfs, dest string) error {
296
-	if filepath.Clean(rootfs) == filepath.Clean(dest) {
296
+	if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
297 297
 		return fmt.Errorf("mounting into / is prohibited")
298 298
 	}
299 299
 	invalidDestinations := []string{
300 300
 		"/proc",
301 301
 	}
302
+	// White list, it should be sub directories of invalid destinations
303
+	validDestinations := []string{
304
+		// These entries can be bind mounted by files emulated by fuse,
305
+		// so commands like top, free displays stats in container.
306
+		"/proc/cpuinfo",
307
+		"/proc/diskstats",
308
+		"/proc/meminfo",
309
+		"/proc/stat",
310
+		"/proc/net/dev",
311
+	}
312
+	for _, valid := range validDestinations {
313
+		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
314
+		if err != nil {
315
+			return err
316
+		}
317
+		if path == "." {
318
+			return nil
319
+		}
320
+	}
302 321
 	for _, invalid := range invalidDestinations {
303 322
 		path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
304 323
 		if err != nil {
... ...
@@ -321,7 +342,7 @@ func setupDevSymlinks(rootfs string) error {
321 321
 	// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
322 322
 	// in /dev if it exists in /proc.
323 323
 	if _, err := os.Stat("/proc/kcore"); err == nil {
324
-		links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
324
+		links = append(links, [2]string{"/proc/kcore", "/dev/core"})
325 325
 	}
326 326
 	for _, link := range links {
327 327
 		var (
... ...
@@ -365,11 +386,12 @@ func reOpenDevNull() error {
365 365
 
366 366
 // Create the device nodes in the container.
367 367
 func createDevices(config *configs.Config) error {
368
+	useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
368 369
 	oldMask := syscall.Umask(0000)
369 370
 	for _, node := range config.Devices {
370 371
 		// containers running in a user namespace are not allowed to mknod
371 372
 		// devices so we can just bind mount it from the host.
372
-		if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil {
373
+		if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
373 374
 			syscall.Umask(oldMask)
374 375
 			return err
375 376
 		}
... ...
@@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
231 231
 	}
232 232
 }
233 233
 
234
+func selinuxEnforcePath() string {
235
+	return fmt.Sprintf("%s/enforce", selinuxPath)
236
+}
237
+
234 238
 func SelinuxGetEnforce() int {
235 239
 	var enforce int
236 240
 
237
-	enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath))
241
+	enforceS, err := readCon(selinuxEnforcePath())
238 242
 	if err != nil {
239 243
 		return -1
240 244
 	}
... ...
@@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
246 246
 	return enforce
247 247
 }
248 248
 
249
+func SelinuxSetEnforce(mode int) error {
250
+	return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
251
+}
252
+
249 253
 func SelinuxGetEnforceMode() int {
250 254
 	switch readConfig(selinuxTag) {
251 255
 	case "enforcing":
... ...
@@ -6,6 +6,7 @@ import (
6 6
 	"os"
7 7
 
8 8
 	"github.com/opencontainers/runc/libcontainer/apparmor"
9
+	"github.com/opencontainers/runc/libcontainer/keys"
9 10
 	"github.com/opencontainers/runc/libcontainer/label"
10 11
 	"github.com/opencontainers/runc/libcontainer/seccomp"
11 12
 	"github.com/opencontainers/runc/libcontainer/system"
... ...
@@ -18,12 +19,21 @@ type linuxSetnsInit struct {
18 18
 }
19 19
 
20 20
 func (l *linuxSetnsInit) Init() error {
21
+	// do not inherit the parent's session keyring
22
+	if _, err := keyctl.JoinSessionKeyring("_ses"); err != nil {
23
+		return err
24
+	}
21 25
 	if err := setupRlimits(l.config.Config); err != nil {
22 26
 		return err
23 27
 	}
24 28
 	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
25 29
 		return err
26 30
 	}
31
+	if l.config.Config.NoNewPrivileges {
32
+		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
33
+			return err
34
+		}
35
+	}
27 36
 	if l.config.Config.Seccomp != nil {
28 37
 		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
29 38
 			return err
... ...
@@ -3,22 +3,41 @@
3 3
 package libcontainer
4 4
 
5 5
 import (
6
+	"io"
6 7
 	"os"
7 8
 	"syscall"
8 9
 
9 10
 	"github.com/opencontainers/runc/libcontainer/apparmor"
10 11
 	"github.com/opencontainers/runc/libcontainer/configs"
12
+	"github.com/opencontainers/runc/libcontainer/keys"
11 13
 	"github.com/opencontainers/runc/libcontainer/label"
12 14
 	"github.com/opencontainers/runc/libcontainer/seccomp"
13 15
 	"github.com/opencontainers/runc/libcontainer/system"
14 16
 )
15 17
 
16 18
 type linuxStandardInit struct {
19
+	pipe      io.ReadWriter
17 20
 	parentPid int
18 21
 	config    *initConfig
19 22
 }
20 23
 
24
+// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
25
+// the kernel
26
+const PR_SET_NO_NEW_PRIVS = 0x26
27
+
21 28
 func (l *linuxStandardInit) Init() error {
29
+	// do not inherit the parent's session keyring
30
+	sessKeyId, err := keyctl.JoinSessionKeyring("")
31
+	if err != nil {
32
+		return err
33
+	}
34
+	// make session keyring searcheable
35
+	// without user ns we need 'UID' search permissions
36
+	// with user ns we need 'other' search permissions
37
+	if err := keyctl.ModKeyringPerm(sessKeyId, 0xffffffff, 0x080008); err != nil {
38
+		return err
39
+	}
40
+
22 41
 	// join any namespaces via a path to the namespace fd if provided
23 42
 	if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
24 43
 		return err
... ...
@@ -50,7 +69,6 @@ func (l *linuxStandardInit) Init() error {
50 50
 	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
51 51
 		return err
52 52
 	}
53
-
54 53
 	label.Init()
55 54
 	// InitializeMountNamespace() can be executed only for a new mount namespace
56 55
 	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
... ...
@@ -75,7 +93,6 @@ func (l *linuxStandardInit) Init() error {
75 75
 			return err
76 76
 		}
77 77
 	}
78
-
79 78
 	for _, path := range l.config.Config.ReadonlyPaths {
80 79
 		if err := remountReadonly(path); err != nil {
81 80
 			return err
... ...
@@ -90,6 +107,17 @@ func (l *linuxStandardInit) Init() error {
90 90
 	if err != nil {
91 91
 		return err
92 92
 	}
93
+	if l.config.Config.NoNewPrivileges {
94
+		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
95
+			return err
96
+		}
97
+	}
98
+	// Tell our parent that we're ready to Execv. This must be done before the
99
+	// Seccomp rules have been applied, because we need to be able to read and
100
+	// write to a socket.
101
+	if err := syncParentReady(l.pipe); err != nil {
102
+		return err
103
+	}
93 104
 	if l.config.Config.Seccomp != nil {
94 105
 		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
95 106
 			return err
... ...
@@ -109,5 +137,6 @@ func (l *linuxStandardInit) Init() error {
109 109
 	if syscall.Getppid() != l.parentPid {
110 110
 		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
111 111
 	}
112
+
112 113
 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
113 114
 }
114 115
new file mode 100644
... ...
@@ -0,0 +1,226 @@
0
+// +build linux
1
+
2
+package libcontainer
3
+
4
+import (
5
+	"fmt"
6
+	"os"
7
+	"path/filepath"
8
+
9
+	"github.com/Sirupsen/logrus"
10
+	"github.com/opencontainers/runc/libcontainer/configs"
11
+)
12
+
13
+func newStateTransitionError(from, to containerState) error {
14
+	return &stateTransitionError{
15
+		From: from.status().String(),
16
+		To:   to.status().String(),
17
+	}
18
+}
19
+
20
+// stateTransitionError is returned when an invalid state transition happens from one
21
+// state to another.
22
+type stateTransitionError struct {
23
+	From string
24
+	To   string
25
+}
26
+
27
+func (s *stateTransitionError) Error() string {
28
+	return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
29
+}
30
+
31
+type containerState interface {
32
+	transition(containerState) error
33
+	destroy() error
34
+	status() Status
35
+}
36
+
37
+func destroy(c *linuxContainer) error {
38
+	if !c.config.Namespaces.Contains(configs.NEWPID) {
39
+		if err := killCgroupProcesses(c.cgroupManager); err != nil {
40
+			logrus.Warn(err)
41
+		}
42
+	}
43
+	err := c.cgroupManager.Destroy()
44
+	if rerr := os.RemoveAll(c.root); err == nil {
45
+		err = rerr
46
+	}
47
+	c.initProcess = nil
48
+	if herr := runPoststopHooks(c); err == nil {
49
+		err = herr
50
+	}
51
+	c.state = &stoppedState{c: c}
52
+	return err
53
+}
54
+
55
+func runPoststopHooks(c *linuxContainer) error {
56
+	if c.config.Hooks != nil {
57
+		s := configs.HookState{
58
+			Version: c.config.Version,
59
+			ID:      c.id,
60
+			Root:    c.config.Rootfs,
61
+		}
62
+		for _, hook := range c.config.Hooks.Poststop {
63
+			if err := hook.Run(s); err != nil {
64
+				return err
65
+			}
66
+		}
67
+	}
68
+	return nil
69
+}
70
+
71
+// stoppedState represents a container is a stopped/destroyed state.
72
+type stoppedState struct {
73
+	c *linuxContainer
74
+}
75
+
76
+func (b *stoppedState) status() Status {
77
+	return Destroyed
78
+}
79
+
80
+func (b *stoppedState) transition(s containerState) error {
81
+	switch s.(type) {
82
+	case *runningState:
83
+		b.c.state = s
84
+		return nil
85
+	case *restoredState:
86
+		b.c.state = s
87
+		return nil
88
+	case *stoppedState:
89
+		return nil
90
+	}
91
+	return newStateTransitionError(b, s)
92
+}
93
+
94
+func (b *stoppedState) destroy() error {
95
+	return destroy(b.c)
96
+}
97
+
98
+// runningState represents a container that is currently running.
99
+type runningState struct {
100
+	c *linuxContainer
101
+}
102
+
103
+func (r *runningState) status() Status {
104
+	return Running
105
+}
106
+
107
+func (r *runningState) transition(s containerState) error {
108
+	switch s.(type) {
109
+	case *stoppedState:
110
+		running, err := r.c.isRunning()
111
+		if err != nil {
112
+			return err
113
+		}
114
+		if running {
115
+			return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
116
+		}
117
+		r.c.state = s
118
+		return nil
119
+	case *pausedState:
120
+		r.c.state = s
121
+		return nil
122
+	case *runningState:
123
+		return nil
124
+	}
125
+	return newStateTransitionError(r, s)
126
+}
127
+
128
+func (r *runningState) destroy() error {
129
+	running, err := r.c.isRunning()
130
+	if err != nil {
131
+		return err
132
+	}
133
+	if running {
134
+		return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
135
+	}
136
+	return destroy(r.c)
137
+}
138
+
139
+// pausedState represents a container that is currently pause.  It cannot be destroyed in a
140
+// paused state and must transition back to running first.
141
+type pausedState struct {
142
+	c *linuxContainer
143
+}
144
+
145
+func (p *pausedState) status() Status {
146
+	return Paused
147
+}
148
+
149
+func (p *pausedState) transition(s containerState) error {
150
+	switch s.(type) {
151
+	case *runningState, *stoppedState:
152
+		p.c.state = s
153
+		return nil
154
+	case *pausedState:
155
+		return nil
156
+	}
157
+	return newStateTransitionError(p, s)
158
+}
159
+
160
+func (p *pausedState) destroy() error {
161
+	isRunning, err := p.c.isRunning()
162
+	if err != nil {
163
+		return err
164
+	}
165
+	if !isRunning {
166
+		if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
167
+			return err
168
+		}
169
+		return destroy(p.c)
170
+	}
171
+	return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
172
+}
173
+
174
+// restoredState is the same as the running state but also has accociated checkpoint
175
+// information that maybe need destroyed when the container is stopped and destory is called.
176
+type restoredState struct {
177
+	imageDir string
178
+	c        *linuxContainer
179
+}
180
+
181
+func (r *restoredState) status() Status {
182
+	return Running
183
+}
184
+
185
+func (r *restoredState) transition(s containerState) error {
186
+	switch s.(type) {
187
+	case *stoppedState:
188
+		return nil
189
+	case *runningState:
190
+		return nil
191
+	}
192
+	return newStateTransitionError(r, s)
193
+}
194
+
195
+func (r *restoredState) destroy() error {
196
+	if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
197
+		if !os.IsNotExist(err) {
198
+			return err
199
+		}
200
+	}
201
+	return destroy(r.c)
202
+}
203
+
204
+// createdState is used whenever a container is restored, loaded, or setting additional
205
+// processes inside and it should not be destroyed when it is exiting.
206
+type createdState struct {
207
+	c *linuxContainer
208
+	s Status
209
+}
210
+
211
+func (n *createdState) status() Status {
212
+	return n.s
213
+}
214
+
215
+func (n *createdState) transition(s containerState) error {
216
+	n.c.state = s
217
+	return nil
218
+}
219
+
220
+func (n *createdState) destroy() error {
221
+	if err := n.c.refreshState(); err != nil {
222
+		return err
223
+	}
224
+	return n.c.state.destroy()
225
+}
... ...
@@ -3,6 +3,9 @@
3 3
 package system
4 4
 
5 5
 import (
6
+	"bufio"
7
+	"fmt"
8
+	"os"
6 9
 	"os/exec"
7 10
 	"syscall"
8 11
 	"unsafe"
... ...
@@ -75,3 +78,45 @@ func Setctty() error {
75 75
 	}
76 76
 	return nil
77 77
 }
78
+
79
+/*
80
+ * Detect whether we are currently running in a user namespace.
81
+ * Copied from github.com/lxc/lxd/shared/util.go
82
+ */
83
+func RunningInUserNS() bool {
84
+	file, err := os.Open("/proc/self/uid_map")
85
+	if err != nil {
86
+		/*
87
+		 * This kernel-provided file only exists if user namespaces are
88
+		 * supported
89
+		 */
90
+		return false
91
+	}
92
+	defer file.Close()
93
+
94
+	buf := bufio.NewReader(file)
95
+	l, _, err := buf.ReadLine()
96
+	if err != nil {
97
+		return false
98
+	}
99
+
100
+	line := string(l)
101
+	var a, b, c int64
102
+	fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
103
+	/*
104
+	 * We assume we are in the initial user namespace if we have a full
105
+	 * range - 4294967295 uids starting at uid 0.
106
+	 */
107
+	if a == 0 && b == 0 && c == 4294967295 {
108
+		return false
109
+	}
110
+	return true
111
+}
112
+
113
+func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
114
+	_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
115
+	if e1 != 0 {
116
+		err = e1
117
+	}
118
+	return
119
+}
... ...
@@ -5,6 +5,7 @@ import (
5 5
 	"encoding/hex"
6 6
 	"encoding/json"
7 7
 	"io"
8
+	"os"
8 9
 	"path/filepath"
9 10
 	"syscall"
10 11
 )
... ...
@@ -54,3 +55,32 @@ func WriteJSON(w io.Writer, v interface{}) error {
54 54
 	_, err = w.Write(data)
55 55
 	return err
56 56
 }
57
+
58
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
59
+// only cleaning the path, but also (if the path is relative) adding a leading
60
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
61
+// path resulting from prepending another path will always resolve to lexically
62
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
63
+// that include symlinks won't be safe as a result of using CleanPath.
64
+func CleanPath(path string) string {
65
+	// Deal with empty strings nicely.
66
+	if path == "" {
67
+		return ""
68
+	}
69
+
70
+	// Ensure that all paths are cleaned (especially problematic ones like
71
+	// "/../../../../../" which can cause lots of issues).
72
+	path = filepath.Clean(path)
73
+
74
+	// If the path isn't absolute, we need to do more processing to fix paths
75
+	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
76
+	// paths to relative ones.
77
+	if !filepath.IsAbs(path) {
78
+		path = filepath.Clean(string(os.PathSeparator) + path)
79
+		// This can't fail, as (by definition) all paths are relative to root.
80
+		path, _ = filepath.Rel(string(os.PathSeparator), path)
81
+	}
82
+
83
+	// Clean the path again for good measure.
84
+	return filepath.Clean(path)
85
+}