Browse code

Use runc version built without ambient capabilities

Until we can support existing behaviour with `sudo` disable
ambient capabilities in runc build.

Add tests that non root user cannot use default capabilities,
and that capabilities are working as expected.

Test for #27590

Update runc.

Signed-off-by: Justin Cormack <justin.cormack@docker.com>

Justin Cormack authored on 2016/10/21 21:34:37
Showing 22 changed files
... ...
@@ -6,6 +6,10 @@ WORKDIR /usr/src/
6 6
 
7 7
 RUN gcc -g -Wall -static userns.c -o /usr/bin/userns-test \
8 8
 	&& gcc -g -Wall -static ns.c -o /usr/bin/ns-test \
9
-	&& gcc -g -Wall -static acct.c -o /usr/bin/acct-test
9
+	&& gcc -g -Wall -static acct.c -o /usr/bin/acct-test \
10
+	&& gcc -g -Wall -static setuid.c -o /usr/bin/setuid-test \
11
+	&& gcc -g -Wall -static setgid.c -o /usr/bin/setgid-test \
12
+	&& gcc -g -Wall -static socket.c -o /usr/bin/socket-test \
13
+	&& gcc -g -Wall -static raw.c -o /usr/bin/raw-test
10 14
 
11 15
 RUN [ "$(uname -m)" = "x86_64" ] && gcc -s -m32 -nostdlib exit32.s -o /usr/bin/exit32-test || true
12 16
new file mode 100644
... ...
@@ -0,0 +1,14 @@
0
+#include <stdio.h>
1
+#include <unistd.h>
2
+#include <sys/socket.h>
3
+#include <netinet/ip.h>
4
+#include <netinet/udp.h>
5
+
6
+int main() {
7
+	if (socket(PF_INET, SOCK_RAW, IPPROTO_UDP) == -1) {
8
+		perror("socket");
9
+		return 1;
10
+	}
11
+
12
+	return 0;
13
+}
0 14
new file mode 100644
... ...
@@ -0,0 +1,11 @@
0
+#include <sys/types.h>
1
+#include <unistd.h>
2
+#include <stdio.h>
3
+
4
+int main() {
5
+	if (setgid(1) == -1) {
6
+		perror("setgid");
7
+		return 1;
8
+	}
9
+	return 0;
10
+}
0 11
new file mode 100644
... ...
@@ -0,0 +1,11 @@
0
+#include <sys/types.h>
1
+#include <unistd.h>
2
+#include <stdio.h>
3
+
4
+int main() {
5
+	if (setuid(1) == -1) {
6
+		perror("setuid");
7
+		return 1;
8
+	}
9
+	return 0;
10
+}
0 11
new file mode 100644
... ...
@@ -0,0 +1,30 @@
0
+#include <stdio.h>
1
+#include <unistd.h>
2
+#include <sys/types.h>
3
+#include <sys/socket.h>
4
+#include <netinet/in.h>
5
+#include <arpa/inet.h>
6
+
7
+int main() {
8
+	int s;
9
+	struct sockaddr_in sin;
10
+
11
+	s = socket(AF_INET, SOCK_STREAM, 0);
12
+	if (s == -1) {
13
+		perror("socket");
14
+		return 1;
15
+	}
16
+
17
+	sin.sin_family = AF_INET;
18
+	sin.sin_addr.s_addr = INADDR_ANY;
19
+	sin.sin_port = htons(80);
20
+
21
+	if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) == -1) {
22
+		perror("bind");
23
+		return 1;
24
+	}
25
+
26
+	close(s);
27
+
28
+	return 0;
29
+}
... ...
@@ -3,7 +3,7 @@ set -e
3 3
 set -x
4 4
 
5 5
 TOMLV_COMMIT=9baf8a8a9f2ed20a8e54160840c492f937eeaf9a
6
-RUNC_COMMIT=02f8fa7863dd3f82909a73e2061897828460d52f
6
+RUNC_COMMIT=ac031b5bf1cc92239461125f4c1ffb760522bbf2
7 7
 CONTAINERD_COMMIT=52ef1ceb4b660c42cf4ea9013180a5663968d4c7
8 8
 GRIMES_COMMIT=fe069a03affd2547fdb05e5b8b07202d2e41735b
9 9
 LIBNETWORK_COMMIT=0f534354b813003a754606689722fe253101bc4e
... ...
@@ -20,11 +20,12 @@ else
20 20
 	export GOPATH="$TMP_GOPATH"
21 21
 fi
22 22
 
23
+# Do not build with ambient capabilities support
23 24
 RUNC_BUILDTAGS="${RUNC_BUILDTAGS:-"seccomp apparmor selinux"}"
24 25
 
25 26
 install_runc() {
26 27
 	echo "Install runc version $RUNC_COMMIT"
27
-	git clone https://github.com/opencontainers/runc.git "$GOPATH/src/github.com/opencontainers/runc"
28
+	git clone https://github.com/docker/runc.git "$GOPATH/src/github.com/opencontainers/runc"
28 29
 	cd "$GOPATH/src/github.com/opencontainers/runc"
29 30
 	git checkout -q "$RUNC_COMMIT"
30 31
 	make BUILDTAGS="$RUNC_BUILDTAGS" $1
... ...
@@ -1155,24 +1155,185 @@ func (s *DockerSuite) TestRunNoNewPrivSetuid(c *check.C) {
1155 1155
 	}
1156 1156
 }
1157 1157
 
1158
-func (s *DockerSuite) TestRunAmbientCapabilities(c *check.C) {
1159
-	testRequires(c, DaemonIsLinux, ambientCapabilities)
1158
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesChown(c *check.C) {
1159
+	testRequires(c, DaemonIsLinux)
1160
+	ensureSyscallTest(c)
1161
+
1162
+	// test that a root user has default capability CAP_CHOWN
1163
+	runCmd := exec.Command(dockerBinary, "run", "busybox", "chown", "100", "/tmp")
1164
+	_, _, err := runCommandWithOutput(runCmd)
1165
+	c.Assert(err, check.IsNil)
1166
+	// test that non root user does not have default capability CAP_CHOWN
1167
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chown", "100", "/tmp")
1168
+	out, _, err := runCommandWithOutput(runCmd)
1169
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1170
+	c.Assert(out, checker.Contains, "Operation not permitted")
1171
+	// test that root user can drop default capability CAP_CHOWN
1172
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "chown", "busybox", "chown", "100", "/tmp")
1173
+	out, _, err = runCommandWithOutput(runCmd)
1174
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1175
+	c.Assert(out, checker.Contains, "Operation not permitted")
1176
+}
1177
+
1178
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesDacOverride(c *check.C) {
1179
+	testRequires(c, DaemonIsLinux)
1180
+	ensureSyscallTest(c)
1181
+
1182
+	// test that a root user has default capability CAP_DAC_OVERRIDE
1183
+	runCmd := exec.Command(dockerBinary, "run", "busybox", "sh", "-c", "echo test > /etc/passwd")
1184
+	_, _, err := runCommandWithOutput(runCmd)
1185
+	c.Assert(err, check.IsNil)
1186
+	// test that non root user does not have default capability CAP_DAC_OVERRIDE
1187
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "sh", "-c", "echo test > /etc/passwd")
1188
+	out, _, err := runCommandWithOutput(runCmd)
1189
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1190
+	c.Assert(out, checker.Contains, "Permission denied")
1191
+	// TODO test that root user can drop default capability CAP_DAC_OVERRIDE
1192
+}
1193
+
1194
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesFowner(c *check.C) {
1195
+	testRequires(c, DaemonIsLinux)
1196
+	ensureSyscallTest(c)
1197
+
1198
+	// test that a root user has default capability CAP_FOWNER
1199
+	runCmd := exec.Command(dockerBinary, "run", "busybox", "chmod", "777", "/etc/passwd")
1200
+	_, _, err := runCommandWithOutput(runCmd)
1201
+	c.Assert(err, check.IsNil)
1202
+	// test that non root user does not have default capability CAP_FOWNER
1203
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chmod", "777", "/etc/passwd")
1204
+	out, _, err := runCommandWithOutput(runCmd)
1205
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1206
+	c.Assert(out, checker.Contains, "Operation not permitted")
1207
+	// TODO test that root user can drop default capability CAP_FOWNER
1208
+}
1209
+
1210
+// TODO CAP_KILL
1211
+
1212
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesSetuid(c *check.C) {
1213
+	testRequires(c, DaemonIsLinux)
1214
+	ensureSyscallTest(c)
1215
+
1216
+	// test that a root user has default capability CAP_SETUID
1217
+	runCmd := exec.Command(dockerBinary, "run", "syscall-test", "setuid-test")
1218
+	_, _, err := runCommandWithOutput(runCmd)
1219
+	c.Assert(err, check.IsNil)
1220
+	// test that non root user does not have default capability CAP_SETUID
1221
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "setuid-test")
1222
+	out, _, err := runCommandWithOutput(runCmd)
1223
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1224
+	c.Assert(out, checker.Contains, "Operation not permitted")
1225
+	// test that root user can drop default capability CAP_SETUID
1226
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "setuid", "syscall-test", "setuid-test")
1227
+	out, _, err = runCommandWithOutput(runCmd)
1228
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1229
+	c.Assert(out, checker.Contains, "Operation not permitted")
1230
+}
1231
+
1232
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesSetgid(c *check.C) {
1233
+	testRequires(c, DaemonIsLinux)
1234
+	ensureSyscallTest(c)
1235
+
1236
+	// test that a root user has default capability CAP_SETGID
1237
+	runCmd := exec.Command(dockerBinary, "run", "syscall-test", "setgid-test")
1238
+	_, _, err := runCommandWithOutput(runCmd)
1239
+	c.Assert(err, check.IsNil)
1240
+	// test that non root user does not have default capability CAP_SETGID
1241
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "setgid-test")
1242
+	out, _, err := runCommandWithOutput(runCmd)
1243
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1244
+	c.Assert(out, checker.Contains, "Operation not permitted")
1245
+	// test that root user can drop default capability CAP_SETGID
1246
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "setgid", "syscall-test", "setgid-test")
1247
+	out, _, err = runCommandWithOutput(runCmd)
1248
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1249
+	c.Assert(out, checker.Contains, "Operation not permitted")
1250
+}
1251
+
1252
+// TODO CAP_SETPCAP
1253
+
1254
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetBindService(c *check.C) {
1255
+	testRequires(c, DaemonIsLinux)
1256
+	ensureSyscallTest(c)
1257
+
1258
+	// test that a root user has default capability CAP_NET_BIND_SERVICE
1259
+	runCmd := exec.Command(dockerBinary, "run", "syscall-test", "socket-test")
1260
+	_, _, err := runCommandWithOutput(runCmd)
1261
+	c.Assert(err, check.IsNil)
1262
+	// test that non root user does not have default capability CAP_NET_BIND_SERVICE
1263
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "socket-test")
1264
+	out, _, err := runCommandWithOutput(runCmd)
1265
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1266
+	c.Assert(out, checker.Contains, "Permission denied")
1267
+	// test that root user can drop default capability CAP_NET_BIND_SERVICE
1268
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "net_bind_service", "syscall-test", "socket-test")
1269
+	out, _, err = runCommandWithOutput(runCmd)
1270
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1271
+	c.Assert(out, checker.Contains, "Permission denied")
1272
+}
1273
+
1274
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetRaw(c *check.C) {
1275
+	testRequires(c, DaemonIsLinux)
1276
+	ensureSyscallTest(c)
1277
+
1278
+	// test that a root user has default capability CAP_NET_RAW
1279
+	runCmd := exec.Command(dockerBinary, "run", "syscall-test", "raw-test")
1280
+	_, _, err := runCommandWithOutput(runCmd)
1281
+	c.Assert(err, check.IsNil)
1282
+	// test that non root user does not have default capability CAP_NET_RAW
1283
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "raw-test")
1284
+	out, _, err := runCommandWithOutput(runCmd)
1285
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1286
+	c.Assert(out, checker.Contains, "Operation not permitted")
1287
+	// test that root user can drop default capability CAP_NET_RAW
1288
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "net_raw", "syscall-test", "raw-test")
1289
+	out, _, err = runCommandWithOutput(runCmd)
1290
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1291
+	c.Assert(out, checker.Contains, "Operation not permitted")
1292
+}
1160 1293
 
1161
-	// test that a non root user can gain capabilities
1162
-	runCmd := exec.Command(dockerBinary, "run", "--user", "1000", "--cap-add", "chown", "busybox", "chown", "100", "/tmp")
1294
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesChroot(c *check.C) {
1295
+	testRequires(c, DaemonIsLinux)
1296
+	ensureSyscallTest(c)
1297
+
1298
+	// test that a root user has default capability CAP_SYS_CHROOT
1299
+	runCmd := exec.Command(dockerBinary, "run", "busybox", "chroot", "/", "/bin/true")
1163 1300
 	_, _, err := runCommandWithOutput(runCmd)
1164 1301
 	c.Assert(err, check.IsNil)
1165
-	// test that non root user has default capabilities
1166
-	runCmd = exec.Command(dockerBinary, "run", "--user", "1000", "busybox", "chown", "100", "/tmp")
1167
-	_, _, err = runCommandWithOutput(runCmd)
1302
+	// test that non root user does not have default capability CAP_SYS_CHROOT
1303
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chroot", "/", "/bin/true")
1304
+	out, _, err := runCommandWithOutput(runCmd)
1305
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1306
+	c.Assert(out, checker.Contains, "Operation not permitted")
1307
+	// test that root user can drop default capability CAP_SYS_CHROOT
1308
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "sys_chroot", "busybox", "chroot", "/", "/bin/true")
1309
+	out, _, err = runCommandWithOutput(runCmd)
1310
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1311
+	c.Assert(out, checker.Contains, "Operation not permitted")
1312
+}
1313
+
1314
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesMknod(c *check.C) {
1315
+	testRequires(c, DaemonIsLinux)
1316
+	ensureSyscallTest(c)
1317
+
1318
+	// test that a root user has default capability CAP_MKNOD
1319
+	runCmd := exec.Command(dockerBinary, "run", "busybox", "mknod", "/tmp/node", "b", "1", "2")
1320
+	_, _, err := runCommandWithOutput(runCmd)
1168 1321
 	c.Assert(err, check.IsNil)
1169
-	// test this fails without cap_chown
1170
-	runCmd = exec.Command(dockerBinary, "run", "--user", "1000", "--cap-drop", "chown", "busybox", "chown", "100", "/tmp")
1322
+	// test that non root user does not have default capability CAP_MKNOD
1323
+	runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "mknod", "/tmp/node", "b", "1", "2")
1171 1324
 	out, _, err := runCommandWithOutput(runCmd)
1172 1325
 	c.Assert(err, checker.NotNil, check.Commentf(out))
1173
-	c.Assert(strings.TrimSpace(out), checker.Equals, "chown: /tmp: Operation not permitted")
1326
+	c.Assert(out, checker.Contains, "Operation not permitted")
1327
+	// test that root user can drop default capability CAP_MKNOD
1328
+	runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "mknod", "busybox", "mknod", "/tmp/node", "b", "1", "2")
1329
+	out, _, err = runCommandWithOutput(runCmd)
1330
+	c.Assert(err, checker.NotNil, check.Commentf(out))
1331
+	c.Assert(out, checker.Contains, "Operation not permitted")
1174 1332
 }
1175 1333
 
1334
+// TODO CAP_AUDIT_WRITE
1335
+// TODO CAP_SETFCAP
1336
+
1176 1337
 func (s *DockerSuite) TestRunApparmorProcDirectory(c *check.C) {
1177 1338
 	testRequires(c, SameHostDaemon, Apparmor)
1178 1339
 
... ...
@@ -1,6 +1,7 @@
1 1
 package main
2 2
 
3 3
 import (
4
+	"fmt"
4 5
 	"io/ioutil"
5 6
 	"os"
6 7
 	"os/exec"
... ...
@@ -53,15 +54,14 @@ func ensureSyscallTest(c *check.C) {
53 53
 	gcc, err := exec.LookPath("gcc")
54 54
 	c.Assert(err, checker.IsNil, check.Commentf("could not find gcc"))
55 55
 
56
-	out, err := exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/userns.c", "-o", tmp+"/"+"userns-test").CombinedOutput()
57
-	c.Assert(err, checker.IsNil, check.Commentf(string(out)))
58
-	out, err = exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/ns.c", "-o", tmp+"/"+"ns-test").CombinedOutput()
59
-	c.Assert(err, checker.IsNil, check.Commentf(string(out)))
60
-	out, err = exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/acct.c", "-o", tmp+"/"+"acct-test").CombinedOutput()
61
-	c.Assert(err, checker.IsNil, check.Commentf(string(out)))
56
+	tests := []string{"userns", "ns", "acct", "setuid", "setgid", "socket", "raw"}
57
+	for _, test := range tests {
58
+		out, err := exec.Command(gcc, "-g", "-Wall", "-static", fmt.Sprintf("../contrib/syscall-test/%s.c", test), "-o", fmt.Sprintf("%s/%s-test", tmp, test)).CombinedOutput()
59
+		c.Assert(err, checker.IsNil, check.Commentf(string(out)))
60
+	}
62 61
 
63 62
 	if runtime.GOOS == "linux" && runtime.GOARCH == "amd64" {
64
-		out, err = exec.Command(gcc, "-s", "-m32", "-nostdlib", "../contrib/syscall-test/exit32.s", "-o", tmp+"/"+"exit32-test").CombinedOutput()
63
+		out, err := exec.Command(gcc, "-s", "-m32", "-nostdlib", "../contrib/syscall-test/exit32.s", "-o", tmp+"/"+"exit32-test").CombinedOutput()
65 64
 		c.Assert(err, checker.IsNil, check.Commentf(string(out)))
66 65
 	}
67 66
 
... ...
@@ -59,7 +59,7 @@ github.com/miekg/pkcs11 df8ae6ca730422dba20c768ff38ef7d79077a59f
59 59
 github.com/docker/go v1.5.1-1-1-gbaf439e
60 60
 github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
61 61
 
62
-github.com/opencontainers/runc 02f8fa7863dd3f82909a73e2061897828460d52f # libcontainer
62
+github.com/opencontainers/runc ac031b5bf1cc92239461125f4c1ffb760522bbf2 # libcontainer
63 63
 github.com/opencontainers/runtime-spec 1c7c27d043c2a5e513a44084d2b10d77d1402b8c # specs
64 64
 github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0
65 65
 # libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json)
... ...
@@ -22,7 +22,7 @@ type Cgroup struct {
22 22
 	// The path is assumed to be relative to the host system cgroup mountpoint.
23 23
 	Path string `json:"path"`
24 24
 
25
-	// ScopePrefix decribes prefix for the scope name
25
+	// ScopePrefix describes prefix for the scope name
26 26
 	ScopePrefix string `json:"scope_prefix"`
27 27
 
28 28
 	// Paths represent the absolute cgroups paths to join.
... ...
@@ -95,7 +95,7 @@ type Resources struct {
95 95
 	// IO read rate limit per cgroup per device, bytes per second.
96 96
 	BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
97 97
 
98
-	// IO write rate limit per cgroup per divice, bytes per second.
98
+	// IO write rate limit per cgroup per device, bytes per second.
99 99
 	BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
100 100
 
101 101
 	// IO read rate limit per cgroup per device, IO per second.
... ...
@@ -85,11 +85,6 @@ type Config struct {
85 85
 	// that the parent process dies.
86 86
 	ParentDeathSignal int `json:"parent_death_signal"`
87 87
 
88
-	// PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set.
89
-	// When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable.
90
-	// This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot.
91
-	PivotDir string `json:"pivot_dir"`
92
-
93 88
 	// Path to a directory containing the container's root filesystem.
94 89
 	Rootfs string `json:"rootfs"`
95 90
 
... ...
@@ -1,5 +1,11 @@
1 1
 package configs
2 2
 
3
+const (
4
+	// EXT_COPYUP is a directive to copy up the contents of a directory when
5
+	// a tmpfs is mounted over it.
6
+	EXT_COPYUP = 1 << iota
7
+)
8
+
3 9
 type Mount struct {
4 10
 	// Source path for the mount.
5 11
 	Source string `json:"source"`
... ...
@@ -22,6 +28,9 @@ type Mount struct {
22 22
 	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
23 23
 	Relabel string `json:"relabel"`
24 24
 
25
+	// Extensions are additional flags that are specific to runc.
26
+	Extensions int `json:"extensions"`
27
+
25 28
 	// Optional Command to be run before Source is mounted.
26 29
 	PremountCmds []Command `json:"premount_cmds"`
27 30
 
... ...
@@ -22,8 +22,8 @@ var (
22 22
 	supportedNamespaces = make(map[NamespaceType]bool)
23 23
 )
24 24
 
25
-// nsToFile converts the namespace type to its filename
26
-func nsToFile(ns NamespaceType) string {
25
+// NsName converts the namespace type to its filename
26
+func NsName(ns NamespaceType) string {
27 27
 	switch ns {
28 28
 	case NEWNET:
29 29
 		return "net"
... ...
@@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
50 50
 	if ok {
51 51
 		return supported
52 52
 	}
53
-	nsFile := nsToFile(ns)
53
+	nsFile := NsName(ns)
54 54
 	// if the namespace type is unknown, just return false
55 55
 	if nsFile == "" {
56 56
 		return false
... ...
@@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
84 84
 	if n.Path != "" {
85 85
 		return n.Path
86 86
 	}
87
-	return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
87
+	return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
88 88
 }
89 89
 
90 90
 func (n *Namespaces) Remove(t NamespaceType) bool {
... ...
@@ -9,6 +9,10 @@ func InitLabels(options []string) (string, string, error) {
9 9
 	return "", "", nil
10 10
 }
11 11
 
12
+func GetROMountLabel() string {
13
+	return ""
14
+}
15
+
12 16
 func GenLabels(options string) (string, string, error) {
13 17
 	return "", "", nil
14 18
 }
... ...
@@ -33,15 +33,19 @@ func InitLabels(options []string) (string, string, error) {
33 33
 		pcon := selinux.NewContext(processLabel)
34 34
 		mcon := selinux.NewContext(mountLabel)
35 35
 		for _, opt := range options {
36
-			if opt == "disable" {
37
-				return "", "", nil
36
+			val := strings.SplitN(opt, "=", 2)
37
+			if val[0] != "label" {
38
+				continue
39
+			}
40
+			if len(val) < 2 {
41
+				return "", "", fmt.Errorf("bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt)
38 42
 			}
39
-			if i := strings.Index(opt, ":"); i == -1 {
40
-				return "", "", fmt.Errorf("Bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt)
43
+			if val[1] == "disable" {
44
+				return "", "", nil
41 45
 			}
42
-			con := strings.SplitN(opt, ":", 2)
43
-			if !validOptions[con[0]] {
44
-				return "", "", fmt.Errorf("Bad label option %q, valid options 'disable, user, role, level, type'", con[0])
46
+			con := strings.SplitN(val[1], ":", 2)
47
+			if len(con) < 2 || !validOptions[con[0]] {
48
+				return "", "", fmt.Errorf("bad label option %q, valid options 'disable, user, role, level, type'", con[0])
45 49
 
46 50
 			}
47 51
 			pcon[con[0]] = con[1]
... ...
@@ -55,6 +59,10 @@ func InitLabels(options []string) (string, string, error) {
55 55
 	return processLabel, mountLabel, nil
56 56
 }
57 57
 
58
+func GetROMountLabel() string {
59
+	return selinux.GetROFileLabel()
60
+}
61
+
58 62
 // DEPRECATED: The GenLabels function is only to be used during the transition to the official API.
59 63
 func GenLabels(options string) (string, string, error) {
60 64
 	return InitLabels(strings.Fields(options))
61 65
new file mode 100644
... ...
@@ -0,0 +1,32 @@
0
+#ifndef NSENTER_NAMESPACE_H
1
+#define NSENTER_NAMESPACE_H
2
+
3
+#ifndef _GNU_SOURCE
4
+#	define _GNU_SOURCE
5
+#endif
6
+#include <sched.h>
7
+
8
+/* All of these are taken from include/uapi/linux/sched.h */
9
+#ifndef CLONE_NEWNS
10
+#	define CLONE_NEWNS 0x00020000 /* New mount namespace group */
11
+#endif
12
+#ifndef CLONE_NEWCGROUP
13
+#	define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
14
+#endif
15
+#ifndef CLONE_NEWUTS
16
+#	define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
17
+#endif
18
+#ifndef CLONE_NEWIPC
19
+#	define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
20
+#endif
21
+#ifndef CLONE_NEWUSER
22
+#	define CLONE_NEWUSER 0x10000000 /* New user namespace */
23
+#endif
24
+#ifndef CLONE_NEWPID
25
+#	define CLONE_NEWPID 0x20000000 /* New pid namespace */
26
+#endif
27
+#ifndef CLONE_NEWNET
28
+#	define CLONE_NEWNET 0x40000000 /* New network namespace */
29
+#endif
30
+
31
+#endif /* NSENTER_NAMESPACE_H */
0 32
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build linux,!gccgo
1
+
2
+package nsenter
3
+
4
+/*
5
+#cgo CFLAGS: -Wall
6
+extern void nsexec();
7
+void __attribute__((constructor)) init(void) {
8
+	nsexec();
9
+}
10
+*/
11
+import "C"
0 12
new file mode 100644
... ...
@@ -0,0 +1,25 @@
0
+// +build linux,gccgo
1
+
2
+package nsenter
3
+
4
+/*
5
+#cgo CFLAGS: -Wall
6
+extern void nsexec();
7
+void __attribute__((constructor)) init(void) {
8
+	nsexec();
9
+}
10
+*/
11
+import "C"
12
+
13
+// AlwaysFalse is here to stay false
14
+// (and be exported so the compiler doesn't optimize out its reference)
15
+var AlwaysFalse bool
16
+
17
+func init() {
18
+	if AlwaysFalse {
19
+		// by referencing this C init() in a noop test, it will ensure the compiler
20
+		// links in the C function.
21
+		// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
22
+		C.init()
23
+	}
24
+}
0 25
new file mode 100644
... ...
@@ -0,0 +1,5 @@
0
+// +build !linux !cgo
1
+
2
+package nsenter
3
+
4
+import "C"
0 5
new file mode 100644
... ...
@@ -0,0 +1,753 @@
0
+#define _GNU_SOURCE
1
+#include <endian.h>
2
+#include <errno.h>
3
+#include <fcntl.h>
4
+#include <grp.h>
5
+#include <sched.h>
6
+#include <setjmp.h>
7
+#include <signal.h>
8
+#include <stdarg.h>
9
+#include <stdbool.h>
10
+#include <stdint.h>
11
+#include <stdio.h>
12
+#include <stdlib.h>
13
+#include <stdbool.h>
14
+#include <string.h>
15
+#include <unistd.h>
16
+
17
+#include <sys/ioctl.h>
18
+#include <sys/prctl.h>
19
+#include <sys/socket.h>
20
+#include <sys/types.h>
21
+
22
+#include <linux/limits.h>
23
+#include <linux/netlink.h>
24
+#include <linux/types.h>
25
+
26
+/* Get all of the CLONE_NEW* flags. */
27
+#include "namespace.h"
28
+
29
+/* Synchronisation values. */
30
+enum sync_t {
31
+	SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
32
+	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
33
+	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
34
+	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
35
+
36
+	/* XXX: This doesn't help with segfaults and other such issues. */
37
+	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
38
+};
39
+
40
+/* longjmp() arguments. */
41
+#define JUMP_PARENT 0x00
42
+#define JUMP_CHILD  0xA0
43
+#define JUMP_INIT   0xA1
44
+
45
+/* JSON buffer. */
46
+#define JSON_MAX 4096
47
+
48
+/* Assume the stack grows down, so arguments should be above it. */
49
+struct clone_t {
50
+	/*
51
+	 * Reserve some space for clone() to locate arguments
52
+	 * and retcode in this place
53
+	 */
54
+	char stack[4096] __attribute__ ((aligned(16)));
55
+	char stack_ptr[0];
56
+
57
+	/* There's two children. This is used to execute the different code. */
58
+	jmp_buf *env;
59
+	int jmpval;
60
+};
61
+
62
+struct nlconfig_t {
63
+	char *data;
64
+	uint32_t cloneflags;
65
+	char *uidmap;
66
+	size_t uidmap_len;
67
+	char *gidmap;
68
+	size_t gidmap_len;
69
+	char *namespaces;
70
+	size_t namespaces_len;
71
+	uint8_t is_setgroup;
72
+	int consolefd;
73
+};
74
+
75
+/*
76
+ * List of netlink message types sent to us as part of bootstrapping the init.
77
+ * These constants are defined in libcontainer/message_linux.go.
78
+ */
79
+#define INIT_MSG		62000
80
+#define CLONE_FLAGS_ATTR	27281
81
+#define CONSOLE_PATH_ATTR	27282
82
+#define NS_PATHS_ATTR		27283
83
+#define UIDMAP_ATTR		27284
84
+#define GIDMAP_ATTR		27285
85
+#define SETGROUP_ATTR		27286
86
+
87
+/*
88
+ * Use the raw syscall for versions of glibc which don't include a function for
89
+ * it, namely (glibc 2.12).
90
+ */
91
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
92
+#	define _GNU_SOURCE
93
+#	include "syscall.h"
94
+#	if !defined(SYS_setns) && defined(__NR_setns)
95
+#		define SYS_setns __NR_setns
96
+#	endif
97
+
98
+#ifndef SYS_setns
99
+#	error "setns(2) syscall not supported by glibc version"
100
+#endif
101
+
102
+int setns(int fd, int nstype)
103
+{
104
+	return syscall(SYS_setns, fd, nstype);
105
+}
106
+#endif
107
+
108
+/* XXX: This is ugly. */
109
+static int syncfd = -1;
110
+
111
+/* TODO(cyphar): Fix this so it correctly deals with syncT. */
112
+#define bail(fmt, ...)								\
113
+	do {									\
114
+		int ret = __COUNTER__ + 1;					\
115
+		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__);	\
116
+		if (syncfd >= 0) {						\
117
+			enum sync_t s = SYNC_ERR;				\
118
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s))		\
119
+				fprintf(stderr, "nsenter: failed: write(s)");	\
120
+			if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret))	\
121
+				fprintf(stderr, "nsenter: failed: write(ret)");	\
122
+		}								\
123
+		exit(ret);							\
124
+	} while(0)
125
+
126
+static int write_file(char *data, size_t data_len, char *pathfmt, ...)
127
+{
128
+	int fd, len, ret = 0;
129
+	char path[PATH_MAX];
130
+
131
+	va_list ap;
132
+	va_start(ap, pathfmt);
133
+	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
134
+	va_end(ap);
135
+	if (len < 0)
136
+		return -1;
137
+
138
+	fd = open(path, O_RDWR);
139
+	if (fd < 0) {
140
+		ret = -1;
141
+		goto out;
142
+	}
143
+
144
+	len = write(fd, data, data_len);
145
+	if (len != data_len) {
146
+		ret = -1;
147
+		goto out;
148
+	}
149
+
150
+out:
151
+	close(fd);
152
+	return ret;
153
+}
154
+
155
+enum policy_t {
156
+	SETGROUPS_DEFAULT = 0,
157
+	SETGROUPS_ALLOW,
158
+	SETGROUPS_DENY,
159
+};
160
+
161
+/* This *must* be called before we touch gid_map. */
162
+static void update_setgroups(int pid, enum policy_t setgroup)
163
+{
164
+	char *policy;
165
+
166
+	switch (setgroup) {
167
+		case SETGROUPS_ALLOW:
168
+			policy = "allow";
169
+			break;
170
+		case SETGROUPS_DENY:
171
+			policy = "deny";
172
+			break;
173
+		case SETGROUPS_DEFAULT:
174
+			/* Nothing to do. */
175
+			return;
176
+	}
177
+
178
+	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
179
+		/*
180
+		 * If the kernel is too old to support /proc/pid/setgroups,
181
+		 * open(2) or write(2) will return ENOENT. This is fine.
182
+		 */
183
+		if (errno != ENOENT)
184
+			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
185
+	}
186
+}
187
+
188
+static void update_uidmap(int pid, char *map, int map_len)
189
+{
190
+	if (map == NULL || map_len <= 0)
191
+		return;
192
+
193
+	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
194
+		bail("failed to update /proc/%d/uid_map", pid);
195
+}
196
+
197
+static void update_gidmap(int pid, char *map, int map_len)
198
+{
199
+	if (map == NULL || map_len <= 0)
200
+		return;
201
+
202
+	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
203
+		bail("failed to update /proc/%d/gid_map", pid);
204
+}
205
+
206
+/* A dummy function that just jumps to the given jumpval. */
207
+static int child_func(void *arg) __attribute__ ((noinline));
208
+static int child_func(void *arg)
209
+{
210
+	struct clone_t *ca = (struct clone_t *)arg;
211
+	longjmp(*ca->env, ca->jmpval);
212
+}
213
+
214
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
215
+static int clone_parent(jmp_buf *env, int jmpval)
216
+{
217
+	struct clone_t ca = {
218
+		.env    = env,
219
+		.jmpval = jmpval,
220
+	};
221
+
222
+	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
223
+}
224
+
225
+/*
226
+ * Gets the init pipe fd from the environment, which is used to read the
227
+ * bootstrap data and tell the parent what the new pid is after we finish
228
+ * setting up the environment.
229
+ */
230
+static int initpipe(void)
231
+{
232
+	int pipenum;
233
+	char *initpipe, *endptr;
234
+
235
+	initpipe = getenv("_LIBCONTAINER_INITPIPE");
236
+	if (initpipe == NULL || *initpipe == '\0')
237
+		return -1;
238
+
239
+	pipenum = strtol(initpipe, &endptr, 10);
240
+	if (*endptr != '\0')
241
+		bail("unable to parse _LIBCONTAINER_INITPIPE");
242
+
243
+	return pipenum;
244
+}
245
+
246
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
247
+static int nsflag(char *name)
248
+{
249
+	if (!strcmp(name, "cgroup"))
250
+		return CLONE_NEWCGROUP;
251
+	else if (!strcmp(name, "ipc"))
252
+		return CLONE_NEWIPC;
253
+	else if (!strcmp(name, "mnt"))
254
+		return CLONE_NEWNS;
255
+	else if (!strcmp(name, "net"))
256
+		return CLONE_NEWNET;
257
+	else if (!strcmp(name, "pid"))
258
+		return CLONE_NEWPID;
259
+	else if (!strcmp(name, "user"))
260
+		return CLONE_NEWUSER;
261
+	else if (!strcmp(name, "uts"))
262
+		return CLONE_NEWUTS;
263
+
264
+	/* If we don't recognise a name, fallback to 0. */
265
+	return 0;
266
+}
267
+
268
+static uint32_t readint32(char *buf)
269
+{
270
+	return *(uint32_t *) buf;
271
+}
272
+
273
+static uint8_t readint8(char *buf)
274
+{
275
+	return *(uint8_t *) buf;
276
+}
277
+
278
+static void nl_parse(int fd, struct nlconfig_t *config)
279
+{
280
+	size_t len, size;
281
+	struct nlmsghdr hdr;
282
+	char *data, *current;
283
+
284
+	/* Retrieve the netlink header. */
285
+	len = read(fd, &hdr, NLMSG_HDRLEN);
286
+	if (len != NLMSG_HDRLEN)
287
+		bail("invalid netlink header length %lu", len);
288
+
289
+	if (hdr.nlmsg_type == NLMSG_ERROR)
290
+		bail("failed to read netlink message");
291
+
292
+	if (hdr.nlmsg_type != INIT_MSG)
293
+		bail("unexpected msg type %d", hdr.nlmsg_type);
294
+
295
+	/* Retrieve data. */
296
+	size = NLMSG_PAYLOAD(&hdr, 0);
297
+	current = data = malloc(size);
298
+	if (!data)
299
+		bail("failed to allocate %zu bytes of memory for nl_payload", size);
300
+
301
+	len = read(fd, data, size);
302
+	if (len != size)
303
+		bail("failed to read netlink payload, %lu != %lu", len, size);
304
+
305
+	/* Parse the netlink payload. */
306
+	config->data = data;
307
+	config->consolefd = -1;
308
+	while (current < data + size) {
309
+		struct nlattr *nlattr = (struct nlattr *)current;
310
+		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
311
+
312
+		/* Advance to payload. */
313
+		current += NLA_HDRLEN;
314
+
315
+		/* Handle payload. */
316
+		switch (nlattr->nla_type) {
317
+		case CLONE_FLAGS_ATTR:
318
+			config->cloneflags = readint32(current);
319
+			break;
320
+		case CONSOLE_PATH_ATTR:
321
+			/*
322
+			 * We open the console here because we currently evaluate console
323
+			 * paths from the *host* namespaces.
324
+			 */
325
+			config->consolefd = open(current, O_RDWR);
326
+			if (config->consolefd < 0)
327
+				bail("failed to open console %s", current);
328
+			break;
329
+		case NS_PATHS_ATTR:
330
+			config->namespaces = current;
331
+			config->namespaces_len = payload_len;
332
+			break;
333
+		case UIDMAP_ATTR:
334
+			config->uidmap = current;
335
+			config->uidmap_len = payload_len;
336
+			break;
337
+		case GIDMAP_ATTR:
338
+			config->gidmap = current;
339
+			config->gidmap_len = payload_len;
340
+			break;
341
+		case SETGROUP_ATTR:
342
+			config->is_setgroup = readint8(current);
343
+			break;
344
+		default:
345
+			bail("unknown netlink message type %d", nlattr->nla_type);
346
+		}
347
+
348
+		current += NLA_ALIGN(payload_len);
349
+	}
350
+}
351
+
352
+void nl_free(struct nlconfig_t *config)
353
+{
354
+	free(config->data);
355
+}
356
+
357
+void join_namespaces(char *nslist)
358
+{
359
+	int num = 0, i;
360
+	char *saveptr = NULL;
361
+	char *namespace = strtok_r(nslist, ",", &saveptr);
362
+	struct namespace_t {
363
+		int fd;
364
+		int ns;
365
+		char type[PATH_MAX];
366
+		char path[PATH_MAX];
367
+	} *namespaces = NULL;
368
+
369
+	if (!namespace || !strlen(namespace) || !strlen(nslist))
370
+		bail("ns paths are empty");
371
+
372
+	/*
373
+	 * We have to open the file descriptors first, since after
374
+	 * we join the mnt namespace we might no longer be able to
375
+	 * access the paths.
376
+	 */
377
+	do {
378
+		int fd;
379
+		char *path;
380
+		struct namespace_t *ns;
381
+
382
+		/* Resize the namespace array. */
383
+		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
384
+		if (!namespaces)
385
+			bail("failed to reallocate namespace array");
386
+		ns = &namespaces[num - 1];
387
+
388
+		/* Split 'ns:path'. */
389
+		path = strstr(namespace, ":");
390
+		if (!path)
391
+			bail("failed to parse %s", namespace);
392
+		*path++ = '\0';
393
+
394
+		fd = open(path, O_RDONLY);
395
+		if (fd < 0)
396
+			bail("failed to open %s", namespace);
397
+
398
+		ns->fd = fd;
399
+		ns->ns = nsflag(namespace);
400
+		strncpy(ns->path, path, PATH_MAX);
401
+	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
402
+
403
+	/*
404
+	 * The ordering in which we join namespaces is important. We should
405
+	 * always join the user namespace *first*. This is all guaranteed
406
+	 * from the container_linux.go side of this, so we're just going to
407
+	 * follow the order given to us.
408
+	 */
409
+
410
+	for (i = 0; i < num; i++) {
411
+		struct namespace_t ns = namespaces[i];
412
+
413
+		if (setns(ns.fd, ns.ns) < 0)
414
+			bail("failed to setns to %s", ns.path);
415
+
416
+		close(ns.fd);
417
+	}
418
+
419
+	free(namespaces);
420
+}
421
+
422
+void nsexec(void)
423
+{
424
+	int pipenum;
425
+	jmp_buf env;
426
+	int syncpipe[2];
427
+	struct nlconfig_t config = {0};
428
+
429
+	/*
430
+	 * If we don't have an init pipe, just return to the go routine.
431
+	 * We'll only get an init pipe for start or exec.
432
+	 */
433
+	pipenum = initpipe();
434
+	if (pipenum == -1)
435
+		return;
436
+
437
+	/* Parse all of the netlink configuration. */
438
+	nl_parse(pipenum, &config);
439
+
440
+	/* Pipe so we can tell the child when we've finished setting up. */
441
+	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
442
+		bail("failed to setup sync pipe between parent and child");
443
+
444
+	/* TODO: Currently we aren't dealing with child deaths properly. */
445
+
446
+	/*
447
+	 * Okay, so this is quite annoying.
448
+	 *
449
+	 * In order for this unsharing code to be more extensible we need to split
450
+	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
451
+	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
452
+	 * separately, but because of SELinux issues we cannot really do that. But
453
+	 * we cannot just dump the namespace flags into clone(...) because several
454
+	 * usecases (such as rootless containers) require more granularity around
455
+	 * the namespace setup. In addition, some older kernels had issues where
456
+	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
457
+	 * handle this while also dealing with SELinux so we choose SELinux support
458
+	 * over broken kernel support).
459
+	 *
460
+	 * However, if we unshare(2) the user namespace *before* we clone(2), then
461
+	 * all hell breaks loose.
462
+	 *
463
+	 * The parent no longer has permissions to do many things (unshare(2) drops
464
+	 * all capabilities in your old namespace), and the container cannot be set
465
+	 * up to have more than one {uid,gid} mapping. This is obviously less than
466
+	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
467
+	 *
468
+	 * Unfortunately, it's not as simple as that. We have to fork to enter the
469
+	 * PID namespace (the PID namespace only applies to children). Since we'll
470
+	 * have to double-fork, this clone_parent() call won't be able to get the
471
+	 * PID of the _actual_ init process (without doing more synchronisation than
472
+	 * I can deal with at the moment). So we'll just get the parent to send it
473
+	 * for us, the only job of this process is to update
474
+	 * /proc/pid/{setgroups,uid_map,gid_map}.
475
+	 *
476
+	 * And as a result of the above, we also need to setns(2) in the first child
477
+	 * because if we join a PID namespace in the topmost parent then our child
478
+	 * will be in that namespace (and it will not be able to give us a PID value
479
+	 * that makes sense without resorting to sending things with cmsg).
480
+	 *
481
+	 * This also deals with an older issue caused by dumping cloneflags into
482
+	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
483
+	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
484
+	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
485
+	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
486
+	 * aware, the last mainline kernel which had this bug was Linux 3.12.
487
+	 * However, we cannot comment on which kernels the broken patch was
488
+	 * backported to.
489
+	 *
490
+	 * -- Aleksa "what has my life come to?" Sarai
491
+	 */
492
+
493
+	switch (setjmp(env)) {
494
+	/*
495
+	 * Stage 0: We're in the parent. Our job is just to create a new child
496
+	 *          (stage 1: JUMP_CHILD) process and write its uid_map and
497
+	 *          gid_map. That process will go on to create a new process, then
498
+	 *          it will send us its PID which we will send to the bootstrap
499
+	 *          process.
500
+	 */
501
+	case JUMP_PARENT: {
502
+			int len;
503
+			pid_t child;
504
+			char buf[JSON_MAX];
505
+
506
+			/* For debugging. */
507
+			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
508
+
509
+			/* Start the process of getting a container. */
510
+			child = clone_parent(&env, JUMP_CHILD);
511
+			if (child < 0)
512
+				bail("unable to fork: child_func");
513
+
514
+			/* State machine for synchronisation with the children. */
515
+			while (true) {
516
+				enum sync_t s;
517
+
518
+				/* This doesn't need to be global, we're in the parent. */
519
+				int syncfd = syncpipe[1];
520
+
521
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
522
+					bail("failed to sync with child: next state");
523
+
524
+				switch (s) {
525
+				case SYNC_ERR: {
526
+						/* We have to mirror the error code of the child. */
527
+						int ret;
528
+
529
+						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
530
+							bail("failed to sync with child: read(error code)");
531
+
532
+						exit(ret);
533
+					}
534
+					break;
535
+				case SYNC_USERMAP_PLS:
536
+					/* Enable setgroups(2) if we've been asked to. */
537
+					if (config.is_setgroup)
538
+						update_setgroups(child, SETGROUPS_ALLOW);
539
+
540
+					/* Set up mappings. */
541
+					update_uidmap(child, config.uidmap, config.uidmap_len);
542
+					update_gidmap(child, config.gidmap, config.gidmap_len);
543
+
544
+					s = SYNC_USERMAP_ACK;
545
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
546
+						kill(child, SIGKILL);
547
+						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
548
+					}
549
+					break;
550
+				case SYNC_USERMAP_ACK:
551
+					/* We should _never_ receive acks. */
552
+					kill(child, SIGKILL);
553
+					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
554
+					break;
555
+				case SYNC_RECVPID_PLS: {
556
+						pid_t old = child;
557
+
558
+						/* Get the init_func pid. */
559
+						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
560
+							kill(old, SIGKILL);
561
+							bail("failed to sync with child: read(childpid)");
562
+						}
563
+
564
+						/* Send ACK. */
565
+						s = SYNC_RECVPID_ACK;
566
+						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
567
+							kill(old, SIGKILL);
568
+							kill(child, SIGKILL);
569
+							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
570
+						}
571
+					}
572
+
573
+					/* Leave the loop. */
574
+					goto out;
575
+				case SYNC_RECVPID_ACK:
576
+					/* We should _never_ receive acks. */
577
+					kill(child, SIGKILL);
578
+					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
579
+					break;
580
+				}
581
+			}
582
+
583
+		out:
584
+			/* Send the init_func pid back to our parent. */
585
+			len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
586
+			if (len < 0) {
587
+				kill(child, SIGKILL);
588
+				bail("unable to generate JSON for child pid");
589
+			}
590
+			if (write(pipenum, buf, len) != len) {
591
+				kill(child, SIGKILL);
592
+				bail("unable to send child pid to bootstrapper");
593
+			}
594
+
595
+			exit(0);
596
+		}
597
+
598
+	/*
599
+	 * Stage 1: We're in the first child process. Our job is to join any
600
+	 *          provided user namespaces in the netlink payload. If we've been
601
+	 *          asked to CLONE_NEWUSER, we will unshare the user namespace and
602
+	 *          ask our parent (stage 0) to set up our user mappings for us.
603
+	 *          Then, we unshare the rest of the requested namespaces and
604
+	 *          create a new child (stage 2: JUMP_INIT).  We then send the
605
+	 *          child's PID to our parent (stage 0).
606
+	 */
607
+	case JUMP_CHILD: {
608
+			pid_t child;
609
+			enum sync_t s;
610
+
611
+			/* We're in a child and thus need to tell the parent if we die. */
612
+			syncfd = syncpipe[0];
613
+
614
+			/* For debugging. */
615
+			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
616
+
617
+			/*
618
+			 * We need to setns first. We cannot do this earlier (in stage 0)
619
+			 * because of the fact that we forked to get here (the PID of
620
+			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
621
+			 * using cmsg(3) but that's just annoying.
622
+			 */
623
+			if (config.namespaces)
624
+				join_namespaces(config.namespaces);
625
+
626
+			/*
627
+			 * Unshare all of the namespaces. Now, it should be noted that this
628
+			 * ordering might break in the future (especially with rootless
629
+			 * containers). But for now, it's not possible to split this into
630
+			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
631
+			 *
632
+			 * Note that we don't merge this with clone() because there were
633
+			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
634
+			 * was broken, so we'll just do it the long way anyway.
635
+			 */
636
+			if (unshare(config.cloneflags) < 0)
637
+				bail("failed to unshare namespaces");
638
+
639
+			/*
640
+			 * Deal with user namespaces first. They are quite special, as they
641
+			 * affect our ability to unshare other namespaces and are used as
642
+			 * context for privilege checks.
643
+			 */
644
+			if (config.cloneflags & CLONE_NEWUSER) {
645
+				/*
646
+				 * We don't have the privileges to do any mapping here (see the
647
+				 * clone_parent rant). So signal our parent to hook us up.
648
+				 */
649
+
650
+				s = SYNC_USERMAP_PLS;
651
+				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
652
+					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
653
+
654
+				/* ... wait for mapping ... */
655
+
656
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
657
+					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
658
+				if (s != SYNC_USERMAP_ACK)
659
+					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
660
+			}
661
+
662
+			/* TODO: What about non-namespace clone flags that we're dropping here? */
663
+			child = clone_parent(&env, JUMP_INIT);
664
+			if (child < 0)
665
+				bail("unable to fork: init_func");
666
+
667
+			/* Send the child to our parent, which knows what it's doing. */
668
+			s = SYNC_RECVPID_PLS;
669
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
670
+				kill(child, SIGKILL);
671
+				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
672
+			}
673
+			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
674
+				kill(child, SIGKILL);
675
+				bail("failed to sync with parent: write(childpid)");
676
+			}
677
+
678
+			/* ... wait for parent to get the pid ... */
679
+
680
+			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
681
+				kill(child, SIGKILL);
682
+				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
683
+			}
684
+			if (s != SYNC_RECVPID_ACK) {
685
+				kill(child, SIGKILL);
686
+				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
687
+			}
688
+
689
+			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
690
+			exit(0);
691
+		}
692
+
693
+	/*
694
+	 * Stage 2: We're the final child process, and the only process that will
695
+	 *          actually return to the Go runtime. Our job is to just do the
696
+	 *          final cleanup steps and then return to the Go runtime to allow
697
+	 *          init_linux.go to run.
698
+	 */
699
+	case JUMP_INIT: {
700
+			/*
701
+			 * We're inside the child now, having jumped from the
702
+			 * start_child() code after forking in the parent.
703
+			 */
704
+			int consolefd = config.consolefd;
705
+
706
+			/* We're in a child and thus need to tell the parent if we die. */
707
+			syncfd = syncpipe[0];
708
+
709
+			/* For debugging. */
710
+			prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
711
+
712
+			if (setsid() < 0)
713
+				bail("setsid failed");
714
+
715
+			if (setuid(0) < 0)
716
+				bail("setuid failed");
717
+
718
+			if (setgid(0) < 0)
719
+				bail("setgid failed");
720
+
721
+			if (setgroups(0, NULL) < 0)
722
+				bail("setgroups failed");
723
+
724
+			if (consolefd != -1) {
725
+				if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
726
+					bail("ioctl TIOCSCTTY failed");
727
+				if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
728
+					bail("failed to dup stdin");
729
+				if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
730
+					bail("failed to dup stdout");
731
+				if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
732
+					bail("failed to dup stderr");
733
+			}
734
+
735
+			/* Close sync pipes. */
736
+			close(syncpipe[0]);
737
+			close(syncpipe[1]);
738
+
739
+			/* Free netlink data. */
740
+			nl_free(&config);
741
+
742
+			/* Finish executing, let the Go runtime take over. */
743
+			return;
744
+		}
745
+	default:
746
+		bail("unexpected jump value");
747
+		break;
748
+	}
749
+
750
+	/* Should never be reached. */
751
+	bail("should never be reached");
752
+}
... ...
@@ -355,6 +355,12 @@ func FreeLxcContexts(scon string) {
355 355
 	}
356 356
 }
357 357
 
358
+var roFileLabel string
359
+
360
+func GetROFileLabel() (fileLabel string) {
361
+	return roFileLabel
362
+}
363
+
358 364
 func GetLxcContexts() (processLabel string, fileLabel string) {
359 365
 	var (
360 366
 		val, key string
... ...
@@ -399,6 +405,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
399 399
 			if key == "file" {
400 400
 				fileLabel = strings.Trim(val, "\"")
401 401
 			}
402
+			if key == "ro_file" {
403
+				roFileLabel = strings.Trim(val, "\"")
404
+			}
402 405
 		}
403 406
 	}
404 407
 
... ...
@@ -406,6 +415,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
406 406
 		return "", ""
407 407
 	}
408 408
 
409
+	if roFileLabel == "" {
410
+		roFileLabel = fileLabel
411
+	}
409 412
 exit:
410 413
 	//	mcs := IntToMcs(os.Getpid(), 1024)
411 414
 	mcs := uniqMcs(1024)
... ...
@@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
14 14
 	if err != nil {
15 15
 		return "", err
16 16
 	}
17
+	return parseStartTime(string(data))
18
+}
17 19
 
18
-	parts := strings.Split(string(data), " ")
20
+func parseStartTime(stat string) (string, error) {
19 21
 	// the starttime is located at pos 22
20 22
 	// from the man page
21 23
 	//
... ...
@@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
23 23
 	// (22)  The  time the process started after system boot.  In kernels before Linux 2.6, this
24 24
 	// value was expressed in jiffies.  Since Linux 2.6, the value is expressed in  clock  ticks
25 25
 	// (divide by sysconf(_SC_CLK_TCK)).
26
-	return parts[22-1], nil // starts at 1
26
+	//
27
+	// NOTE:
28
+	// pos 2 could contain space and is inside `(` and `)`:
29
+	// (2) comm  %s
30
+	// The filename of the executable, in parentheses.
31
+	// This is visible whether or not the executable is
32
+	// swapped out.
33
+	//
34
+	// the following is an example:
35
+	// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
36
+
37
+	// get parts after last `)`:
38
+	s := strings.Split(stat, ")")
39
+	parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ")
40
+	return parts[22-3], nil // starts at 3 (after the filename pos `2`)
27 41
 }