Until we can support existing behaviour with `sudo` disable
ambient capabilities in runc build.
Add tests that non root user cannot use default capabilities,
and that capabilities are working as expected.
Test for #27590
Update runc.
Signed-off-by: Justin Cormack <justin.cormack@docker.com>
| ... | ... |
@@ -6,6 +6,10 @@ WORKDIR /usr/src/ |
| 6 | 6 |
|
| 7 | 7 |
RUN gcc -g -Wall -static userns.c -o /usr/bin/userns-test \ |
| 8 | 8 |
&& gcc -g -Wall -static ns.c -o /usr/bin/ns-test \ |
| 9 |
- && gcc -g -Wall -static acct.c -o /usr/bin/acct-test |
|
| 9 |
+ && gcc -g -Wall -static acct.c -o /usr/bin/acct-test \ |
|
| 10 |
+ && gcc -g -Wall -static setuid.c -o /usr/bin/setuid-test \ |
|
| 11 |
+ && gcc -g -Wall -static setgid.c -o /usr/bin/setgid-test \ |
|
| 12 |
+ && gcc -g -Wall -static socket.c -o /usr/bin/socket-test \ |
|
| 13 |
+ && gcc -g -Wall -static raw.c -o /usr/bin/raw-test |
|
| 10 | 14 |
|
| 11 | 15 |
RUN [ "$(uname -m)" = "x86_64" ] && gcc -s -m32 -nostdlib exit32.s -o /usr/bin/exit32-test || true |
| 12 | 16 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,14 @@ |
| 0 |
+#include <stdio.h> |
|
| 1 |
+#include <unistd.h> |
|
| 2 |
+#include <sys/socket.h> |
|
| 3 |
+#include <netinet/ip.h> |
|
| 4 |
+#include <netinet/udp.h> |
|
| 5 |
+ |
|
| 6 |
+int main() {
|
|
| 7 |
+ if (socket(PF_INET, SOCK_RAW, IPPROTO_UDP) == -1) {
|
|
| 8 |
+ perror("socket");
|
|
| 9 |
+ return 1; |
|
| 10 |
+ } |
|
| 11 |
+ |
|
| 12 |
+ return 0; |
|
| 13 |
+} |
| 0 | 11 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,30 @@ |
| 0 |
+#include <stdio.h> |
|
| 1 |
+#include <unistd.h> |
|
| 2 |
+#include <sys/types.h> |
|
| 3 |
+#include <sys/socket.h> |
|
| 4 |
+#include <netinet/in.h> |
|
| 5 |
+#include <arpa/inet.h> |
|
| 6 |
+ |
|
| 7 |
+int main() {
|
|
| 8 |
+ int s; |
|
| 9 |
+ struct sockaddr_in sin; |
|
| 10 |
+ |
|
| 11 |
+ s = socket(AF_INET, SOCK_STREAM, 0); |
|
| 12 |
+ if (s == -1) {
|
|
| 13 |
+ perror("socket");
|
|
| 14 |
+ return 1; |
|
| 15 |
+ } |
|
| 16 |
+ |
|
| 17 |
+ sin.sin_family = AF_INET; |
|
| 18 |
+ sin.sin_addr.s_addr = INADDR_ANY; |
|
| 19 |
+ sin.sin_port = htons(80); |
|
| 20 |
+ |
|
| 21 |
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) == -1) {
|
|
| 22 |
+ perror("bind");
|
|
| 23 |
+ return 1; |
|
| 24 |
+ } |
|
| 25 |
+ |
|
| 26 |
+ close(s); |
|
| 27 |
+ |
|
| 28 |
+ return 0; |
|
| 29 |
+} |
| ... | ... |
@@ -3,7 +3,7 @@ set -e |
| 3 | 3 |
set -x |
| 4 | 4 |
|
| 5 | 5 |
TOMLV_COMMIT=9baf8a8a9f2ed20a8e54160840c492f937eeaf9a |
| 6 |
-RUNC_COMMIT=02f8fa7863dd3f82909a73e2061897828460d52f |
|
| 6 |
+RUNC_COMMIT=ac031b5bf1cc92239461125f4c1ffb760522bbf2 |
|
| 7 | 7 |
CONTAINERD_COMMIT=52ef1ceb4b660c42cf4ea9013180a5663968d4c7 |
| 8 | 8 |
GRIMES_COMMIT=fe069a03affd2547fdb05e5b8b07202d2e41735b |
| 9 | 9 |
LIBNETWORK_COMMIT=0f534354b813003a754606689722fe253101bc4e |
| ... | ... |
@@ -20,11 +20,12 @@ else |
| 20 | 20 |
export GOPATH="$TMP_GOPATH" |
| 21 | 21 |
fi |
| 22 | 22 |
|
| 23 |
+# Do not build with ambient capabilities support |
|
| 23 | 24 |
RUNC_BUILDTAGS="${RUNC_BUILDTAGS:-"seccomp apparmor selinux"}"
|
| 24 | 25 |
|
| 25 | 26 |
install_runc() {
|
| 26 | 27 |
echo "Install runc version $RUNC_COMMIT" |
| 27 |
- git clone https://github.com/opencontainers/runc.git "$GOPATH/src/github.com/opencontainers/runc" |
|
| 28 |
+ git clone https://github.com/docker/runc.git "$GOPATH/src/github.com/opencontainers/runc" |
|
| 28 | 29 |
cd "$GOPATH/src/github.com/opencontainers/runc" |
| 29 | 30 |
git checkout -q "$RUNC_COMMIT" |
| 30 | 31 |
make BUILDTAGS="$RUNC_BUILDTAGS" $1 |
| ... | ... |
@@ -1155,24 +1155,185 @@ func (s *DockerSuite) TestRunNoNewPrivSetuid(c *check.C) {
|
| 1155 | 1155 |
} |
| 1156 | 1156 |
} |
| 1157 | 1157 |
|
| 1158 |
-func (s *DockerSuite) TestRunAmbientCapabilities(c *check.C) {
|
|
| 1159 |
- testRequires(c, DaemonIsLinux, ambientCapabilities) |
|
| 1158 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesChown(c *check.C) {
|
|
| 1159 |
+ testRequires(c, DaemonIsLinux) |
|
| 1160 |
+ ensureSyscallTest(c) |
|
| 1161 |
+ |
|
| 1162 |
+ // test that a root user has default capability CAP_CHOWN |
|
| 1163 |
+ runCmd := exec.Command(dockerBinary, "run", "busybox", "chown", "100", "/tmp") |
|
| 1164 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1165 |
+ c.Assert(err, check.IsNil) |
|
| 1166 |
+ // test that non root user does not have default capability CAP_CHOWN |
|
| 1167 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chown", "100", "/tmp") |
|
| 1168 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1169 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1170 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1171 |
+ // test that root user can drop default capability CAP_CHOWN |
|
| 1172 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "chown", "busybox", "chown", "100", "/tmp") |
|
| 1173 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1174 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1175 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1176 |
+} |
|
| 1177 |
+ |
|
| 1178 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesDacOverride(c *check.C) {
|
|
| 1179 |
+ testRequires(c, DaemonIsLinux) |
|
| 1180 |
+ ensureSyscallTest(c) |
|
| 1181 |
+ |
|
| 1182 |
+ // test that a root user has default capability CAP_DAC_OVERRIDE |
|
| 1183 |
+ runCmd := exec.Command(dockerBinary, "run", "busybox", "sh", "-c", "echo test > /etc/passwd") |
|
| 1184 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1185 |
+ c.Assert(err, check.IsNil) |
|
| 1186 |
+ // test that non root user does not have default capability CAP_DAC_OVERRIDE |
|
| 1187 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "sh", "-c", "echo test > /etc/passwd") |
|
| 1188 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1189 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1190 |
+ c.Assert(out, checker.Contains, "Permission denied") |
|
| 1191 |
+ // TODO test that root user can drop default capability CAP_DAC_OVERRIDE |
|
| 1192 |
+} |
|
| 1193 |
+ |
|
| 1194 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesFowner(c *check.C) {
|
|
| 1195 |
+ testRequires(c, DaemonIsLinux) |
|
| 1196 |
+ ensureSyscallTest(c) |
|
| 1197 |
+ |
|
| 1198 |
+ // test that a root user has default capability CAP_FOWNER |
|
| 1199 |
+ runCmd := exec.Command(dockerBinary, "run", "busybox", "chmod", "777", "/etc/passwd") |
|
| 1200 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1201 |
+ c.Assert(err, check.IsNil) |
|
| 1202 |
+ // test that non root user does not have default capability CAP_FOWNER |
|
| 1203 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chmod", "777", "/etc/passwd") |
|
| 1204 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1205 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1206 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1207 |
+ // TODO test that root user can drop default capability CAP_FOWNER |
|
| 1208 |
+} |
|
| 1209 |
+ |
|
| 1210 |
+// TODO CAP_KILL |
|
| 1211 |
+ |
|
| 1212 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesSetuid(c *check.C) {
|
|
| 1213 |
+ testRequires(c, DaemonIsLinux) |
|
| 1214 |
+ ensureSyscallTest(c) |
|
| 1215 |
+ |
|
| 1216 |
+ // test that a root user has default capability CAP_SETUID |
|
| 1217 |
+ runCmd := exec.Command(dockerBinary, "run", "syscall-test", "setuid-test") |
|
| 1218 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1219 |
+ c.Assert(err, check.IsNil) |
|
| 1220 |
+ // test that non root user does not have default capability CAP_SETUID |
|
| 1221 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "setuid-test") |
|
| 1222 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1223 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1224 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1225 |
+ // test that root user can drop default capability CAP_SETUID |
|
| 1226 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "setuid", "syscall-test", "setuid-test") |
|
| 1227 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1228 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1229 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1230 |
+} |
|
| 1231 |
+ |
|
| 1232 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesSetgid(c *check.C) {
|
|
| 1233 |
+ testRequires(c, DaemonIsLinux) |
|
| 1234 |
+ ensureSyscallTest(c) |
|
| 1235 |
+ |
|
| 1236 |
+ // test that a root user has default capability CAP_SETGID |
|
| 1237 |
+ runCmd := exec.Command(dockerBinary, "run", "syscall-test", "setgid-test") |
|
| 1238 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1239 |
+ c.Assert(err, check.IsNil) |
|
| 1240 |
+ // test that non root user does not have default capability CAP_SETGID |
|
| 1241 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "setgid-test") |
|
| 1242 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1243 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1244 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1245 |
+ // test that root user can drop default capability CAP_SETGID |
|
| 1246 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "setgid", "syscall-test", "setgid-test") |
|
| 1247 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1248 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1249 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1250 |
+} |
|
| 1251 |
+ |
|
| 1252 |
+// TODO CAP_SETPCAP |
|
| 1253 |
+ |
|
| 1254 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetBindService(c *check.C) {
|
|
| 1255 |
+ testRequires(c, DaemonIsLinux) |
|
| 1256 |
+ ensureSyscallTest(c) |
|
| 1257 |
+ |
|
| 1258 |
+ // test that a root user has default capability CAP_NET_BIND_SERVICE |
|
| 1259 |
+ runCmd := exec.Command(dockerBinary, "run", "syscall-test", "socket-test") |
|
| 1260 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1261 |
+ c.Assert(err, check.IsNil) |
|
| 1262 |
+ // test that non root user does not have default capability CAP_NET_BIND_SERVICE |
|
| 1263 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "socket-test") |
|
| 1264 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1265 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1266 |
+ c.Assert(out, checker.Contains, "Permission denied") |
|
| 1267 |
+ // test that root user can drop default capability CAP_NET_BIND_SERVICE |
|
| 1268 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "net_bind_service", "syscall-test", "socket-test") |
|
| 1269 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1270 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1271 |
+ c.Assert(out, checker.Contains, "Permission denied") |
|
| 1272 |
+} |
|
| 1273 |
+ |
|
| 1274 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetRaw(c *check.C) {
|
|
| 1275 |
+ testRequires(c, DaemonIsLinux) |
|
| 1276 |
+ ensureSyscallTest(c) |
|
| 1277 |
+ |
|
| 1278 |
+ // test that a root user has default capability CAP_NET_RAW |
|
| 1279 |
+ runCmd := exec.Command(dockerBinary, "run", "syscall-test", "raw-test") |
|
| 1280 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1281 |
+ c.Assert(err, check.IsNil) |
|
| 1282 |
+ // test that non root user does not have default capability CAP_NET_RAW |
|
| 1283 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "raw-test") |
|
| 1284 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1285 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1286 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1287 |
+ // test that root user can drop default capability CAP_NET_RAW |
|
| 1288 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "net_raw", "syscall-test", "raw-test") |
|
| 1289 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1290 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1291 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1292 |
+} |
|
| 1160 | 1293 |
|
| 1161 |
- // test that a non root user can gain capabilities |
|
| 1162 |
- runCmd := exec.Command(dockerBinary, "run", "--user", "1000", "--cap-add", "chown", "busybox", "chown", "100", "/tmp") |
|
| 1294 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesChroot(c *check.C) {
|
|
| 1295 |
+ testRequires(c, DaemonIsLinux) |
|
| 1296 |
+ ensureSyscallTest(c) |
|
| 1297 |
+ |
|
| 1298 |
+ // test that a root user has default capability CAP_SYS_CHROOT |
|
| 1299 |
+ runCmd := exec.Command(dockerBinary, "run", "busybox", "chroot", "/", "/bin/true") |
|
| 1163 | 1300 |
_, _, err := runCommandWithOutput(runCmd) |
| 1164 | 1301 |
c.Assert(err, check.IsNil) |
| 1165 |
- // test that non root user has default capabilities |
|
| 1166 |
- runCmd = exec.Command(dockerBinary, "run", "--user", "1000", "busybox", "chown", "100", "/tmp") |
|
| 1167 |
- _, _, err = runCommandWithOutput(runCmd) |
|
| 1302 |
+ // test that non root user does not have default capability CAP_SYS_CHROOT |
|
| 1303 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chroot", "/", "/bin/true") |
|
| 1304 |
+ out, _, err := runCommandWithOutput(runCmd) |
|
| 1305 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1306 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1307 |
+ // test that root user can drop default capability CAP_SYS_CHROOT |
|
| 1308 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "sys_chroot", "busybox", "chroot", "/", "/bin/true") |
|
| 1309 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1310 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1311 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1312 |
+} |
|
| 1313 |
+ |
|
| 1314 |
+func (s *DockerSuite) TestUserNoEffectiveCapabilitiesMknod(c *check.C) {
|
|
| 1315 |
+ testRequires(c, DaemonIsLinux) |
|
| 1316 |
+ ensureSyscallTest(c) |
|
| 1317 |
+ |
|
| 1318 |
+ // test that a root user has default capability CAP_MKNOD |
|
| 1319 |
+ runCmd := exec.Command(dockerBinary, "run", "busybox", "mknod", "/tmp/node", "b", "1", "2") |
|
| 1320 |
+ _, _, err := runCommandWithOutput(runCmd) |
|
| 1168 | 1321 |
c.Assert(err, check.IsNil) |
| 1169 |
- // test this fails without cap_chown |
|
| 1170 |
- runCmd = exec.Command(dockerBinary, "run", "--user", "1000", "--cap-drop", "chown", "busybox", "chown", "100", "/tmp") |
|
| 1322 |
+ // test that non root user does not have default capability CAP_MKNOD |
|
| 1323 |
+ runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "mknod", "/tmp/node", "b", "1", "2") |
|
| 1171 | 1324 |
out, _, err := runCommandWithOutput(runCmd) |
| 1172 | 1325 |
c.Assert(err, checker.NotNil, check.Commentf(out)) |
| 1173 |
- c.Assert(strings.TrimSpace(out), checker.Equals, "chown: /tmp: Operation not permitted") |
|
| 1326 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1327 |
+ // test that root user can drop default capability CAP_MKNOD |
|
| 1328 |
+ runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "mknod", "busybox", "mknod", "/tmp/node", "b", "1", "2") |
|
| 1329 |
+ out, _, err = runCommandWithOutput(runCmd) |
|
| 1330 |
+ c.Assert(err, checker.NotNil, check.Commentf(out)) |
|
| 1331 |
+ c.Assert(out, checker.Contains, "Operation not permitted") |
|
| 1174 | 1332 |
} |
| 1175 | 1333 |
|
| 1334 |
+// TODO CAP_AUDIT_WRITE |
|
| 1335 |
+// TODO CAP_SETFCAP |
|
| 1336 |
+ |
|
| 1176 | 1337 |
func (s *DockerSuite) TestRunApparmorProcDirectory(c *check.C) {
|
| 1177 | 1338 |
testRequires(c, SameHostDaemon, Apparmor) |
| 1178 | 1339 |
|
| ... | ... |
@@ -1,6 +1,7 @@ |
| 1 | 1 |
package main |
| 2 | 2 |
|
| 3 | 3 |
import ( |
| 4 |
+ "fmt" |
|
| 4 | 5 |
"io/ioutil" |
| 5 | 6 |
"os" |
| 6 | 7 |
"os/exec" |
| ... | ... |
@@ -53,15 +54,14 @@ func ensureSyscallTest(c *check.C) {
|
| 53 | 53 |
gcc, err := exec.LookPath("gcc")
|
| 54 | 54 |
c.Assert(err, checker.IsNil, check.Commentf("could not find gcc"))
|
| 55 | 55 |
|
| 56 |
- out, err := exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/userns.c", "-o", tmp+"/"+"userns-test").CombinedOutput() |
|
| 57 |
- c.Assert(err, checker.IsNil, check.Commentf(string(out))) |
|
| 58 |
- out, err = exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/ns.c", "-o", tmp+"/"+"ns-test").CombinedOutput() |
|
| 59 |
- c.Assert(err, checker.IsNil, check.Commentf(string(out))) |
|
| 60 |
- out, err = exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/acct.c", "-o", tmp+"/"+"acct-test").CombinedOutput() |
|
| 61 |
- c.Assert(err, checker.IsNil, check.Commentf(string(out))) |
|
| 56 |
+ tests := []string{"userns", "ns", "acct", "setuid", "setgid", "socket", "raw"}
|
|
| 57 |
+ for _, test := range tests {
|
|
| 58 |
+ out, err := exec.Command(gcc, "-g", "-Wall", "-static", fmt.Sprintf("../contrib/syscall-test/%s.c", test), "-o", fmt.Sprintf("%s/%s-test", tmp, test)).CombinedOutput()
|
|
| 59 |
+ c.Assert(err, checker.IsNil, check.Commentf(string(out))) |
|
| 60 |
+ } |
|
| 62 | 61 |
|
| 63 | 62 |
if runtime.GOOS == "linux" && runtime.GOARCH == "amd64" {
|
| 64 |
- out, err = exec.Command(gcc, "-s", "-m32", "-nostdlib", "../contrib/syscall-test/exit32.s", "-o", tmp+"/"+"exit32-test").CombinedOutput() |
|
| 63 |
+ out, err := exec.Command(gcc, "-s", "-m32", "-nostdlib", "../contrib/syscall-test/exit32.s", "-o", tmp+"/"+"exit32-test").CombinedOutput() |
|
| 65 | 64 |
c.Assert(err, checker.IsNil, check.Commentf(string(out))) |
| 66 | 65 |
} |
| 67 | 66 |
|
| ... | ... |
@@ -59,7 +59,7 @@ github.com/miekg/pkcs11 df8ae6ca730422dba20c768ff38ef7d79077a59f |
| 59 | 59 |
github.com/docker/go v1.5.1-1-1-gbaf439e |
| 60 | 60 |
github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c |
| 61 | 61 |
|
| 62 |
-github.com/opencontainers/runc 02f8fa7863dd3f82909a73e2061897828460d52f # libcontainer |
|
| 62 |
+github.com/opencontainers/runc ac031b5bf1cc92239461125f4c1ffb760522bbf2 # libcontainer |
|
| 63 | 63 |
github.com/opencontainers/runtime-spec 1c7c27d043c2a5e513a44084d2b10d77d1402b8c # specs |
| 64 | 64 |
github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0 |
| 65 | 65 |
# libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json) |
| ... | ... |
@@ -22,7 +22,7 @@ type Cgroup struct {
|
| 22 | 22 |
// The path is assumed to be relative to the host system cgroup mountpoint. |
| 23 | 23 |
Path string `json:"path"` |
| 24 | 24 |
|
| 25 |
- // ScopePrefix decribes prefix for the scope name |
|
| 25 |
+ // ScopePrefix describes prefix for the scope name |
|
| 26 | 26 |
ScopePrefix string `json:"scope_prefix"` |
| 27 | 27 |
|
| 28 | 28 |
// Paths represent the absolute cgroups paths to join. |
| ... | ... |
@@ -95,7 +95,7 @@ type Resources struct {
|
| 95 | 95 |
// IO read rate limit per cgroup per device, bytes per second. |
| 96 | 96 |
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` |
| 97 | 97 |
|
| 98 |
- // IO write rate limit per cgroup per divice, bytes per second. |
|
| 98 |
+ // IO write rate limit per cgroup per device, bytes per second. |
|
| 99 | 99 |
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` |
| 100 | 100 |
|
| 101 | 101 |
// IO read rate limit per cgroup per device, IO per second. |
| ... | ... |
@@ -85,11 +85,6 @@ type Config struct {
|
| 85 | 85 |
// that the parent process dies. |
| 86 | 86 |
ParentDeathSignal int `json:"parent_death_signal"` |
| 87 | 87 |
|
| 88 |
- // PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set. |
|
| 89 |
- // When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable. |
|
| 90 |
- // This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot. |
|
| 91 |
- PivotDir string `json:"pivot_dir"` |
|
| 92 |
- |
|
| 93 | 88 |
// Path to a directory containing the container's root filesystem. |
| 94 | 89 |
Rootfs string `json:"rootfs"` |
| 95 | 90 |
|
| ... | ... |
@@ -1,5 +1,11 @@ |
| 1 | 1 |
package configs |
| 2 | 2 |
|
| 3 |
+const ( |
|
| 4 |
+ // EXT_COPYUP is a directive to copy up the contents of a directory when |
|
| 5 |
+ // a tmpfs is mounted over it. |
|
| 6 |
+ EXT_COPYUP = 1 << iota |
|
| 7 |
+) |
|
| 8 |
+ |
|
| 3 | 9 |
type Mount struct {
|
| 4 | 10 |
// Source path for the mount. |
| 5 | 11 |
Source string `json:"source"` |
| ... | ... |
@@ -22,6 +28,9 @@ type Mount struct {
|
| 22 | 22 |
// Relabel source if set, "z" indicates shared, "Z" indicates unshared. |
| 23 | 23 |
Relabel string `json:"relabel"` |
| 24 | 24 |
|
| 25 |
+ // Extensions are additional flags that are specific to runc. |
|
| 26 |
+ Extensions int `json:"extensions"` |
|
| 27 |
+ |
|
| 25 | 28 |
// Optional Command to be run before Source is mounted. |
| 26 | 29 |
PremountCmds []Command `json:"premount_cmds"` |
| 27 | 30 |
|
| ... | ... |
@@ -22,8 +22,8 @@ var ( |
| 22 | 22 |
supportedNamespaces = make(map[NamespaceType]bool) |
| 23 | 23 |
) |
| 24 | 24 |
|
| 25 |
-// nsToFile converts the namespace type to its filename |
|
| 26 |
-func nsToFile(ns NamespaceType) string {
|
|
| 25 |
+// NsName converts the namespace type to its filename |
|
| 26 |
+func NsName(ns NamespaceType) string {
|
|
| 27 | 27 |
switch ns {
|
| 28 | 28 |
case NEWNET: |
| 29 | 29 |
return "net" |
| ... | ... |
@@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
|
| 50 | 50 |
if ok {
|
| 51 | 51 |
return supported |
| 52 | 52 |
} |
| 53 |
- nsFile := nsToFile(ns) |
|
| 53 |
+ nsFile := NsName(ns) |
|
| 54 | 54 |
// if the namespace type is unknown, just return false |
| 55 | 55 |
if nsFile == "" {
|
| 56 | 56 |
return false |
| ... | ... |
@@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
|
| 84 | 84 |
if n.Path != "" {
|
| 85 | 85 |
return n.Path |
| 86 | 86 |
} |
| 87 |
- return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
|
|
| 87 |
+ return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
|
|
| 88 | 88 |
} |
| 89 | 89 |
|
| 90 | 90 |
func (n *Namespaces) Remove(t NamespaceType) bool {
|
| ... | ... |
@@ -33,15 +33,19 @@ func InitLabels(options []string) (string, string, error) {
|
| 33 | 33 |
pcon := selinux.NewContext(processLabel) |
| 34 | 34 |
mcon := selinux.NewContext(mountLabel) |
| 35 | 35 |
for _, opt := range options {
|
| 36 |
- if opt == "disable" {
|
|
| 37 |
- return "", "", nil |
|
| 36 |
+ val := strings.SplitN(opt, "=", 2) |
|
| 37 |
+ if val[0] != "label" {
|
|
| 38 |
+ continue |
|
| 39 |
+ } |
|
| 40 |
+ if len(val) < 2 {
|
|
| 41 |
+ return "", "", fmt.Errorf("bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt)
|
|
| 38 | 42 |
} |
| 39 |
- if i := strings.Index(opt, ":"); i == -1 {
|
|
| 40 |
- return "", "", fmt.Errorf("Bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt)
|
|
| 43 |
+ if val[1] == "disable" {
|
|
| 44 |
+ return "", "", nil |
|
| 41 | 45 |
} |
| 42 |
- con := strings.SplitN(opt, ":", 2) |
|
| 43 |
- if !validOptions[con[0]] {
|
|
| 44 |
- return "", "", fmt.Errorf("Bad label option %q, valid options 'disable, user, role, level, type'", con[0])
|
|
| 46 |
+ con := strings.SplitN(val[1], ":", 2) |
|
| 47 |
+ if len(con) < 2 || !validOptions[con[0]] {
|
|
| 48 |
+ return "", "", fmt.Errorf("bad label option %q, valid options 'disable, user, role, level, type'", con[0])
|
|
| 45 | 49 |
|
| 46 | 50 |
} |
| 47 | 51 |
pcon[con[0]] = con[1] |
| ... | ... |
@@ -55,6 +59,10 @@ func InitLabels(options []string) (string, string, error) {
|
| 55 | 55 |
return processLabel, mountLabel, nil |
| 56 | 56 |
} |
| 57 | 57 |
|
| 58 |
+func GetROMountLabel() string {
|
|
| 59 |
+ return selinux.GetROFileLabel() |
|
| 60 |
+} |
|
| 61 |
+ |
|
| 58 | 62 |
// DEPRECATED: The GenLabels function is only to be used during the transition to the official API. |
| 59 | 63 |
func GenLabels(options string) (string, string, error) {
|
| 60 | 64 |
return InitLabels(strings.Fields(options)) |
| 61 | 65 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,32 @@ |
| 0 |
+#ifndef NSENTER_NAMESPACE_H |
|
| 1 |
+#define NSENTER_NAMESPACE_H |
|
| 2 |
+ |
|
| 3 |
+#ifndef _GNU_SOURCE |
|
| 4 |
+# define _GNU_SOURCE |
|
| 5 |
+#endif |
|
| 6 |
+#include <sched.h> |
|
| 7 |
+ |
|
| 8 |
+/* All of these are taken from include/uapi/linux/sched.h */ |
|
| 9 |
+#ifndef CLONE_NEWNS |
|
| 10 |
+# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ |
|
| 11 |
+#endif |
|
| 12 |
+#ifndef CLONE_NEWCGROUP |
|
| 13 |
+# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ |
|
| 14 |
+#endif |
|
| 15 |
+#ifndef CLONE_NEWUTS |
|
| 16 |
+# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ |
|
| 17 |
+#endif |
|
| 18 |
+#ifndef CLONE_NEWIPC |
|
| 19 |
+# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ |
|
| 20 |
+#endif |
|
| 21 |
+#ifndef CLONE_NEWUSER |
|
| 22 |
+# define CLONE_NEWUSER 0x10000000 /* New user namespace */ |
|
| 23 |
+#endif |
|
| 24 |
+#ifndef CLONE_NEWPID |
|
| 25 |
+# define CLONE_NEWPID 0x20000000 /* New pid namespace */ |
|
| 26 |
+#endif |
|
| 27 |
+#ifndef CLONE_NEWNET |
|
| 28 |
+# define CLONE_NEWNET 0x40000000 /* New network namespace */ |
|
| 29 |
+#endif |
|
| 30 |
+ |
|
| 31 |
+#endif /* NSENTER_NAMESPACE_H */ |
| 0 | 12 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,25 @@ |
| 0 |
+// +build linux,gccgo |
|
| 1 |
+ |
|
| 2 |
+package nsenter |
|
| 3 |
+ |
|
| 4 |
+/* |
|
| 5 |
+#cgo CFLAGS: -Wall |
|
| 6 |
+extern void nsexec(); |
|
| 7 |
+void __attribute__((constructor)) init(void) {
|
|
| 8 |
+ nsexec(); |
|
| 9 |
+} |
|
| 10 |
+*/ |
|
| 11 |
+import "C" |
|
| 12 |
+ |
|
| 13 |
+// AlwaysFalse is here to stay false |
|
| 14 |
+// (and be exported so the compiler doesn't optimize out its reference) |
|
| 15 |
+var AlwaysFalse bool |
|
| 16 |
+ |
|
| 17 |
+func init() {
|
|
| 18 |
+ if AlwaysFalse {
|
|
| 19 |
+ // by referencing this C init() in a noop test, it will ensure the compiler |
|
| 20 |
+ // links in the C function. |
|
| 21 |
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 |
|
| 22 |
+ C.init() |
|
| 23 |
+ } |
|
| 24 |
+} |
| 0 | 5 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,753 @@ |
| 0 |
+#define _GNU_SOURCE |
|
| 1 |
+#include <endian.h> |
|
| 2 |
+#include <errno.h> |
|
| 3 |
+#include <fcntl.h> |
|
| 4 |
+#include <grp.h> |
|
| 5 |
+#include <sched.h> |
|
| 6 |
+#include <setjmp.h> |
|
| 7 |
+#include <signal.h> |
|
| 8 |
+#include <stdarg.h> |
|
| 9 |
+#include <stdbool.h> |
|
| 10 |
+#include <stdint.h> |
|
| 11 |
+#include <stdio.h> |
|
| 12 |
+#include <stdlib.h> |
|
| 13 |
+#include <stdbool.h> |
|
| 14 |
+#include <string.h> |
|
| 15 |
+#include <unistd.h> |
|
| 16 |
+ |
|
| 17 |
+#include <sys/ioctl.h> |
|
| 18 |
+#include <sys/prctl.h> |
|
| 19 |
+#include <sys/socket.h> |
|
| 20 |
+#include <sys/types.h> |
|
| 21 |
+ |
|
| 22 |
+#include <linux/limits.h> |
|
| 23 |
+#include <linux/netlink.h> |
|
| 24 |
+#include <linux/types.h> |
|
| 25 |
+ |
|
| 26 |
+/* Get all of the CLONE_NEW* flags. */ |
|
| 27 |
+#include "namespace.h" |
|
| 28 |
+ |
|
| 29 |
+/* Synchronisation values. */ |
|
| 30 |
+enum sync_t {
|
|
| 31 |
+ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ |
|
| 32 |
+ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ |
|
| 33 |
+ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ |
|
| 34 |
+ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ |
|
| 35 |
+ |
|
| 36 |
+ /* XXX: This doesn't help with segfaults and other such issues. */ |
|
| 37 |
+ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ |
|
| 38 |
+}; |
|
| 39 |
+ |
|
| 40 |
+/* longjmp() arguments. */ |
|
| 41 |
+#define JUMP_PARENT 0x00 |
|
| 42 |
+#define JUMP_CHILD 0xA0 |
|
| 43 |
+#define JUMP_INIT 0xA1 |
|
| 44 |
+ |
|
| 45 |
+/* JSON buffer. */ |
|
| 46 |
+#define JSON_MAX 4096 |
|
| 47 |
+ |
|
| 48 |
+/* Assume the stack grows down, so arguments should be above it. */ |
|
| 49 |
+struct clone_t {
|
|
| 50 |
+ /* |
|
| 51 |
+ * Reserve some space for clone() to locate arguments |
|
| 52 |
+ * and retcode in this place |
|
| 53 |
+ */ |
|
| 54 |
+ char stack[4096] __attribute__ ((aligned(16))); |
|
| 55 |
+ char stack_ptr[0]; |
|
| 56 |
+ |
|
| 57 |
+ /* There's two children. This is used to execute the different code. */ |
|
| 58 |
+ jmp_buf *env; |
|
| 59 |
+ int jmpval; |
|
| 60 |
+}; |
|
| 61 |
+ |
|
| 62 |
+struct nlconfig_t {
|
|
| 63 |
+ char *data; |
|
| 64 |
+ uint32_t cloneflags; |
|
| 65 |
+ char *uidmap; |
|
| 66 |
+ size_t uidmap_len; |
|
| 67 |
+ char *gidmap; |
|
| 68 |
+ size_t gidmap_len; |
|
| 69 |
+ char *namespaces; |
|
| 70 |
+ size_t namespaces_len; |
|
| 71 |
+ uint8_t is_setgroup; |
|
| 72 |
+ int consolefd; |
|
| 73 |
+}; |
|
| 74 |
+ |
|
| 75 |
+/* |
|
| 76 |
+ * List of netlink message types sent to us as part of bootstrapping the init. |
|
| 77 |
+ * These constants are defined in libcontainer/message_linux.go. |
|
| 78 |
+ */ |
|
| 79 |
+#define INIT_MSG 62000 |
|
| 80 |
+#define CLONE_FLAGS_ATTR 27281 |
|
| 81 |
+#define CONSOLE_PATH_ATTR 27282 |
|
| 82 |
+#define NS_PATHS_ATTR 27283 |
|
| 83 |
+#define UIDMAP_ATTR 27284 |
|
| 84 |
+#define GIDMAP_ATTR 27285 |
|
| 85 |
+#define SETGROUP_ATTR 27286 |
|
| 86 |
+ |
|
| 87 |
+/* |
|
| 88 |
+ * Use the raw syscall for versions of glibc which don't include a function for |
|
| 89 |
+ * it, namely (glibc 2.12). |
|
| 90 |
+ */ |
|
| 91 |
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 |
|
| 92 |
+# define _GNU_SOURCE |
|
| 93 |
+# include "syscall.h" |
|
| 94 |
+# if !defined(SYS_setns) && defined(__NR_setns) |
|
| 95 |
+# define SYS_setns __NR_setns |
|
| 96 |
+# endif |
|
| 97 |
+ |
|
| 98 |
+#ifndef SYS_setns |
|
| 99 |
+# error "setns(2) syscall not supported by glibc version" |
|
| 100 |
+#endif |
|
| 101 |
+ |
|
| 102 |
+int setns(int fd, int nstype) |
|
| 103 |
+{
|
|
| 104 |
+ return syscall(SYS_setns, fd, nstype); |
|
| 105 |
+} |
|
| 106 |
+#endif |
|
| 107 |
+ |
|
| 108 |
+/* XXX: This is ugly. */ |
|
| 109 |
+static int syncfd = -1; |
|
| 110 |
+ |
|
| 111 |
+/* TODO(cyphar): Fix this so it correctly deals with syncT. */ |
|
| 112 |
+#define bail(fmt, ...) \ |
|
| 113 |
+ do { \
|
|
| 114 |
+ int ret = __COUNTER__ + 1; \ |
|
| 115 |
+ fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ |
|
| 116 |
+ if (syncfd >= 0) { \
|
|
| 117 |
+ enum sync_t s = SYNC_ERR; \ |
|
| 118 |
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ |
|
| 119 |
+ fprintf(stderr, "nsenter: failed: write(s)"); \ |
|
| 120 |
+ if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ |
|
| 121 |
+ fprintf(stderr, "nsenter: failed: write(ret)"); \ |
|
| 122 |
+ } \ |
|
| 123 |
+ exit(ret); \ |
|
| 124 |
+ } while(0) |
|
| 125 |
+ |
|
| 126 |
+static int write_file(char *data, size_t data_len, char *pathfmt, ...) |
|
| 127 |
+{
|
|
| 128 |
+ int fd, len, ret = 0; |
|
| 129 |
+ char path[PATH_MAX]; |
|
| 130 |
+ |
|
| 131 |
+ va_list ap; |
|
| 132 |
+ va_start(ap, pathfmt); |
|
| 133 |
+ len = vsnprintf(path, PATH_MAX, pathfmt, ap); |
|
| 134 |
+ va_end(ap); |
|
| 135 |
+ if (len < 0) |
|
| 136 |
+ return -1; |
|
| 137 |
+ |
|
| 138 |
+ fd = open(path, O_RDWR); |
|
| 139 |
+ if (fd < 0) {
|
|
| 140 |
+ ret = -1; |
|
| 141 |
+ goto out; |
|
| 142 |
+ } |
|
| 143 |
+ |
|
| 144 |
+ len = write(fd, data, data_len); |
|
| 145 |
+ if (len != data_len) {
|
|
| 146 |
+ ret = -1; |
|
| 147 |
+ goto out; |
|
| 148 |
+ } |
|
| 149 |
+ |
|
| 150 |
+out: |
|
| 151 |
+ close(fd); |
|
| 152 |
+ return ret; |
|
| 153 |
+} |
|
| 154 |
+ |
|
| 155 |
+enum policy_t {
|
|
| 156 |
+ SETGROUPS_DEFAULT = 0, |
|
| 157 |
+ SETGROUPS_ALLOW, |
|
| 158 |
+ SETGROUPS_DENY, |
|
| 159 |
+}; |
|
| 160 |
+ |
|
| 161 |
+/* This *must* be called before we touch gid_map. */ |
|
| 162 |
+static void update_setgroups(int pid, enum policy_t setgroup) |
|
| 163 |
+{
|
|
| 164 |
+ char *policy; |
|
| 165 |
+ |
|
| 166 |
+ switch (setgroup) {
|
|
| 167 |
+ case SETGROUPS_ALLOW: |
|
| 168 |
+ policy = "allow"; |
|
| 169 |
+ break; |
|
| 170 |
+ case SETGROUPS_DENY: |
|
| 171 |
+ policy = "deny"; |
|
| 172 |
+ break; |
|
| 173 |
+ case SETGROUPS_DEFAULT: |
|
| 174 |
+ /* Nothing to do. */ |
|
| 175 |
+ return; |
|
| 176 |
+ } |
|
| 177 |
+ |
|
| 178 |
+ if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
|
|
| 179 |
+ /* |
|
| 180 |
+ * If the kernel is too old to support /proc/pid/setgroups, |
|
| 181 |
+ * open(2) or write(2) will return ENOENT. This is fine. |
|
| 182 |
+ */ |
|
| 183 |
+ if (errno != ENOENT) |
|
| 184 |
+ bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
|
|
| 185 |
+ } |
|
| 186 |
+} |
|
| 187 |
+ |
|
| 188 |
+static void update_uidmap(int pid, char *map, int map_len) |
|
| 189 |
+{
|
|
| 190 |
+ if (map == NULL || map_len <= 0) |
|
| 191 |
+ return; |
|
| 192 |
+ |
|
| 193 |
+ if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) |
|
| 194 |
+ bail("failed to update /proc/%d/uid_map", pid);
|
|
| 195 |
+} |
|
| 196 |
+ |
|
| 197 |
+static void update_gidmap(int pid, char *map, int map_len) |
|
| 198 |
+{
|
|
| 199 |
+ if (map == NULL || map_len <= 0) |
|
| 200 |
+ return; |
|
| 201 |
+ |
|
| 202 |
+ if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) |
|
| 203 |
+ bail("failed to update /proc/%d/gid_map", pid);
|
|
| 204 |
+} |
|
| 205 |
+ |
|
| 206 |
+/* A dummy function that just jumps to the given jumpval. */ |
|
| 207 |
+static int child_func(void *arg) __attribute__ ((noinline)); |
|
| 208 |
+static int child_func(void *arg) |
|
| 209 |
+{
|
|
| 210 |
+ struct clone_t *ca = (struct clone_t *)arg; |
|
| 211 |
+ longjmp(*ca->env, ca->jmpval); |
|
| 212 |
+} |
|
| 213 |
+ |
|
| 214 |
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); |
|
| 215 |
+static int clone_parent(jmp_buf *env, int jmpval) |
|
| 216 |
+{
|
|
| 217 |
+ struct clone_t ca = {
|
|
| 218 |
+ .env = env, |
|
| 219 |
+ .jmpval = jmpval, |
|
| 220 |
+ }; |
|
| 221 |
+ |
|
| 222 |
+ return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); |
|
| 223 |
+} |
|
| 224 |
+ |
|
| 225 |
+/* |
|
| 226 |
+ * Gets the init pipe fd from the environment, which is used to read the |
|
| 227 |
+ * bootstrap data and tell the parent what the new pid is after we finish |
|
| 228 |
+ * setting up the environment. |
|
| 229 |
+ */ |
|
| 230 |
+static int initpipe(void) |
|
| 231 |
+{
|
|
| 232 |
+ int pipenum; |
|
| 233 |
+ char *initpipe, *endptr; |
|
| 234 |
+ |
|
| 235 |
+ initpipe = getenv("_LIBCONTAINER_INITPIPE");
|
|
| 236 |
+ if (initpipe == NULL || *initpipe == '\0') |
|
| 237 |
+ return -1; |
|
| 238 |
+ |
|
| 239 |
+ pipenum = strtol(initpipe, &endptr, 10); |
|
| 240 |
+ if (*endptr != '\0') |
|
| 241 |
+ bail("unable to parse _LIBCONTAINER_INITPIPE");
|
|
| 242 |
+ |
|
| 243 |
+ return pipenum; |
|
| 244 |
+} |
|
| 245 |
+ |
|
| 246 |
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ |
|
| 247 |
+static int nsflag(char *name) |
|
| 248 |
+{
|
|
| 249 |
+ if (!strcmp(name, "cgroup")) |
|
| 250 |
+ return CLONE_NEWCGROUP; |
|
| 251 |
+ else if (!strcmp(name, "ipc")) |
|
| 252 |
+ return CLONE_NEWIPC; |
|
| 253 |
+ else if (!strcmp(name, "mnt")) |
|
| 254 |
+ return CLONE_NEWNS; |
|
| 255 |
+ else if (!strcmp(name, "net")) |
|
| 256 |
+ return CLONE_NEWNET; |
|
| 257 |
+ else if (!strcmp(name, "pid")) |
|
| 258 |
+ return CLONE_NEWPID; |
|
| 259 |
+ else if (!strcmp(name, "user")) |
|
| 260 |
+ return CLONE_NEWUSER; |
|
| 261 |
+ else if (!strcmp(name, "uts")) |
|
| 262 |
+ return CLONE_NEWUTS; |
|
| 263 |
+ |
|
| 264 |
+ /* If we don't recognise a name, fallback to 0. */ |
|
| 265 |
+ return 0; |
|
| 266 |
+} |
|
| 267 |
+ |
|
| 268 |
+static uint32_t readint32(char *buf) |
|
| 269 |
+{
|
|
| 270 |
+ return *(uint32_t *) buf; |
|
| 271 |
+} |
|
| 272 |
+ |
|
| 273 |
+static uint8_t readint8(char *buf) |
|
| 274 |
+{
|
|
| 275 |
+ return *(uint8_t *) buf; |
|
| 276 |
+} |
|
| 277 |
+ |
|
| 278 |
+static void nl_parse(int fd, struct nlconfig_t *config) |
|
| 279 |
+{
|
|
| 280 |
+ size_t len, size; |
|
| 281 |
+ struct nlmsghdr hdr; |
|
| 282 |
+ char *data, *current; |
|
| 283 |
+ |
|
| 284 |
+ /* Retrieve the netlink header. */ |
|
| 285 |
+ len = read(fd, &hdr, NLMSG_HDRLEN); |
|
| 286 |
+ if (len != NLMSG_HDRLEN) |
|
| 287 |
+ bail("invalid netlink header length %lu", len);
|
|
| 288 |
+ |
|
| 289 |
+ if (hdr.nlmsg_type == NLMSG_ERROR) |
|
| 290 |
+ bail("failed to read netlink message");
|
|
| 291 |
+ |
|
| 292 |
+ if (hdr.nlmsg_type != INIT_MSG) |
|
| 293 |
+ bail("unexpected msg type %d", hdr.nlmsg_type);
|
|
| 294 |
+ |
|
| 295 |
+ /* Retrieve data. */ |
|
| 296 |
+ size = NLMSG_PAYLOAD(&hdr, 0); |
|
| 297 |
+ current = data = malloc(size); |
|
| 298 |
+ if (!data) |
|
| 299 |
+ bail("failed to allocate %zu bytes of memory for nl_payload", size);
|
|
| 300 |
+ |
|
| 301 |
+ len = read(fd, data, size); |
|
| 302 |
+ if (len != size) |
|
| 303 |
+ bail("failed to read netlink payload, %lu != %lu", len, size);
|
|
| 304 |
+ |
|
| 305 |
+ /* Parse the netlink payload. */ |
|
| 306 |
+ config->data = data; |
|
| 307 |
+ config->consolefd = -1; |
|
| 308 |
+ while (current < data + size) {
|
|
| 309 |
+ struct nlattr *nlattr = (struct nlattr *)current; |
|
| 310 |
+ size_t payload_len = nlattr->nla_len - NLA_HDRLEN; |
|
| 311 |
+ |
|
| 312 |
+ /* Advance to payload. */ |
|
| 313 |
+ current += NLA_HDRLEN; |
|
| 314 |
+ |
|
| 315 |
+ /* Handle payload. */ |
|
| 316 |
+ switch (nlattr->nla_type) {
|
|
| 317 |
+ case CLONE_FLAGS_ATTR: |
|
| 318 |
+ config->cloneflags = readint32(current); |
|
| 319 |
+ break; |
|
| 320 |
+ case CONSOLE_PATH_ATTR: |
|
| 321 |
+ /* |
|
| 322 |
+ * We open the console here because we currently evaluate console |
|
| 323 |
+ * paths from the *host* namespaces. |
|
| 324 |
+ */ |
|
| 325 |
+ config->consolefd = open(current, O_RDWR); |
|
| 326 |
+ if (config->consolefd < 0) |
|
| 327 |
+ bail("failed to open console %s", current);
|
|
| 328 |
+ break; |
|
| 329 |
+ case NS_PATHS_ATTR: |
|
| 330 |
+ config->namespaces = current; |
|
| 331 |
+ config->namespaces_len = payload_len; |
|
| 332 |
+ break; |
|
| 333 |
+ case UIDMAP_ATTR: |
|
| 334 |
+ config->uidmap = current; |
|
| 335 |
+ config->uidmap_len = payload_len; |
|
| 336 |
+ break; |
|
| 337 |
+ case GIDMAP_ATTR: |
|
| 338 |
+ config->gidmap = current; |
|
| 339 |
+ config->gidmap_len = payload_len; |
|
| 340 |
+ break; |
|
| 341 |
+ case SETGROUP_ATTR: |
|
| 342 |
+ config->is_setgroup = readint8(current); |
|
| 343 |
+ break; |
|
| 344 |
+ default: |
|
| 345 |
+ bail("unknown netlink message type %d", nlattr->nla_type);
|
|
| 346 |
+ } |
|
| 347 |
+ |
|
| 348 |
+ current += NLA_ALIGN(payload_len); |
|
| 349 |
+ } |
|
| 350 |
+} |
|
| 351 |
+ |
|
| 352 |
+void nl_free(struct nlconfig_t *config) |
|
| 353 |
+{
|
|
| 354 |
+ free(config->data); |
|
| 355 |
+} |
|
| 356 |
+ |
|
| 357 |
+void join_namespaces(char *nslist) |
|
| 358 |
+{
|
|
| 359 |
+ int num = 0, i; |
|
| 360 |
+ char *saveptr = NULL; |
|
| 361 |
+ char *namespace = strtok_r(nslist, ",", &saveptr); |
|
| 362 |
+ struct namespace_t {
|
|
| 363 |
+ int fd; |
|
| 364 |
+ int ns; |
|
| 365 |
+ char type[PATH_MAX]; |
|
| 366 |
+ char path[PATH_MAX]; |
|
| 367 |
+ } *namespaces = NULL; |
|
| 368 |
+ |
|
| 369 |
+ if (!namespace || !strlen(namespace) || !strlen(nslist)) |
|
| 370 |
+ bail("ns paths are empty");
|
|
| 371 |
+ |
|
| 372 |
+ /* |
|
| 373 |
+ * We have to open the file descriptors first, since after |
|
| 374 |
+ * we join the mnt namespace we might no longer be able to |
|
| 375 |
+ * access the paths. |
|
| 376 |
+ */ |
|
| 377 |
+ do {
|
|
| 378 |
+ int fd; |
|
| 379 |
+ char *path; |
|
| 380 |
+ struct namespace_t *ns; |
|
| 381 |
+ |
|
| 382 |
+ /* Resize the namespace array. */ |
|
| 383 |
+ namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); |
|
| 384 |
+ if (!namespaces) |
|
| 385 |
+ bail("failed to reallocate namespace array");
|
|
| 386 |
+ ns = &namespaces[num - 1]; |
|
| 387 |
+ |
|
| 388 |
+ /* Split 'ns:path'. */ |
|
| 389 |
+ path = strstr(namespace, ":"); |
|
| 390 |
+ if (!path) |
|
| 391 |
+ bail("failed to parse %s", namespace);
|
|
| 392 |
+ *path++ = '\0'; |
|
| 393 |
+ |
|
| 394 |
+ fd = open(path, O_RDONLY); |
|
| 395 |
+ if (fd < 0) |
|
| 396 |
+ bail("failed to open %s", namespace);
|
|
| 397 |
+ |
|
| 398 |
+ ns->fd = fd; |
|
| 399 |
+ ns->ns = nsflag(namespace); |
|
| 400 |
+ strncpy(ns->path, path, PATH_MAX); |
|
| 401 |
+ } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); |
|
| 402 |
+ |
|
| 403 |
+ /* |
|
| 404 |
+ * The ordering in which we join namespaces is important. We should |
|
| 405 |
+ * always join the user namespace *first*. This is all guaranteed |
|
| 406 |
+ * from the container_linux.go side of this, so we're just going to |
|
| 407 |
+ * follow the order given to us. |
|
| 408 |
+ */ |
|
| 409 |
+ |
|
| 410 |
+ for (i = 0; i < num; i++) {
|
|
| 411 |
+ struct namespace_t ns = namespaces[i]; |
|
| 412 |
+ |
|
| 413 |
+ if (setns(ns.fd, ns.ns) < 0) |
|
| 414 |
+ bail("failed to setns to %s", ns.path);
|
|
| 415 |
+ |
|
| 416 |
+ close(ns.fd); |
|
| 417 |
+ } |
|
| 418 |
+ |
|
| 419 |
+ free(namespaces); |
|
| 420 |
+} |
|
| 421 |
+ |
|
| 422 |
+void nsexec(void) |
|
| 423 |
+{
|
|
| 424 |
+ int pipenum; |
|
| 425 |
+ jmp_buf env; |
|
| 426 |
+ int syncpipe[2]; |
|
| 427 |
+ struct nlconfig_t config = {0};
|
|
| 428 |
+ |
|
| 429 |
+ /* |
|
| 430 |
+ * If we don't have an init pipe, just return to the go routine. |
|
| 431 |
+ * We'll only get an init pipe for start or exec. |
|
| 432 |
+ */ |
|
| 433 |
+ pipenum = initpipe(); |
|
| 434 |
+ if (pipenum == -1) |
|
| 435 |
+ return; |
|
| 436 |
+ |
|
| 437 |
+ /* Parse all of the netlink configuration. */ |
|
| 438 |
+ nl_parse(pipenum, &config); |
|
| 439 |
+ |
|
| 440 |
+ /* Pipe so we can tell the child when we've finished setting up. */ |
|
| 441 |
+ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) |
|
| 442 |
+ bail("failed to setup sync pipe between parent and child");
|
|
| 443 |
+ |
|
| 444 |
+ /* TODO: Currently we aren't dealing with child deaths properly. */ |
|
| 445 |
+ |
|
| 446 |
+ /* |
|
| 447 |
+ * Okay, so this is quite annoying. |
|
| 448 |
+ * |
|
| 449 |
+ * In order for this unsharing code to be more extensible we need to split |
|
| 450 |
+ * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case |
|
| 451 |
+ * would be if we did clone(CLONE_NEWUSER) and the other namespaces |
|
| 452 |
+ * separately, but because of SELinux issues we cannot really do that. But |
|
| 453 |
+ * we cannot just dump the namespace flags into clone(...) because several |
|
| 454 |
+ * usecases (such as rootless containers) require more granularity around |
|
| 455 |
+ * the namespace setup. In addition, some older kernels had issues where |
|
| 456 |
+ * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot |
|
| 457 |
+ * handle this while also dealing with SELinux so we choose SELinux support |
|
| 458 |
+ * over broken kernel support). |
|
| 459 |
+ * |
|
| 460 |
+ * However, if we unshare(2) the user namespace *before* we clone(2), then |
|
| 461 |
+ * all hell breaks loose. |
|
| 462 |
+ * |
|
| 463 |
+ * The parent no longer has permissions to do many things (unshare(2) drops |
|
| 464 |
+ * all capabilities in your old namespace), and the container cannot be set |
|
| 465 |
+ * up to have more than one {uid,gid} mapping. This is obviously less than
|
|
| 466 |
+ * ideal. In order to fix this, we have to first clone(2) and then unshare. |
|
| 467 |
+ * |
|
| 468 |
+ * Unfortunately, it's not as simple as that. We have to fork to enter the |
|
| 469 |
+ * PID namespace (the PID namespace only applies to children). Since we'll |
|
| 470 |
+ * have to double-fork, this clone_parent() call won't be able to get the |
|
| 471 |
+ * PID of the _actual_ init process (without doing more synchronisation than |
|
| 472 |
+ * I can deal with at the moment). So we'll just get the parent to send it |
|
| 473 |
+ * for us, the only job of this process is to update |
|
| 474 |
+ * /proc/pid/{setgroups,uid_map,gid_map}.
|
|
| 475 |
+ * |
|
| 476 |
+ * And as a result of the above, we also need to setns(2) in the first child |
|
| 477 |
+ * because if we join a PID namespace in the topmost parent then our child |
|
| 478 |
+ * will be in that namespace (and it will not be able to give us a PID value |
|
| 479 |
+ * that makes sense without resorting to sending things with cmsg). |
|
| 480 |
+ * |
|
| 481 |
+ * This also deals with an older issue caused by dumping cloneflags into |
|
| 482 |
+ * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so |
|
| 483 |
+ * we have to unshare(2) before clone(2) in order to do this. This was fixed |
|
| 484 |
+ * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was |
|
| 485 |
+ * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're |
|
| 486 |
+ * aware, the last mainline kernel which had this bug was Linux 3.12. |
|
| 487 |
+ * However, we cannot comment on which kernels the broken patch was |
|
| 488 |
+ * backported to. |
|
| 489 |
+ * |
|
| 490 |
+ * -- Aleksa "what has my life come to?" Sarai |
|
| 491 |
+ */ |
|
| 492 |
+ |
|
| 493 |
+ switch (setjmp(env)) {
|
|
| 494 |
+ /* |
|
| 495 |
+ * Stage 0: We're in the parent. Our job is just to create a new child |
|
| 496 |
+ * (stage 1: JUMP_CHILD) process and write its uid_map and |
|
| 497 |
+ * gid_map. That process will go on to create a new process, then |
|
| 498 |
+ * it will send us its PID which we will send to the bootstrap |
|
| 499 |
+ * process. |
|
| 500 |
+ */ |
|
| 501 |
+ case JUMP_PARENT: {
|
|
| 502 |
+ int len; |
|
| 503 |
+ pid_t child; |
|
| 504 |
+ char buf[JSON_MAX]; |
|
| 505 |
+ |
|
| 506 |
+ /* For debugging. */ |
|
| 507 |
+ prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); |
|
| 508 |
+ |
|
| 509 |
+ /* Start the process of getting a container. */ |
|
| 510 |
+ child = clone_parent(&env, JUMP_CHILD); |
|
| 511 |
+ if (child < 0) |
|
| 512 |
+ bail("unable to fork: child_func");
|
|
| 513 |
+ |
|
| 514 |
+ /* State machine for synchronisation with the children. */ |
|
| 515 |
+ while (true) {
|
|
| 516 |
+ enum sync_t s; |
|
| 517 |
+ |
|
| 518 |
+ /* This doesn't need to be global, we're in the parent. */ |
|
| 519 |
+ int syncfd = syncpipe[1]; |
|
| 520 |
+ |
|
| 521 |
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
|
| 522 |
+ bail("failed to sync with child: next state");
|
|
| 523 |
+ |
|
| 524 |
+ switch (s) {
|
|
| 525 |
+ case SYNC_ERR: {
|
|
| 526 |
+ /* We have to mirror the error code of the child. */ |
|
| 527 |
+ int ret; |
|
| 528 |
+ |
|
| 529 |
+ if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) |
|
| 530 |
+ bail("failed to sync with child: read(error code)");
|
|
| 531 |
+ |
|
| 532 |
+ exit(ret); |
|
| 533 |
+ } |
|
| 534 |
+ break; |
|
| 535 |
+ case SYNC_USERMAP_PLS: |
|
| 536 |
+ /* Enable setgroups(2) if we've been asked to. */ |
|
| 537 |
+ if (config.is_setgroup) |
|
| 538 |
+ update_setgroups(child, SETGROUPS_ALLOW); |
|
| 539 |
+ |
|
| 540 |
+ /* Set up mappings. */ |
|
| 541 |
+ update_uidmap(child, config.uidmap, config.uidmap_len); |
|
| 542 |
+ update_gidmap(child, config.gidmap, config.gidmap_len); |
|
| 543 |
+ |
|
| 544 |
+ s = SYNC_USERMAP_ACK; |
|
| 545 |
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
| 546 |
+ kill(child, SIGKILL); |
|
| 547 |
+ bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
|
|
| 548 |
+ } |
|
| 549 |
+ break; |
|
| 550 |
+ case SYNC_USERMAP_ACK: |
|
| 551 |
+ /* We should _never_ receive acks. */ |
|
| 552 |
+ kill(child, SIGKILL); |
|
| 553 |
+ bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
|
|
| 554 |
+ break; |
|
| 555 |
+ case SYNC_RECVPID_PLS: {
|
|
| 556 |
+ pid_t old = child; |
|
| 557 |
+ |
|
| 558 |
+ /* Get the init_func pid. */ |
|
| 559 |
+ if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
|
| 560 |
+ kill(old, SIGKILL); |
|
| 561 |
+ bail("failed to sync with child: read(childpid)");
|
|
| 562 |
+ } |
|
| 563 |
+ |
|
| 564 |
+ /* Send ACK. */ |
|
| 565 |
+ s = SYNC_RECVPID_ACK; |
|
| 566 |
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
| 567 |
+ kill(old, SIGKILL); |
|
| 568 |
+ kill(child, SIGKILL); |
|
| 569 |
+ bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
|
|
| 570 |
+ } |
|
| 571 |
+ } |
|
| 572 |
+ |
|
| 573 |
+ /* Leave the loop. */ |
|
| 574 |
+ goto out; |
|
| 575 |
+ case SYNC_RECVPID_ACK: |
|
| 576 |
+ /* We should _never_ receive acks. */ |
|
| 577 |
+ kill(child, SIGKILL); |
|
| 578 |
+ bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
|
|
| 579 |
+ break; |
|
| 580 |
+ } |
|
| 581 |
+ } |
|
| 582 |
+ |
|
| 583 |
+ out: |
|
| 584 |
+ /* Send the init_func pid back to our parent. */ |
|
| 585 |
+ len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
|
|
| 586 |
+ if (len < 0) {
|
|
| 587 |
+ kill(child, SIGKILL); |
|
| 588 |
+ bail("unable to generate JSON for child pid");
|
|
| 589 |
+ } |
|
| 590 |
+ if (write(pipenum, buf, len) != len) {
|
|
| 591 |
+ kill(child, SIGKILL); |
|
| 592 |
+ bail("unable to send child pid to bootstrapper");
|
|
| 593 |
+ } |
|
| 594 |
+ |
|
| 595 |
+ exit(0); |
|
| 596 |
+ } |
|
| 597 |
+ |
|
| 598 |
+ /* |
|
| 599 |
+ * Stage 1: We're in the first child process. Our job is to join any |
|
| 600 |
+ * provided user namespaces in the netlink payload. If we've been |
|
| 601 |
+ * asked to CLONE_NEWUSER, we will unshare the user namespace and |
|
| 602 |
+ * ask our parent (stage 0) to set up our user mappings for us. |
|
| 603 |
+ * Then, we unshare the rest of the requested namespaces and |
|
| 604 |
+ * create a new child (stage 2: JUMP_INIT). We then send the |
|
| 605 |
+ * child's PID to our parent (stage 0). |
|
| 606 |
+ */ |
|
| 607 |
+ case JUMP_CHILD: {
|
|
| 608 |
+ pid_t child; |
|
| 609 |
+ enum sync_t s; |
|
| 610 |
+ |
|
| 611 |
+ /* We're in a child and thus need to tell the parent if we die. */ |
|
| 612 |
+ syncfd = syncpipe[0]; |
|
| 613 |
+ |
|
| 614 |
+ /* For debugging. */ |
|
| 615 |
+ prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); |
|
| 616 |
+ |
|
| 617 |
+ /* |
|
| 618 |
+ * We need to setns first. We cannot do this earlier (in stage 0) |
|
| 619 |
+ * because of the fact that we forked to get here (the PID of |
|
| 620 |
+ * [stage 2: JUMP_INIT]) would be meaningless). We could send it |
|
| 621 |
+ * using cmsg(3) but that's just annoying. |
|
| 622 |
+ */ |
|
| 623 |
+ if (config.namespaces) |
|
| 624 |
+ join_namespaces(config.namespaces); |
|
| 625 |
+ |
|
| 626 |
+ /* |
|
| 627 |
+ * Unshare all of the namespaces. Now, it should be noted that this |
|
| 628 |
+ * ordering might break in the future (especially with rootless |
|
| 629 |
+ * containers). But for now, it's not possible to split this into |
|
| 630 |
+ * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. |
|
| 631 |
+ * |
|
| 632 |
+ * Note that we don't merge this with clone() because there were |
|
| 633 |
+ * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) |
|
| 634 |
+ * was broken, so we'll just do it the long way anyway. |
|
| 635 |
+ */ |
|
| 636 |
+ if (unshare(config.cloneflags) < 0) |
|
| 637 |
+ bail("failed to unshare namespaces");
|
|
| 638 |
+ |
|
| 639 |
+ /* |
|
| 640 |
+ * Deal with user namespaces first. They are quite special, as they |
|
| 641 |
+ * affect our ability to unshare other namespaces and are used as |
|
| 642 |
+ * context for privilege checks. |
|
| 643 |
+ */ |
|
| 644 |
+ if (config.cloneflags & CLONE_NEWUSER) {
|
|
| 645 |
+ /* |
|
| 646 |
+ * We don't have the privileges to do any mapping here (see the |
|
| 647 |
+ * clone_parent rant). So signal our parent to hook us up. |
|
| 648 |
+ */ |
|
| 649 |
+ |
|
| 650 |
+ s = SYNC_USERMAP_PLS; |
|
| 651 |
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) |
|
| 652 |
+ bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
|
|
| 653 |
+ |
|
| 654 |
+ /* ... wait for mapping ... */ |
|
| 655 |
+ |
|
| 656 |
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
|
| 657 |
+ bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
|
|
| 658 |
+ if (s != SYNC_USERMAP_ACK) |
|
| 659 |
+ bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
|
|
| 660 |
+ } |
|
| 661 |
+ |
|
| 662 |
+ /* TODO: What about non-namespace clone flags that we're dropping here? */ |
|
| 663 |
+ child = clone_parent(&env, JUMP_INIT); |
|
| 664 |
+ if (child < 0) |
|
| 665 |
+ bail("unable to fork: init_func");
|
|
| 666 |
+ |
|
| 667 |
+ /* Send the child to our parent, which knows what it's doing. */ |
|
| 668 |
+ s = SYNC_RECVPID_PLS; |
|
| 669 |
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
| 670 |
+ kill(child, SIGKILL); |
|
| 671 |
+ bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
|
|
| 672 |
+ } |
|
| 673 |
+ if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
|
| 674 |
+ kill(child, SIGKILL); |
|
| 675 |
+ bail("failed to sync with parent: write(childpid)");
|
|
| 676 |
+ } |
|
| 677 |
+ |
|
| 678 |
+ /* ... wait for parent to get the pid ... */ |
|
| 679 |
+ |
|
| 680 |
+ if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
| 681 |
+ kill(child, SIGKILL); |
|
| 682 |
+ bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
|
|
| 683 |
+ } |
|
| 684 |
+ if (s != SYNC_RECVPID_ACK) {
|
|
| 685 |
+ kill(child, SIGKILL); |
|
| 686 |
+ bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
|
|
| 687 |
+ } |
|
| 688 |
+ |
|
| 689 |
+ /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ |
|
| 690 |
+ exit(0); |
|
| 691 |
+ } |
|
| 692 |
+ |
|
| 693 |
+ /* |
|
| 694 |
+ * Stage 2: We're the final child process, and the only process that will |
|
| 695 |
+ * actually return to the Go runtime. Our job is to just do the |
|
| 696 |
+ * final cleanup steps and then return to the Go runtime to allow |
|
| 697 |
+ * init_linux.go to run. |
|
| 698 |
+ */ |
|
| 699 |
+ case JUMP_INIT: {
|
|
| 700 |
+ /* |
|
| 701 |
+ * We're inside the child now, having jumped from the |
|
| 702 |
+ * start_child() code after forking in the parent. |
|
| 703 |
+ */ |
|
| 704 |
+ int consolefd = config.consolefd; |
|
| 705 |
+ |
|
| 706 |
+ /* We're in a child and thus need to tell the parent if we die. */ |
|
| 707 |
+ syncfd = syncpipe[0]; |
|
| 708 |
+ |
|
| 709 |
+ /* For debugging. */ |
|
| 710 |
+ prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); |
|
| 711 |
+ |
|
| 712 |
+ if (setsid() < 0) |
|
| 713 |
+ bail("setsid failed");
|
|
| 714 |
+ |
|
| 715 |
+ if (setuid(0) < 0) |
|
| 716 |
+ bail("setuid failed");
|
|
| 717 |
+ |
|
| 718 |
+ if (setgid(0) < 0) |
|
| 719 |
+ bail("setgid failed");
|
|
| 720 |
+ |
|
| 721 |
+ if (setgroups(0, NULL) < 0) |
|
| 722 |
+ bail("setgroups failed");
|
|
| 723 |
+ |
|
| 724 |
+ if (consolefd != -1) {
|
|
| 725 |
+ if (ioctl(consolefd, TIOCSCTTY, 0) < 0) |
|
| 726 |
+ bail("ioctl TIOCSCTTY failed");
|
|
| 727 |
+ if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) |
|
| 728 |
+ bail("failed to dup stdin");
|
|
| 729 |
+ if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) |
|
| 730 |
+ bail("failed to dup stdout");
|
|
| 731 |
+ if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) |
|
| 732 |
+ bail("failed to dup stderr");
|
|
| 733 |
+ } |
|
| 734 |
+ |
|
| 735 |
+ /* Close sync pipes. */ |
|
| 736 |
+ close(syncpipe[0]); |
|
| 737 |
+ close(syncpipe[1]); |
|
| 738 |
+ |
|
| 739 |
+ /* Free netlink data. */ |
|
| 740 |
+ nl_free(&config); |
|
| 741 |
+ |
|
| 742 |
+ /* Finish executing, let the Go runtime take over. */ |
|
| 743 |
+ return; |
|
| 744 |
+ } |
|
| 745 |
+ default: |
|
| 746 |
+ bail("unexpected jump value");
|
|
| 747 |
+ break; |
|
| 748 |
+ } |
|
| 749 |
+ |
|
| 750 |
+ /* Should never be reached. */ |
|
| 751 |
+ bail("should never be reached");
|
|
| 752 |
+} |
| ... | ... |
@@ -355,6 +355,12 @@ func FreeLxcContexts(scon string) {
|
| 355 | 355 |
} |
| 356 | 356 |
} |
| 357 | 357 |
|
| 358 |
+var roFileLabel string |
|
| 359 |
+ |
|
| 360 |
+func GetROFileLabel() (fileLabel string) {
|
|
| 361 |
+ return roFileLabel |
|
| 362 |
+} |
|
| 363 |
+ |
|
| 358 | 364 |
func GetLxcContexts() (processLabel string, fileLabel string) {
|
| 359 | 365 |
var ( |
| 360 | 366 |
val, key string |
| ... | ... |
@@ -399,6 +405,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
|
| 399 | 399 |
if key == "file" {
|
| 400 | 400 |
fileLabel = strings.Trim(val, "\"") |
| 401 | 401 |
} |
| 402 |
+ if key == "ro_file" {
|
|
| 403 |
+ roFileLabel = strings.Trim(val, "\"") |
|
| 404 |
+ } |
|
| 402 | 405 |
} |
| 403 | 406 |
} |
| 404 | 407 |
|
| ... | ... |
@@ -406,6 +415,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) {
|
| 406 | 406 |
return "", "" |
| 407 | 407 |
} |
| 408 | 408 |
|
| 409 |
+ if roFileLabel == "" {
|
|
| 410 |
+ roFileLabel = fileLabel |
|
| 411 |
+ } |
|
| 409 | 412 |
exit: |
| 410 | 413 |
// mcs := IntToMcs(os.Getpid(), 1024) |
| 411 | 414 |
mcs := uniqMcs(1024) |
| ... | ... |
@@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) {
|
| 14 | 14 |
if err != nil {
|
| 15 | 15 |
return "", err |
| 16 | 16 |
} |
| 17 |
+ return parseStartTime(string(data)) |
|
| 18 |
+} |
|
| 17 | 19 |
|
| 18 |
- parts := strings.Split(string(data), " ") |
|
| 20 |
+func parseStartTime(stat string) (string, error) {
|
|
| 19 | 21 |
// the starttime is located at pos 22 |
| 20 | 22 |
// from the man page |
| 21 | 23 |
// |
| ... | ... |
@@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) {
|
| 23 | 23 |
// (22) The time the process started after system boot. In kernels before Linux 2.6, this |
| 24 | 24 |
// value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks |
| 25 | 25 |
// (divide by sysconf(_SC_CLK_TCK)). |
| 26 |
- return parts[22-1], nil // starts at 1 |
|
| 26 |
+ // |
|
| 27 |
+ // NOTE: |
|
| 28 |
+ // pos 2 could contain space and is inside `(` and `)`: |
|
| 29 |
+ // (2) comm %s |
|
| 30 |
+ // The filename of the executable, in parentheses. |
|
| 31 |
+ // This is visible whether or not the executable is |
|
| 32 |
+ // swapped out. |
|
| 33 |
+ // |
|
| 34 |
+ // the following is an example: |
|
| 35 |
+ // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|
| 36 |
+ |
|
| 37 |
+ // get parts after last `)`: |
|
| 38 |
+ s := strings.Split(stat, ")") |
|
| 39 |
+ parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ") |
|
| 40 |
+ return parts[22-3], nil // starts at 3 (after the filename pos `2`) |
|
| 27 | 41 |
} |