Browse code

daemon: oci: obey CL_UNPRIVILEGED for user namespaced daemon

When runc is bind-mounting a particular path "with options", it has to
do so by first creating a bind-mount and the modifying the options of
said bind-mount via remount. However, in a user namespace, there are
restrictions on which flags you can change with a remount (due to
CL_UNPRIVILEGED being set in this instance). Docker historically has
ignored this, and as a result, internal Docker mounts (such as secrets)
haven't worked with --userns-remap. Fix this by preserving
CL_UNPRIVILEGED mount flags when Docker is spawning containers with user
namespaces enabled.

Ref: https://github.com/opencontainers/runc/pull/1603
Signed-off-by: Aleksa Sarai <asarai@suse.de>

Aleksa Sarai authored on 2017/10/15 15:06:20
Showing 1 changed files
... ...
@@ -26,6 +26,7 @@ import (
26 26
 	"github.com/opencontainers/runc/libcontainer/user"
27 27
 	specs "github.com/opencontainers/runtime-spec/specs-go"
28 28
 	"github.com/sirupsen/logrus"
29
+	"golang.org/x/sys/unix"
29 30
 )
30 31
 
31 32
 // nolint: gosimple
... ...
@@ -469,6 +470,38 @@ func ensureSharedOrSlave(path string) error {
469 469
 	return nil
470 470
 }
471 471
 
472
+// Get the set of mount flags that are set on the mount that contains the given
473
+// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
474
+// bind-mounting "with options" will not fail with user namespaces, due to
475
+// kernel restrictions that require user namespace mounts to preserve
476
+// CL_UNPRIVILEGED locked flags.
477
+func getUnprivilegedMountFlags(path string) ([]string, error) {
478
+	var statfs unix.Statfs_t
479
+	if err := unix.Statfs(path, &statfs); err != nil {
480
+		return nil, err
481
+	}
482
+
483
+	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
484
+	unprivilegedFlags := map[uint64]string{
485
+		unix.MS_RDONLY:     "ro",
486
+		unix.MS_NODEV:      "nodev",
487
+		unix.MS_NOEXEC:     "noexec",
488
+		unix.MS_NOSUID:     "nosuid",
489
+		unix.MS_NOATIME:    "noatime",
490
+		unix.MS_RELATIME:   "relatime",
491
+		unix.MS_NODIRATIME: "nodiratime",
492
+	}
493
+
494
+	var flags []string
495
+	for mask, flag := range unprivilegedFlags {
496
+		if uint64(statfs.Flags)&mask == mask {
497
+			flags = append(flags, flag)
498
+		}
499
+	}
500
+
501
+	return flags, nil
502
+}
503
+
472 504
 var (
473 505
 	mountPropagationMap = map[string]int{
474 506
 		"private":  mount.PRIVATE,
... ...
@@ -575,6 +608,19 @@ func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []c
575 575
 			opts = append(opts, mountPropagationReverseMap[pFlag])
576 576
 		}
577 577
 
578
+		// If we are using user namespaces, then we must make sure that we
579
+		// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
580
+		// "mount" when we bind-mount. The reason for this is that at the point
581
+		// when runc sets up the root filesystem, it is already inside a user
582
+		// namespace, and thus cannot change any flags that are locked.
583
+		if daemon.configStore.RemappedRoot != "" {
584
+			unprivOpts, err := getUnprivilegedMountFlags(m.Source)
585
+			if err != nil {
586
+				return err
587
+			}
588
+			opts = append(opts, unprivOpts...)
589
+		}
590
+
578 591
 		mt.Options = opts
579 592
 		s.Mounts = append(s.Mounts, mt)
580 593
 	}