When runc is bind-mounting a particular path "with options", it has to
do so by first creating a bind-mount and the modifying the options of
said bind-mount via remount. However, in a user namespace, there are
restrictions on which flags you can change with a remount (due to
CL_UNPRIVILEGED being set in this instance). Docker historically has
ignored this, and as a result, internal Docker mounts (such as secrets)
haven't worked with --userns-remap. Fix this by preserving
CL_UNPRIVILEGED mount flags when Docker is spawning containers with user
namespaces enabled.
Ref: https://github.com/opencontainers/runc/pull/1603
Signed-off-by: Aleksa Sarai <asarai@suse.de>
... | ... |
@@ -26,6 +26,7 @@ import ( |
26 | 26 |
"github.com/opencontainers/runc/libcontainer/user" |
27 | 27 |
specs "github.com/opencontainers/runtime-spec/specs-go" |
28 | 28 |
"github.com/sirupsen/logrus" |
29 |
+ "golang.org/x/sys/unix" |
|
29 | 30 |
) |
30 | 31 |
|
31 | 32 |
// nolint: gosimple |
... | ... |
@@ -469,6 +470,38 @@ func ensureSharedOrSlave(path string) error { |
469 | 469 |
return nil |
470 | 470 |
} |
471 | 471 |
|
472 |
+// Get the set of mount flags that are set on the mount that contains the given |
|
473 |
+// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that |
|
474 |
+// bind-mounting "with options" will not fail with user namespaces, due to |
|
475 |
+// kernel restrictions that require user namespace mounts to preserve |
|
476 |
+// CL_UNPRIVILEGED locked flags. |
|
477 |
+func getUnprivilegedMountFlags(path string) ([]string, error) { |
|
478 |
+ var statfs unix.Statfs_t |
|
479 |
+ if err := unix.Statfs(path, &statfs); err != nil { |
|
480 |
+ return nil, err |
|
481 |
+ } |
|
482 |
+ |
|
483 |
+ // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. |
|
484 |
+ unprivilegedFlags := map[uint64]string{ |
|
485 |
+ unix.MS_RDONLY: "ro", |
|
486 |
+ unix.MS_NODEV: "nodev", |
|
487 |
+ unix.MS_NOEXEC: "noexec", |
|
488 |
+ unix.MS_NOSUID: "nosuid", |
|
489 |
+ unix.MS_NOATIME: "noatime", |
|
490 |
+ unix.MS_RELATIME: "relatime", |
|
491 |
+ unix.MS_NODIRATIME: "nodiratime", |
|
492 |
+ } |
|
493 |
+ |
|
494 |
+ var flags []string |
|
495 |
+ for mask, flag := range unprivilegedFlags { |
|
496 |
+ if uint64(statfs.Flags)&mask == mask { |
|
497 |
+ flags = append(flags, flag) |
|
498 |
+ } |
|
499 |
+ } |
|
500 |
+ |
|
501 |
+ return flags, nil |
|
502 |
+} |
|
503 |
+ |
|
472 | 504 |
var ( |
473 | 505 |
mountPropagationMap = map[string]int{ |
474 | 506 |
"private": mount.PRIVATE, |
... | ... |
@@ -575,6 +608,19 @@ func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []c |
575 | 575 |
opts = append(opts, mountPropagationReverseMap[pFlag]) |
576 | 576 |
} |
577 | 577 |
|
578 |
+ // If we are using user namespaces, then we must make sure that we |
|
579 |
+ // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source |
|
580 |
+ // "mount" when we bind-mount. The reason for this is that at the point |
|
581 |
+ // when runc sets up the root filesystem, it is already inside a user |
|
582 |
+ // namespace, and thus cannot change any flags that are locked. |
|
583 |
+ if daemon.configStore.RemappedRoot != "" { |
|
584 |
+ unprivOpts, err := getUnprivilegedMountFlags(m.Source) |
|
585 |
+ if err != nil { |
|
586 |
+ return err |
|
587 |
+ } |
|
588 |
+ opts = append(opts, unprivOpts...) |
|
589 |
+ } |
|
590 |
+ |
|
578 | 591 |
mt.Options = opts |
579 | 592 |
s.Mounts = append(s.Mounts, mt) |
580 | 593 |
} |