When runc is bind-mounting a particular path "with options", it has to
do so by first creating a bind-mount and the modifying the options of
said bind-mount via remount. However, in a user namespace, there are
restrictions on which flags you can change with a remount (due to
CL_UNPRIVILEGED being set in this instance). Docker historically has
ignored this, and as a result, internal Docker mounts (such as secrets)
haven't worked with --userns-remap. Fix this by preserving
CL_UNPRIVILEGED mount flags when Docker is spawning containers with user
namespaces enabled.
Ref: https://github.com/opencontainers/runc/pull/1603
Signed-off-by: Aleksa Sarai <asarai@suse.de>
| ... | ... |
@@ -26,6 +26,7 @@ import ( |
| 26 | 26 |
"github.com/opencontainers/runc/libcontainer/user" |
| 27 | 27 |
specs "github.com/opencontainers/runtime-spec/specs-go" |
| 28 | 28 |
"github.com/sirupsen/logrus" |
| 29 |
+ "golang.org/x/sys/unix" |
|
| 29 | 30 |
) |
| 30 | 31 |
|
| 31 | 32 |
// nolint: gosimple |
| ... | ... |
@@ -469,6 +470,38 @@ func ensureSharedOrSlave(path string) error {
|
| 469 | 469 |
return nil |
| 470 | 470 |
} |
| 471 | 471 |
|
| 472 |
+// Get the set of mount flags that are set on the mount that contains the given |
|
| 473 |
+// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that |
|
| 474 |
+// bind-mounting "with options" will not fail with user namespaces, due to |
|
| 475 |
+// kernel restrictions that require user namespace mounts to preserve |
|
| 476 |
+// CL_UNPRIVILEGED locked flags. |
|
| 477 |
+func getUnprivilegedMountFlags(path string) ([]string, error) {
|
|
| 478 |
+ var statfs unix.Statfs_t |
|
| 479 |
+ if err := unix.Statfs(path, &statfs); err != nil {
|
|
| 480 |
+ return nil, err |
|
| 481 |
+ } |
|
| 482 |
+ |
|
| 483 |
+ // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. |
|
| 484 |
+ unprivilegedFlags := map[uint64]string{
|
|
| 485 |
+ unix.MS_RDONLY: "ro", |
|
| 486 |
+ unix.MS_NODEV: "nodev", |
|
| 487 |
+ unix.MS_NOEXEC: "noexec", |
|
| 488 |
+ unix.MS_NOSUID: "nosuid", |
|
| 489 |
+ unix.MS_NOATIME: "noatime", |
|
| 490 |
+ unix.MS_RELATIME: "relatime", |
|
| 491 |
+ unix.MS_NODIRATIME: "nodiratime", |
|
| 492 |
+ } |
|
| 493 |
+ |
|
| 494 |
+ var flags []string |
|
| 495 |
+ for mask, flag := range unprivilegedFlags {
|
|
| 496 |
+ if uint64(statfs.Flags)&mask == mask {
|
|
| 497 |
+ flags = append(flags, flag) |
|
| 498 |
+ } |
|
| 499 |
+ } |
|
| 500 |
+ |
|
| 501 |
+ return flags, nil |
|
| 502 |
+} |
|
| 503 |
+ |
|
| 472 | 504 |
var ( |
| 473 | 505 |
mountPropagationMap = map[string]int{
|
| 474 | 506 |
"private": mount.PRIVATE, |
| ... | ... |
@@ -575,6 +608,19 @@ func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []c |
| 575 | 575 |
opts = append(opts, mountPropagationReverseMap[pFlag]) |
| 576 | 576 |
} |
| 577 | 577 |
|
| 578 |
+ // If we are using user namespaces, then we must make sure that we |
|
| 579 |
+ // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source |
|
| 580 |
+ // "mount" when we bind-mount. The reason for this is that at the point |
|
| 581 |
+ // when runc sets up the root filesystem, it is already inside a user |
|
| 582 |
+ // namespace, and thus cannot change any flags that are locked. |
|
| 583 |
+ if daemon.configStore.RemappedRoot != "" {
|
|
| 584 |
+ unprivOpts, err := getUnprivilegedMountFlags(m.Source) |
|
| 585 |
+ if err != nil {
|
|
| 586 |
+ return err |
|
| 587 |
+ } |
|
| 588 |
+ opts = append(opts, unprivOpts...) |
|
| 589 |
+ } |
|
| 590 |
+ |
|
| 578 | 591 |
mt.Options = opts |
| 579 | 592 |
s.Mounts = append(s.Mounts, mt) |
| 580 | 593 |
} |