package daemon import ( "context" "errors" "fmt" "os" "path/filepath" "runtime" "strconv" "github.com/containerd/log" "github.com/moby/sys/mount" "github.com/moby/sys/symlink" "golang.org/x/sys/unix" containertypes "github.com/moby/moby/api/types/container" "github.com/moby/moby/v2/daemon/container" "github.com/moby/moby/v2/daemon/internal/mounttree" "github.com/moby/moby/v2/daemon/internal/unshare" ) type future struct { fn func() error res chan<- error } // containerFSView allows functions to be run in the context of a container's // filesystem. Inside these functions, the root directory is the container root // for all native OS filesystem APIs, including, but not limited to, the [os] // and [golang.org/x/sys/unix] packages. The view of the container's filesystem // is live and read-write. Each view has its own private set of tmpfs mounts. // Any files written under a tmpfs mount are not visible to processes inside the // container nor any other view of the container's filesystem, and vice versa. // // Each view has its own current working directory which is initialized to the // root of the container filesystem and can be changed with [os.Chdir]. Changes // to the current directory persist across successive [*containerFSView.RunInFS] // and [*containerFSView.GoInFS] calls. // // Multiple views of the same container filesystem can coexist at the same time. // Only one function can be running in a particular filesystem view at any given // time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will // block while another function is running. If more than one call is blocked // concurrently, the order they are unblocked is undefined. type containerFSView struct { d *Daemon ctr *container.Container todo chan future done chan error } // openContainerFS opens a new view of the container's filesystem. func (daemon *Daemon) openContainerFS(ctr *container.Container) (_ *containerFSView, retErr error) { ctx := context.TODO() if err := daemon.Mount(ctr); err != nil { return nil, err } defer func() { if retErr != nil { if err := daemon.Unmount(ctr); err != nil { log.G(ctx).WithError(err).Debug("Failed to unmount container after failure") } } }() mounts, cleanup, err := daemon.setupMounts(ctx, ctr) if err != nil { return nil, err } defer func() { ctx := context.WithoutCancel(ctx) if err := cleanup(ctx); err != nil { log.G(ctx).WithError(err).Debug("Failed to cleanup container mounts") } if retErr != nil { if err := ctr.UnmountVolumes(ctx, daemon.LogVolumeEvent); err != nil { log.G(ctx).WithError(err).Debug("Failed to unmount container volumes after failure") } } }() // Setup in initial mount namespace complete. We're ready to unshare the // mount namespace and bind the volume mounts into that private view of // the container FS. todo := make(chan future) done := make(chan error) err = unshare.Go(unix.CLONE_NEWNS, func() error { if err := mount.MakeRSlave("/"); err != nil { return err } root, err := os.OpenRoot(ctr.BaseFS) if err != nil { return fmt.Errorf("open container root: %w", err) } defer root.Close() // TODO(vvoland): Refactor this after security release. for _, m := range mounts { // Walk m.Destination through the container's symlinks before // passing it to os.Root, which refuses absolute symlinks // (e.g. the common /var/run -> /run). The resolution itself // is lexical; subsequent os.Root operations still enforce // the GHSA-vp62-88p7-qqf5 / GHSA-rg2x-37c3-w2rh protections. resolved, err := ctr.GetResourcePath(m.Destination) if err != nil { return fmt.Errorf("resolve mount destination %q: %w", m.Destination, err) } relDest, err := filepath.Rel(ctr.BaseFS, resolved) if err != nil { return fmt.Errorf("make destination relative: %w", err) } var stat os.FileInfo stat, err = os.Stat(m.Source) if err != nil { return err } if err := createIfNotExists(root, relDest, stat.IsDir()); err != nil { return err } bindMode := "rbind" if m.NonRecursive { bindMode = "bind" } if m.Writable { if m.ReadOnlyNonRecursive { return errors.New("options conflict: Writable && ReadOnlyNonRecursive") } if m.ReadOnlyForceRecursive { return errors.New("options conflict: Writable && ReadOnlyForceRecursive") } } if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive { return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive") } // Open the mount target through os.Root so we have a // file descriptor pinning the resolved inode. Using // /proc/self/fd/ as the mount target prevents any // subsequent symlink swap from redirecting the mount. targetFile, err := root.Open(relDest) if err != nil { return fmt.Errorf("open mount target %q: %w", m.Destination, err) } targetPath := "/proc/self/fd/" + strconv.FormatUint(uint64(targetFile.Fd()), 10) // The kernel rejects remount and propagation-change syscalls // when the target is a /proc/self/fd path. Only the initial // bind mount works on such paths, so we perform that via the // fd path for TOCTOU safety and then resolve the real path for // the read-only remount and propagation change. if err := mount.Mount(m.Source, targetPath, "", bindMode); err != nil { targetFile.Close() return err } realPath, err := os.Readlink(targetPath) if err != nil { targetFile.Close() return fmt.Errorf("readlink %s: %w", targetPath, err) } if !m.Writable { if err := mount.Mount("", realPath, "", "ro,remount,bind"); err != nil { targetFile.Close() return err } } // openContainerFS() is called for temporary mounts // outside the container. Soon these will be unmounted // with lazy unmount option and given we have mounted // them rbind, all the submounts will propagate if these // are shared. If daemon is running in host namespace // and has / as shared then these unmounts will // propagate and unmount original mount as well. So make // all these mounts rprivate. Do not use propagation // property of volume as that should apply only when // mounting happens inside the container. if err := mount.MakeRPrivate(realPath); err != nil { targetFile.Close() return err } if !m.Writable && !m.ReadOnlyNonRecursive { if err := makeMountRRO(realPath); err != nil { targetFile.Close() if m.ReadOnlyForceRecursive { return err } log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", m.Destination) } } targetFile.Close() } return mounttree.SwitchRoot(ctr.BaseFS) }, func() { defer close(done) for it := range todo { err := it.fn() if it.res != nil { it.res <- err } } // The thread will terminate when this goroutine returns, taking the // mount namespace and all the volume bind-mounts with it. }, ) if err != nil { return nil, err } vw := &containerFSView{ d: daemon, ctr: ctr, todo: todo, done: done, } runtime.SetFinalizer(vw, (*containerFSView).Close) return vw, nil } // RunInFS synchronously runs fn in the context of the container filesystem and // passes through its return value. // // The container filesystem is only visible to functions called in the same // goroutine as fn. Goroutines started from fn will see the host's filesystem. func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error { res := make(chan error) select { case vw.todo <- future{fn: fn, res: res}: case <-ctx.Done(): return ctx.Err() } return <-res } // GoInFS starts fn in the container FS. It blocks until fn is started but does // not wait until fn returns. An error is returned if ctx is canceled before fn // has been started. // // The container filesystem is only visible to functions called in the same // goroutine as fn. Goroutines started from fn will see the host's filesystem. func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error { select { case vw.todo <- future{fn: func() error { fn(); return nil }}: return nil case <-ctx.Done(): return ctx.Err() } } // Close waits until any in-flight operations complete and frees all // resources associated with vw. func (vw *containerFSView) Close() error { runtime.SetFinalizer(vw, nil) close(vw.todo) var errs []error errs = append(errs, <-vw.done, vw.ctr.UnmountVolumes(context.TODO(), vw.d.LogVolumeEvent), vw.d.Unmount(vw.ctr), ) return errors.Join(errs...) } // Stat returns the metadata for path, relative to the current working directory // of vw inside the container filesystem view. func (vw *containerFSView) Stat(ctx context.Context, path string) (*containertypes.PathStat, error) { var stat *containertypes.PathStat err := vw.RunInFS(ctx, func() error { lstat, err := os.Lstat(path) if err != nil { return err } var target string if lstat.Mode()&os.ModeSymlink != 0 { // Fully evaluate symlinks along path to the ultimate // target, or as much as possible with broken links. target, err = symlink.FollowSymlinkInScope(path, "/") if err != nil { return err } } stat = &containertypes.PathStat{ Name: filepath.Base(path), Size: lstat.Size(), Mode: lstat.Mode(), Mtime: lstat.ModTime(), LinkTarget: target, } return nil }) return stat, err } // createIfNotExists creates a file or a directory only if it does not already exist. // The path is scoped to root using [os.Root] to prevent symlink escape attacks. func createIfNotExists(root *os.Root, unsafePath string, isDir bool) error { if isDir { return root.MkdirAll(unsafePath, 0o755) } parent := filepath.Dir(unsafePath) if parent != "." && parent != "/" { if err := root.MkdirAll(parent, 0o755); err != nil { return err } } f, err := root.OpenFile(unsafePath, os.O_CREATE|os.O_WRONLY, 0o755) if err != nil { return err } return f.Close() } // makeMountRRO makes the mount recursively read-only. func makeMountRRO(dest string) error { attr := &unix.MountAttr{ Attr_set: unix.MOUNT_ATTR_RDONLY, } var err error for { err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr) if !errors.Is(err, unix.EINTR) { break } } if err != nil { err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err) } return err }