daemon/oci_linux.go
4f0d95fa
 package daemon // import "github.com/docker/docker/daemon"
9c4570a9
 
 import (
cb902f44
 	"context"
9c4570a9
 	"fmt"
 	"io"
 	"os"
ee3ac3aa
 	"os/exec"
9c4570a9
 	"path/filepath"
6a8ea46c
 	"sort"
9c4570a9
 	"strconv"
 	"strings"
 
cb902f44
 	"github.com/containerd/containerd/containers"
 	coci "github.com/containerd/containerd/oci"
91e197d6
 	containertypes "github.com/docker/docker/api/types/container"
9c4570a9
 	"github.com/docker/docker/container"
17b12887
 	daemonconfig "github.com/docker/docker/daemon/config"
9c4570a9
 	"github.com/docker/docker/oci"
80d7bfd5
 	"github.com/docker/docker/oci/caps"
9c4570a9
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/mount"
d3d724e4
 	"github.com/docker/docker/pkg/stringid"
ec87479b
 	"github.com/docker/docker/rootless/specconv"
6a70fd22
 	volumemounts "github.com/docker/docker/volume/mounts"
9c4570a9
 	"github.com/opencontainers/runc/libcontainer/apparmor"
56f77d5a
 	"github.com/opencontainers/runc/libcontainer/cgroups"
9c4570a9
 	"github.com/opencontainers/runc/libcontainer/devices"
ec87479b
 	rsystem "github.com/opencontainers/runc/libcontainer/system"
9c4570a9
 	"github.com/opencontainers/runc/libcontainer/user"
f23c00d8
 	"github.com/opencontainers/runtime-spec/specs-go"
487c6c7e
 	"github.com/pkg/errors"
1009e6a4
 	"github.com/sirupsen/logrus"
c0f883fd
 	"golang.org/x/sys/unix"
9c4570a9
 )
 
c4785536
 const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
bcacbf52
 
c4785536
 // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
 func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		var rlimits []specs.POSIXRlimit
 
 		// We want to leave the original HostConfig alone so make a copy here
 		hostConfig := *c.HostConfig
 		// Merge with the daemon defaults
 		daemon.mergeUlimits(&hostConfig)
 		for _, ul := range hostConfig.Ulimits {
 			rlimits = append(rlimits, specs.POSIXRlimit{
 				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
 				Soft: uint64(ul.Soft),
 				Hard: uint64(ul.Hard),
 			})
 		}
 
 		s.Process.Rlimits = rlimits
 		return nil
9c4570a9
 	}
c4785536
 }
9c4570a9
 
c4785536
 // WithLibnetwork sets the libnetwork hook
 func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		if s.Hooks == nil {
 			s.Hooks = &specs.Hooks{}
 		}
 		for _, ns := range s.Linux.Namespaces {
 			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
 				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
d3d724e4
 				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
c4785536
 				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
 					Path: target,
 					Args: []string{
 						"libnetwork-setkey",
 						"-exec-root=" + daemon.configStore.GetExecRoot(),
 						c.ID,
d3d724e4
 						shortNetCtlrID,
c4785536
 					},
 				})
 			}
 		}
 		return nil
 	}
 }
 
 // WithRootless sets the spec to the rootless configuration
 func WithRootless(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 	return specconv.ToRootless(s)
 }
 
 // WithOOMScore sets the oom score
 func WithOOMScore(score *int) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		s.Process.OOMScoreAdj = score
 		return nil
 	}
 }
 
 // WithSelinux sets the selinux labels
 func WithSelinux(c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		s.Process.SelinuxLabel = c.GetProcessLabel()
 		s.Linux.MountLabel = c.MountLabel
 		return nil
 	}
 }
 
 // WithApparmor sets the apparmor profile
 func WithApparmor(c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		if apparmor.IsEnabled() {
 			var appArmorProfile string
 			if c.AppArmorProfile != "" {
 				appArmorProfile = c.AppArmorProfile
 			} else if c.HostConfig.Privileged {
 				appArmorProfile = "unconfined"
 			} else {
 				appArmorProfile = "docker-default"
 			}
 
 			if appArmorProfile == "docker-default" {
 				// Unattended upgrades and other fun services can unload AppArmor
 				// profiles inadvertently. Since we cannot store our profile in
 				// /etc/apparmor.d, nor can we practically add other ways of
 				// telling the system to keep our profile loaded, in order to make
 				// sure that we keep the default profile enabled we dynamically
 				// reload it if necessary.
 				if err := ensureDefaultAppArmorProfile(); err != nil {
 					return err
 				}
 			}
 			s.Process.ApparmorProfile = appArmorProfile
 		}
 		return nil
 	}
 }
 
 // WithCapabilities sets the container's capabilties
 func WithCapabilities(c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		capabilities, err := caps.TweakCapabilities(
 			oci.DefaultCapabilities(),
 			c.HostConfig.CapAdd,
 			c.HostConfig.CapDrop,
 			c.HostConfig.Capabilities,
 			c.HostConfig.Privileged,
 		)
 		if err != nil {
 			return err
 		}
 		return oci.SetCapabilities(s, capabilities)
 	}
9c4570a9
 }
 
 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
7a7357da
 	fp, err := c.GetResourcePath(p)
9c4570a9
 	if err != nil {
 		return nil, err
 	}
83baeafc
 	fh, err := os.Open(fp)
 	if err != nil {
 		// This is needed because a nil *os.File is different to a nil
 		// io.ReadCloser and this causes GetExecUser to not detect that the
 		// container file is missing.
 		return nil, err
 	}
 	return fh, nil
9c4570a9
 }
 
 func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
 	passwdPath, err := user.GetPasswdPath()
 	if err != nil {
 		return 0, 0, nil, err
 	}
 	groupPath, err := user.GetGroupPath()
 	if err != nil {
 		return 0, 0, nil, err
 	}
 	passwdFile, err := readUserFile(c, passwdPath)
 	if err == nil {
 		defer passwdFile.Close()
 	}
 	groupFile, err := readUserFile(c, groupPath)
 	if err == nil {
 		defer groupFile.Close()
 	}
 
 	execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
 	if err != nil {
 		return 0, 0, nil, err
 	}
 
 	// todo: fix this double read by a change to libcontainer/user pkg
 	groupFile, err = readUserFile(c, groupPath)
 	if err == nil {
 		defer groupFile.Close()
 	}
 	var addGroups []int
 	if len(c.HostConfig.GroupAdd) > 0 {
 		addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
 		if err != nil {
 			return 0, 0, nil, err
 		}
 	}
 	uid := uint32(execUser.Uid)
 	gid := uint32(execUser.Gid)
 	sgids := append(execUser.Sgids, addGroups...)
 	var additionalGids []uint32
 	for _, g := range sgids {
 		additionalGids = append(additionalGids, uint32(g))
 	}
 	return uid, gid, additionalGids, nil
 }
 
005506d3
 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
9c4570a9
 	for i, n := range s.Linux.Namespaces {
 		if n.Type == ns.Type {
 			s.Linux.Namespaces[i] = ns
 			return
 		}
 	}
 	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
 }
 
c4785536
 // WithNamespaces sets the container's namespaces
 func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		userNS := false
 		// user
 		if c.HostConfig.UsernsMode.IsPrivate() {
 			uidMap := daemon.idMapping.UIDs()
 			if uidMap != nil {
 				userNS = true
 				ns := specs.LinuxNamespace{Type: "user"}
 				setNamespace(s, ns)
 				s.Linux.UIDMappings = specMapping(uidMap)
 				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
 			}
 		}
 		// network
 		if !c.Config.NetworkDisabled {
 			ns := specs.LinuxNamespace{Type: "network"}
 			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
 			if parts[0] == "container" {
 				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
 				if err != nil {
 					return err
 				}
 				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
 				if userNS {
 					// to share a net namespace, they must also share a user namespace
 					nsUser := specs.LinuxNamespace{Type: "user"}
 					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
 					setNamespace(s, nsUser)
 				}
 			} else if c.HostConfig.NetworkMode.IsHost() {
 				ns.Path = c.NetworkSettings.SandboxKey
 			}
2b278f48
 			setNamespace(s, ns)
 		}
c4785536
 
 		// ipc
 		ipcMode := c.HostConfig.IpcMode
 		switch {
 		case ipcMode.IsContainer():
 			ns := specs.LinuxNamespace{Type: "ipc"}
 			ic, err := daemon.getIpcContainer(ipcMode.Container())
9c4570a9
 			if err != nil {
 				return err
 			}
c4785536
 			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
 			setNamespace(s, ns)
2b278f48
 			if userNS {
c4785536
 				// to share an IPC namespace, they must also share a user namespace
005506d3
 				nsUser := specs.LinuxNamespace{Type: "user"}
c4785536
 				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
2b278f48
 				setNamespace(s, nsUser)
 			}
c4785536
 		case ipcMode.IsHost():
 			oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
 		case ipcMode.IsEmpty():
 			// A container was created by an older version of the daemon.
 			// The default behavior used to be what is now called "shareable".
 			fallthrough
 		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
 			ns := specs.LinuxNamespace{Type: "ipc"}
 			setNamespace(s, ns)
 		default:
 			return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
9c4570a9
 		}
7120976d
 
c4785536
 		// pid
 		if c.HostConfig.PidMode.IsContainer() {
 			ns := specs.LinuxNamespace{Type: "pid"}
 			pc, err := daemon.getPidContainer(c)
 			if err != nil {
 				return err
 			}
 			ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
 			setNamespace(s, ns)
 			if userNS {
 				// to share a PID namespace, they must also share a user namespace
 				nsUser := specs.LinuxNamespace{Type: "user"}
 				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
 				setNamespace(s, nsUser)
 			}
 		} else if c.HostConfig.PidMode.IsHost() {
 			oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
 		} else {
 			ns := specs.LinuxNamespace{Type: "pid"}
 			setNamespace(s, ns)
fb43ef64
 		}
c4785536
 		// uts
 		if c.HostConfig.UTSMode.IsHost() {
 			oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
 			s.Hostname = ""
fb43ef64
 		}
9c4570a9
 
c4785536
 		return nil
 	}
9c4570a9
 }
 
005506d3
 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
 	var ids []specs.LinuxIDMapping
9c4570a9
 	for _, item := range s {
005506d3
 		ids = append(ids, specs.LinuxIDMapping{
9c4570a9
 			HostID:      uint32(item.HostID),
 			ContainerID: uint32(item.ContainerID),
 			Size:        uint32(item.Size),
 		})
 	}
 	return ids
 }
 
 // Get the source mount point of directory passed in as argument. Also return
 // optional fields.
 func getSourceMount(source string) (string, string, error) {
 	// Ensure any symlinks are resolved.
 	sourcePath, err := filepath.EvalSymlinks(source)
 	if err != nil {
 		return "", "", err
 	}
 
871c9572
 	mi, err := mount.GetMounts(mount.ParentsFilter(sourcePath))
9c4570a9
 	if err != nil {
 		return "", "", err
 	}
871c9572
 	if len(mi) < 1 {
 		return "", "", fmt.Errorf("Can't find mount point of %s", source)
9c4570a9
 	}
 
871c9572
 	// find the longest mount point
 	var idx, maxlen int
 	for i := range mi {
 		if len(mi[i].Mountpoint) > maxlen {
 			maxlen = len(mi[i].Mountpoint)
 			idx = i
9c4570a9
 		}
 	}
d8fd6137
 	return mi[idx].Mountpoint, mi[idx].Optional, nil
9c4570a9
 }
 
487c6c7e
 const (
 	sharedPropagationOption = "shared:"
 	slavePropagationOption  = "master:"
 )
 
 // hasMountinfoOption checks if any of the passed any of the given option values
 // are set in the passed in option string.
 func hasMountinfoOption(opts string, vals ...string) bool {
 	for _, opt := range strings.Split(opts, " ") {
 		for _, val := range vals {
 			if strings.HasPrefix(opt, val) {
 				return true
 			}
 		}
 	}
 	return false
 }
 
9c4570a9
 // Ensure mount point on which path is mounted, is shared.
 func ensureShared(path string) error {
 	sourceMount, optionalOpts, err := getSourceMount(path)
 	if err != nil {
 		return err
 	}
 	// Make sure source mount point is shared.
487c6c7e
 	if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
 		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
9c4570a9
 	}
 	return nil
 }
 
 // Ensure mount point on which path is mounted, is either shared or slave.
 func ensureSharedOrSlave(path string) error {
 	sourceMount, optionalOpts, err := getSourceMount(path)
 	if err != nil {
 		return err
 	}
 
487c6c7e
 	if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
 		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
9c4570a9
 	}
 	return nil
 }
 
c0f883fd
 // Get the set of mount flags that are set on the mount that contains the given
 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
 // bind-mounting "with options" will not fail with user namespaces, due to
 // kernel restrictions that require user namespace mounts to preserve
 // CL_UNPRIVILEGED locked flags.
 func getUnprivilegedMountFlags(path string) ([]string, error) {
 	var statfs unix.Statfs_t
 	if err := unix.Statfs(path, &statfs); err != nil {
 		return nil, err
 	}
 
 	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
 	unprivilegedFlags := map[uint64]string{
 		unix.MS_RDONLY:     "ro",
 		unix.MS_NODEV:      "nodev",
 		unix.MS_NOEXEC:     "noexec",
 		unix.MS_NOSUID:     "nosuid",
 		unix.MS_NOATIME:    "noatime",
 		unix.MS_RELATIME:   "relatime",
 		unix.MS_NODIRATIME: "nodiratime",
 	}
 
 	var flags []string
 	for mask, flag := range unprivilegedFlags {
 		if uint64(statfs.Flags)&mask == mask {
 			flags = append(flags, flag)
 		}
 	}
 
 	return flags, nil
 }
 
9c4570a9
 var (
 	mountPropagationMap = map[string]int{
 		"private":  mount.PRIVATE,
 		"rprivate": mount.RPRIVATE,
 		"shared":   mount.SHARED,
 		"rshared":  mount.RSHARED,
 		"slave":    mount.SLAVE,
 		"rslave":   mount.RSLAVE,
 	}
 
 	mountPropagationReverseMap = map[int]string{
 		mount.PRIVATE:  "private",
 		mount.RPRIVATE: "rprivate",
 		mount.SHARED:   "shared",
 		mount.RSHARED:  "rshared",
 		mount.SLAVE:    "slave",
 		mount.RSLAVE:   "rslave",
 	}
 )
 
5c154cfa
 // inSlice tests whether a string is contained in a slice of strings or not.
 // Comparison is case sensitive
 func inSlice(slice []string, s string) bool {
 	for _, ss := range slice {
 		if s == ss {
 			return true
 		}
 	}
 	return false
 }
 
c4785536
 // WithMounts sets the container's mounts
 func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
 		if err := daemon.setupContainerMountsRoot(c); err != nil {
 			return err
9c4570a9
 		}
31d30a98
 
c4785536
 		if err := daemon.setupIpcDirs(c); err != nil {
 			return err
 		}
9c4570a9
 
c4785536
 		defer func() {
397a6fef
 			if err != nil {
c4785536
 				daemon.cleanupSecretDir(c)
397a6fef
 			}
c4785536
 		}()
397a6fef
 
c4785536
 		if err := daemon.setupSecretDir(c); err != nil {
 			return err
9c4570a9
 		}
 
c4785536
 		ms, err := daemon.setupMounts(c)
 		if err != nil {
 			return err
 		}
9c4570a9
 
c4785536
 		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
 			ms = append(ms, c.IpcMounts()...)
 		}
 
 		tmpfsMounts, err := c.TmpfsMounts()
 		if err != nil {
 			return err
 		}
 		ms = append(ms, tmpfsMounts...)
 
 		secretMounts, err := c.SecretMounts()
 		if err != nil {
 			return err
 		}
 		ms = append(ms, secretMounts...)
 
 		sort.Sort(mounts(ms))
 
 		mounts := ms
 
 		userMounts := make(map[string]struct{})
 		for _, m := range mounts {
 			userMounts[m.Destination] = struct{}{}
 		}
 
 		// Copy all mounts from spec to defaultMounts, except for
 		//  - mounts overridden by a user supplied mount;
 		//  - all mounts under /dev if a user supplied /dev is present;
 		//  - /dev/shm, in case IpcMode is none.
 		// While at it, also
 		//  - set size for /dev/shm from shmsize.
 		defaultMounts := s.Mounts[:0]
 		_, mountDev := userMounts["/dev"]
 		for _, m := range s.Mounts {
 			if _, ok := userMounts[m.Destination]; ok {
 				// filter out mount overridden by a user supplied mount
 				continue
9c4570a9
 			}
c4785536
 			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
 				// filter out everything under /dev if /dev is user-mounted
 				continue
 			}
 
 			if m.Destination == "/dev/shm" {
 				if c.HostConfig.IpcMode.IsNone() {
 					// filter out /dev/shm for "none" IpcMode
 					continue
589a0afa
 				}
c4785536
 				// set size for /dev/shm mount from spec
 				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
 				m.Options = append(m.Options, sizeOpt)
 			}
589a0afa
 
c4785536
 			defaultMounts = append(defaultMounts, m)
 		}
 
 		s.Mounts = defaultMounts
 		for _, m := range mounts {
 			if m.Source == "tmpfs" {
 				data := m.Data
 				parser := volumemounts.NewParser("linux")
 				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
 				if data != "" {
 					options = append(options, strings.Split(data, ",")...)
589a0afa
 				}
c4785536
 
 				merged, err := mount.MergeTmpfsOptions(options)
 				if err != nil {
589a0afa
 					return err
 				}
c4785536
 
 				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
 				continue
9c4570a9
 			}
c4785536
 
 			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
 
 			// Determine property of RootPropagation based on volume
 			// properties. If a volume is shared, then keep root propagation
 			// shared. This should work for slave and private volumes too.
 			//
 			// For slave volumes, it can be either [r]shared/[r]slave.
 			//
 			// For private volumes any root propagation value should work.
 			pFlag := mountPropagationMap[m.Propagation]
 			switch pFlag {
 			case mount.SHARED, mount.RSHARED:
 				if err := ensureShared(m.Source); err != nil {
 					return err
 				}
589a0afa
 				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
c4785536
 				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
 					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
 				}
 			case mount.SLAVE, mount.RSLAVE:
 				var fallback bool
 				if err := ensureSharedOrSlave(m.Source); err != nil {
 					// For backwards compatibility purposes, treat mounts from the daemon root
 					// as special since we automatically add rslave propagation to these mounts
 					// when the user did not set anything, so we should fallback to the old
 					// behavior which is to use private propagation which is normally the
 					// default.
 					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
 						return err
 					}
 
 					cm, ok := c.MountPoints[m.Destination]
 					if !ok {
 						return err
 					}
 					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
 						// This means the user explicitly set a propagation, do not fallback in that case.
 						return err
 					}
 					fallback = true
 					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
 				}
 				if !fallback {
 					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
 					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
 						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
 					}
589a0afa
 				}
9c4570a9
 			}
 
c4785536
 			bindMode := "rbind"
 			if m.NonRecursive {
 				bindMode = "bind"
 			}
 			opts := []string{bindMode}
 			if !m.Writable {
 				opts = append(opts, "ro")
 			}
 			if pFlag != 0 {
 				opts = append(opts, mountPropagationReverseMap[pFlag])
 			}
9c4570a9
 
c4785536
 			// If we are using user namespaces, then we must make sure that we
 			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
 			// "mount" when we bind-mount. The reason for this is that at the point
 			// when runc sets up the root filesystem, it is already inside a user
 			// namespace, and thus cannot change any flags that are locked.
 			if daemon.configStore.RemappedRoot != "" {
 				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
 				if err != nil {
 					return err
 				}
 				opts = append(opts, unprivOpts...)
c0f883fd
 			}
 
c4785536
 			mt.Options = opts
 			s.Mounts = append(s.Mounts, mt)
 		}
9c4570a9
 
c4785536
 		if s.Root.Readonly {
 			for i, m := range s.Mounts {
 				switch m.Destination {
 				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
 					continue
 				}
 				if _, ok := userMounts[m.Destination]; !ok {
 					if !inSlice(m.Options, "ro") {
 						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
 					}
9c4570a9
 				}
 			}
 		}
 
c4785536
 		if c.HostConfig.Privileged {
 			// clear readonly for /sys
 			for i := range s.Mounts {
 				if s.Mounts[i].Destination == "/sys" {
 					clearReadOnly(&s.Mounts[i])
 				}
9c4570a9
 			}
c4785536
 			s.Linux.ReadonlyPaths = nil
 			s.Linux.MaskedPaths = nil
9c4570a9
 		}
 
c4785536
 		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
 		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
 		if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
 			for i, m := range s.Mounts {
 				if m.Type == "cgroup" {
 					clearReadOnly(&s.Mounts[i])
 				}
9c4570a9
 			}
 		}
 
c4785536
 		return nil
9c4570a9
 
 	}
c4785536
 }
 
 // WithCommonOptions sets common docker options
 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		if c.BaseFS == nil {
 			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
 		}
 		linkedEnv, err := daemon.setupLinkedContainers(c)
 		if err != nil {
 			return err
 		}
 		s.Root = &specs.Root{
 			Path:     c.BaseFS.Path(),
 			Readonly: c.HostConfig.ReadonlyRootfs,
 		}
 		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
 			return err
 		}
 		cwd := c.Config.WorkingDir
 		if len(cwd) == 0 {
 			cwd = "/"
 		}
 		s.Process.Args = append([]string{c.Path}, c.Args...)
 
 		// only add the custom init if it is specified and the container is running in its
 		// own private pid namespace.  It does not make sense to add if it is running in the
 		// host namespace or another container's pid namespace where we already have an init
 		if c.HostConfig.PidMode.IsPrivate() {
 			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
 				(c.HostConfig.Init == nil && daemon.configStore.Init) {
 				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
 				path := daemon.configStore.InitPath
 				if path == "" {
 					path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
 					if err != nil {
 						return err
 					}
6a12685b
 				}
c4785536
 				s.Mounts = append(s.Mounts, specs.Mount{
 					Destination: inContainerInitPath,
 					Type:        "bind",
 					Source:      path,
 					Options:     []string{"bind", "ro"},
 				})
6a12685b
 			}
ee3ac3aa
 		}
c4785536
 		s.Process.Cwd = cwd
 		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
 		s.Process.Terminal = c.Config.Tty
7417f505
 
c4785536
 		s.Hostname = c.Config.Hostname
 		setLinuxDomainname(c, s)
9c4570a9
 
c4785536
 		return nil
 	}
9c4570a9
 }
 
c4785536
 // WithCgroups sets the container's cgroups
 func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
cb902f44
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		var cgroupsPath string
 		scopePrefix := "docker"
 		parent := "/docker"
 		useSystemd := UsingSystemd(daemon.configStore)
 		if useSystemd {
 			parent = "system.slice"
 		}
9c4570a9
 
cb902f44
 		if c.HostConfig.CgroupParent != "" {
 			parent = c.HostConfig.CgroupParent
 		} else if daemon.configStore.CgroupParent != "" {
 			parent = daemon.configStore.CgroupParent
 		}
7ed3d265
 
cb902f44
 		if useSystemd {
 			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
 			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
 		} else {
 			cgroupsPath = filepath.Join(parent, c.ID)
 		}
 		s.Linux.CgroupsPath = cgroupsPath
 		p := cgroupsPath
 		if useSystemd {
 			initPath, err := cgroups.GetInitCgroup("cpu")
 			if err != nil {
 				return err
 			}
 			_, err = cgroups.GetOwnCgroup("cpu")
 			if err != nil {
 				return err
 			}
 			p = filepath.Join(initPath, s.Linux.CgroupsPath)
 		}
7ed3d265
 
cb902f44
 		// Clean path to guard against things like ../../../BAD
 		parentPath := filepath.Dir(p)
 		if !filepath.IsAbs(parentPath) {
 			parentPath = filepath.Clean("/" + parentPath)
 		}
9c4570a9
 
cb902f44
 		if err := daemon.initCgroupsPath(parentPath); err != nil {
 			return fmt.Errorf("linux init cgroups path: %v", err)
 		}
 		return nil
9c4570a9
 	}
cb902f44
 }
 
c4785536
 // WithDevices sets the container's devices
 func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
cb902f44
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		// Build lists of devices allowed and created within the container.
 		var devs []specs.LinuxDevice
 		devPermissions := s.Linux.Resources.Devices
 		if c.HostConfig.Privileged && !rsystem.RunningInUserNS() {
 			hostDevices, err := devices.HostDevices()
 			if err != nil {
 				return err
 			}
 			for _, d := range hostDevices {
 				devs = append(devs, oci.Device(d))
 			}
 			devPermissions = []specs.LinuxDeviceCgroup{
 				{
 					Allow:  true,
 					Access: "rwm",
 				},
 			}
 		} else {
 			for _, deviceMapping := range c.HostConfig.Devices {
 				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
 				if err != nil {
 					return err
 				}
 				devs = append(devs, d...)
 				devPermissions = append(devPermissions, dPermissions...)
 			}
 
 			var err error
 			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
 			if err != nil {
 				return err
 			}
 		}
 
 		s.Linux.Devices = append(s.Linux.Devices, devs...)
 		s.Linux.Resources.Devices = devPermissions
 
 		for _, req := range c.HostConfig.DeviceRequests {
 			if err := daemon.handleDevice(req, s); err != nil {
 				return err
 			}
 		}
 		return nil
7417f505
 	}
cb902f44
 }
56f77d5a
 
c4785536
 // WithResources applies the container resources
 func WithResources(c *container.Container) coci.SpecOpts {
cb902f44
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		r := c.HostConfig.Resources
 		weightDevices, err := getBlkioWeightDevices(r)
56f77d5a
 		if err != nil {
cb902f44
 			return err
56f77d5a
 		}
cb902f44
 		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
56f77d5a
 		if err != nil {
cb902f44
 			return err
 		}
 		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
 		if err != nil {
 			return err
 		}
 		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
 		if err != nil {
 			return err
 		}
 		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
 		if err != nil {
 			return err
 		}
 
 		memoryRes := getMemoryResources(r)
 		cpuRes, err := getCPUResources(r)
 		if err != nil {
 			return err
 		}
 		blkioWeight := r.BlkioWeight
 
 		specResources := &specs.LinuxResources{
 			Memory: memoryRes,
 			CPU:    cpuRes,
 			BlockIO: &specs.LinuxBlockIO{
 				Weight:                  &blkioWeight,
 				WeightDevice:            weightDevices,
 				ThrottleReadBpsDevice:   readBpsDevice,
 				ThrottleWriteBpsDevice:  writeBpsDevice,
 				ThrottleReadIOPSDevice:  readIOpsDevice,
 				ThrottleWriteIOPSDevice: writeIOpsDevice,
 			},
 			Pids: getPidsLimit(r),
56f77d5a
 		}
cb902f44
 
 		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
 			specResources.Devices = s.Linux.Resources.Devices
 		}
 
 		s.Linux.Resources = specResources
 		return nil
56f77d5a
 	}
cb902f44
 }
56f77d5a
 
c4785536
 // WithSysctls sets the container's sysctls
 func WithSysctls(c *container.Container) coci.SpecOpts {
cb902f44
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		// We merge the sysctls injected above with the HostConfig (latter takes
 		// precedence for backwards-compatibility reasons).
 		for k, v := range c.HostConfig.Sysctls {
 			s.Linux.Sysctl[k] = v
 		}
 		return nil
56f77d5a
 	}
cb902f44
 }
56f77d5a
 
c4785536
 // WithUser sets the container's user
 func WithUser(c *container.Container) coci.SpecOpts {
cb902f44
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		uid, gid, additionalGids, err := getUser(c, c.Config.User)
 		if err != nil {
 			return err
 		}
 		s.Process.User.UID = uid
 		s.Process.User.GID = gid
 		s.Process.User.AdditionalGids = additionalGids
 		return nil
56f77d5a
 	}
cb902f44
 }
 
 func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
 	var (
 		opts []coci.SpecOpts
 		s    = oci.DefaultSpec()
 	)
 	opts = append(opts,
c4785536
 		WithCommonOptions(daemon, c),
 		WithCgroups(daemon, c),
 		WithResources(c),
 		WithSysctls(c),
 		WithDevices(daemon, c),
 		WithUser(c),
 		WithRlimits(daemon, c),
 		WithNamespaces(daemon, c),
 		WithCapabilities(c),
 		WithSeccomp(daemon, c),
 		WithMounts(daemon, c),
 		WithLibnetwork(daemon, c),
 		WithApparmor(c),
 		WithSelinux(c),
 		WithOOMScore(&c.HostConfig.OomScoreAdj),
cb902f44
 	)
c4785536
 	if c.NoNewPrivileges {
 		opts = append(opts, coci.WithNoNewPrivileges)
9c4570a9
 	}
 
3694c1e3
 	// Set the masked and readonly paths with regard to the host config options if they are set.
 	if c.HostConfig.MaskedPaths != nil {
c4785536
 		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
3694c1e3
 	}
 	if c.HostConfig.ReadonlyPaths != nil {
c4785536
 		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
3694c1e3
 	}
ec87479b
 	if daemon.configStore.Rootless {
c4785536
 		opts = append(opts, WithRootless)
ec87479b
 	}
cb902f44
 	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
 		ID: c.ID,
 	}, &s, opts...)
9c4570a9
 }
 
 func clearReadOnly(m *specs.Mount) {
 	var opt []string
 	for _, o := range m.Options {
 		if o != "ro" {
 			opt = append(opt, o)
 		}
 	}
 	m.Options = opt
 }
7d705a73
 
 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
 	ulimits := c.Ulimits
 	// Merge ulimits with daemon defaults
 	ulIdx := make(map[string]struct{})
 	for _, ul := range ulimits {
 		ulIdx[ul.Name] = struct{}{}
 	}
 	for name, ul := range daemon.configStore.Ulimits {
 		if _, exists := ulIdx[name]; !exists {
 			ulimits = append(ulimits, ul)
 		}
 	}
 	c.Ulimits = ulimits
 }