daemon/oci_linux.go
9c4570a9
 package daemon
 
 import (
 	"fmt"
 	"io"
 	"os"
ee3ac3aa
 	"os/exec"
9c4570a9
 	"path/filepath"
1756af6f
 	"regexp"
6a8ea46c
 	"sort"
9c4570a9
 	"strconv"
 	"strings"
 
91e197d6
 	containertypes "github.com/docker/docker/api/types/container"
9c4570a9
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/daemon/caps"
17b12887
 	daemonconfig "github.com/docker/docker/daemon/config"
9c4570a9
 	"github.com/docker/docker/oci"
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/mount"
 	"github.com/docker/docker/volume"
 	"github.com/opencontainers/runc/libcontainer/apparmor"
56f77d5a
 	"github.com/opencontainers/runc/libcontainer/cgroups"
9c4570a9
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/user"
56f77d5a
 	specs "github.com/opencontainers/runtime-spec/specs-go"
1009e6a4
 	"github.com/sirupsen/logrus"
c0f883fd
 	"golang.org/x/sys/unix"
9c4570a9
 )
 
f7f101d5
 // nolint: gosimple
1756af6f
 var (
 	deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
 )
 
9c4570a9
 func setResources(s *specs.Spec, r containertypes.Resources) error {
 	weightDevices, err := getBlkioWeightDevices(r)
 	if err != nil {
 		return err
 	}
668f0a2a
 	readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
9c4570a9
 	if err != nil {
 		return err
 	}
668f0a2a
 	writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
9c4570a9
 	if err != nil {
 		return err
 	}
668f0a2a
 	readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
9c4570a9
 	if err != nil {
 		return err
 	}
668f0a2a
 	writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
9c4570a9
 	if err != nil {
 		return err
 	}
 
 	memoryRes := getMemoryResources(r)
005506d3
 	cpuRes, err := getCPUResources(r)
 	if err != nil {
 		return err
 	}
9c4570a9
 	blkioWeight := r.BlkioWeight
 
005506d3
 	specResources := &specs.LinuxResources{
9c4570a9
 		Memory: memoryRes,
 		CPU:    cpuRes,
005506d3
 		BlockIO: &specs.LinuxBlockIO{
9c4570a9
 			Weight:                  &blkioWeight,
 			WeightDevice:            weightDevices,
 			ThrottleReadBpsDevice:   readBpsDevice,
 			ThrottleWriteBpsDevice:  writeBpsDevice,
 			ThrottleReadIOPSDevice:  readIOpsDevice,
 			ThrottleWriteIOPSDevice: writeIOpsDevice,
 		},
005506d3
 		Pids: &specs.LinuxPids{
 			Limit: r.PidsLimit,
9c4570a9
 		},
 	}
 
 	if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
 		specResources.Devices = s.Linux.Resources.Devices
 	}
 
 	s.Linux.Resources = specResources
 	return nil
 }
 
 func setDevices(s *specs.Spec, c *container.Container) error {
 	// Build lists of devices allowed and created within the container.
005506d3
 	var devs []specs.LinuxDevice
ee612358
 	devPermissions := s.Linux.Resources.Devices
9c4570a9
 	if c.HostConfig.Privileged {
 		hostDevices, err := devices.HostDevices()
 		if err != nil {
 			return err
 		}
 		for _, d := range hostDevices {
53b9b99e
 			devs = append(devs, oci.Device(d))
9c4570a9
 		}
005506d3
 		devPermissions = []specs.LinuxDeviceCgroup{
ee612358
 			{
 				Allow:  true,
005506d3
 				Access: "rwm",
ee612358
 			},
 		}
9c4570a9
 	} else {
 		for _, deviceMapping := range c.HostConfig.Devices {
53b9b99e
 			d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
9c4570a9
 			if err != nil {
 				return err
 			}
 			devs = append(devs, d...)
ee612358
 			devPermissions = append(devPermissions, dPermissions...)
9c4570a9
 		}
1756af6f
 
 		for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
 			ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
 			if len(ss[0]) != 5 {
 				return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
 			}
 			matches := ss[0]
 
005506d3
 			dPermissions := specs.LinuxDeviceCgroup{
1756af6f
 				Allow:  true,
005506d3
 				Type:   matches[1],
 				Access: matches[4],
1756af6f
 			}
 			if matches[2] == "*" {
 				major := int64(-1)
 				dPermissions.Major = &major
 			} else {
 				major, err := strconv.ParseInt(matches[2], 10, 64)
 				if err != nil {
 					return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
 				}
 				dPermissions.Major = &major
 			}
 			if matches[3] == "*" {
 				minor := int64(-1)
 				dPermissions.Minor = &minor
 			} else {
 				minor, err := strconv.ParseInt(matches[3], 10, 64)
 				if err != nil {
 					return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
 				}
 				dPermissions.Minor = &minor
 			}
 			devPermissions = append(devPermissions, dPermissions)
 		}
9c4570a9
 	}
 
 	s.Linux.Devices = append(s.Linux.Devices, devs...)
ee612358
 	s.Linux.Resources.Devices = devPermissions
9c4570a9
 	return nil
 }
 
ddae20c0
 func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
45d85c99
 	var rlimits []specs.POSIXRlimit
9c4570a9
 
7d705a73
 	// We want to leave the original HostConfig alone so make a copy here
 	hostConfig := *c.HostConfig
 	// Merge with the daemon defaults
 	daemon.mergeUlimits(&hostConfig)
 	for _, ul := range hostConfig.Ulimits {
45d85c99
 		rlimits = append(rlimits, specs.POSIXRlimit{
9c4570a9
 			Type: "RLIMIT_" + strings.ToUpper(ul.Name),
 			Soft: uint64(ul.Soft),
 			Hard: uint64(ul.Hard),
 		})
 	}
 
 	s.Process.Rlimits = rlimits
 	return nil
 }
 
 func setUser(s *specs.Spec, c *container.Container) error {
 	uid, gid, additionalGids, err := getUser(c, c.Config.User)
 	if err != nil {
 		return err
 	}
 	s.Process.User.UID = uid
 	s.Process.User.GID = gid
 	s.Process.User.AdditionalGids = additionalGids
 	return nil
 }
 
 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
7a7357da
 	fp, err := c.GetResourcePath(p)
9c4570a9
 	if err != nil {
 		return nil, err
 	}
 	return os.Open(fp)
 }
 
 func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
 	passwdPath, err := user.GetPasswdPath()
 	if err != nil {
 		return 0, 0, nil, err
 	}
 	groupPath, err := user.GetGroupPath()
 	if err != nil {
 		return 0, 0, nil, err
 	}
 	passwdFile, err := readUserFile(c, passwdPath)
 	if err == nil {
 		defer passwdFile.Close()
 	}
 	groupFile, err := readUserFile(c, groupPath)
 	if err == nil {
 		defer groupFile.Close()
 	}
 
 	execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
 	if err != nil {
 		return 0, 0, nil, err
 	}
 
 	// todo: fix this double read by a change to libcontainer/user pkg
 	groupFile, err = readUserFile(c, groupPath)
 	if err == nil {
 		defer groupFile.Close()
 	}
 	var addGroups []int
 	if len(c.HostConfig.GroupAdd) > 0 {
 		addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
 		if err != nil {
 			return 0, 0, nil, err
 		}
 	}
 	uid := uint32(execUser.Uid)
 	gid := uint32(execUser.Gid)
 	sgids := append(execUser.Sgids, addGroups...)
 	var additionalGids []uint32
 	for _, g := range sgids {
 		additionalGids = append(additionalGids, uint32(g))
 	}
 	return uid, gid, additionalGids, nil
 }
 
005506d3
 func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
9c4570a9
 	for i, n := range s.Linux.Namespaces {
 		if n.Type == ns.Type {
 			s.Linux.Namespaces[i] = ns
 			return
 		}
 	}
 	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
 }
 
 func setCapabilities(s *specs.Spec, c *container.Container) error {
 	var caplist []string
 	var err error
 	if c.HostConfig.Privileged {
 		caplist = caps.GetAllCapabilities()
 	} else {
005506d3
 		caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
9c4570a9
 		if err != nil {
 			return err
 		}
 	}
005506d3
 	s.Process.Capabilities.Effective = caplist
 	s.Process.Capabilities.Bounding = caplist
 	s.Process.Capabilities.Permitted = caplist
 	s.Process.Capabilities.Inheritable = caplist
9c4570a9
 	return nil
 }
 
 func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
2b278f48
 	userNS := false
 	// user
 	if c.HostConfig.UsernsMode.IsPrivate() {
09cd96c5
 		uidMap := daemon.idMappings.UIDs()
2b278f48
 		if uidMap != nil {
 			userNS = true
005506d3
 			ns := specs.LinuxNamespace{Type: "user"}
2b278f48
 			setNamespace(s, ns)
 			s.Linux.UIDMappings = specMapping(uidMap)
09cd96c5
 			s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
2b278f48
 		}
 	}
9c4570a9
 	// network
 	if !c.Config.NetworkDisabled {
005506d3
 		ns := specs.LinuxNamespace{Type: "network"}
9c4570a9
 		parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
 		if parts[0] == "container" {
 			nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
 			if err != nil {
 				return err
 			}
 			ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
2b278f48
 			if userNS {
 				// to share a net namespace, they must also share a user namespace
005506d3
 				nsUser := specs.LinuxNamespace{Type: "user"}
2b278f48
 				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
 				setNamespace(s, nsUser)
 			}
9c4570a9
 		} else if c.HostConfig.NetworkMode.IsHost() {
 			ns.Path = c.NetworkSettings.SandboxKey
 		}
 		setNamespace(s, ns)
 	}
7120976d
 
9c4570a9
 	// ipc
7120976d
 	ipcMode := c.HostConfig.IpcMode
 	switch {
 	case ipcMode.IsContainer():
005506d3
 		ns := specs.LinuxNamespace{Type: "ipc"}
7120976d
 		ic, err := daemon.getIpcContainer(ipcMode.Container())
9c4570a9
 		if err != nil {
 			return err
 		}
 		ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
 		setNamespace(s, ns)
2b278f48
 		if userNS {
 			// to share an IPC namespace, they must also share a user namespace
005506d3
 			nsUser := specs.LinuxNamespace{Type: "user"}
2b278f48
 			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
 			setNamespace(s, nsUser)
 		}
7120976d
 	case ipcMode.IsHost():
005506d3
 		oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
7120976d
 	case ipcMode.IsEmpty():
 		// A container was created by an older version of the daemon.
 		// The default behavior used to be what is now called "shareable".
 		fallthrough
 	case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
005506d3
 		ns := specs.LinuxNamespace{Type: "ipc"}
9c4570a9
 		setNamespace(s, ns)
7120976d
 	default:
 		return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
9c4570a9
 	}
7120976d
 
9c4570a9
 	// pid
fb43ef64
 	if c.HostConfig.PidMode.IsContainer() {
005506d3
 		ns := specs.LinuxNamespace{Type: "pid"}
fb43ef64
 		pc, err := daemon.getPidContainer(c)
 		if err != nil {
 			return err
 		}
 		ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
 		setNamespace(s, ns)
 		if userNS {
c1be45fa
 			// to share a PID namespace, they must also share a user namespace
005506d3
 			nsUser := specs.LinuxNamespace{Type: "user"}
fb43ef64
 			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
 			setNamespace(s, nsUser)
 		}
 	} else if c.HostConfig.PidMode.IsHost() {
005506d3
 		oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
fb43ef64
 	} else {
005506d3
 		ns := specs.LinuxNamespace{Type: "pid"}
fb43ef64
 		setNamespace(s, ns)
9c4570a9
 	}
 	// uts
 	if c.HostConfig.UTSMode.IsHost() {
005506d3
 		oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
9c4570a9
 		s.Hostname = ""
 	}
 
 	return nil
 }
 
005506d3
 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
 	var ids []specs.LinuxIDMapping
9c4570a9
 	for _, item := range s {
005506d3
 		ids = append(ids, specs.LinuxIDMapping{
9c4570a9
 			HostID:      uint32(item.HostID),
 			ContainerID: uint32(item.ContainerID),
 			Size:        uint32(item.Size),
 		})
 	}
 	return ids
 }
 
 func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
 	for _, m := range mountinfo {
 		if m.Mountpoint == dir {
 			return m
 		}
 	}
 	return nil
 }
 
 // Get the source mount point of directory passed in as argument. Also return
 // optional fields.
 func getSourceMount(source string) (string, string, error) {
 	// Ensure any symlinks are resolved.
 	sourcePath, err := filepath.EvalSymlinks(source)
 	if err != nil {
 		return "", "", err
 	}
 
 	mountinfos, err := mount.GetMounts()
 	if err != nil {
 		return "", "", err
 	}
 
 	mountinfo := getMountInfo(mountinfos, sourcePath)
 	if mountinfo != nil {
 		return sourcePath, mountinfo.Optional, nil
 	}
 
 	path := sourcePath
 	for {
 		path = filepath.Dir(path)
 
 		mountinfo = getMountInfo(mountinfos, path)
 		if mountinfo != nil {
 			return path, mountinfo.Optional, nil
 		}
 
 		if path == "/" {
 			break
 		}
 	}
 
 	// If we are here, we did not find parent mount. Something is wrong.
 	return "", "", fmt.Errorf("Could not find source mount of %s", source)
 }
 
 // Ensure mount point on which path is mounted, is shared.
 func ensureShared(path string) error {
 	sharedMount := false
 
 	sourceMount, optionalOpts, err := getSourceMount(path)
 	if err != nil {
 		return err
 	}
 	// Make sure source mount point is shared.
 	optsSplit := strings.Split(optionalOpts, " ")
 	for _, opt := range optsSplit {
 		if strings.HasPrefix(opt, "shared:") {
 			sharedMount = true
 			break
 		}
 	}
 
 	if !sharedMount {
9b47b7b1
 		return fmt.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
9c4570a9
 	}
 	return nil
 }
 
 // Ensure mount point on which path is mounted, is either shared or slave.
 func ensureSharedOrSlave(path string) error {
 	sharedMount := false
 	slaveMount := false
 
 	sourceMount, optionalOpts, err := getSourceMount(path)
 	if err != nil {
 		return err
 	}
 	// Make sure source mount point is shared.
 	optsSplit := strings.Split(optionalOpts, " ")
 	for _, opt := range optsSplit {
 		if strings.HasPrefix(opt, "shared:") {
 			sharedMount = true
 			break
 		} else if strings.HasPrefix(opt, "master:") {
 			slaveMount = true
 			break
 		}
 	}
 
 	if !sharedMount && !slaveMount {
9b47b7b1
 		return fmt.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
9c4570a9
 	}
 	return nil
 }
 
c0f883fd
 // Get the set of mount flags that are set on the mount that contains the given
 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
 // bind-mounting "with options" will not fail with user namespaces, due to
 // kernel restrictions that require user namespace mounts to preserve
 // CL_UNPRIVILEGED locked flags.
 func getUnprivilegedMountFlags(path string) ([]string, error) {
 	var statfs unix.Statfs_t
 	if err := unix.Statfs(path, &statfs); err != nil {
 		return nil, err
 	}
 
 	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
 	unprivilegedFlags := map[uint64]string{
 		unix.MS_RDONLY:     "ro",
 		unix.MS_NODEV:      "nodev",
 		unix.MS_NOEXEC:     "noexec",
 		unix.MS_NOSUID:     "nosuid",
 		unix.MS_NOATIME:    "noatime",
 		unix.MS_RELATIME:   "relatime",
 		unix.MS_NODIRATIME: "nodiratime",
 	}
 
 	var flags []string
 	for mask, flag := range unprivilegedFlags {
 		if uint64(statfs.Flags)&mask == mask {
 			flags = append(flags, flag)
 		}
 	}
 
 	return flags, nil
 }
 
9c4570a9
 var (
 	mountPropagationMap = map[string]int{
 		"private":  mount.PRIVATE,
 		"rprivate": mount.RPRIVATE,
 		"shared":   mount.SHARED,
 		"rshared":  mount.RSHARED,
 		"slave":    mount.SLAVE,
 		"rslave":   mount.RSLAVE,
 	}
 
 	mountPropagationReverseMap = map[int]string{
 		mount.PRIVATE:  "private",
 		mount.RPRIVATE: "rprivate",
 		mount.SHARED:   "shared",
 		mount.RSHARED:  "rshared",
 		mount.SLAVE:    "slave",
 		mount.RSLAVE:   "rslave",
 	}
 )
 
5c154cfa
 // inSlice tests whether a string is contained in a slice of strings or not.
 // Comparison is case sensitive
 func inSlice(slice []string, s string) bool {
 	for _, ss := range slice {
 		if s == ss {
 			return true
 		}
 	}
 	return false
 }
 
9c4570a9
 func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
 	userMounts := make(map[string]struct{})
 	for _, m := range mounts {
 		userMounts[m.Destination] = struct{}{}
 	}
 
31d30a98
 	// Copy all mounts from spec to defaultMounts, except for
 	//  - mounts overriden by a user supplied mount;
 	//  - all mounts under /dev if a user supplied /dev is present;
 	//  - /dev/shm, in case IpcMode is none.
 	// While at it, also
 	//  - set size for /dev/shm from shmsize.
9c4570a9
 	var defaultMounts []specs.Mount
 	_, mountDev := userMounts["/dev"]
 	for _, m := range s.Mounts {
31d30a98
 		if _, ok := userMounts[m.Destination]; ok {
 			// filter out mount overridden by a user supplied mount
7120976d
 			continue
 		}
31d30a98
 		if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
 			// filter out everything under /dev if /dev is user-mounted
 			continue
 		}
 
 		if m.Destination == "/dev/shm" {
 			if c.HostConfig.IpcMode.IsNone() {
 				// filter out /dev/shm for "none" IpcMode
9c4570a9
 				continue
 			}
31d30a98
 			// set size for /dev/shm mount from spec
 			sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
 			m.Options = append(m.Options, sizeOpt)
9c4570a9
 		}
31d30a98
 
 		defaultMounts = append(defaultMounts, m)
9c4570a9
 	}
 
 	s.Mounts = defaultMounts
 	for _, m := range mounts {
 		for _, cm := range s.Mounts {
 			if cm.Destination == m.Destination {
ebcb7d6b
 				return duplicateMountPointError(m.Destination)
9c4570a9
 			}
 		}
 
 		if m.Source == "tmpfs" {
18768fdc
 			data := m.Data
e89b6e8c
 			parser := volume.NewParser("linux")
 			options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
756f6cef
 			if data != "" {
 				options = append(options, strings.Split(data, ",")...)
9c4570a9
 			}
 
397a6fef
 			merged, err := mount.MergeTmpfsOptions(options)
 			if err != nil {
 				return err
 			}
 
 			s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
9c4570a9
 			continue
 		}
 
 		mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
 
 		// Determine property of RootPropagation based on volume
 		// properties. If a volume is shared, then keep root propagation
 		// shared. This should work for slave and private volumes too.
 		//
 		// For slave volumes, it can be either [r]shared/[r]slave.
 		//
 		// For private volumes any root propagation value should work.
 		pFlag := mountPropagationMap[m.Propagation]
 		if pFlag == mount.SHARED || pFlag == mount.RSHARED {
 			if err := ensureShared(m.Source); err != nil {
 				return err
 			}
 			rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
 			if rootpg != mount.SHARED && rootpg != mount.RSHARED {
 				s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
 			}
 		} else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
 			if err := ensureSharedOrSlave(m.Source); err != nil {
 				return err
 			}
 			rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
 			if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
 				s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
 			}
 		}
 
 		opts := []string{"rbind"}
 		if !m.Writable {
 			opts = append(opts, "ro")
 		}
 		if pFlag != 0 {
 			opts = append(opts, mountPropagationReverseMap[pFlag])
 		}
 
c0f883fd
 		// If we are using user namespaces, then we must make sure that we
 		// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
 		// "mount" when we bind-mount. The reason for this is that at the point
 		// when runc sets up the root filesystem, it is already inside a user
 		// namespace, and thus cannot change any flags that are locked.
 		if daemon.configStore.RemappedRoot != "" {
 			unprivOpts, err := getUnprivilegedMountFlags(m.Source)
 			if err != nil {
 				return err
 			}
 			opts = append(opts, unprivOpts...)
 		}
 
9c4570a9
 		mt.Options = opts
 		s.Mounts = append(s.Mounts, mt)
 	}
 
 	if s.Root.Readonly {
 		for i, m := range s.Mounts {
 			switch m.Destination {
5f3bd247
 			case "/proc", "/dev/pts", "/dev/mqueue", "/dev":
9c4570a9
 				continue
 			}
 			if _, ok := userMounts[m.Destination]; !ok {
5c154cfa
 				if !inSlice(m.Options, "ro") {
9c4570a9
 					s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
 				}
 			}
 		}
 	}
 
 	if c.HostConfig.Privileged {
 		if !s.Root.Readonly {
 			// clear readonly for /sys
 			for i := range s.Mounts {
 				if s.Mounts[i].Destination == "/sys" {
 					clearReadOnly(&s.Mounts[i])
 				}
 			}
 		}
3f81b493
 		s.Linux.ReadonlyPaths = nil
 		s.Linux.MaskedPaths = nil
9c4570a9
 	}
 
 	// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
 	// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
09cd96c5
 	if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
9c4570a9
 		for i, m := range s.Mounts {
 			if m.Type == "cgroup" {
 				clearReadOnly(&s.Mounts[i])
 			}
 		}
 	}
 
 	return nil
 }
 
 func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
 	linkedEnv, err := daemon.setupLinkedContainers(c)
 	if err != nil {
 		return err
 	}
45d85c99
 	s.Root = &specs.Root{
7a7357da
 		Path:     c.BaseFS.Path(),
9c4570a9
 		Readonly: c.HostConfig.ReadonlyRootfs,
 	}
93fbdb69
 	if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
9c4570a9
 		return err
 	}
 	cwd := c.Config.WorkingDir
 	if len(cwd) == 0 {
 		cwd = "/"
 	}
 	s.Process.Args = append([]string{c.Path}, c.Args...)
ee3ac3aa
 
 	// only add the custom init if it is specified and the container is running in its
 	// own private pid namespace.  It does not make sense to add if it is running in the
 	// host namespace or another container's pid namespace where we already have an init
 	if c.HostConfig.PidMode.IsPrivate() {
 		if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
 			(c.HostConfig.Init == nil && daemon.configStore.Init) {
d7df7315
 			s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
6a12685b
 			var path string
a18d103b
 			if daemon.configStore.InitPath == "" {
17b12887
 				path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
6a12685b
 				if err != nil {
 					return err
 				}
 			}
 			if daemon.configStore.InitPath != "" {
 				path = daemon.configStore.InitPath
 			}
ee3ac3aa
 			s.Mounts = append(s.Mounts, specs.Mount{
 				Destination: "/dev/init",
 				Type:        "bind",
 				Source:      path,
 				Options:     []string{"bind", "ro"},
 			})
 		}
 	}
9c4570a9
 	s.Process.Cwd = cwd
e9814596
 	s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
9c4570a9
 	s.Process.Terminal = c.Config.Tty
 	s.Hostname = c.FullHostname()
 
 	return nil
 }
 
02309170
 func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) {
9c4570a9
 	s := oci.DefaultSpec()
 	if err := daemon.populateCommonSpec(&s, c); err != nil {
 		return nil, err
 	}
 
 	var cgroupsPath string
7ed3d265
 	scopePrefix := "docker"
 	parent := "/docker"
 	useSystemd := UsingSystemd(daemon.configStore)
 	if useSystemd {
 		parent = "system.slice"
 	}
 
9c4570a9
 	if c.HostConfig.CgroupParent != "" {
7ed3d265
 		parent = c.HostConfig.CgroupParent
 	} else if daemon.configStore.CgroupParent != "" {
 		parent = daemon.configStore.CgroupParent
 	}
 
 	if useSystemd {
 		cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
 		logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
9c4570a9
 	} else {
7ed3d265
 		cgroupsPath = filepath.Join(parent, c.ID)
9c4570a9
 	}
005506d3
 	s.Linux.CgroupsPath = cgroupsPath
9c4570a9
 
 	if err := setResources(&s, c.HostConfig.Resources); err != nil {
 		return nil, fmt.Errorf("linux runtime spec resources: %v", err)
 	}
9caf7aee
 	s.Linux.Sysctl = c.HostConfig.Sysctls
56f77d5a
 
005506d3
 	p := s.Linux.CgroupsPath
56f77d5a
 	if useSystemd {
005506d3
 		initPath, err := cgroups.GetInitCgroup("cpu")
56f77d5a
 		if err != nil {
 			return nil, err
 		}
ddae20c0
 		_, err = cgroups.GetOwnCgroup("cpu")
56f77d5a
 		if err != nil {
 			return nil, err
 		}
ddae20c0
 		p = filepath.Join(initPath, s.Linux.CgroupsPath)
56f77d5a
 	}
 
 	// Clean path to guard against things like ../../../BAD
 	parentPath := filepath.Dir(p)
 	if !filepath.IsAbs(parentPath) {
 		parentPath = filepath.Clean("/" + parentPath)
 	}
 
 	if err := daemon.initCgroupsPath(parentPath); err != nil {
 		return nil, fmt.Errorf("linux init cgroups path: %v", err)
 	}
9c4570a9
 	if err := setDevices(&s, c); err != nil {
 		return nil, fmt.Errorf("linux runtime spec devices: %v", err)
 	}
ddae20c0
 	if err := daemon.setRlimits(&s, c); err != nil {
9c4570a9
 		return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
 	}
 	if err := setUser(&s, c); err != nil {
 		return nil, fmt.Errorf("linux spec user: %v", err)
 	}
 	if err := setNamespaces(daemon, &s, c); err != nil {
 		return nil, fmt.Errorf("linux spec namespaces: %v", err)
 	}
 	if err := setCapabilities(&s, c); err != nil {
 		return nil, fmt.Errorf("linux spec capabilities: %v", err)
 	}
 	if err := setSeccomp(daemon, &s, c); err != nil {
 		return nil, fmt.Errorf("linux seccomp: %v", err)
 	}
 
eaa51928
 	if err := daemon.setupContainerMountsRoot(c); err != nil {
 		return nil, err
 	}
 
9c4570a9
 	if err := daemon.setupIpcDirs(c); err != nil {
 		return nil, err
 	}
 
3716ec25
 	if err := daemon.setupSecretDir(c); err != nil {
 		return nil, err
 	}
 
9e9fc7b5
 	if err := daemon.setupConfigDir(c); err != nil {
 		return nil, err
 	}
 
6a8ea46c
 	ms, err := daemon.setupMounts(c)
9c4570a9
 	if err != nil {
 		return nil, err
 	}
189f8930
 
7120976d
 	if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
 		ms = append(ms, c.IpcMounts()...)
 	}
3716ec25
 
18768fdc
 	tmpfsMounts, err := c.TmpfsMounts()
 	if err != nil {
 		return nil, err
 	}
 	ms = append(ms, tmpfsMounts...)
3716ec25
 
eaa51928
 	secretMounts, err := c.SecretMounts()
 	if err != nil {
 		return nil, err
189f8930
 	}
eaa51928
 	ms = append(ms, secretMounts...)
857e60c2
 
eaa51928
 	configMounts, err := c.ConfigMounts()
 	if err != nil {
 		return nil, err
 	}
 	ms = append(ms, configMounts...)
9e9fc7b5
 
6a8ea46c
 	sort.Sort(mounts(ms))
 	if err := setMounts(daemon, &s, c, ms); err != nil {
9c4570a9
 		return nil, fmt.Errorf("linux mounts: %v", err)
 	}
 
 	for _, ns := range s.Linux.Namespaces {
 		if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
 			target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"))
 			if err != nil {
 				return nil, err
 			}
 
005506d3
 			s.Hooks = &specs.Hooks{
9c4570a9
 				Prestart: []specs.Hook{{
 					Path: target, // FIXME: cross-platform
 					Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
 				}},
 			}
 		}
 	}
 
 	if apparmor.IsEnabled() {
567ef8e7
 		var appArmorProfile string
 		if c.AppArmorProfile != "" {
9c4570a9
 			appArmorProfile = c.AppArmorProfile
3f81b493
 		} else if c.HostConfig.Privileged {
 			appArmorProfile = "unconfined"
567ef8e7
 		} else {
 			appArmorProfile = "docker-default"
 		}
 
 		if appArmorProfile == "docker-default" {
 			// Unattended upgrades and other fun services can unload AppArmor
 			// profiles inadvertently. Since we cannot store our profile in
 			// /etc/apparmor.d, nor can we practically add other ways of
 			// telling the system to keep our profile loaded, in order to make
 			// sure that we keep the default profile enabled we dynamically
 			// reload it if necessary.
 			if err := ensureDefaultAppArmorProfile(); err != nil {
 				return nil, err
 			}
9c4570a9
 		}
567ef8e7
 
9c4570a9
 		s.Process.ApparmorProfile = appArmorProfile
 	}
 	s.Process.SelinuxLabel = c.GetProcessLabel()
 	s.Process.NoNewPrivileges = c.NoNewPrivileges
45d85c99
 	s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
e0f98c69
 	s.Linux.MountLabel = c.MountLabel
9c4570a9
 
2f5f0af3
 	return &s, nil
9c4570a9
 }
 
 func clearReadOnly(m *specs.Mount) {
 	var opt []string
 	for _, o := range m.Options {
 		if o != "ro" {
 			opt = append(opt, o)
 		}
 	}
 	m.Options = opt
 }
7d705a73
 
 // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
 func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
 	ulimits := c.Ulimits
 	// Merge ulimits with daemon defaults
 	ulIdx := make(map[string]struct{})
 	for _, ul := range ulimits {
 		ulIdx[ul.Name] = struct{}{}
 	}
 	for name, ul := range daemon.configStore.Ulimits {
 		if _, exists := ulIdx[name]; !exists {
 			ulimits = append(ulimits, ul)
 		}
 	}
 	c.Ulimits = ulimits
 }