// +build linux package overlay2 import ( "bufio" "errors" "fmt" "io" "io/ioutil" "os" "os/exec" "path" "path/filepath" "strconv" "strings" "sync" "github.com/Sirupsen/logrus" "github.com/docker/docker/daemon/graphdriver" "github.com/docker/docker/daemon/graphdriver/overlayutils" "github.com/docker/docker/daemon/graphdriver/quota" "github.com/docker/docker/pkg/archive" "github.com/docker/docker/pkg/chrootarchive" "github.com/docker/docker/pkg/directory" "github.com/docker/docker/pkg/fsutils" "github.com/docker/docker/pkg/idtools" "github.com/docker/docker/pkg/locker" "github.com/docker/docker/pkg/mount" "github.com/docker/docker/pkg/parsers" "github.com/docker/docker/pkg/parsers/kernel" "github.com/docker/docker/pkg/system" units "github.com/docker/go-units" "github.com/opencontainers/selinux/go-selinux/label" "golang.org/x/sys/unix" ) var ( // untar defines the untar method untar = chrootarchive.UntarUncompressed ) // This backend uses the overlay union filesystem for containers // with diff directories for each layer. // This version of the overlay driver requires at least kernel // 4.0.0 in order to support mounting multiple diff directories. // Each container/image has at least a "diff" directory and "link" file. // If there is also a "lower" file when there are diff layers // below as well as "merged" and "work" directories. The "diff" directory // has the upper layer of the overlay and is used to capture any // changes to the layer. The "lower" file contains all the lower layer // mounts separated by ":" and ordered from uppermost to lowermost // layers. The overlay itself is mounted in the "merged" directory, // and the "work" dir is needed for overlay to work. // The "link" file for each layer contains a unique string for the layer. // Under the "l" directory at the root there will be a symbolic link // with that unique string pointing the "diff" directory for the layer. // The symbolic links are used to reference lower layers in the "lower" // file and on mount. The links are used to shorten the total length // of a layer reference without requiring changes to the layer identifier // or root directory. Mounts are always done relative to root and // referencing the symbolic links in order to ensure the number of // lower directories can fit in a single page for making the mount // syscall. A hard upper limit of 128 lower layers is enforced to ensure // that mounts do not fail due to length. const ( driverName = "overlay2" linkDir = "l" lowerFile = "lower" maxDepth = 128 // idLength represents the number of random characters // which can be used to create the unique link identifer // for every layer. If this value is too long then the // page size limit for the mount command may be exceeded. // The idLength should be selected such that following equation // is true (512 is a buffer for label metadata). // ((idLength + len(linkDir) + 1) * maxDepth) <= (pageSize - 512) idLength = 26 ) type overlayOptions struct { overrideKernelCheck bool quota quota.Quota } // Driver contains information about the home directory and the list of active mounts that are created using this driver. type Driver struct { home string uidMaps []idtools.IDMap gidMaps []idtools.IDMap ctr *graphdriver.RefCounter quotaCtl *quota.Control options overlayOptions naiveDiff graphdriver.DiffDriver supportsDType bool locker *locker.Locker } var ( backingFs = "" projectQuotaSupported = false useNaiveDiffLock sync.Once useNaiveDiffOnly bool ) func init() { graphdriver.Register(driverName, Init) } // Init returns the a native diff driver for overlay filesystem. // If overlay filesystem is not supported on the host, graphdriver.ErrNotSupported is returned as error. // If an overlay filesystem is not supported over an existing filesystem then error graphdriver.ErrIncompatibleFS is returned. func Init(home string, options []string, uidMaps, gidMaps []idtools.IDMap) (graphdriver.Driver, error) { opts, err := parseOptions(options) if err != nil { return nil, err } if err := supportsOverlay(); err != nil { return nil, graphdriver.ErrNotSupported } // require kernel 4.0.0 to ensure multiple lower dirs are supported v, err := kernel.GetKernelVersion() if err != nil { return nil, err } if kernel.CompareKernelVersion(*v, kernel.VersionInfo{Kernel: 4, Major: 0, Minor: 0}) < 0 { if !opts.overrideKernelCheck { return nil, graphdriver.ErrNotSupported } logrus.Warn("Using pre-4.0.0 kernel for overlay2, mount failures may require kernel update") } fsMagic, err := graphdriver.GetFSMagic(home) if err != nil { return nil, err } if fsName, ok := graphdriver.FsNames[fsMagic]; ok { backingFs = fsName } // check if they are running over btrfs, aufs, zfs, overlay, or ecryptfs switch fsMagic { case graphdriver.FsMagicAufs, graphdriver.FsMagicZfs, graphdriver.FsMagicOverlay, graphdriver.FsMagicEcryptfs: logrus.Errorf("'overlay2' is not supported over %s", backingFs) return nil, graphdriver.ErrIncompatibleFS case graphdriver.FsMagicBtrfs: // Support for OverlayFS on BTRFS was added in kernel 4.7 // See https://btrfs.wiki.kernel.org/index.php/Changelog if kernel.CompareKernelVersion(*v, kernel.VersionInfo{Kernel: 4, Major: 7, Minor: 0}) < 0 { if !opts.overrideKernelCheck { logrus.Errorf("'overlay2' requires kernel 4.7 to use on %s", backingFs) return nil, graphdriver.ErrIncompatibleFS } logrus.Warn("Using pre-4.7.0 kernel for overlay2 on btrfs, may require kernel update") } } rootUID, rootGID, err := idtools.GetRootUIDGID(uidMaps, gidMaps) if err != nil { return nil, err } // Create the driver home dir if err := idtools.MkdirAllAs(path.Join(home, linkDir), 0700, rootUID, rootGID); err != nil && !os.IsExist(err) { return nil, err } if err := mount.MakePrivate(home); err != nil { return nil, err } supportsDType, err := fsutils.SupportsDType(home) if err != nil { return nil, err } if !supportsDType { // not a fatal error until v17.12 (#27443) logrus.Warn(overlayutils.ErrDTypeNotSupported("overlay2", backingFs)) } d := &Driver{ home: home, uidMaps: uidMaps, gidMaps: gidMaps, ctr: graphdriver.NewRefCounter(graphdriver.NewFsChecker(graphdriver.FsMagicOverlay)), supportsDType: supportsDType, locker: locker.New(), options: *opts, } d.naiveDiff = graphdriver.NewNaiveDiffDriver(d, uidMaps, gidMaps) if backingFs == "xfs" { // Try to enable project quota support over xfs. if d.quotaCtl, err = quota.NewControl(home); err == nil { projectQuotaSupported = true } else if opts.quota.Size > 0 { return nil, fmt.Errorf("Storage option overlay2.size not supported. Filesystem does not support Project Quota: %v", err) } } else if opts.quota.Size > 0 { // if xfs is not the backing fs then error out if the storage-opt overlay2.size is used. return nil, fmt.Errorf("Storage Option overlay2.size only supported for backingFS XFS. Found %v", backingFs) } logrus.Debugf("backingFs=%s, projectQuotaSupported=%v", backingFs, projectQuotaSupported) return d, nil } func parseOptions(options []string) (*overlayOptions, error) { o := &overlayOptions{} for _, option := range options { key, val, err := parsers.ParseKeyValueOpt(option) if err != nil { return nil, err } key = strings.ToLower(key) switch key { case "overlay2.override_kernel_check": o.overrideKernelCheck, err = strconv.ParseBool(val) if err != nil { return nil, err } case "overlay2.size": size, err := units.RAMInBytes(val) if err != nil { return nil, err } o.quota.Size = uint64(size) default: return nil, fmt.Errorf("overlay2: unknown option %s", key) } } return o, nil } func supportsOverlay() error { // We can try to modprobe overlay first before looking at // proc/filesystems for when overlay is supported exec.Command("modprobe", "overlay").Run() f, err := os.Open("/proc/filesystems") if err != nil { return err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { if s.Text() == "nodev\toverlay" { return nil } } logrus.Error("'overlay' not found as a supported filesystem on this host. Please ensure kernel is new enough and has overlay support loaded.") return graphdriver.ErrNotSupported } func useNaiveDiff(home string) bool { useNaiveDiffLock.Do(func() { if err := hasOpaqueCopyUpBug(home); err != nil { logrus.Warnf("Not using native diff for overlay2: %v", err) useNaiveDiffOnly = true } }) return useNaiveDiffOnly } func (d *Driver) String() string { return driverName } // Status returns current driver information in a two dimensional string array. // Output contains "Backing Filesystem" used in this implementation. func (d *Driver) Status() [][2]string { return [][2]string{ {"Backing Filesystem", backingFs}, {"Supports d_type", strconv.FormatBool(d.supportsDType)}, {"Native Overlay Diff", strconv.FormatBool(!useNaiveDiff(d.home))}, } } // GetMetadata returns meta data about the overlay driver such as // LowerDir, UpperDir, WorkDir and MergeDir used to store data. func (d *Driver) GetMetadata(id string) (map[string]string, error) { dir := d.dir(id) if _, err := os.Stat(dir); err != nil { return nil, err } metadata := map[string]string{ "WorkDir": path.Join(dir, "work"), "MergedDir": path.Join(dir, "merged"), "UpperDir": path.Join(dir, "diff"), } lowerDirs, err := d.getLowerDirs(id) if err != nil { return nil, err } if len(lowerDirs) > 0 { metadata["LowerDir"] = strings.Join(lowerDirs, ":") } return metadata, nil } // Cleanup any state created by overlay which should be cleaned when daemon // is being shutdown. For now, we just have to unmount the bind mounted // we had created. func (d *Driver) Cleanup() error { return mount.Unmount(d.home) } // CreateReadWrite creates a layer that is writable for use as a container // file system. func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts) error { if opts != nil && len(opts.StorageOpt) != 0 && !projectQuotaSupported { return fmt.Errorf("--storage-opt is supported only for overlay over xfs with 'pquota' mount option") } if opts == nil { opts = &graphdriver.CreateOpts{ StorageOpt: map[string]string{}, } } if _, ok := opts.StorageOpt["size"]; !ok { if opts.StorageOpt == nil { opts.StorageOpt = map[string]string{} } opts.StorageOpt["size"] = strconv.FormatUint(d.options.quota.Size, 10) } return d.create(id, parent, opts) } // Create is used to create the upper, lower, and merge directories required for overlay fs for a given id. // The parent filesystem is used to configure these directories for the overlay. func (d *Driver) Create(id, parent string, opts *graphdriver.CreateOpts) (retErr error) { if opts != nil && len(opts.StorageOpt) != 0 { if _, ok := opts.StorageOpt["size"]; ok { return fmt.Errorf("--storage-opt size is only supported for ReadWrite Layers") } } return d.create(id, parent, opts) } func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts) (retErr error) { dir := d.dir(id) rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps) if err != nil { return err } if err := idtools.MkdirAllAs(path.Dir(dir), 0700, rootUID, rootGID); err != nil { return err } if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil { return err } defer func() { // Clean up on failure if retErr != nil { os.RemoveAll(dir) } }() if opts != nil && len(opts.StorageOpt) > 0 { driver := &Driver{} if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil { return err } if driver.options.quota.Size > 0 { // Set container disk quota limit if err := d.quotaCtl.SetQuota(dir, driver.options.quota); err != nil { return err } } } if err := idtools.MkdirAs(path.Join(dir, "diff"), 0755, rootUID, rootGID); err != nil { return err } lid := generateID(idLength) if err := os.Symlink(path.Join("..", id, "diff"), path.Join(d.home, linkDir, lid)); err != nil { return err } // Write link id to link file if err := ioutil.WriteFile(path.Join(dir, "link"), []byte(lid), 0644); err != nil { return err } // if no parent directory, done if parent == "" { return nil } if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil { return err } if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil { return err } lower, err := d.getLower(parent) if err != nil { return err } if lower != "" { if err := ioutil.WriteFile(path.Join(dir, lowerFile), []byte(lower), 0666); err != nil { return err } } return nil } // Parse overlay storage options func (d *Driver) parseStorageOpt(storageOpt map[string]string, driver *Driver) error { // Read size to set the disk project quota per container for key, val := range storageOpt { key := strings.ToLower(key) switch key { case "size": size, err := units.RAMInBytes(val) if err != nil { return err } driver.options.quota.Size = uint64(size) default: return fmt.Errorf("Unknown option %s", key) } } return nil } func (d *Driver) getLower(parent string) (string, error) { parentDir := d.dir(parent) // Ensure parent exists if _, err := os.Lstat(parentDir); err != nil { return "", err } // Read Parent link fileA parentLink, err := ioutil.ReadFile(path.Join(parentDir, "link")) if err != nil { return "", err } lowers := []string{path.Join(linkDir, string(parentLink))} parentLower, err := ioutil.ReadFile(path.Join(parentDir, lowerFile)) if err == nil { parentLowers := strings.Split(string(parentLower), ":") lowers = append(lowers, parentLowers...) } if len(lowers) > maxDepth { return "", errors.New("max depth exceeded") } return strings.Join(lowers, ":"), nil } func (d *Driver) dir(id string) string { return path.Join(d.home, id) } func (d *Driver) getLowerDirs(id string) ([]string, error) { var lowersArray []string lowers, err := ioutil.ReadFile(path.Join(d.dir(id), lowerFile)) if err == nil { for _, s := range strings.Split(string(lowers), ":") { lp, err := os.Readlink(path.Join(d.home, s)) if err != nil { return nil, err } lowersArray = append(lowersArray, path.Clean(path.Join(d.home, linkDir, lp))) } } else if !os.IsNotExist(err) { return nil, err } return lowersArray, nil } // Remove cleans the directories that are created for this id. func (d *Driver) Remove(id string) error { d.locker.Lock(id) defer d.locker.Unlock(id) dir := d.dir(id) lid, err := ioutil.ReadFile(path.Join(dir, "link")) if err == nil { if err := os.RemoveAll(path.Join(d.home, linkDir, string(lid))); err != nil { logrus.Debugf("Failed to remove link: %v", err) } } if err := system.EnsureRemoveAll(dir); err != nil && !os.IsNotExist(err) { return err } return nil } // Get creates and mounts the required file system for the given id and returns the mount path. func (d *Driver) Get(id string, mountLabel string) (s string, err error) { d.locker.Lock(id) defer d.locker.Unlock(id) dir := d.dir(id) if _, err := os.Stat(dir); err != nil { return "", err } diffDir := path.Join(dir, "diff") lowers, err := ioutil.ReadFile(path.Join(dir, lowerFile)) if err != nil { // If no lower, just return diff directory if os.IsNotExist(err) { return diffDir, nil } return "", err } mergedDir := path.Join(dir, "merged") if count := d.ctr.Increment(mergedDir); count > 1 { return mergedDir, nil } defer func() { if err != nil { if c := d.ctr.Decrement(mergedDir); c <= 0 { unix.Unmount(mergedDir, 0) } } }() workDir := path.Join(dir, "work") splitLowers := strings.Split(string(lowers), ":") absLowers := make([]string, len(splitLowers)) for i, s := range splitLowers { absLowers[i] = path.Join(d.home, s) } opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), path.Join(dir, "diff"), path.Join(dir, "work")) mountData := label.FormatMountLabel(opts, mountLabel) mount := unix.Mount mountTarget := mergedDir pageSize := unix.Getpagesize() // Go can return a larger page size than supported by the system // as of go 1.7. This will be fixed in 1.8 and this block can be // removed when building with 1.8. // See https://github.com/golang/go/commit/1b9499b06989d2831e5b156161d6c07642926ee1 // See https://github.com/docker/docker/issues/27384 if pageSize > 4096 { pageSize = 4096 } // Use relative paths and mountFrom when the mount data has exceeded // the page size. The mount syscall fails if the mount data cannot // fit within a page and relative links make the mount data much // smaller at the expense of requiring a fork exec to chroot. if len(mountData) > pageSize { opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", string(lowers), path.Join(id, "diff"), path.Join(id, "work")) mountData = label.FormatMountLabel(opts, mountLabel) if len(mountData) > pageSize { return "", fmt.Errorf("cannot mount layer, mount label too large %d", len(mountData)) } mount = func(source string, target string, mType string, flags uintptr, label string) error { return mountFrom(d.home, source, target, mType, flags, label) } mountTarget = path.Join(id, "merged") } if err := mount("overlay", mountTarget, "overlay", 0, mountData); err != nil { return "", fmt.Errorf("error creating overlay mount to %s: %v", mergedDir, err) } // chown "workdir/work" to the remapped root UID/GID. Overlay fs inside a // user namespace requires this to move a directory from lower to upper. rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps) if err != nil { return "", err } if err := os.Chown(path.Join(workDir, "work"), rootUID, rootGID); err != nil { return "", err } return mergedDir, nil } // Put unmounts the mount path created for the give id. func (d *Driver) Put(id string) error { d.locker.Lock(id) defer d.locker.Unlock(id) dir := d.dir(id) _, err := ioutil.ReadFile(path.Join(dir, lowerFile)) if err != nil { // If no lower, no mount happened and just return directly if os.IsNotExist(err) { return nil } return err } mountpoint := path.Join(dir, "merged") if count := d.ctr.Decrement(mountpoint); count > 0 { return nil } if err := unix.Unmount(mountpoint, unix.MNT_DETACH); err != nil { logrus.Debugf("Failed to unmount %s overlay: %s - %v", id, mountpoint, err) } return nil } // Exists checks to see if the id is already mounted. func (d *Driver) Exists(id string) bool { _, err := os.Stat(d.dir(id)) return err == nil } // isParent returns if the passed in parent is the direct parent of the passed in layer func (d *Driver) isParent(id, parent string) bool { lowers, err := d.getLowerDirs(id) if err != nil { return false } if parent == "" && len(lowers) > 0 { return false } parentDir := d.dir(parent) var ld string if len(lowers) > 0 { ld = filepath.Dir(lowers[0]) } if ld == "" && parent == "" { return true } return ld == parentDir } // ApplyDiff applies the new layer into a root func (d *Driver) ApplyDiff(id string, parent string, diff io.Reader) (size int64, err error) { if !d.isParent(id, parent) { return d.naiveDiff.ApplyDiff(id, parent, diff) } applyDir := d.getDiffPath(id) logrus.Debugf("Applying tar in %s", applyDir) // Overlay doesn't need the parent id to apply the diff if err := untar(diff, applyDir, &archive.TarOptions{ UIDMaps: d.uidMaps, GIDMaps: d.gidMaps, WhiteoutFormat: archive.OverlayWhiteoutFormat, }); err != nil { return 0, err } return directory.Size(applyDir) } func (d *Driver) getDiffPath(id string) string { dir := d.dir(id) return path.Join(dir, "diff") } // DiffSize calculates the changes between the specified id // and its parent and returns the size in bytes of the changes // relative to its base filesystem directory. func (d *Driver) DiffSize(id, parent string) (size int64, err error) { if useNaiveDiff(d.home) || !d.isParent(id, parent) { return d.naiveDiff.DiffSize(id, parent) } return directory.Size(d.getDiffPath(id)) } // Diff produces an archive of the changes between the specified // layer and its parent layer which may be "". func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) { if useNaiveDiff(d.home) || !d.isParent(id, parent) { return d.naiveDiff.Diff(id, parent) } diffPath := d.getDiffPath(id) logrus.Debugf("Tar with options on %s", diffPath) return archive.TarWithOptions(diffPath, &archive.TarOptions{ Compression: archive.Uncompressed, UIDMaps: d.uidMaps, GIDMaps: d.gidMaps, WhiteoutFormat: archive.OverlayWhiteoutFormat, }) } // Changes produces a list of changes between the specified layer // and its parent layer. If parent is "", then all changes will be ADD changes. func (d *Driver) Changes(id, parent string) ([]archive.Change, error) { if useNaiveDiff(d.home) || !d.isParent(id, parent) { return d.naiveDiff.Changes(id, parent) } // Overlay doesn't have snapshots, so we need to get changes from all parent // layers. diffPath := d.getDiffPath(id) layers, err := d.getLowerDirs(id) if err != nil { return nil, err } return archive.OverlayChanges(layers, diffPath) }