GitList

Browse code

daemon: switch to semaphore-gated WaitGroup for startup tasks

Many startup tasks have to run for each container, and thus using a
WaitGroup (which doesn't have a limit to the number of parallel tasks)
can result in Docker exceeding the NOFILE limit quite trivially. A more
optimal solution is to have a parallelism limit by using a semaphore.

In addition, several startup tasks were not parallelised previously
which resulted in very long startup times. According to my testing, 20K
dead containers resulted in ~6 minute startup times (during which time
Docker is completely unusable).

This patch fixes both issues, and the parallelStartupTimes factor chosen
(128 * NumCPU) is based on my own significant testing of the 20K
container case. This patch (on my machines) reduces the startup time
from 6 minutes to less than a minute (ideally this could be further
reduced by removing the need to scan all dead containers on startup --
but that's beyond the scope of this patchset).

In order to avoid the NOFILE limit problem, we also detect this
on-startup and if NOFILE < 2*128*NumCPU we will reduce the parallelism
factor to avoid hitting NOFILE limits (but also emit a warning since
this is almost certainly a mis-configuration).

Signed-off-by: Aleksa Sarai <asarai@suse.de>

Aleksa Sarai authored on 2018/12/05 01:44:45
Showing 3 changed files

daemon/daemon.go index 14b31e7..e8f482f 100644
daemon/daemon_unix.go index b18fca1..3148b48 100644
daemon/daemon_windows.go index e534d7e..38b19ea 100644

daemon/daemon.go

History View file @ 5a52917

@@ -67,6 +67,7 @@ import (
                      	"github.com/docker/libnetwork/cluster"
                      	nwconfig "github.com/docker/libnetwork/config"
                      	"github.com/pkg/errors"
                     +	"golang.org/x/sync/semaphore"
+                     )
                      // ContainersNamespace is the name of the namespace used for users containers
@@ -198,6 +199,7 @@ func (daemon *Daemon) NewResolveOptionsFunc() resolver.ResolveOptionsFunc {
+                     }
                      func (daemon *Daemon) restore() error {
                     +	var mapLock sync.Mutex
                      	containers := make(map[string]*container.Container)
                      	logrus.Info("Loading containers: start.")
@@ -207,68 +209,99 @@ func (daemon *Daemon) restore() error {
                      		return err
+                     	}
                     +	// parallelLimit is the maximum number of parallel startup jobs that we
                     +	// allow (this is the limited used for all startup semaphores). The multipler
                     +	// (128) was chosen after some fairly significant benchmarking -- don't change
                     +	// it unless you've tested it significantly (this value is adjusted if
                     +	// RLIMIT_NOFILE is small to avoid EMFILE).
                     +	parallelLimit := adjustParallelLimit(len(dir), 128*runtime.NumCPU())
+                    +
                     +	// Re-used for all parallel startup jobs.
                     +	var group sync.WaitGroup
                     +	sem := semaphore.NewWeighted(int64(parallelLimit))
+                    +
                      	for _, v := range dir {
                     -		id := v.Name()
                     -		container, err := daemon.load(id)
                     -		if err != nil {
                     -			logrus.Errorf("Failed to load container %v: %v", id, err)
                     -			continue
                     -		}
                     -		if !system.IsOSSupported(container.OS) {
                     -			logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
                     -			continue
                     -		}
                     -		// Ignore the container if it does not support the current driver being used by the graph
                     -		currentDriverForContainerOS := daemon.graphDrivers[container.OS]
                     -		if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
                     -			rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
                     +		group.Add(1)
                     +		go func(id string) {
                     +			defer group.Done()
                     +			_ = sem.Acquire(context.Background(), 1)
                     +			defer sem.Release(1)
+                    +
                     +			container, err := daemon.load(id)
                      			if err != nil {
                     -				logrus.Errorf("Failed to load container mount %v: %v", id, err)
                     -				continue
                     +				logrus.Errorf("Failed to load container %v: %v", id, err)
                     +				return
+                     			}
                     -			container.RWLayer = rwlayer
                     -			logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
                     +			if !system.IsOSSupported(container.OS) {
                     +				logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
                     +				return
                     +			}
                     +			// Ignore the container if it does not support the current driver being used by the graph
                     +			currentDriverForContainerOS := daemon.graphDrivers[container.OS]
                     +			if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
                     +				rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
                     +				if err != nil {
                     +					logrus.Errorf("Failed to load container mount %v: %v", id, err)
                     +					return
                     +				}
                     +				container.RWLayer = rwlayer
                     +				logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
                     -			containers[container.ID] = container
                     -		} else {
                     -			logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
                     -		}
                     +				mapLock.Lock()
                     +				containers[container.ID] = container
                     +				mapLock.Unlock()
                     +			} else {
                     +				logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
                     +			}
                     +		}(v.Name())
+                     	}
                     +	group.Wait()
                      	removeContainers := make(map[string]*container.Container)
                      	restartContainers := make(map[*container.Container]chan struct{})
                      	activeSandboxes := make(map[string]interface{})
+                    +
                      	for id, c := range containers {
                     -		if err := daemon.registerName(c); err != nil {
                     -			logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
                     -			delete(containers, id)
                     -			continue
                     -		}
                     -		if err := daemon.Register(c); err != nil {
                     -			logrus.Errorf("Failed to register container %s: %s", c.ID, err)
                     -			delete(containers, id)
                     -			continue
                     -		}
                     +		group.Add(1)
                     +		go func(c *container.Container) {
                     +			defer group.Done()
                     +			_ = sem.Acquire(context.Background(), 1)
                     +			defer sem.Release(1)
                     -		// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
                     -		// We should rewrite it to use the daemon defaults.
                     -		// Fixes https://github.com/docker/docker/issues/22536
                     -		if c.HostConfig.LogConfig.Type == "" {
                     -			if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
                     -				logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
                     -				continue
                     +			if err := daemon.registerName(c); err != nil {
                     +				logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
                     +				mapLock.Lock()
                     +				delete(containers, id)
                     +				mapLock.Unlock()
                     +				return
+                     			}
                     -		}
                     +			if err := daemon.Register(c); err != nil {
                     +				logrus.Errorf("Failed to register container %s: %s", c.ID, err)
                     +				mapLock.Lock()
                     +				delete(containers, id)
                     +				mapLock.Unlock()
                     +				return
                     +			}
+                    +
                     +			// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
                     +			// We should rewrite it to use the daemon defaults.
                     +			// Fixes https://github.com/docker/docker/issues/22536
                     +			if c.HostConfig.LogConfig.Type == "" {
                     +				if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
                     +					logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
                     +				}
                     +			}
                     +		}(c)
+                     	}
                     +	group.Wait()
                     -	var (
                     -		wg      sync.WaitGroup
                     -		mapLock sync.Mutex
                     -	)
                      	for _, c := range containers {
                     -		wg.Add(1)
                     +		group.Add(1)
                      		go func(c *container.Container) {
                     -			defer wg.Done()
                     +			defer group.Done()
                     +			_ = sem.Acquire(context.Background(), 1)
                     +			defer sem.Release(1)
+                    +
                      			daemon.backportMountSpec(c)
                      			if err := daemon.checkpointAndSave(c); err != nil {
                      				logrus.WithError(err).WithField("container", c.ID).Error("error saving backported mountspec to disk")
@@ -415,7 +448,8 @@ func (daemon *Daemon) restore() error {
                      			c.Unlock()
                      		}(c)
+                     	}
                     -	wg.Wait()
                     +	group.Wait()
+                    +
                      	daemon.netController, err = daemon.initNetworkController(daemon.configStore, activeSandboxes)
                      	if err != nil {
                      		return fmt.Errorf("Error initializing network controller: %v", err)
@@ -423,18 +457,24 @@ func (daemon *Daemon) restore() error {
                      	// Now that all the containers are registered, register the links
                      	for _, c := range containers {
                     -		if err := daemon.registerLinks(c, c.HostConfig); err != nil {
                     -			logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
                     -		}
                     +		group.Add(1)
                     +		go func(c *container.Container) {
                     +			_ = sem.Acquire(context.Background(), 1)
+                    +
                     +			if err := daemon.registerLinks(c, c.HostConfig); err != nil {
                     +				logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
                     +			}
+                    +
                     +			sem.Release(1)
                     +			group.Done()
                     +		}(c)
+                     	}
                     +	group.Wait()
                     -	group := sync.WaitGroup{}
                      	for c, notifier := range restartContainers {
                      		group.Add(1)
+                    -
                      		go func(c *container.Container, chNotify chan struct{}) {
                     -			defer group.Done()
+                    -
                     +			_ = sem.Acquire(context.Background(), 1)
                      			logrus.Debugf("Starting container %s", c.ID)
                      			// ignore errors here as this is a best effort to wait for children to be
@@ -456,22 +496,27 @@ func (daemon *Daemon) restore() error {
                      				logrus.Errorf("Failed to start container %s: %s", c.ID, err)
+                     			}
                      			close(chNotify)
                     -		}(c, notifier)
                     +			sem.Release(1)
                     +			group.Done()
                     +		}(c, notifier)
+                     	}
                      	group.Wait()
                     -	removeGroup := sync.WaitGroup{}
                      	for id := range removeContainers {
                     -		removeGroup.Add(1)
                     +		group.Add(1)
                      		go func(cid string) {
                     +			_ = sem.Acquire(context.Background(), 1)
+                    +
                      			if err := daemon.ContainerRm(cid, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
                      				logrus.Errorf("Failed to remove container %s: %s", cid, err)
+                     			}
                     -			removeGroup.Done()
+                    +
                     +			sem.Release(1)
                     +			group.Done()
                      		}(id)
+                     	}
                     -	removeGroup.Wait()
                     +	group.Wait()
                      	// any containers that were started above would already have had this done,
                      	// however we need to now prepare the mountpoints for the rest of the containers as well.
@@ -492,13 +537,16 @@ func (daemon *Daemon) restore() error {
                      		group.Add(1)
                      		go func(c *container.Container) {
                     -			defer group.Done()
                     +			_ = sem.Acquire(context.Background(), 1)
+                    +
                      			if err := daemon.prepareMountPoints(c); err != nil {
                      				logrus.Error(err)
+                     			}
+                    +
                     +			sem.Release(1)
                     +			group.Done()
                      		}(c)
+                     	}
+                    -
                      	group.Wait()
                      	logrus.Info("Loading containers: done.")
@@ -509,7 +557,18 @@ func (daemon *Daemon) restore() error {
                      // RestartSwarmContainers restarts any autostart container which has a
                      // swarm endpoint.
                      func (daemon *Daemon) RestartSwarmContainers() {
                     -	group := sync.WaitGroup{}
                     +	ctx := context.Background()
+                    +
                     +	// parallelLimit is the maximum number of parallel startup jobs that we
                     +	// allow (this is the limited used for all startup semaphores). The multipler
                     +	// (128) was chosen after some fairly significant benchmarking -- don't change
                     +	// it unless you've tested it significantly (this value is adjusted if
                     +	// RLIMIT_NOFILE is small to avoid EMFILE).
                     +	parallelLimit := adjustParallelLimit(len(daemon.List()), 128*runtime.NumCPU())
+                    +
                     +	var group sync.WaitGroup
                     +	sem := semaphore.NewWeighted(int64(parallelLimit))
+                    +
                      	for _, c := range daemon.List() {
                      		if !c.IsRunning() && !c.IsPaused() {
                      			// Autostart all the containers which has a
@@ -518,14 +577,21 @@ func (daemon *Daemon) RestartSwarmContainers() {
                      			if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint && c.HasBeenStartedBefore {
                      				group.Add(1)
                      				go func(c *container.Container) {
                     -					defer group.Done()
                     +					if err := sem.Acquire(ctx, 1); err != nil {
                     +						// ctx is done.
                     +						group.Done()
                     +						return
                     +					}
+                    +
                      					if err := daemon.containerStart(c, "", "", true); err != nil {
                      						logrus.Error(err)
+                     					}
+                    +
                     +					sem.Release(1)
                     +					group.Done()
                      				}(c)
+                     			}
+                     		}
+                    -
+                     	}
                      	group.Wait()
+                     }

daemon/daemon_unix.go

History View file @ 5a52917

@@ -257,6 +257,41 @@ func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThro
                      	return throttleDevices, nil
+                     }
                     +// adjustParallelLimit takes a number of objects and a proposed limit and
                     +// figures out if it's reasonable (and adjusts it accordingly). This is only
                     +// used for daemon startup, which does a lot of parallel loading of containers
                     +// (and if we exceed RLIMIT_NOFILE then we're in trouble).
                     +func adjustParallelLimit(n int, limit int) int {
                     +	// Rule-of-thumb overhead factor (how many files will each goroutine open
                     +	// simultaneously). Yes, this is ugly but to be frank this whole thing is
                     +	// ugly.
                     +	const overhead = 2
+                    +
                     +	// On Linux, we need to ensure that parallelStartupJobs doesn't cause us to
                     +	// exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it
                     +	// and give a warning (since in theory the user should increase their
                     +	// ulimits to the largest possible value for dockerd).
                     +	var rlim unix.Rlimit
                     +	if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil {
                     +		logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err)
                     +		return limit
                     +	}
                     +	softRlimit := int(rlim.Cur)
+                    +
                     +	// Much fewer containers than RLIMIT_NOFILE. No need to adjust anything.
                     +	if softRlimit > overhead*n {
                     +		return limit
                     +	}
+                    +
                     +	// RLIMIT_NOFILE big enough, no need to adjust anything.
                     +	if softRlimit > overhead*limit {
                     +		return limit
                     +	}
+                    +
                     +	logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit)
                     +	return softRlimit / overhead
                     +}
+                    +
                      func checkKernel() error {
                      	// Check for unsupported kernel versions
                      	// FIXME: it would be cleaner to not test for specific versions, but rather

daemon/daemon_windows.go

History View file @ 5a52917

@@ -40,6 +40,11 @@ const (
                      	windowsMaxCPUPercent = 100
+                     )
                     +// Windows doesn't really have rlimits.
                     +func adjustParallelLimit(n int, limit int) int {
                     +	return limit
                     +}
+                    +
                      // Windows has no concept of an execution state directory. So use config.Root here.
                      func getPluginExecRoot(root string) string {
                      	return filepath.Join(root, "plugins")