GitList

Browse code

cluster: Allow reentrant calls to methods during shutdown

The agent sometimes calls into libnetwork code that in turn calls
(*Cluster).IsAgent and (*Cluster).IsManager. These can cause the
node shutdown process to time out, since they wait for a lock that is
held by Cleanup.

It turns out c.mu doesn't need to be held while calling Stop. Holding
controlMutex is sufficient. Also, (*nodeRunner).Stop must release
nodeRunner's mu during the node shutdown process, otherwise the same
call into Cluster would be blocked on this lock instead.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>

Aaron Lehmann authored on 2017/04/08 10:27:35
Showing 3 changed files

daemon/cluster/cluster.go index fec07dc..aa622d9 100644
daemon/cluster/noderunner.go index 5057e7f..49fef1f 100644
daemon/cluster/swarm.go index 3e01a99..6b6a543 100644

daemon/cluster/cluster.go

History View file @ 44ce809

@@ -334,8 +334,9 @@ func (c *Cluster) Cleanup() {
 		c.mu.Unlock()
 		return
 	}
-	defer c.mu.Unlock()
 	state := c.currentNodeState()
+	c.mu.Unlock()
+
 	if state.IsActiveManager() {
 		active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
 		if err == nil {
@@ -345,11 +346,15 @@ func (c *Cluster) Cleanup() {
 			}
 		}
 	}
+
 	if err := node.Stop(); err != nil {
 		logrus.Errorf("failed to shut down cluster node: %v", err)
 		signal.DumpStacks("")
 	}
+
+	c.mu.Lock()
 	c.nr = nil
+	c.mu.Unlock()
 }
 
 func managerStats(client swarmapi.ControlClient, currentNodeID string) (current bool, reachable int, unreachable int, err error) {

daemon/cluster/noderunner.go

History View file @ 44ce809

@@ -210,11 +210,10 @@ func (n *nodeRunner) Stop() error {
                      	n.stopping = true
                      	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
                      	defer cancel()
                     +	n.mu.Unlock()
                      	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
                     -		n.mu.Unlock()
                      		return err
+                     	}
                     -	n.mu.Unlock()
                      	<-n.done
                      	return nil
+                     }

daemon/cluster/swarm.go

History View file @ 44ce809

@@ -25,19 +25,20 @@ import (
                      func (c *Cluster) Init(req types.InitRequest) (string, error) {
                      	c.controlMutex.Lock()
                      	defer c.controlMutex.Unlock()
                     -	c.mu.Lock()
                      	if c.nr != nil {
                      		if req.ForceNewCluster {
                     +			// Take c.mu temporarily to wait for presently running
                     +			// API handlers to finish before shutting down the node.
                     +			c.mu.Lock()
                     +			c.mu.Unlock()
+                    +
                      			if err := c.nr.Stop(); err != nil {
                     -				c.mu.Unlock()
                      				return "", err
+                     			}
                      		} else {
                     -			c.mu.Unlock()
                      			return "", errSwarmExists
+                     		}
+                     	}
                     -	c.mu.Unlock()
                      	if err := validateAndSanitizeInitRequest(&req); err != nil {
                      		return "", apierrors.NewBadRequestError(err)
@@ -325,9 +326,10 @@ func (c *Cluster) Leave(force bool) error {
                      	state := c.currentNodeState()
                     +	c.mu.Unlock()
+                    +
                      	if errors.Cause(state.err) == errSwarmLocked && !force {
                      		// leave a locked swarm without --force is not allowed
                     -		c.mu.Unlock()
                      		return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")
+                     	}
@@ -339,7 +341,6 @@ func (c *Cluster) Leave(force bool) error {
                      				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
                      					if isLastManager(reachable, unreachable) {
                      						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
                     -						c.mu.Unlock()
                      						return errors.New(msg)
+                     					}
                      					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
@@ -350,18 +351,19 @@ func (c *Cluster) Leave(force bool) error {
+                     		}
                      		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
                     -		c.mu.Unlock()
                      		return errors.New(msg)
+                     	}
                      	// release readers in here
                      	if err := nr.Stop(); err != nil {
                      		logrus.Errorf("failed to shut down cluster node: %v", err)
                      		signal.DumpStacks("")
                     -		c.mu.Unlock()
                      		return err
+                     	}
+                    +
                     +	c.mu.Lock()
                      	c.nr = nil
                      	c.mu.Unlock()
+                    +
                      	if nodeID := state.NodeID(); nodeID != "" {
                      		nodeContainers, err := c.listContainerForNode(nodeID)
                      		if err != nil {

...	...	@@ -334,8 +334,9 @@ func (c *Cluster) Cleanup() {
334	334	c.mu.Unlock()
335	335	return
336	336	}
337		- defer c.mu.Unlock()
338	337	state := c.currentNodeState()
	338	+ c.mu.Unlock()
	339	+
339	340	if state.IsActiveManager() {
340	341	active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
341	342	if err == nil {
...	...	@@ -345,11 +346,15 @@ func (c *Cluster) Cleanup() {
345	345	}
346	346	}
347	347	}
	348	+
348	349	if err := node.Stop(); err != nil {
349	350	logrus.Errorf("failed to shut down cluster node: %v", err)
350	351	signal.DumpStacks("")
351	352	}
	353	+
	354	+ c.mu.Lock()
352	355	c.nr = nil
	356	+ c.mu.Unlock()
353	357	}
354	358
355	359	func managerStats(client swarmapi.ControlClient, currentNodeID string) (current bool, reachable int, unreachable int, err error) {