Browse code

cluster: Allow reentrant calls to methods during shutdown

The agent sometimes calls into libnetwork code that in turn calls
(*Cluster).IsAgent and (*Cluster).IsManager. These can cause the
node shutdown process to time out, since they wait for a lock that is
held by Cleanup.

It turns out c.mu doesn't need to be held while calling Stop. Holding
controlMutex is sufficient. Also, (*nodeRunner).Stop must release
nodeRunner's mu during the node shutdown process, otherwise the same
call into Cluster would be blocked on this lock instead.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>

Aaron Lehmann authored on 2017/04/08 10:27:35
Showing 3 changed files
... ...
@@ -334,8 +334,9 @@ func (c *Cluster) Cleanup() {
334 334
 		c.mu.Unlock()
335 335
 		return
336 336
 	}
337
-	defer c.mu.Unlock()
338 337
 	state := c.currentNodeState()
338
+	c.mu.Unlock()
339
+
339 340
 	if state.IsActiveManager() {
340 341
 		active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
341 342
 		if err == nil {
... ...
@@ -345,11 +346,15 @@ func (c *Cluster) Cleanup() {
345 345
 			}
346 346
 		}
347 347
 	}
348
+
348 349
 	if err := node.Stop(); err != nil {
349 350
 		logrus.Errorf("failed to shut down cluster node: %v", err)
350 351
 		signal.DumpStacks("")
351 352
 	}
353
+
354
+	c.mu.Lock()
352 355
 	c.nr = nil
356
+	c.mu.Unlock()
353 357
 }
354 358
 
355 359
 func managerStats(client swarmapi.ControlClient, currentNodeID string) (current bool, reachable int, unreachable int, err error) {
... ...
@@ -210,11 +210,10 @@ func (n *nodeRunner) Stop() error {
210 210
 	n.stopping = true
211 211
 	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
212 212
 	defer cancel()
213
+	n.mu.Unlock()
213 214
 	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
214
-		n.mu.Unlock()
215 215
 		return err
216 216
 	}
217
-	n.mu.Unlock()
218 217
 	<-n.done
219 218
 	return nil
220 219
 }
... ...
@@ -25,19 +25,20 @@ import (
25 25
 func (c *Cluster) Init(req types.InitRequest) (string, error) {
26 26
 	c.controlMutex.Lock()
27 27
 	defer c.controlMutex.Unlock()
28
-	c.mu.Lock()
29 28
 	if c.nr != nil {
30 29
 		if req.ForceNewCluster {
30
+			// Take c.mu temporarily to wait for presently running
31
+			// API handlers to finish before shutting down the node.
32
+			c.mu.Lock()
33
+			c.mu.Unlock()
34
+
31 35
 			if err := c.nr.Stop(); err != nil {
32
-				c.mu.Unlock()
33 36
 				return "", err
34 37
 			}
35 38
 		} else {
36
-			c.mu.Unlock()
37 39
 			return "", errSwarmExists
38 40
 		}
39 41
 	}
40
-	c.mu.Unlock()
41 42
 
42 43
 	if err := validateAndSanitizeInitRequest(&req); err != nil {
43 44
 		return "", apierrors.NewBadRequestError(err)
... ...
@@ -325,9 +326,10 @@ func (c *Cluster) Leave(force bool) error {
325 325
 
326 326
 	state := c.currentNodeState()
327 327
 
328
+	c.mu.Unlock()
329
+
328 330
 	if errors.Cause(state.err) == errSwarmLocked && !force {
329 331
 		// leave a locked swarm without --force is not allowed
330
-		c.mu.Unlock()
331 332
 		return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")
332 333
 	}
333 334
 
... ...
@@ -339,7 +341,6 @@ func (c *Cluster) Leave(force bool) error {
339 339
 				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
340 340
 					if isLastManager(reachable, unreachable) {
341 341
 						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
342
-						c.mu.Unlock()
343 342
 						return errors.New(msg)
344 343
 					}
345 344
 					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
... ...
@@ -350,18 +351,19 @@ func (c *Cluster) Leave(force bool) error {
350 350
 		}
351 351
 
352 352
 		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
353
-		c.mu.Unlock()
354 353
 		return errors.New(msg)
355 354
 	}
356 355
 	// release readers in here
357 356
 	if err := nr.Stop(); err != nil {
358 357
 		logrus.Errorf("failed to shut down cluster node: %v", err)
359 358
 		signal.DumpStacks("")
360
-		c.mu.Unlock()
361 359
 		return err
362 360
 	}
361
+
362
+	c.mu.Lock()
363 363
 	c.nr = nil
364 364
 	c.mu.Unlock()
365
+
365 366
 	if nodeID := state.NodeID(); nodeID != "" {
366 367
 		nodeContainers, err := c.listContainerForNode(nodeID)
367 368
 		if err != nil {