The agent sometimes calls into libnetwork code that in turn calls
(*Cluster).IsAgent and (*Cluster).IsManager. These can cause the
node shutdown process to time out, since they wait for a lock that is
held by Cleanup.
It turns out c.mu doesn't need to be held while calling Stop. Holding
controlMutex is sufficient. Also, (*nodeRunner).Stop must release
nodeRunner's mu during the node shutdown process, otherwise the same
call into Cluster would be blocked on this lock instead.
Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
| ... | ... |
@@ -334,8 +334,9 @@ func (c *Cluster) Cleanup() {
|
| 334 | 334 |
c.mu.Unlock() |
| 335 | 335 |
return |
| 336 | 336 |
} |
| 337 |
- defer c.mu.Unlock() |
|
| 338 | 337 |
state := c.currentNodeState() |
| 338 |
+ c.mu.Unlock() |
|
| 339 |
+ |
|
| 339 | 340 |
if state.IsActiveManager() {
|
| 340 | 341 |
active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID()) |
| 341 | 342 |
if err == nil {
|
| ... | ... |
@@ -345,11 +346,15 @@ func (c *Cluster) Cleanup() {
|
| 345 | 345 |
} |
| 346 | 346 |
} |
| 347 | 347 |
} |
| 348 |
+ |
|
| 348 | 349 |
if err := node.Stop(); err != nil {
|
| 349 | 350 |
logrus.Errorf("failed to shut down cluster node: %v", err)
|
| 350 | 351 |
signal.DumpStacks("")
|
| 351 | 352 |
} |
| 353 |
+ |
|
| 354 |
+ c.mu.Lock() |
|
| 352 | 355 |
c.nr = nil |
| 356 |
+ c.mu.Unlock() |
|
| 353 | 357 |
} |
| 354 | 358 |
|
| 355 | 359 |
func managerStats(client swarmapi.ControlClient, currentNodeID string) (current bool, reachable int, unreachable int, err error) {
|
| ... | ... |
@@ -210,11 +210,10 @@ func (n *nodeRunner) Stop() error {
|
| 210 | 210 |
n.stopping = true |
| 211 | 211 |
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) |
| 212 | 212 |
defer cancel() |
| 213 |
+ n.mu.Unlock() |
|
| 213 | 214 |
if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
|
| 214 |
- n.mu.Unlock() |
|
| 215 | 215 |
return err |
| 216 | 216 |
} |
| 217 |
- n.mu.Unlock() |
|
| 218 | 217 |
<-n.done |
| 219 | 218 |
return nil |
| 220 | 219 |
} |
| ... | ... |
@@ -25,19 +25,20 @@ import ( |
| 25 | 25 |
func (c *Cluster) Init(req types.InitRequest) (string, error) {
|
| 26 | 26 |
c.controlMutex.Lock() |
| 27 | 27 |
defer c.controlMutex.Unlock() |
| 28 |
- c.mu.Lock() |
|
| 29 | 28 |
if c.nr != nil {
|
| 30 | 29 |
if req.ForceNewCluster {
|
| 30 |
+ // Take c.mu temporarily to wait for presently running |
|
| 31 |
+ // API handlers to finish before shutting down the node. |
|
| 32 |
+ c.mu.Lock() |
|
| 33 |
+ c.mu.Unlock() |
|
| 34 |
+ |
|
| 31 | 35 |
if err := c.nr.Stop(); err != nil {
|
| 32 |
- c.mu.Unlock() |
|
| 33 | 36 |
return "", err |
| 34 | 37 |
} |
| 35 | 38 |
} else {
|
| 36 |
- c.mu.Unlock() |
|
| 37 | 39 |
return "", errSwarmExists |
| 38 | 40 |
} |
| 39 | 41 |
} |
| 40 |
- c.mu.Unlock() |
|
| 41 | 42 |
|
| 42 | 43 |
if err := validateAndSanitizeInitRequest(&req); err != nil {
|
| 43 | 44 |
return "", apierrors.NewBadRequestError(err) |
| ... | ... |
@@ -325,9 +326,10 @@ func (c *Cluster) Leave(force bool) error {
|
| 325 | 325 |
|
| 326 | 326 |
state := c.currentNodeState() |
| 327 | 327 |
|
| 328 |
+ c.mu.Unlock() |
|
| 329 |
+ |
|
| 328 | 330 |
if errors.Cause(state.err) == errSwarmLocked && !force {
|
| 329 | 331 |
// leave a locked swarm without --force is not allowed |
| 330 |
- c.mu.Unlock() |
|
| 331 | 332 |
return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")
|
| 332 | 333 |
} |
| 333 | 334 |
|
| ... | ... |
@@ -339,7 +341,6 @@ func (c *Cluster) Leave(force bool) error {
|
| 339 | 339 |
if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
|
| 340 | 340 |
if isLastManager(reachable, unreachable) {
|
| 341 | 341 |
msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " |
| 342 |
- c.mu.Unlock() |
|
| 343 | 342 |
return errors.New(msg) |
| 344 | 343 |
} |
| 345 | 344 |
msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
|
| ... | ... |
@@ -350,18 +351,19 @@ func (c *Cluster) Leave(force bool) error {
|
| 350 | 350 |
} |
| 351 | 351 |
|
| 352 | 352 |
msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." |
| 353 |
- c.mu.Unlock() |
|
| 354 | 353 |
return errors.New(msg) |
| 355 | 354 |
} |
| 356 | 355 |
// release readers in here |
| 357 | 356 |
if err := nr.Stop(); err != nil {
|
| 358 | 357 |
logrus.Errorf("failed to shut down cluster node: %v", err)
|
| 359 | 358 |
signal.DumpStacks("")
|
| 360 |
- c.mu.Unlock() |
|
| 361 | 359 |
return err |
| 362 | 360 |
} |
| 361 |
+ |
|
| 362 |
+ c.mu.Lock() |
|
| 363 | 363 |
c.nr = nil |
| 364 | 364 |
c.mu.Unlock() |
| 365 |
+ |
|
| 365 | 366 |
if nodeID := state.NodeID(); nodeID != "" {
|
| 366 | 367 |
nodeContainers, err := c.listContainerForNode(nodeID) |
| 367 | 368 |
if err != nil {
|