Browse code

Return membership status on join without timeout

Signed-off-by: Tonis Tiigi <tonistiigi@gmail.com>

Tonis Tiigi authored on 2016/06/17 01:42:22
Showing 3 changed files
... ...
@@ -28,25 +28,28 @@ import (
28 28
 
29 29
 const swarmDirName = "swarm"
30 30
 const controlSocket = "control.sock"
31
-const swarmConnectTimeout = 10 * time.Second
31
+const swarmConnectTimeout = 20 * time.Second
32 32
 const stateFile = "docker-state.json"
33 33
 
34 34
 const (
35 35
 	initialReconnectDelay = 100 * time.Millisecond
36
-	maxReconnectDelay     = 10 * time.Second
36
+	maxReconnectDelay     = 30 * time.Second
37 37
 )
38 38
 
39 39
 // ErrNoManager is returned then a manager-only function is called on non-manager
40
-var ErrNoManager = fmt.Errorf("this node is not participating as a Swarm manager")
40
+var ErrNoManager = fmt.Errorf("This node is not participating as a Swarm manager")
41 41
 
42 42
 // ErrNoSwarm is returned on leaving a cluster that was never initialized
43
-var ErrNoSwarm = fmt.Errorf("this node is not part of Swarm")
43
+var ErrNoSwarm = fmt.Errorf("This node is not part of Swarm")
44 44
 
45 45
 // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated
46
-var ErrSwarmExists = fmt.Errorf("this node is already part of a Swarm")
46
+var ErrSwarmExists = fmt.Errorf("This node is already part of a Swarm cluster. Use \"docker swarm leave\" to leave this cluster and join another one.")
47
+
48
+// ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet.
49
+var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.")
47 50
 
48 51
 // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached.
49
-var ErrSwarmJoinTimeoutReached = fmt.Errorf("timeout reached before node was joined")
52
+var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. Attempt to join the cluster will continue in the background. Use \"docker info\" command to see the current Swarm status of your node.")
50 53
 
51 54
 type state struct {
52 55
 	ListenAddr string
... ...
@@ -249,13 +252,14 @@ func (c *Cluster) startNewNode(forceNewCluster bool, listenAddr, joinAddr, secre
249 249
 // Init initializes new cluster from user provided request.
250 250
 func (c *Cluster) Init(req types.InitRequest) (string, error) {
251 251
 	c.Lock()
252
-	if c.node != nil {
252
+	if node := c.node; node != nil {
253 253
 		c.Unlock()
254 254
 		if !req.ForceNewCluster {
255
-			return "", ErrSwarmExists
255
+			return "", errSwarmExists(node)
256 256
 		}
257 257
 		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
258 258
 		defer cancel()
259
+		c.cancelReconnect()
259 260
 		if err := c.node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
260 261
 			return "", err
261 262
 		}
... ...
@@ -297,9 +301,9 @@ func (c *Cluster) Init(req types.InitRequest) (string, error) {
297 297
 // Join makes current Cluster part of an existing swarm cluster.
298 298
 func (c *Cluster) Join(req types.JoinRequest) error {
299 299
 	c.Lock()
300
-	if c.node != nil {
300
+	if node := c.node; node != nil {
301 301
 		c.Unlock()
302
-		return ErrSwarmExists
302
+		return errSwarmExists(node)
303 303
 	}
304 304
 	// todo: check current state existing
305 305
 	if len(req.RemoteAddrs) == 0 {
... ...
@@ -312,23 +316,29 @@ func (c *Cluster) Join(req types.JoinRequest) error {
312 312
 	}
313 313
 	c.Unlock()
314 314
 
315
-	select {
316
-	case <-time.After(swarmConnectTimeout):
317
-		go c.reconnectOnFailure(ctx)
318
-		if nodeid := n.NodeID(); nodeid != "" {
319
-			return fmt.Errorf("Timeout reached before node was joined. Your cluster settings may be preventing this node from automatically joining. To accept this node into cluster run `docker node accept %v` in an existing cluster manager", nodeid)
320
-		}
321
-		return ErrSwarmJoinTimeoutReached
322
-	case <-n.Ready():
323
-		go c.reconnectOnFailure(ctx)
324
-		return nil
325
-	case <-ctx.Done():
326
-		c.RLock()
327
-		defer c.RUnlock()
328
-		if c.err != nil {
329
-			return c.err
315
+	certificateRequested := n.CertificateRequested()
316
+	for {
317
+		select {
318
+		case <-certificateRequested:
319
+			if n.NodeMembership() == swarmapi.NodeMembershipPending {
320
+				return fmt.Errorf("Your node is in the process of joining the cluster but needs to be accepted by existing cluster member.\nTo accept this node into cluster run \"docker node accept %v\" in an existing cluster manager. Use \"docker info\" command to see the current Swarm status of your node.", n.NodeID())
321
+			}
322
+			certificateRequested = nil
323
+		case <-time.After(swarmConnectTimeout):
324
+			// attempt to connect will continue in background, also reconnecting
325
+			go c.reconnectOnFailure(ctx)
326
+			return ErrSwarmJoinTimeoutReached
327
+		case <-n.Ready():
328
+			go c.reconnectOnFailure(ctx)
329
+			return nil
330
+		case <-ctx.Done():
331
+			c.RLock()
332
+			defer c.RUnlock()
333
+			if c.err != nil {
334
+				return c.err
335
+			}
336
+			return ctx.Err()
330 337
 		}
331
-		return ctx.Err()
332 338
 	}
333 339
 }
334 340
 
... ...
@@ -1004,6 +1014,13 @@ func (c *Cluster) managerStats() (current bool, reachable int, unreachable int,
1004 1004
 	return
1005 1005
 }
1006 1006
 
1007
+func errSwarmExists(node *swarmagent.Node) error {
1008
+	if node.NodeMembership() != swarmapi.NodeMembershipAccepted {
1009
+		return ErrPendingSwarmExists
1010
+	}
1011
+	return ErrSwarmExists
1012
+}
1013
+
1007 1014
 func initAcceptancePolicy(node *swarmagent.Node, acceptancePolicy types.AcceptancePolicy) error {
1008 1015
 	ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
1009 1016
 	for conn := range node.ListenControlSocket(ctx) {
... ...
@@ -5,6 +5,7 @@ import (
5 5
 	"fmt"
6 6
 	"net/http"
7 7
 	"strings"
8
+	"time"
8 9
 
9 10
 	"github.com/docker/docker/pkg/integration/checker"
10 11
 	"github.com/docker/engine-api/types"
... ...
@@ -167,14 +168,22 @@ func (d *SwarmDaemon) getNode(c *check.C, id string) *swarm.Node {
167 167
 	return &node
168 168
 }
169 169
 
170
-func (d *SwarmDaemon) updateNode(c *check.C, node *swarm.Node, f ...nodeConstructor) {
171
-	for _, fn := range f {
172
-		fn(node)
170
+func (d *SwarmDaemon) updateNode(c *check.C, id string, f ...nodeConstructor) {
171
+	for i := 0; ; i++ {
172
+		node := d.getNode(c, id)
173
+		for _, fn := range f {
174
+			fn(node)
175
+		}
176
+		url := fmt.Sprintf("/nodes/%s/update?version=%d", node.ID, node.Version.Index)
177
+		status, out, err := d.SockRequest("POST", url, node.Spec)
178
+		if i < 10 && strings.Contains(string(out), "update out of sequence") {
179
+			time.Sleep(100 * time.Millisecond)
180
+			continue
181
+		}
182
+		c.Assert(err, checker.IsNil)
183
+		c.Assert(status, checker.Equals, http.StatusOK, check.Commentf("output: %q", string(out)))
184
+		return
173 185
 	}
174
-	url := fmt.Sprintf("/nodes/%s/update?version=%d", node.ID, node.Version.Index)
175
-	status, out, err := d.SockRequest("POST", url, node.Spec)
176
-	c.Assert(err, checker.IsNil)
177
-	c.Assert(status, checker.Equals, http.StatusOK, check.Commentf("output: %q", string(out)))
178 186
 }
179 187
 
180 188
 func (d *SwarmDaemon) listNodes(c *check.C) []swarm.Node {
... ...
@@ -82,7 +82,7 @@ func (s *DockerSwarmSuite) testAPISwarmManualAcceptance(c *check.C, secret strin
82 82
 	err := d2.Join(d1.listenAddr, "", "", false)
83 83
 	c.Assert(err, checker.NotNil)
84 84
 	if secret == "" {
85
-		c.Assert(err.Error(), checker.Contains, "Timeout reached")
85
+		c.Assert(err.Error(), checker.Contains, "needs to be accepted")
86 86
 		info, err := d2.info()
87 87
 		c.Assert(err, checker.IsNil)
88 88
 		c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStatePending)
... ...
@@ -97,23 +97,25 @@ func (s *DockerSwarmSuite) testAPISwarmManualAcceptance(c *check.C, secret strin
97 97
 		c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStateInactive)
98 98
 	}
99 99
 	d3 := s.AddDaemon(c, false, false)
100
-	go func() {
101
-		for i := 0; ; i++ {
102
-			info, err := d3.info()
103
-			c.Assert(err, checker.IsNil)
104
-			if info.NodeID != "" {
105
-				d1.updateNode(c, d1.getNode(c, info.NodeID), func(n *swarm.Node) {
106
-					n.Spec.Membership = swarm.NodeMembershipAccepted
107
-				})
108
-				return
109
-			}
110
-			if i >= 10 {
111
-				c.Errorf("could not find nodeID")
112
-			}
113
-			time.Sleep(300 * time.Millisecond)
100
+	c.Assert(d3.Join(d1.listenAddr, secret, "", false), checker.NotNil)
101
+	info, err := d3.info()
102
+	c.Assert(err, checker.IsNil)
103
+	c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStatePending)
104
+	c.Assert(len(info.NodeID), checker.GreaterThan, 5)
105
+	d1.updateNode(c, info.NodeID, func(n *swarm.Node) {
106
+		n.Spec.Membership = swarm.NodeMembershipAccepted
107
+	})
108
+	for i := 0; ; i++ {
109
+		info, err := d3.info()
110
+		c.Assert(err, checker.IsNil)
111
+		if info.LocalNodeState == swarm.LocalNodeStateActive {
112
+			break
114 113
 		}
115
-	}()
116
-	c.Assert(d3.Join(d1.listenAddr, secret, "", false), checker.IsNil)
114
+		if i > 10 {
115
+			c.Errorf("node did not become active")
116
+		}
117
+		time.Sleep(200 * time.Millisecond)
118
+	}
117 119
 }
118 120
 
119 121
 func (s *DockerSwarmSuite) TestApiSwarmSecretAcceptance(c *check.C) {
... ...
@@ -236,7 +238,7 @@ func (s *DockerSwarmSuite) TestApiSwarmPromoteDemote(c *check.C) {
236 236
 	c.Assert(info.ControlAvailable, checker.Equals, false)
237 237
 	c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStateActive)
238 238
 
239
-	d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
239
+	d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
240 240
 		n.Spec.Role = swarm.NodeRoleManager
241 241
 	})
242 242
 
... ...
@@ -255,7 +257,7 @@ func (s *DockerSwarmSuite) TestApiSwarmPromoteDemote(c *check.C) {
255 255
 		time.Sleep(100 * time.Millisecond)
256 256
 	}
257 257
 
258
-	d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
258
+	d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
259 259
 		n.Spec.Role = swarm.NodeRoleWorker
260 260
 	})
261 261
 
... ...
@@ -466,7 +468,7 @@ func (s *DockerSwarmSuite) TestApiSwarmNodeUpdate(c *check.C) {
466 466
 
467 467
 	nodes := d.listNodes(c)
468 468
 
469
-	d.updateNode(c, d.getNode(c, nodes[0].ID), func(n *swarm.Node) {
469
+	d.updateNode(c, nodes[0].ID, func(n *swarm.Node) {
470 470
 		n.Spec.Availability = swarm.NodeAvailabilityPause
471 471
 	})
472 472
 
... ...
@@ -489,14 +491,14 @@ func (s *DockerSwarmSuite) TestApiSwarmNodeDrainPause(c *check.C) {
489 489
 	waitAndAssert(c, defaultReconciliationTimeout, reducedCheck(sumAsIntegers, d1.checkActiveContainerCount, d2.checkActiveContainerCount), checker.Equals, instances)
490 490
 
491 491
 	// drain d2, all containers should move to d1
492
-	d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
492
+	d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
493 493
 		n.Spec.Availability = swarm.NodeAvailabilityDrain
494 494
 	})
495 495
 	waitAndAssert(c, defaultReconciliationTimeout, d1.checkActiveContainerCount, checker.Equals, instances)
496 496
 	waitAndAssert(c, defaultReconciliationTimeout, d2.checkActiveContainerCount, checker.Equals, 0)
497 497
 
498 498
 	// set d2 back to active
499
-	d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
499
+	d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
500 500
 		n.Spec.Availability = swarm.NodeAvailabilityActive
501 501
 	})
502 502
 
... ...
@@ -516,7 +518,7 @@ func (s *DockerSwarmSuite) TestApiSwarmNodeDrainPause(c *check.C) {
516 516
 	d2ContainerCount := len(d2.activeContainers())
517 517
 
518 518
 	// set d2 to paused, scale service up, only d1 gets new tasks
519
-	d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
519
+	d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
520 520
 		n.Spec.Availability = swarm.NodeAvailabilityPause
521 521
 	})
522 522