Signed-off-by: Tonis Tiigi <tonistiigi@gmail.com>
| ... | ... |
@@ -28,25 +28,28 @@ import ( |
| 28 | 28 |
|
| 29 | 29 |
const swarmDirName = "swarm" |
| 30 | 30 |
const controlSocket = "control.sock" |
| 31 |
-const swarmConnectTimeout = 10 * time.Second |
|
| 31 |
+const swarmConnectTimeout = 20 * time.Second |
|
| 32 | 32 |
const stateFile = "docker-state.json" |
| 33 | 33 |
|
| 34 | 34 |
const ( |
| 35 | 35 |
initialReconnectDelay = 100 * time.Millisecond |
| 36 |
- maxReconnectDelay = 10 * time.Second |
|
| 36 |
+ maxReconnectDelay = 30 * time.Second |
|
| 37 | 37 |
) |
| 38 | 38 |
|
| 39 | 39 |
// ErrNoManager is returned then a manager-only function is called on non-manager |
| 40 |
-var ErrNoManager = fmt.Errorf("this node is not participating as a Swarm manager")
|
|
| 40 |
+var ErrNoManager = fmt.Errorf("This node is not participating as a Swarm manager")
|
|
| 41 | 41 |
|
| 42 | 42 |
// ErrNoSwarm is returned on leaving a cluster that was never initialized |
| 43 |
-var ErrNoSwarm = fmt.Errorf("this node is not part of Swarm")
|
|
| 43 |
+var ErrNoSwarm = fmt.Errorf("This node is not part of Swarm")
|
|
| 44 | 44 |
|
| 45 | 45 |
// ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated |
| 46 |
-var ErrSwarmExists = fmt.Errorf("this node is already part of a Swarm")
|
|
| 46 |
+var ErrSwarmExists = fmt.Errorf("This node is already part of a Swarm cluster. Use \"docker swarm leave\" to leave this cluster and join another one.")
|
|
| 47 |
+ |
|
| 48 |
+// ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet. |
|
| 49 |
+var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.")
|
|
| 47 | 50 |
|
| 48 | 51 |
// ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. |
| 49 |
-var ErrSwarmJoinTimeoutReached = fmt.Errorf("timeout reached before node was joined")
|
|
| 52 |
+var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. Attempt to join the cluster will continue in the background. Use \"docker info\" command to see the current Swarm status of your node.")
|
|
| 50 | 53 |
|
| 51 | 54 |
type state struct {
|
| 52 | 55 |
ListenAddr string |
| ... | ... |
@@ -249,13 +252,14 @@ func (c *Cluster) startNewNode(forceNewCluster bool, listenAddr, joinAddr, secre |
| 249 | 249 |
// Init initializes new cluster from user provided request. |
| 250 | 250 |
func (c *Cluster) Init(req types.InitRequest) (string, error) {
|
| 251 | 251 |
c.Lock() |
| 252 |
- if c.node != nil {
|
|
| 252 |
+ if node := c.node; node != nil {
|
|
| 253 | 253 |
c.Unlock() |
| 254 | 254 |
if !req.ForceNewCluster {
|
| 255 |
- return "", ErrSwarmExists |
|
| 255 |
+ return "", errSwarmExists(node) |
|
| 256 | 256 |
} |
| 257 | 257 |
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) |
| 258 | 258 |
defer cancel() |
| 259 |
+ c.cancelReconnect() |
|
| 259 | 260 |
if err := c.node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
|
| 260 | 261 |
return "", err |
| 261 | 262 |
} |
| ... | ... |
@@ -297,9 +301,9 @@ func (c *Cluster) Init(req types.InitRequest) (string, error) {
|
| 297 | 297 |
// Join makes current Cluster part of an existing swarm cluster. |
| 298 | 298 |
func (c *Cluster) Join(req types.JoinRequest) error {
|
| 299 | 299 |
c.Lock() |
| 300 |
- if c.node != nil {
|
|
| 300 |
+ if node := c.node; node != nil {
|
|
| 301 | 301 |
c.Unlock() |
| 302 |
- return ErrSwarmExists |
|
| 302 |
+ return errSwarmExists(node) |
|
| 303 | 303 |
} |
| 304 | 304 |
// todo: check current state existing |
| 305 | 305 |
if len(req.RemoteAddrs) == 0 {
|
| ... | ... |
@@ -312,23 +316,29 @@ func (c *Cluster) Join(req types.JoinRequest) error {
|
| 312 | 312 |
} |
| 313 | 313 |
c.Unlock() |
| 314 | 314 |
|
| 315 |
- select {
|
|
| 316 |
- case <-time.After(swarmConnectTimeout): |
|
| 317 |
- go c.reconnectOnFailure(ctx) |
|
| 318 |
- if nodeid := n.NodeID(); nodeid != "" {
|
|
| 319 |
- return fmt.Errorf("Timeout reached before node was joined. Your cluster settings may be preventing this node from automatically joining. To accept this node into cluster run `docker node accept %v` in an existing cluster manager", nodeid)
|
|
| 320 |
- } |
|
| 321 |
- return ErrSwarmJoinTimeoutReached |
|
| 322 |
- case <-n.Ready(): |
|
| 323 |
- go c.reconnectOnFailure(ctx) |
|
| 324 |
- return nil |
|
| 325 |
- case <-ctx.Done(): |
|
| 326 |
- c.RLock() |
|
| 327 |
- defer c.RUnlock() |
|
| 328 |
- if c.err != nil {
|
|
| 329 |
- return c.err |
|
| 315 |
+ certificateRequested := n.CertificateRequested() |
|
| 316 |
+ for {
|
|
| 317 |
+ select {
|
|
| 318 |
+ case <-certificateRequested: |
|
| 319 |
+ if n.NodeMembership() == swarmapi.NodeMembershipPending {
|
|
| 320 |
+ return fmt.Errorf("Your node is in the process of joining the cluster but needs to be accepted by existing cluster member.\nTo accept this node into cluster run \"docker node accept %v\" in an existing cluster manager. Use \"docker info\" command to see the current Swarm status of your node.", n.NodeID())
|
|
| 321 |
+ } |
|
| 322 |
+ certificateRequested = nil |
|
| 323 |
+ case <-time.After(swarmConnectTimeout): |
|
| 324 |
+ // attempt to connect will continue in background, also reconnecting |
|
| 325 |
+ go c.reconnectOnFailure(ctx) |
|
| 326 |
+ return ErrSwarmJoinTimeoutReached |
|
| 327 |
+ case <-n.Ready(): |
|
| 328 |
+ go c.reconnectOnFailure(ctx) |
|
| 329 |
+ return nil |
|
| 330 |
+ case <-ctx.Done(): |
|
| 331 |
+ c.RLock() |
|
| 332 |
+ defer c.RUnlock() |
|
| 333 |
+ if c.err != nil {
|
|
| 334 |
+ return c.err |
|
| 335 |
+ } |
|
| 336 |
+ return ctx.Err() |
|
| 330 | 337 |
} |
| 331 |
- return ctx.Err() |
|
| 332 | 338 |
} |
| 333 | 339 |
} |
| 334 | 340 |
|
| ... | ... |
@@ -1004,6 +1014,13 @@ func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, |
| 1004 | 1004 |
return |
| 1005 | 1005 |
} |
| 1006 | 1006 |
|
| 1007 |
+func errSwarmExists(node *swarmagent.Node) error {
|
|
| 1008 |
+ if node.NodeMembership() != swarmapi.NodeMembershipAccepted {
|
|
| 1009 |
+ return ErrPendingSwarmExists |
|
| 1010 |
+ } |
|
| 1011 |
+ return ErrSwarmExists |
|
| 1012 |
+} |
|
| 1013 |
+ |
|
| 1007 | 1014 |
func initAcceptancePolicy(node *swarmagent.Node, acceptancePolicy types.AcceptancePolicy) error {
|
| 1008 | 1015 |
ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) |
| 1009 | 1016 |
for conn := range node.ListenControlSocket(ctx) {
|
| ... | ... |
@@ -5,6 +5,7 @@ import ( |
| 5 | 5 |
"fmt" |
| 6 | 6 |
"net/http" |
| 7 | 7 |
"strings" |
| 8 |
+ "time" |
|
| 8 | 9 |
|
| 9 | 10 |
"github.com/docker/docker/pkg/integration/checker" |
| 10 | 11 |
"github.com/docker/engine-api/types" |
| ... | ... |
@@ -167,14 +168,22 @@ func (d *SwarmDaemon) getNode(c *check.C, id string) *swarm.Node {
|
| 167 | 167 |
return &node |
| 168 | 168 |
} |
| 169 | 169 |
|
| 170 |
-func (d *SwarmDaemon) updateNode(c *check.C, node *swarm.Node, f ...nodeConstructor) {
|
|
| 171 |
- for _, fn := range f {
|
|
| 172 |
- fn(node) |
|
| 170 |
+func (d *SwarmDaemon) updateNode(c *check.C, id string, f ...nodeConstructor) {
|
|
| 171 |
+ for i := 0; ; i++ {
|
|
| 172 |
+ node := d.getNode(c, id) |
|
| 173 |
+ for _, fn := range f {
|
|
| 174 |
+ fn(node) |
|
| 175 |
+ } |
|
| 176 |
+ url := fmt.Sprintf("/nodes/%s/update?version=%d", node.ID, node.Version.Index)
|
|
| 177 |
+ status, out, err := d.SockRequest("POST", url, node.Spec)
|
|
| 178 |
+ if i < 10 && strings.Contains(string(out), "update out of sequence") {
|
|
| 179 |
+ time.Sleep(100 * time.Millisecond) |
|
| 180 |
+ continue |
|
| 181 |
+ } |
|
| 182 |
+ c.Assert(err, checker.IsNil) |
|
| 183 |
+ c.Assert(status, checker.Equals, http.StatusOK, check.Commentf("output: %q", string(out)))
|
|
| 184 |
+ return |
|
| 173 | 185 |
} |
| 174 |
- url := fmt.Sprintf("/nodes/%s/update?version=%d", node.ID, node.Version.Index)
|
|
| 175 |
- status, out, err := d.SockRequest("POST", url, node.Spec)
|
|
| 176 |
- c.Assert(err, checker.IsNil) |
|
| 177 |
- c.Assert(status, checker.Equals, http.StatusOK, check.Commentf("output: %q", string(out)))
|
|
| 178 | 186 |
} |
| 179 | 187 |
|
| 180 | 188 |
func (d *SwarmDaemon) listNodes(c *check.C) []swarm.Node {
|
| ... | ... |
@@ -82,7 +82,7 @@ func (s *DockerSwarmSuite) testAPISwarmManualAcceptance(c *check.C, secret strin |
| 82 | 82 |
err := d2.Join(d1.listenAddr, "", "", false) |
| 83 | 83 |
c.Assert(err, checker.NotNil) |
| 84 | 84 |
if secret == "" {
|
| 85 |
- c.Assert(err.Error(), checker.Contains, "Timeout reached") |
|
| 85 |
+ c.Assert(err.Error(), checker.Contains, "needs to be accepted") |
|
| 86 | 86 |
info, err := d2.info() |
| 87 | 87 |
c.Assert(err, checker.IsNil) |
| 88 | 88 |
c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStatePending) |
| ... | ... |
@@ -97,23 +97,25 @@ func (s *DockerSwarmSuite) testAPISwarmManualAcceptance(c *check.C, secret strin |
| 97 | 97 |
c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStateInactive) |
| 98 | 98 |
} |
| 99 | 99 |
d3 := s.AddDaemon(c, false, false) |
| 100 |
- go func() {
|
|
| 101 |
- for i := 0; ; i++ {
|
|
| 102 |
- info, err := d3.info() |
|
| 103 |
- c.Assert(err, checker.IsNil) |
|
| 104 |
- if info.NodeID != "" {
|
|
| 105 |
- d1.updateNode(c, d1.getNode(c, info.NodeID), func(n *swarm.Node) {
|
|
| 106 |
- n.Spec.Membership = swarm.NodeMembershipAccepted |
|
| 107 |
- }) |
|
| 108 |
- return |
|
| 109 |
- } |
|
| 110 |
- if i >= 10 {
|
|
| 111 |
- c.Errorf("could not find nodeID")
|
|
| 112 |
- } |
|
| 113 |
- time.Sleep(300 * time.Millisecond) |
|
| 100 |
+ c.Assert(d3.Join(d1.listenAddr, secret, "", false), checker.NotNil) |
|
| 101 |
+ info, err := d3.info() |
|
| 102 |
+ c.Assert(err, checker.IsNil) |
|
| 103 |
+ c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStatePending) |
|
| 104 |
+ c.Assert(len(info.NodeID), checker.GreaterThan, 5) |
|
| 105 |
+ d1.updateNode(c, info.NodeID, func(n *swarm.Node) {
|
|
| 106 |
+ n.Spec.Membership = swarm.NodeMembershipAccepted |
|
| 107 |
+ }) |
|
| 108 |
+ for i := 0; ; i++ {
|
|
| 109 |
+ info, err := d3.info() |
|
| 110 |
+ c.Assert(err, checker.IsNil) |
|
| 111 |
+ if info.LocalNodeState == swarm.LocalNodeStateActive {
|
|
| 112 |
+ break |
|
| 114 | 113 |
} |
| 115 |
- }() |
|
| 116 |
- c.Assert(d3.Join(d1.listenAddr, secret, "", false), checker.IsNil) |
|
| 114 |
+ if i > 10 {
|
|
| 115 |
+ c.Errorf("node did not become active")
|
|
| 116 |
+ } |
|
| 117 |
+ time.Sleep(200 * time.Millisecond) |
|
| 118 |
+ } |
|
| 117 | 119 |
} |
| 118 | 120 |
|
| 119 | 121 |
func (s *DockerSwarmSuite) TestApiSwarmSecretAcceptance(c *check.C) {
|
| ... | ... |
@@ -236,7 +238,7 @@ func (s *DockerSwarmSuite) TestApiSwarmPromoteDemote(c *check.C) {
|
| 236 | 236 |
c.Assert(info.ControlAvailable, checker.Equals, false) |
| 237 | 237 |
c.Assert(info.LocalNodeState, checker.Equals, swarm.LocalNodeStateActive) |
| 238 | 238 |
|
| 239 |
- d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
|
|
| 239 |
+ d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
|
|
| 240 | 240 |
n.Spec.Role = swarm.NodeRoleManager |
| 241 | 241 |
}) |
| 242 | 242 |
|
| ... | ... |
@@ -255,7 +257,7 @@ func (s *DockerSwarmSuite) TestApiSwarmPromoteDemote(c *check.C) {
|
| 255 | 255 |
time.Sleep(100 * time.Millisecond) |
| 256 | 256 |
} |
| 257 | 257 |
|
| 258 |
- d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
|
|
| 258 |
+ d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
|
|
| 259 | 259 |
n.Spec.Role = swarm.NodeRoleWorker |
| 260 | 260 |
}) |
| 261 | 261 |
|
| ... | ... |
@@ -466,7 +468,7 @@ func (s *DockerSwarmSuite) TestApiSwarmNodeUpdate(c *check.C) {
|
| 466 | 466 |
|
| 467 | 467 |
nodes := d.listNodes(c) |
| 468 | 468 |
|
| 469 |
- d.updateNode(c, d.getNode(c, nodes[0].ID), func(n *swarm.Node) {
|
|
| 469 |
+ d.updateNode(c, nodes[0].ID, func(n *swarm.Node) {
|
|
| 470 | 470 |
n.Spec.Availability = swarm.NodeAvailabilityPause |
| 471 | 471 |
}) |
| 472 | 472 |
|
| ... | ... |
@@ -489,14 +491,14 @@ func (s *DockerSwarmSuite) TestApiSwarmNodeDrainPause(c *check.C) {
|
| 489 | 489 |
waitAndAssert(c, defaultReconciliationTimeout, reducedCheck(sumAsIntegers, d1.checkActiveContainerCount, d2.checkActiveContainerCount), checker.Equals, instances) |
| 490 | 490 |
|
| 491 | 491 |
// drain d2, all containers should move to d1 |
| 492 |
- d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
|
|
| 492 |
+ d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
|
|
| 493 | 493 |
n.Spec.Availability = swarm.NodeAvailabilityDrain |
| 494 | 494 |
}) |
| 495 | 495 |
waitAndAssert(c, defaultReconciliationTimeout, d1.checkActiveContainerCount, checker.Equals, instances) |
| 496 | 496 |
waitAndAssert(c, defaultReconciliationTimeout, d2.checkActiveContainerCount, checker.Equals, 0) |
| 497 | 497 |
|
| 498 | 498 |
// set d2 back to active |
| 499 |
- d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
|
|
| 499 |
+ d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
|
|
| 500 | 500 |
n.Spec.Availability = swarm.NodeAvailabilityActive |
| 501 | 501 |
}) |
| 502 | 502 |
|
| ... | ... |
@@ -516,7 +518,7 @@ func (s *DockerSwarmSuite) TestApiSwarmNodeDrainPause(c *check.C) {
|
| 516 | 516 |
d2ContainerCount := len(d2.activeContainers()) |
| 517 | 517 |
|
| 518 | 518 |
// set d2 to paused, scale service up, only d1 gets new tasks |
| 519 |
- d1.updateNode(c, d1.getNode(c, d2.NodeID), func(n *swarm.Node) {
|
|
| 519 |
+ d1.updateNode(c, d2.NodeID, func(n *swarm.Node) {
|
|
| 520 | 520 |
n.Spec.Availability = swarm.NodeAvailabilityPause |
| 521 | 521 |
}) |
| 522 | 522 |
|