Browse code

Fix race in `TestApiSwarmRestartCluster`

In `TestApiSwarmRestartCluster`, it's calling `checkClusterHealth`.
`checkClusterHealth` calls `d.info()`, which will return an error if
there is no cluster leader... problem is `checkClusterHealth` is doing a
nil error assertion w/o giving any time for a leader to be elected.

This moves the `d.info()` call into a `waitAndAssert` using the default
reconciliation timeout.

It also moves some other checks into a `waitAndAssert` to give the
cluster enough time to come back up.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>

Brian Goff authored on 2016/07/27 00:44:46
Showing 1 changed files
... ...
@@ -823,19 +823,49 @@ func setGlobalMode(s *swarm.Service) {
823 823
 
824 824
 func checkClusterHealth(c *check.C, cl []*SwarmDaemon, managerCount, workerCount int) {
825 825
 	var totalMCount, totalWCount int
826
+
826 827
 	for _, d := range cl {
827
-		info, err := d.info()
828
-		c.Assert(err, check.IsNil)
828
+		var (
829
+			info swarm.Info
830
+			err  error
831
+		)
832
+
833
+		// check info in a waitAndAssert, because if the cluster doesn't have a leader, `info` will return an error
834
+		checkInfo := func(c *check.C) (interface{}, check.CommentInterface) {
835
+			info, err = d.info()
836
+			return err, check.Commentf("cluster not ready in time")
837
+		}
838
+		waitAndAssert(c, defaultReconciliationTimeout, checkInfo, checker.IsNil)
829 839
 		if !info.ControlAvailable {
830 840
 			totalWCount++
831 841
 			continue
832 842
 		}
843
+
833 844
 		var leaderFound bool
834 845
 		totalMCount++
835 846
 		var mCount, wCount int
847
+
836 848
 		for _, n := range d.listNodes(c) {
837
-			c.Assert(n.Status.State, checker.Equals, swarm.NodeStateReady, check.Commentf("state of node %s, reported by %s", n.ID, d.Info.NodeID))
838
-			c.Assert(n.Spec.Availability, checker.Equals, swarm.NodeAvailabilityActive, check.Commentf("availability of node %s, reported by %s", n.ID, d.Info.NodeID))
849
+			waitReady := func(c *check.C) (interface{}, check.CommentInterface) {
850
+				if n.Status.State == swarm.NodeStateReady {
851
+					return true, nil
852
+				}
853
+				nn := d.getNode(c, n.ID)
854
+				n = *nn
855
+				return n.Status.State == swarm.NodeStateReady, check.Commentf("state of node %s, reported by %s", n.ID, d.Info.NodeID)
856
+			}
857
+			waitAndAssert(c, defaultReconciliationTimeout, waitReady, checker.True)
858
+
859
+			waitActive := func(c *check.C) (interface{}, check.CommentInterface) {
860
+				if n.Spec.Availability == swarm.NodeAvailabilityActive {
861
+					return true, nil
862
+				}
863
+				nn := d.getNode(c, n.ID)
864
+				n = *nn
865
+				return n.Spec.Availability == swarm.NodeAvailabilityActive, check.Commentf("availability of node %s, reported by %s", n.ID, d.Info.NodeID)
866
+			}
867
+			waitAndAssert(c, defaultReconciliationTimeout, waitActive, checker.True)
868
+
839 869
 			if n.Spec.Role == swarm.NodeRoleManager {
840 870
 				c.Assert(n.ManagerStatus, checker.NotNil, check.Commentf("manager status of node %s (manager), reported by %s", n.ID, d.Info.NodeID))
841 871
 				if n.ManagerStatus.Leader {