Browse code

libnetwork: make rejoin intervals configurable

This allows the rejoin intervals to be chosen according to the context
within which the component is used, and, in particular, this allows
lower intervals to be used within TestNetworkDBIslands test.

Signed-off-by: Roman Volosatovs <roman.volosatovs@docker.com>

Roman Volosatovs authored on 2021/07/12 18:12:56
Showing 3 changed files
... ...
@@ -18,12 +18,10 @@ import (
18 18
 )
19 19
 
20 20
 const (
21
-	reapPeriod            = 5 * time.Second
22
-	rejoinClusterDuration = 10 * time.Second
23
-	rejoinInterval        = 60 * time.Second
24
-	retryInterval         = 1 * time.Second
25
-	nodeReapInterval      = 24 * time.Hour
26
-	nodeReapPeriod        = 2 * time.Hour
21
+	reapPeriod       = 5 * time.Second
22
+	retryInterval    = 1 * time.Second
23
+	nodeReapInterval = 24 * time.Hour
24
+	nodeReapPeriod   = 2 * time.Hour
27 25
 	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
28 26
 	// the following is roughly 1 minute
29 27
 	maxQueueLenBroadcastOnSync = 500
... ...
@@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error {
172 172
 		{config.PushPullInterval, nDB.bulkSyncTables},
173 173
 		{retryInterval, nDB.reconnectNode},
174 174
 		{nodeReapPeriod, nDB.reapDeadNode},
175
-		{rejoinInterval, nDB.rejoinClusterBootStrap},
175
+		{nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
176 176
 	} {
177 177
 		t := time.NewTicker(trigger.interval)
178 178
 		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
... ...
@@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
210 210
 
211 211
 	if _, err := mlist.Join(members); err != nil {
212 212
 		// In case of failure, we no longer need to explicitly call retryJoin.
213
-		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
213
+		// rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
214
+		// will retryJoin for nDB.config.rejoinClusterDuration.
214 215
 		return fmt.Errorf("could not join node to memberlist: %v", err)
215 216
 	}
216 217
 
... ...
@@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() {
324 324
 	}
325 325
 	// None of the bootStrap nodes are in the cluster, call memberlist join
326 326
 	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
327
-	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
327
+	ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
328 328
 	defer cancel()
329 329
 	nDB.retryJoin(ctx, bootStrapIPs)
330 330
 }
... ...
@@ -192,6 +192,14 @@ type Config struct {
192 192
 	// NOTE this MUST always be higher than reapEntryInterval
193 193
 	reapNetworkInterval time.Duration
194 194
 
195
+	// rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
196
+	// Default is 10sec.
197
+	rejoinClusterDuration time.Duration
198
+
199
+	// rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
200
+	// Default is 60sec.
201
+	rejoinClusterInterval time.Duration
202
+
195 203
 	// StatsPrintPeriod the period to use to print queue stats
196 204
 	// Default is 5min
197 205
 	StatsPrintPeriod time.Duration
... ...
@@ -225,13 +233,15 @@ type entry struct {
225 225
 func DefaultConfig() *Config {
226 226
 	hostname, _ := os.Hostname()
227 227
 	return &Config{
228
-		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
229
-		Hostname:          hostname,
230
-		BindAddr:          "0.0.0.0",
231
-		PacketBufferSize:  1400,
232
-		StatsPrintPeriod:  5 * time.Minute,
233
-		HealthPrintPeriod: 1 * time.Minute,
234
-		reapEntryInterval: 30 * time.Minute,
228
+		NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
229
+		Hostname:              hostname,
230
+		BindAddr:              "0.0.0.0",
231
+		PacketBufferSize:      1400,
232
+		StatsPrintPeriod:      5 * time.Minute,
233
+		HealthPrintPeriod:     1 * time.Minute,
234
+		reapEntryInterval:     30 * time.Minute,
235
+		rejoinClusterDuration: 10 * time.Second,
236
+		rejoinClusterInterval: 60 * time.Second,
235 237
 	}
236 238
 }
237 239
 
... ...
@@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) {
819 819
 }
820 820
 
821 821
 func TestNetworkDBIslands(t *testing.T) {
822
+	pollTimeout := func() time.Duration {
823
+		const defaultTimeout = 120 * time.Second
824
+		dl, ok := t.Deadline()
825
+		if !ok {
826
+			return defaultTimeout
827
+		}
828
+		if d := time.Until(dl); d <= defaultTimeout {
829
+			return d
830
+		}
831
+		return defaultTimeout
832
+	}
833
+
822 834
 	logrus.SetLevel(logrus.DebugLevel)
823
-	dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig())
835
+	conf := DefaultConfig()
836
+	// Shorten durations to speed up test execution.
837
+	conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10
838
+	conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10
839
+	dbs := createNetworkDBInstances(t, 5, "node", conf)
824 840
 
825 841
 	// Get the node IP used currently
826 842
 	node := dbs[0].nodes[dbs[0].config.NodeID]
... ...
@@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) {
868 868
 		}
869 869
 		return poll.Success()
870 870
 	}
871
-	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second))
871
+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
872 872
 
873 873
 	// Spawn again the first 3 nodes with different names but same IP:port
874 874
 	for i := 0; i < 3; i++ {
... ...
@@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) {
877 877
 		dbs[i] = launchNode(t, *dbs[i].config)
878 878
 	}
879 879
 
880
-	// Give some time for the reconnect routine to run, it runs every 60s
880
+	// Give some time for the reconnect routine to run, it runs every 6s.
881 881
 	check = func(t poll.LogT) poll.Result {
882 882
 		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
883 883
 		for i := 0; i < 5; i++ {
... ...
@@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) {
908 908
 		}
909 909
 		return poll.Success()
910 910
 	}
911
-	poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second))
911
+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
912 912
 	closeNetworkDBInstances(t, dbs)
913 913
 }