GitList

Browse code

libnetwork: make rejoin intervals configurable

This allows the rejoin intervals to be chosen according to the context
within which the component is used, and, in particular, this allows
lower intervals to be used within TestNetworkDBIslands test.

Signed-off-by: Roman Volosatovs <roman.volosatovs@docker.com>

Roman Volosatovs authored on 2021/07/12 18:12:56
Showing 3 changed files

libnetwork/networkdb/cluster.go index b388cae..6a633df 100644
libnetwork/networkdb/networkdb.go index 7655f83..bc78e48 100644
libnetwork/networkdb/networkdb_test.go index 2dc4f88..6691f01 100644

libnetwork/networkdb/cluster.go

History View file @ d7a2635

@@ -18,12 +18,10 @@ import (
+                     )
                      const (
                     -	reapPeriod            = 5 * time.Second
                     -	rejoinClusterDuration = 10 * time.Second
                     -	rejoinInterval        = 60 * time.Second
                     -	retryInterval         = 1 * time.Second
                     -	nodeReapInterval      = 24 * time.Hour
                     -	nodeReapPeriod        = 2 * time.Hour
                     +	reapPeriod       = 5 * time.Second
                     +	retryInterval    = 1 * time.Second
                     +	nodeReapInterval = 24 * time.Hour
                     +	nodeReapPeriod   = 2 * time.Hour
                      	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
                      	// the following is roughly 1 minute
                      	maxQueueLenBroadcastOnSync = 500
@@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error {
                      		{config.PushPullInterval, nDB.bulkSyncTables},
                      		{retryInterval, nDB.reconnectNode},
                      		{nodeReapPeriod, nDB.reapDeadNode},
                     -		{rejoinInterval, nDB.rejoinClusterBootStrap},
                     +		{nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
                      	} {
                      		t := time.NewTicker(trigger.interval)
                      		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
@@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
                      	if _, err := mlist.Join(members); err != nil {
                      		// In case of failure, we no longer need to explicitly call retryJoin.
                     -		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
                     +		// rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
                     +		// will retryJoin for nDB.config.rejoinClusterDuration.
                      		return fmt.Errorf("could not join node to memberlist: %v", err)
+                     	}
@@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() {
+                     	}
                      	// None of the bootStrap nodes are in the cluster, call memberlist join
                      	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
                     -	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
                     +	ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
                      	defer cancel()
                      	nDB.retryJoin(ctx, bootStrapIPs)
+                     }

libnetwork/networkdb/networkdb.go

History View file @ d7a2635

@@ -192,6 +192,14 @@ type Config struct {
                      	// NOTE this MUST always be higher than reapEntryInterval
                      	reapNetworkInterval time.Duration
                     +	// rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
                     +	// Default is 10sec.
                     +	rejoinClusterDuration time.Duration
+                    +
                     +	// rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
                     +	// Default is 60sec.
                     +	rejoinClusterInterval time.Duration
+                    +
                      	// StatsPrintPeriod the period to use to print queue stats
                      	// Default is 5min
                      	StatsPrintPeriod time.Duration
@@ -225,13 +233,15 @@ type entry struct {
                      func DefaultConfig() *Config {
                      	hostname, _ := os.Hostname()
                      	return &Config{
                     -		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
                     -		Hostname:          hostname,
                     -		BindAddr:          "0.0.0.0",
                     -		PacketBufferSize:  1400,
                     -		StatsPrintPeriod:  5 * time.Minute,
                     -		HealthPrintPeriod: 1 * time.Minute,
                     -		reapEntryInterval: 30 * time.Minute,
                     +		NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
                     +		Hostname:              hostname,
                     +		BindAddr:              "0.0.0.0",
                     +		PacketBufferSize:      1400,
                     +		StatsPrintPeriod:      5 * time.Minute,
                     +		HealthPrintPeriod:     1 * time.Minute,
                     +		reapEntryInterval:     30 * time.Minute,
                     +		rejoinClusterDuration: 10 * time.Second,
                     +		rejoinClusterInterval: 60 * time.Second,
+                     	}
+                     }

libnetwork/networkdb/networkdb_test.go

History View file @ d7a2635

@@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) {
+                     }
                      func TestNetworkDBIslands(t *testing.T) {
                     +	pollTimeout := func() time.Duration {
                     +		const defaultTimeout = 120 * time.Second
                     +		dl, ok := t.Deadline()
                     +		if !ok {
                     +			return defaultTimeout
                     +		}
                     +		if d := time.Until(dl); d <= defaultTimeout {
                     +			return d
                     +		}
                     +		return defaultTimeout
                     +	}
+                    +
                      	logrus.SetLevel(logrus.DebugLevel)
                     -	dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig())
                     +	conf := DefaultConfig()
                     +	// Shorten durations to speed up test execution.
                     +	conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10
                     +	conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10
                     +	dbs := createNetworkDBInstances(t, 5, "node", conf)
                      	// Get the node IP used currently
                      	node := dbs[0].nodes[dbs[0].config.NodeID]
@@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) {
+                     		}
                      		return poll.Success()
+                     	}
                     -	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second))
                     +	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
                      	// Spawn again the first 3 nodes with different names but same IP:port
                      	for i := 0; i < 3; i++ {
@@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) {
                      		dbs[i] = launchNode(t, *dbs[i].config)
+                     	}
                     -	// Give some time for the reconnect routine to run, it runs every 60s
                     +	// Give some time for the reconnect routine to run, it runs every 6s.
                      	check = func(t poll.LogT) poll.Result {
                      		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
                      		for i := 0; i < 5; i++ {
@@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) {
+                     		}
                      		return poll.Success()
+                     	}
                     -	poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second))
                     +	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
                      	closeNetworkDBInstances(t, dbs)
+                     }