Browse code

Avoid alignment of reapNetwork and tableEntries

Make sure that the network is garbage collected after
the entries. Entries to be deleted requires that the network
is present.

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>

Flavio Crisciani authored on 2017/09/23 02:23:07
Showing 3 changed files
... ...
@@ -17,11 +17,15 @@ import (
17 17
 )
18 18
 
19 19
 const (
20
-	reapInterval     = 30 * time.Minute
21
-	reapPeriod       = 5 * time.Second
22
-	retryInterval    = 1 * time.Second
23
-	nodeReapInterval = 24 * time.Hour
24
-	nodeReapPeriod   = 2 * time.Hour
20
+	// The garbage collection logic for entries leverage the presence of the network.
21
+	// For this reason the expiration time of the network is put slightly higher than the entry expiration so that
22
+	// there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network.
23
+	reapEntryInterval   = 30 * time.Minute
24
+	reapNetworkInterval = reapEntryInterval + 5*reapPeriod
25
+	reapPeriod          = 5 * time.Second
26
+	retryInterval       = 1 * time.Second
27
+	nodeReapInterval    = 24 * time.Hour
28
+	nodeReapPeriod      = 2 * time.Hour
25 29
 )
26 30
 
27 31
 type logWriter struct{}
... ...
@@ -300,8 +304,9 @@ func (nDB *NetworkDB) reconnectNode() {
300 300
 // the reaper runs. NOTE nDB.reapTableEntries updates the reapTime with a readlock. This
301 301
 // is safe as long as no other concurrent path touches the reapTime field.
302 302
 func (nDB *NetworkDB) reapState() {
303
-	nDB.reapNetworks()
303
+	// The reapTableEntries leverage the presence of the network so garbage collect entries first
304 304
 	nDB.reapTableEntries()
305
+	nDB.reapNetworks()
305 306
 }
306 307
 
307 308
 func (nDB *NetworkDB) reapNetworks() {
... ...
@@ -414,8 +419,8 @@ func (nDB *NetworkDB) gossip() {
414 414
 		// Collect stats and print the queue info, note this code is here also to have a view of the queues empty
415 415
 		network.qMessagesSent += len(msgs)
416 416
 		if printStats {
417
-			logrus.Infof("NetworkDB stats - net:%s Entries:%d Queue qLen:%d netPeers:%d netMsg/s:%d",
418
-				nid, network.entriesNumber, broadcastQ.NumQueued(), broadcastQ.NumNodes(),
417
+			logrus.Infof("NetworkDB stats - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d netMsg/s:%d",
418
+				nid, network.leaving, broadcastQ.NumNodes(), network.entriesNumber, broadcastQ.NumQueued(),
419 419
 				network.qMessagesSent/int((nDB.config.StatsPrintPeriod/time.Second)))
420 420
 			network.qMessagesSent = 0
421 421
 		}
... ...
@@ -165,7 +165,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
165 165
 		n.ltime = nEvent.LTime
166 166
 		n.leaving = nEvent.Type == NetworkEventTypeLeave
167 167
 		if n.leaving {
168
-			n.reapTime = reapInterval
168
+			n.reapTime = reapNetworkInterval
169 169
 
170 170
 			// The remote node is leaving the network, but not the gossip cluster.
171 171
 			// Mark all its entries in deleted state, this will guarantee that
... ...
@@ -242,7 +242,7 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
242 242
 	// field. If that is not the case, this can be a BUG
243 243
 	if e.deleting && e.reapTime == 0 {
244 244
 		logrus.Warnf("handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?", tEvent)
245
-		e.reapTime = reapInterval
245
+		e.reapTime = reapEntryInterval
246 246
 	}
247 247
 
248 248
 	nDB.Lock()
... ...
@@ -405,7 +405,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
405 405
 		node:     nDB.config.NodeName,
406 406
 		value:    value,
407 407
 		deleting: true,
408
-		reapTime: reapInterval,
408
+		reapTime: reapEntryInterval,
409 409
 	}
410 410
 
411 411
 	if err := nDB.sendTableEvent(TableEventTypeDelete, nid, tname, key, entry); err != nil {
... ...
@@ -478,7 +478,7 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
478 478
 				node:     oldEntry.node,
479 479
 				value:    oldEntry.value,
480 480
 				deleting: true,
481
-				reapTime: reapInterval,
481
+				reapTime: reapEntryInterval,
482 482
 			}
483 483
 
484 484
 			// we arrived at this point in 2 cases:
... ...
@@ -619,8 +619,9 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
619 619
 		return fmt.Errorf("could not find network %s while trying to leave", nid)
620 620
 	}
621 621
 
622
+	logrus.Debugf("%s: leaving network %s", nDB.config.NodeName, nid)
622 623
 	n.ltime = ltime
623
-	n.reapTime = reapInterval
624
+	n.reapTime = reapNetworkInterval
624 625
 	n.leaving = true
625 626
 	return nil
626 627
 }