Browse code

Add test to confirm garbage collection

- Create a test to verify that a node that joins
in an async way is not going to extend the life
of a already deleted object

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>

Flavio Crisciani authored on 2017/09/27 02:29:33
Showing 4 changed files
... ...
@@ -17,15 +17,10 @@ import (
17 17
 )
18 18
 
19 19
 const (
20
-	// The garbage collection logic for entries leverage the presence of the network.
21
-	// For this reason the expiration time of the network is put slightly higher than the entry expiration so that
22
-	// there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network.
23
-	reapEntryInterval   = 30 * time.Minute
24
-	reapNetworkInterval = reapEntryInterval + 5*reapPeriod
25
-	reapPeriod          = 5 * time.Second
26
-	retryInterval       = 1 * time.Second
27
-	nodeReapInterval    = 24 * time.Hour
28
-	nodeReapPeriod      = 2 * time.Hour
20
+	reapPeriod       = 5 * time.Second
21
+	retryInterval    = 1 * time.Second
22
+	nodeReapInterval = 24 * time.Hour
23
+	nodeReapPeriod   = 2 * time.Hour
29 24
 )
30 25
 
31 26
 type logWriter struct{}
... ...
@@ -93,14 +93,14 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
93 93
 		nDB.nodes[n.Name] = n
94 94
 		nDB.Unlock()
95 95
 		if !found {
96
-			logrus.Infof("Node join event for %s/%s", n.Name, n.Addr)
96
+			logrus.Infof("%v(%v): Node join event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
97 97
 		}
98 98
 		return true
99 99
 	case NodeEventTypeLeave:
100 100
 		nDB.Lock()
101 101
 		nDB.leftNodes[n.Name] = n
102 102
 		nDB.Unlock()
103
-		logrus.Infof("Node leave event for %s/%s", n.Name, n.Addr)
103
+		logrus.Infof("%v(%v): Node leave event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
104 104
 		return true
105 105
 	}
106 106
 
... ...
@@ -140,7 +140,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
140 140
 		n.ltime = nEvent.LTime
141 141
 		n.leaving = nEvent.Type == NetworkEventTypeLeave
142 142
 		if n.leaving {
143
-			n.reapTime = reapNetworkInterval
143
+			n.reapTime = nDB.config.reapNetworkInterval
144 144
 
145 145
 			// The remote node is leaving the network, but not the gossip cluster.
146 146
 			// Mark all its entries in deleted state, this will guarantee that
... ...
@@ -216,8 +216,9 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
216 216
 	// This case can happen if the cluster is running different versions of the engine where the old version does not have the
217 217
 	// field. If that is not the case, this can be a BUG
218 218
 	if e.deleting && e.reapTime == 0 {
219
-		logrus.Warnf("handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?", tEvent)
220
-		e.reapTime = reapEntryInterval
219
+		logrus.Warnf("%v(%v) handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?",
220
+			nDB.config.Hostname, nDB.config.NodeID, tEvent)
221
+		e.reapTime = nDB.config.reapEntryInterval
221 222
 	}
222 223
 
223 224
 	nDB.Lock()
... ...
@@ -229,7 +230,7 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
229 229
 		// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around
230 230
 		// most likely the cluster is already aware of it, if not who will sync with this node will catch the state too.
231 231
 		// This also avoids that deletion of entries close to their garbage collection ends up circuling around forever
232
-		return e.reapTime > reapEntryInterval/6
232
+		return e.reapTime > nDB.config.reapEntryInterval/6
233 233
 	}
234 234
 
235 235
 	var op opType
... ...
@@ -181,6 +181,13 @@ type Config struct {
181 181
 	// be able to increase this to get more content into each gossip packet.
182 182
 	PacketBufferSize int
183 183
 
184
+	// reapEntryInterval duration of a deleted entry before being garbage collected
185
+	reapEntryInterval time.Duration
186
+
187
+	// reapNetworkInterval duration of a delted network before being garbage collected
188
+	// NOTE this MUST always be higher than reapEntryInterval
189
+	reapNetworkInterval time.Duration
190
+
184 191
 	// StatsPrintPeriod the period to use to print queue stats
185 192
 	// Default is 5min
186 193
 	StatsPrintPeriod time.Duration
... ...
@@ -220,12 +227,18 @@ func DefaultConfig() *Config {
220 220
 		PacketBufferSize:  1400,
221 221
 		StatsPrintPeriod:  5 * time.Minute,
222 222
 		HealthPrintPeriod: 1 * time.Minute,
223
+		reapEntryInterval: 30 * time.Minute,
223 224
 	}
224 225
 }
225 226
 
226 227
 // New creates a new instance of NetworkDB using the Config passed by
227 228
 // the caller.
228 229
 func New(c *Config) (*NetworkDB, error) {
230
+	// The garbage collection logic for entries leverage the presence of the network.
231
+	// For this reason the expiration time of the network is put slightly higher than the entry expiration so that
232
+	// there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network.
233
+	c.reapNetworkInterval = c.reapEntryInterval + 5*reapPeriod
234
+
229 235
 	nDB := &NetworkDB{
230 236
 		config:         c,
231 237
 		indexes:        make(map[int]*radix.Tree),
... ...
@@ -241,7 +254,7 @@ func New(c *Config) (*NetworkDB, error) {
241 241
 	nDB.indexes[byTable] = radix.New()
242 242
 	nDB.indexes[byNetwork] = radix.New()
243 243
 
244
-	logrus.Debugf("New memberlist node - Node:%v will use memberlist nodeID:%v", c.Hostname, c.NodeID)
244
+	logrus.Infof("New memberlist node - Node:%v will use memberlist nodeID:%v with config:%+v", c.Hostname, c.NodeID, c)
245 245
 	if err := nDB.clusterInit(); err != nil {
246 246
 		return nil, err
247 247
 	}
... ...
@@ -414,7 +427,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
414 414
 		node:     nDB.config.NodeID,
415 415
 		value:    value,
416 416
 		deleting: true,
417
-		reapTime: reapEntryInterval,
417
+		reapTime: nDB.config.reapEntryInterval,
418 418
 	}
419 419
 
420 420
 	if err := nDB.sendTableEvent(TableEventTypeDelete, nid, tname, key, entry); err != nil {
... ...
@@ -487,7 +500,7 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
487 487
 				node:     oldEntry.node,
488 488
 				value:    oldEntry.value,
489 489
 				deleting: true,
490
-				reapTime: reapEntryInterval,
490
+				reapTime: nDB.config.reapEntryInterval,
491 491
 			}
492 492
 
493 493
 			// we arrived at this point in 2 cases:
... ...
@@ -632,7 +645,7 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
632 632
 
633 633
 	logrus.Debugf("%v(%v): leaving network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
634 634
 	n.ltime = ltime
635
-	n.reapTime = reapNetworkInterval
635
+	n.reapTime = nDB.config.reapNetworkInterval
636 636
 	n.leaving = true
637 637
 	return nil
638 638
 }
... ...
@@ -10,6 +10,7 @@ import (
10 10
 	"testing"
11 11
 	"time"
12 12
 
13
+	"github.com/docker/docker/pkg/stringid"
13 14
 	"github.com/docker/go-events"
14 15
 	"github.com/sirupsen/logrus"
15 16
 	"github.com/stretchr/testify/assert"
... ...
@@ -27,13 +28,14 @@ func TestMain(m *testing.M) {
27 27
 	os.Exit(m.Run())
28 28
 }
29 29
 
30
-func createNetworkDBInstances(t *testing.T, num int, namePrefix string) []*NetworkDB {
30
+func createNetworkDBInstances(t *testing.T, num int, namePrefix string, conf *Config) []*NetworkDB {
31 31
 	var dbs []*NetworkDB
32 32
 	for i := 0; i < num; i++ {
33
-		conf := DefaultConfig()
34
-		conf.Hostname = fmt.Sprintf("%s%d", namePrefix, i+1)
35
-		conf.BindPort = int(atomic.AddInt32(&dbPort, 1))
36
-		db, err := New(conf)
33
+		localConfig := *conf
34
+		localConfig.Hostname = fmt.Sprintf("%s%d", namePrefix, i+1)
35
+		localConfig.NodeID = stringid.TruncateID(stringid.GenerateRandomID())
36
+		localConfig.BindPort = int(atomic.AddInt32(&dbPort, 1))
37
+		db, err := New(&localConfig)
37 38
 		require.NoError(t, err)
38 39
 
39 40
 		if i != 0 {
... ...
@@ -44,10 +46,19 @@ func createNetworkDBInstances(t *testing.T, num int, namePrefix string) []*Netwo
44 44
 		dbs = append(dbs, db)
45 45
 	}
46 46
 
47
+	// Check that the cluster is properly created
48
+	for i := 0; i < num; i++ {
49
+		if num != len(dbs[i].ClusterPeers()) {
50
+			t.Fatalf("Number of nodes for %s into the cluster does not match %d != %d",
51
+				dbs[i].config.Hostname, num, len(dbs[i].ClusterPeers()))
52
+		}
53
+	}
54
+
47 55
 	return dbs
48 56
 }
49 57
 
50 58
 func closeNetworkDBInstances(dbs []*NetworkDB) {
59
+	log.Print("Closing DB instances...")
51 60
 	for _, db := range dbs {
52 61
 		db.Close()
53 62
 	}
... ...
@@ -147,12 +158,12 @@ func testWatch(t *testing.T, ch chan events.Event, ev interface{}, tname, nid, k
147 147
 }
148 148
 
149 149
 func TestNetworkDBSimple(t *testing.T) {
150
-	dbs := createNetworkDBInstances(t, 2, "node")
150
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
151 151
 	closeNetworkDBInstances(dbs)
152 152
 }
153 153
 
154 154
 func TestNetworkDBJoinLeaveNetwork(t *testing.T) {
155
-	dbs := createNetworkDBInstances(t, 2, "node")
155
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
156 156
 
157 157
 	err := dbs[0].JoinNetwork("network1")
158 158
 	assert.NoError(t, err)
... ...
@@ -167,7 +178,7 @@ func TestNetworkDBJoinLeaveNetwork(t *testing.T) {
167 167
 }
168 168
 
169 169
 func TestNetworkDBJoinLeaveNetworks(t *testing.T) {
170
-	dbs := createNetworkDBInstances(t, 2, "node")
170
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
171 171
 
172 172
 	n := 10
173 173
 	for i := 1; i <= n; i++ {
... ...
@@ -210,7 +221,7 @@ func TestNetworkDBJoinLeaveNetworks(t *testing.T) {
210 210
 }
211 211
 
212 212
 func TestNetworkDBCRUDTableEntry(t *testing.T) {
213
-	dbs := createNetworkDBInstances(t, 3, "node")
213
+	dbs := createNetworkDBInstances(t, 3, "node", DefaultConfig())
214 214
 
215 215
 	err := dbs[0].JoinNetwork("network1")
216 216
 	assert.NoError(t, err)
... ...
@@ -240,7 +251,7 @@ func TestNetworkDBCRUDTableEntry(t *testing.T) {
240 240
 }
241 241
 
242 242
 func TestNetworkDBCRUDTableEntries(t *testing.T) {
243
-	dbs := createNetworkDBInstances(t, 2, "node")
243
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
244 244
 
245 245
 	err := dbs[0].JoinNetwork("network1")
246 246
 	assert.NoError(t, err)
... ...
@@ -308,7 +319,7 @@ func TestNetworkDBCRUDTableEntries(t *testing.T) {
308 308
 }
309 309
 
310 310
 func TestNetworkDBNodeLeave(t *testing.T) {
311
-	dbs := createNetworkDBInstances(t, 2, "node")
311
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
312 312
 
313 313
 	err := dbs[0].JoinNetwork("network1")
314 314
 	assert.NoError(t, err)
... ...
@@ -327,7 +338,7 @@ func TestNetworkDBNodeLeave(t *testing.T) {
327 327
 }
328 328
 
329 329
 func TestNetworkDBWatch(t *testing.T) {
330
-	dbs := createNetworkDBInstances(t, 2, "node")
330
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
331 331
 	err := dbs[0].JoinNetwork("network1")
332 332
 	assert.NoError(t, err)
333 333
 
... ...
@@ -356,7 +367,7 @@ func TestNetworkDBWatch(t *testing.T) {
356 356
 }
357 357
 
358 358
 func TestNetworkDBBulkSync(t *testing.T) {
359
-	dbs := createNetworkDBInstances(t, 2, "node")
359
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
360 360
 
361 361
 	err := dbs[0].JoinNetwork("network1")
362 362
 	assert.NoError(t, err)
... ...
@@ -389,7 +400,7 @@ func TestNetworkDBBulkSync(t *testing.T) {
389 389
 func TestNetworkDBCRUDMediumCluster(t *testing.T) {
390 390
 	n := 5
391 391
 
392
-	dbs := createNetworkDBInstances(t, n, "node")
392
+	dbs := createNetworkDBInstances(t, n, "node", DefaultConfig())
393 393
 
394 394
 	for i := 0; i < n; i++ {
395 395
 		for j := 0; j < n; j++ {
... ...
@@ -433,13 +444,12 @@ func TestNetworkDBCRUDMediumCluster(t *testing.T) {
433 433
 		dbs[i].verifyEntryExistence(t, "test_table", "network1", "test_key", "", false)
434 434
 	}
435 435
 
436
-	log.Print("Closing DB instances...")
437 436
 	closeNetworkDBInstances(dbs)
438 437
 }
439 438
 
440 439
 func TestNetworkDBNodeJoinLeaveIteration(t *testing.T) {
441 440
 	maxRetry := 5
442
-	dbs := createNetworkDBInstances(t, 2, "node")
441
+	dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig())
443 442
 
444 443
 	// Single node Join/Leave
445 444
 	err := dbs[0].JoinNetwork("network1")
... ...
@@ -517,6 +527,56 @@ func TestNetworkDBNodeJoinLeaveIteration(t *testing.T) {
517 517
 		t.Fatalf("The networkNodes list has to have be 2 instead of %d - %v", len(dbs[1].networkNodes["network1"]), dbs[1].networkNodes["network1"])
518 518
 	}
519 519
 
520
-	dbs[0].Close()
521
-	dbs[1].Close()
520
+	closeNetworkDBInstances(dbs)
521
+}
522
+
523
+func TestNetworkDBGarbageCollection(t *testing.T) {
524
+	keysWriteDelete := 5
525
+	config := DefaultConfig()
526
+	config.reapEntryInterval = 30 * time.Second
527
+	config.StatsPrintPeriod = 15 * time.Second
528
+
529
+	dbs := createNetworkDBInstances(t, 3, "node", config)
530
+
531
+	// 2 Nodes join network
532
+	err := dbs[0].JoinNetwork("network1")
533
+	assert.NoError(t, err)
534
+
535
+	err = dbs[1].JoinNetwork("network1")
536
+	assert.NoError(t, err)
537
+
538
+	for i := 0; i < keysWriteDelete; i++ {
539
+		err = dbs[i%2].CreateEntry("testTable", "network1", "key-"+string(i), []byte("value"))
540
+		assert.NoError(t, err)
541
+	}
542
+	time.Sleep(time.Second)
543
+	for i := 0; i < keysWriteDelete; i++ {
544
+		err = dbs[i%2].DeleteEntry("testTable", "network1", "key-"+string(i))
545
+		assert.NoError(t, err)
546
+	}
547
+	for i := 0; i < 2; i++ {
548
+		assert.Equal(t, keysWriteDelete, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries number should match")
549
+	}
550
+
551
+	// from this point the timer for the garbage collection started, wait 5 seconds and then join a new node
552
+	time.Sleep(5 * time.Second)
553
+
554
+	err = dbs[2].JoinNetwork("network1")
555
+	assert.NoError(t, err)
556
+	for i := 0; i < 3; i++ {
557
+		assert.Equal(t, keysWriteDelete, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries number should match")
558
+	}
559
+	// at this point the entries should had been all deleted
560
+	time.Sleep(30 * time.Second)
561
+	for i := 0; i < 3; i++ {
562
+		assert.Equal(t, 0, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries should had been garbage collected")
563
+	}
564
+
565
+	// make sure that entries are not coming back
566
+	time.Sleep(15 * time.Second)
567
+	for i := 0; i < 3; i++ {
568
+		assert.Equal(t, 0, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries should had been garbage collected")
569
+	}
570
+
571
+	closeNetworkDBInstances(dbs)
522 572
 }