- Create a test to verify that a node that joins
in an async way is not going to extend the life
of a already deleted object
Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
| ... | ... |
@@ -17,15 +17,10 @@ import ( |
| 17 | 17 |
) |
| 18 | 18 |
|
| 19 | 19 |
const ( |
| 20 |
- // The garbage collection logic for entries leverage the presence of the network. |
|
| 21 |
- // For this reason the expiration time of the network is put slightly higher than the entry expiration so that |
|
| 22 |
- // there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network. |
|
| 23 |
- reapEntryInterval = 30 * time.Minute |
|
| 24 |
- reapNetworkInterval = reapEntryInterval + 5*reapPeriod |
|
| 25 |
- reapPeriod = 5 * time.Second |
|
| 26 |
- retryInterval = 1 * time.Second |
|
| 27 |
- nodeReapInterval = 24 * time.Hour |
|
| 28 |
- nodeReapPeriod = 2 * time.Hour |
|
| 20 |
+ reapPeriod = 5 * time.Second |
|
| 21 |
+ retryInterval = 1 * time.Second |
|
| 22 |
+ nodeReapInterval = 24 * time.Hour |
|
| 23 |
+ nodeReapPeriod = 2 * time.Hour |
|
| 29 | 24 |
) |
| 30 | 25 |
|
| 31 | 26 |
type logWriter struct{}
|
| ... | ... |
@@ -93,14 +93,14 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
|
| 93 | 93 |
nDB.nodes[n.Name] = n |
| 94 | 94 |
nDB.Unlock() |
| 95 | 95 |
if !found {
|
| 96 |
- logrus.Infof("Node join event for %s/%s", n.Name, n.Addr)
|
|
| 96 |
+ logrus.Infof("%v(%v): Node join event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
|
|
| 97 | 97 |
} |
| 98 | 98 |
return true |
| 99 | 99 |
case NodeEventTypeLeave: |
| 100 | 100 |
nDB.Lock() |
| 101 | 101 |
nDB.leftNodes[n.Name] = n |
| 102 | 102 |
nDB.Unlock() |
| 103 |
- logrus.Infof("Node leave event for %s/%s", n.Name, n.Addr)
|
|
| 103 |
+ logrus.Infof("%v(%v): Node leave event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
|
|
| 104 | 104 |
return true |
| 105 | 105 |
} |
| 106 | 106 |
|
| ... | ... |
@@ -140,7 +140,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
|
| 140 | 140 |
n.ltime = nEvent.LTime |
| 141 | 141 |
n.leaving = nEvent.Type == NetworkEventTypeLeave |
| 142 | 142 |
if n.leaving {
|
| 143 |
- n.reapTime = reapNetworkInterval |
|
| 143 |
+ n.reapTime = nDB.config.reapNetworkInterval |
|
| 144 | 144 |
|
| 145 | 145 |
// The remote node is leaving the network, but not the gossip cluster. |
| 146 | 146 |
// Mark all its entries in deleted state, this will guarantee that |
| ... | ... |
@@ -216,8 +216,9 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
|
| 216 | 216 |
// This case can happen if the cluster is running different versions of the engine where the old version does not have the |
| 217 | 217 |
// field. If that is not the case, this can be a BUG |
| 218 | 218 |
if e.deleting && e.reapTime == 0 {
|
| 219 |
- logrus.Warnf("handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?", tEvent)
|
|
| 220 |
- e.reapTime = reapEntryInterval |
|
| 219 |
+ logrus.Warnf("%v(%v) handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?",
|
|
| 220 |
+ nDB.config.Hostname, nDB.config.NodeID, tEvent) |
|
| 221 |
+ e.reapTime = nDB.config.reapEntryInterval |
|
| 221 | 222 |
} |
| 222 | 223 |
|
| 223 | 224 |
nDB.Lock() |
| ... | ... |
@@ -229,7 +230,7 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
|
| 229 | 229 |
// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around |
| 230 | 230 |
// most likely the cluster is already aware of it, if not who will sync with this node will catch the state too. |
| 231 | 231 |
// This also avoids that deletion of entries close to their garbage collection ends up circuling around forever |
| 232 |
- return e.reapTime > reapEntryInterval/6 |
|
| 232 |
+ return e.reapTime > nDB.config.reapEntryInterval/6 |
|
| 233 | 233 |
} |
| 234 | 234 |
|
| 235 | 235 |
var op opType |
| ... | ... |
@@ -181,6 +181,13 @@ type Config struct {
|
| 181 | 181 |
// be able to increase this to get more content into each gossip packet. |
| 182 | 182 |
PacketBufferSize int |
| 183 | 183 |
|
| 184 |
+ // reapEntryInterval duration of a deleted entry before being garbage collected |
|
| 185 |
+ reapEntryInterval time.Duration |
|
| 186 |
+ |
|
| 187 |
+ // reapNetworkInterval duration of a delted network before being garbage collected |
|
| 188 |
+ // NOTE this MUST always be higher than reapEntryInterval |
|
| 189 |
+ reapNetworkInterval time.Duration |
|
| 190 |
+ |
|
| 184 | 191 |
// StatsPrintPeriod the period to use to print queue stats |
| 185 | 192 |
// Default is 5min |
| 186 | 193 |
StatsPrintPeriod time.Duration |
| ... | ... |
@@ -220,12 +227,18 @@ func DefaultConfig() *Config {
|
| 220 | 220 |
PacketBufferSize: 1400, |
| 221 | 221 |
StatsPrintPeriod: 5 * time.Minute, |
| 222 | 222 |
HealthPrintPeriod: 1 * time.Minute, |
| 223 |
+ reapEntryInterval: 30 * time.Minute, |
|
| 223 | 224 |
} |
| 224 | 225 |
} |
| 225 | 226 |
|
| 226 | 227 |
// New creates a new instance of NetworkDB using the Config passed by |
| 227 | 228 |
// the caller. |
| 228 | 229 |
func New(c *Config) (*NetworkDB, error) {
|
| 230 |
+ // The garbage collection logic for entries leverage the presence of the network. |
|
| 231 |
+ // For this reason the expiration time of the network is put slightly higher than the entry expiration so that |
|
| 232 |
+ // there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network. |
|
| 233 |
+ c.reapNetworkInterval = c.reapEntryInterval + 5*reapPeriod |
|
| 234 |
+ |
|
| 229 | 235 |
nDB := &NetworkDB{
|
| 230 | 236 |
config: c, |
| 231 | 237 |
indexes: make(map[int]*radix.Tree), |
| ... | ... |
@@ -241,7 +254,7 @@ func New(c *Config) (*NetworkDB, error) {
|
| 241 | 241 |
nDB.indexes[byTable] = radix.New() |
| 242 | 242 |
nDB.indexes[byNetwork] = radix.New() |
| 243 | 243 |
|
| 244 |
- logrus.Debugf("New memberlist node - Node:%v will use memberlist nodeID:%v", c.Hostname, c.NodeID)
|
|
| 244 |
+ logrus.Infof("New memberlist node - Node:%v will use memberlist nodeID:%v with config:%+v", c.Hostname, c.NodeID, c)
|
|
| 245 | 245 |
if err := nDB.clusterInit(); err != nil {
|
| 246 | 246 |
return nil, err |
| 247 | 247 |
} |
| ... | ... |
@@ -414,7 +427,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
|
| 414 | 414 |
node: nDB.config.NodeID, |
| 415 | 415 |
value: value, |
| 416 | 416 |
deleting: true, |
| 417 |
- reapTime: reapEntryInterval, |
|
| 417 |
+ reapTime: nDB.config.reapEntryInterval, |
|
| 418 | 418 |
} |
| 419 | 419 |
|
| 420 | 420 |
if err := nDB.sendTableEvent(TableEventTypeDelete, nid, tname, key, entry); err != nil {
|
| ... | ... |
@@ -487,7 +500,7 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
|
| 487 | 487 |
node: oldEntry.node, |
| 488 | 488 |
value: oldEntry.value, |
| 489 | 489 |
deleting: true, |
| 490 |
- reapTime: reapEntryInterval, |
|
| 490 |
+ reapTime: nDB.config.reapEntryInterval, |
|
| 491 | 491 |
} |
| 492 | 492 |
|
| 493 | 493 |
// we arrived at this point in 2 cases: |
| ... | ... |
@@ -632,7 +645,7 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
|
| 632 | 632 |
|
| 633 | 633 |
logrus.Debugf("%v(%v): leaving network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
|
| 634 | 634 |
n.ltime = ltime |
| 635 |
- n.reapTime = reapNetworkInterval |
|
| 635 |
+ n.reapTime = nDB.config.reapNetworkInterval |
|
| 636 | 636 |
n.leaving = true |
| 637 | 637 |
return nil |
| 638 | 638 |
} |
| ... | ... |
@@ -10,6 +10,7 @@ import ( |
| 10 | 10 |
"testing" |
| 11 | 11 |
"time" |
| 12 | 12 |
|
| 13 |
+ "github.com/docker/docker/pkg/stringid" |
|
| 13 | 14 |
"github.com/docker/go-events" |
| 14 | 15 |
"github.com/sirupsen/logrus" |
| 15 | 16 |
"github.com/stretchr/testify/assert" |
| ... | ... |
@@ -27,13 +28,14 @@ func TestMain(m *testing.M) {
|
| 27 | 27 |
os.Exit(m.Run()) |
| 28 | 28 |
} |
| 29 | 29 |
|
| 30 |
-func createNetworkDBInstances(t *testing.T, num int, namePrefix string) []*NetworkDB {
|
|
| 30 |
+func createNetworkDBInstances(t *testing.T, num int, namePrefix string, conf *Config) []*NetworkDB {
|
|
| 31 | 31 |
var dbs []*NetworkDB |
| 32 | 32 |
for i := 0; i < num; i++ {
|
| 33 |
- conf := DefaultConfig() |
|
| 34 |
- conf.Hostname = fmt.Sprintf("%s%d", namePrefix, i+1)
|
|
| 35 |
- conf.BindPort = int(atomic.AddInt32(&dbPort, 1)) |
|
| 36 |
- db, err := New(conf) |
|
| 33 |
+ localConfig := *conf |
|
| 34 |
+ localConfig.Hostname = fmt.Sprintf("%s%d", namePrefix, i+1)
|
|
| 35 |
+ localConfig.NodeID = stringid.TruncateID(stringid.GenerateRandomID()) |
|
| 36 |
+ localConfig.BindPort = int(atomic.AddInt32(&dbPort, 1)) |
|
| 37 |
+ db, err := New(&localConfig) |
|
| 37 | 38 |
require.NoError(t, err) |
| 38 | 39 |
|
| 39 | 40 |
if i != 0 {
|
| ... | ... |
@@ -44,10 +46,19 @@ func createNetworkDBInstances(t *testing.T, num int, namePrefix string) []*Netwo |
| 44 | 44 |
dbs = append(dbs, db) |
| 45 | 45 |
} |
| 46 | 46 |
|
| 47 |
+ // Check that the cluster is properly created |
|
| 48 |
+ for i := 0; i < num; i++ {
|
|
| 49 |
+ if num != len(dbs[i].ClusterPeers()) {
|
|
| 50 |
+ t.Fatalf("Number of nodes for %s into the cluster does not match %d != %d",
|
|
| 51 |
+ dbs[i].config.Hostname, num, len(dbs[i].ClusterPeers())) |
|
| 52 |
+ } |
|
| 53 |
+ } |
|
| 54 |
+ |
|
| 47 | 55 |
return dbs |
| 48 | 56 |
} |
| 49 | 57 |
|
| 50 | 58 |
func closeNetworkDBInstances(dbs []*NetworkDB) {
|
| 59 |
+ log.Print("Closing DB instances...")
|
|
| 51 | 60 |
for _, db := range dbs {
|
| 52 | 61 |
db.Close() |
| 53 | 62 |
} |
| ... | ... |
@@ -147,12 +158,12 @@ func testWatch(t *testing.T, ch chan events.Event, ev interface{}, tname, nid, k
|
| 147 | 147 |
} |
| 148 | 148 |
|
| 149 | 149 |
func TestNetworkDBSimple(t *testing.T) {
|
| 150 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 150 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 151 | 151 |
closeNetworkDBInstances(dbs) |
| 152 | 152 |
} |
| 153 | 153 |
|
| 154 | 154 |
func TestNetworkDBJoinLeaveNetwork(t *testing.T) {
|
| 155 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 155 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 156 | 156 |
|
| 157 | 157 |
err := dbs[0].JoinNetwork("network1")
|
| 158 | 158 |
assert.NoError(t, err) |
| ... | ... |
@@ -167,7 +178,7 @@ func TestNetworkDBJoinLeaveNetwork(t *testing.T) {
|
| 167 | 167 |
} |
| 168 | 168 |
|
| 169 | 169 |
func TestNetworkDBJoinLeaveNetworks(t *testing.T) {
|
| 170 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 170 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 171 | 171 |
|
| 172 | 172 |
n := 10 |
| 173 | 173 |
for i := 1; i <= n; i++ {
|
| ... | ... |
@@ -210,7 +221,7 @@ func TestNetworkDBJoinLeaveNetworks(t *testing.T) {
|
| 210 | 210 |
} |
| 211 | 211 |
|
| 212 | 212 |
func TestNetworkDBCRUDTableEntry(t *testing.T) {
|
| 213 |
- dbs := createNetworkDBInstances(t, 3, "node") |
|
| 213 |
+ dbs := createNetworkDBInstances(t, 3, "node", DefaultConfig()) |
|
| 214 | 214 |
|
| 215 | 215 |
err := dbs[0].JoinNetwork("network1")
|
| 216 | 216 |
assert.NoError(t, err) |
| ... | ... |
@@ -240,7 +251,7 @@ func TestNetworkDBCRUDTableEntry(t *testing.T) {
|
| 240 | 240 |
} |
| 241 | 241 |
|
| 242 | 242 |
func TestNetworkDBCRUDTableEntries(t *testing.T) {
|
| 243 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 243 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 244 | 244 |
|
| 245 | 245 |
err := dbs[0].JoinNetwork("network1")
|
| 246 | 246 |
assert.NoError(t, err) |
| ... | ... |
@@ -308,7 +319,7 @@ func TestNetworkDBCRUDTableEntries(t *testing.T) {
|
| 308 | 308 |
} |
| 309 | 309 |
|
| 310 | 310 |
func TestNetworkDBNodeLeave(t *testing.T) {
|
| 311 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 311 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 312 | 312 |
|
| 313 | 313 |
err := dbs[0].JoinNetwork("network1")
|
| 314 | 314 |
assert.NoError(t, err) |
| ... | ... |
@@ -327,7 +338,7 @@ func TestNetworkDBNodeLeave(t *testing.T) {
|
| 327 | 327 |
} |
| 328 | 328 |
|
| 329 | 329 |
func TestNetworkDBWatch(t *testing.T) {
|
| 330 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 330 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 331 | 331 |
err := dbs[0].JoinNetwork("network1")
|
| 332 | 332 |
assert.NoError(t, err) |
| 333 | 333 |
|
| ... | ... |
@@ -356,7 +367,7 @@ func TestNetworkDBWatch(t *testing.T) {
|
| 356 | 356 |
} |
| 357 | 357 |
|
| 358 | 358 |
func TestNetworkDBBulkSync(t *testing.T) {
|
| 359 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 359 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 360 | 360 |
|
| 361 | 361 |
err := dbs[0].JoinNetwork("network1")
|
| 362 | 362 |
assert.NoError(t, err) |
| ... | ... |
@@ -389,7 +400,7 @@ func TestNetworkDBBulkSync(t *testing.T) {
|
| 389 | 389 |
func TestNetworkDBCRUDMediumCluster(t *testing.T) {
|
| 390 | 390 |
n := 5 |
| 391 | 391 |
|
| 392 |
- dbs := createNetworkDBInstances(t, n, "node") |
|
| 392 |
+ dbs := createNetworkDBInstances(t, n, "node", DefaultConfig()) |
|
| 393 | 393 |
|
| 394 | 394 |
for i := 0; i < n; i++ {
|
| 395 | 395 |
for j := 0; j < n; j++ {
|
| ... | ... |
@@ -433,13 +444,12 @@ func TestNetworkDBCRUDMediumCluster(t *testing.T) {
|
| 433 | 433 |
dbs[i].verifyEntryExistence(t, "test_table", "network1", "test_key", "", false) |
| 434 | 434 |
} |
| 435 | 435 |
|
| 436 |
- log.Print("Closing DB instances...")
|
|
| 437 | 436 |
closeNetworkDBInstances(dbs) |
| 438 | 437 |
} |
| 439 | 438 |
|
| 440 | 439 |
func TestNetworkDBNodeJoinLeaveIteration(t *testing.T) {
|
| 441 | 440 |
maxRetry := 5 |
| 442 |
- dbs := createNetworkDBInstances(t, 2, "node") |
|
| 441 |
+ dbs := createNetworkDBInstances(t, 2, "node", DefaultConfig()) |
|
| 443 | 442 |
|
| 444 | 443 |
// Single node Join/Leave |
| 445 | 444 |
err := dbs[0].JoinNetwork("network1")
|
| ... | ... |
@@ -517,6 +527,56 @@ func TestNetworkDBNodeJoinLeaveIteration(t *testing.T) {
|
| 517 | 517 |
t.Fatalf("The networkNodes list has to have be 2 instead of %d - %v", len(dbs[1].networkNodes["network1"]), dbs[1].networkNodes["network1"])
|
| 518 | 518 |
} |
| 519 | 519 |
|
| 520 |
- dbs[0].Close() |
|
| 521 |
- dbs[1].Close() |
|
| 520 |
+ closeNetworkDBInstances(dbs) |
|
| 521 |
+} |
|
| 522 |
+ |
|
| 523 |
+func TestNetworkDBGarbageCollection(t *testing.T) {
|
|
| 524 |
+ keysWriteDelete := 5 |
|
| 525 |
+ config := DefaultConfig() |
|
| 526 |
+ config.reapEntryInterval = 30 * time.Second |
|
| 527 |
+ config.StatsPrintPeriod = 15 * time.Second |
|
| 528 |
+ |
|
| 529 |
+ dbs := createNetworkDBInstances(t, 3, "node", config) |
|
| 530 |
+ |
|
| 531 |
+ // 2 Nodes join network |
|
| 532 |
+ err := dbs[0].JoinNetwork("network1")
|
|
| 533 |
+ assert.NoError(t, err) |
|
| 534 |
+ |
|
| 535 |
+ err = dbs[1].JoinNetwork("network1")
|
|
| 536 |
+ assert.NoError(t, err) |
|
| 537 |
+ |
|
| 538 |
+ for i := 0; i < keysWriteDelete; i++ {
|
|
| 539 |
+ err = dbs[i%2].CreateEntry("testTable", "network1", "key-"+string(i), []byte("value"))
|
|
| 540 |
+ assert.NoError(t, err) |
|
| 541 |
+ } |
|
| 542 |
+ time.Sleep(time.Second) |
|
| 543 |
+ for i := 0; i < keysWriteDelete; i++ {
|
|
| 544 |
+ err = dbs[i%2].DeleteEntry("testTable", "network1", "key-"+string(i))
|
|
| 545 |
+ assert.NoError(t, err) |
|
| 546 |
+ } |
|
| 547 |
+ for i := 0; i < 2; i++ {
|
|
| 548 |
+ assert.Equal(t, keysWriteDelete, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries number should match") |
|
| 549 |
+ } |
|
| 550 |
+ |
|
| 551 |
+ // from this point the timer for the garbage collection started, wait 5 seconds and then join a new node |
|
| 552 |
+ time.Sleep(5 * time.Second) |
|
| 553 |
+ |
|
| 554 |
+ err = dbs[2].JoinNetwork("network1")
|
|
| 555 |
+ assert.NoError(t, err) |
|
| 556 |
+ for i := 0; i < 3; i++ {
|
|
| 557 |
+ assert.Equal(t, keysWriteDelete, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries number should match") |
|
| 558 |
+ } |
|
| 559 |
+ // at this point the entries should had been all deleted |
|
| 560 |
+ time.Sleep(30 * time.Second) |
|
| 561 |
+ for i := 0; i < 3; i++ {
|
|
| 562 |
+ assert.Equal(t, 0, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries should had been garbage collected") |
|
| 563 |
+ } |
|
| 564 |
+ |
|
| 565 |
+ // make sure that entries are not coming back |
|
| 566 |
+ time.Sleep(15 * time.Second) |
|
| 567 |
+ for i := 0; i < 3; i++ {
|
|
| 568 |
+ assert.Equal(t, 0, dbs[i].networks[dbs[i].config.NodeID]["network1"].entriesNumber, "entries should had been garbage collected") |
|
| 569 |
+ } |
|
| 570 |
+ |
|
| 571 |
+ closeNetworkDBInstances(dbs) |
|
| 522 | 572 |
} |