Browse code

Merge pull request #2134 from dani-docker/esc-532

Adding a recovery mechanism for a split gossip cluster

Flavio Crisciani authored on 2018/04/24 05:14:27
Showing 3 changed files
... ...
@@ -2,6 +2,7 @@ package networkdb
2 2
 
3 3
 import (
4 4
 	"bytes"
5
+	"context"
5 6
 	"crypto/rand"
6 7
 	"encoding/hex"
7 8
 	"fmt"
... ...
@@ -17,10 +18,12 @@ import (
17 17
 )
18 18
 
19 19
 const (
20
-	reapPeriod       = 5 * time.Second
21
-	retryInterval    = 1 * time.Second
22
-	nodeReapInterval = 24 * time.Hour
23
-	nodeReapPeriod   = 2 * time.Hour
20
+	reapPeriod            = 5 * time.Second
21
+	rejoinClusterDuration = 10 * time.Second
22
+	rejoinInterval        = 60 * time.Second
23
+	retryInterval         = 1 * time.Second
24
+	nodeReapInterval      = 24 * time.Hour
25
+	nodeReapPeriod        = 2 * time.Hour
24 26
 )
25 27
 
26 28
 type logWriter struct{}
... ...
@@ -154,7 +157,7 @@ func (nDB *NetworkDB) clusterInit() error {
154 154
 		return fmt.Errorf("failed to create memberlist: %v", err)
155 155
 	}
156 156
 
157
-	nDB.stopCh = make(chan struct{})
157
+	nDB.ctx, nDB.cancelCtx = context.WithCancel(context.Background())
158 158
 	nDB.memberlist = mlist
159 159
 
160 160
 	for _, trigger := range []struct {
... ...
@@ -166,16 +169,17 @@ func (nDB *NetworkDB) clusterInit() error {
166 166
 		{config.PushPullInterval, nDB.bulkSyncTables},
167 167
 		{retryInterval, nDB.reconnectNode},
168 168
 		{nodeReapPeriod, nDB.reapDeadNode},
169
+		{rejoinInterval, nDB.rejoinClusterBootStrap},
169 170
 	} {
170 171
 		t := time.NewTicker(trigger.interval)
171
-		go nDB.triggerFunc(trigger.interval, t.C, nDB.stopCh, trigger.fn)
172
+		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
172 173
 		nDB.tickers = append(nDB.tickers, t)
173 174
 	}
174 175
 
175 176
 	return nil
176 177
 }
177 178
 
178
-func (nDB *NetworkDB) retryJoin(members []string, stop <-chan struct{}) {
179
+func (nDB *NetworkDB) retryJoin(ctx context.Context, members []string) {
179 180
 	t := time.NewTicker(retryInterval)
180 181
 	defer t.Stop()
181 182
 
... ...
@@ -191,7 +195,7 @@ func (nDB *NetworkDB) retryJoin(members []string, stop <-chan struct{}) {
191 191
 				continue
192 192
 			}
193 193
 			return
194
-		case <-stop:
194
+		case <-ctx.Done():
195 195
 			return
196 196
 		}
197 197
 	}
... ...
@@ -202,8 +206,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
202 202
 	mlist := nDB.memberlist
203 203
 
204 204
 	if _, err := mlist.Join(members); err != nil {
205
-		// In case of failure, keep retrying join until it succeeds or the cluster is shutdown.
206
-		go nDB.retryJoin(members, nDB.stopCh)
205
+		// In case of failure, we no longer need to explicitly call retryJoin.
206
+		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
207 207
 		return fmt.Errorf("could not join node to memberlist: %v", err)
208 208
 	}
209 209
 
... ...
@@ -225,7 +229,8 @@ func (nDB *NetworkDB) clusterLeave() error {
225 225
 		return err
226 226
 	}
227 227
 
228
-	close(nDB.stopCh)
228
+	// cancel the context
229
+	nDB.cancelCtx()
229 230
 
230 231
 	for _, t := range nDB.tickers {
231 232
 		t.Stop()
... ...
@@ -234,19 +239,19 @@ func (nDB *NetworkDB) clusterLeave() error {
234 234
 	return mlist.Shutdown()
235 235
 }
236 236
 
237
-func (nDB *NetworkDB) triggerFunc(stagger time.Duration, C <-chan time.Time, stop <-chan struct{}, f func()) {
237
+func (nDB *NetworkDB) triggerFunc(stagger time.Duration, C <-chan time.Time, f func()) {
238 238
 	// Use a random stagger to avoid syncronizing
239 239
 	randStagger := time.Duration(uint64(rnd.Int63()) % uint64(stagger))
240 240
 	select {
241 241
 	case <-time.After(randStagger):
242
-	case <-stop:
242
+	case <-nDB.ctx.Done():
243 243
 		return
244 244
 	}
245 245
 	for {
246 246
 		select {
247 247
 		case <-C:
248 248
 			f()
249
-		case <-stop:
249
+		case <-nDB.ctx.Done():
250 250
 			return
251 251
 		}
252 252
 	}
... ...
@@ -270,6 +275,35 @@ func (nDB *NetworkDB) reapDeadNode() {
270 270
 	}
271 271
 }
272 272
 
273
+// rejoinClusterBootStrap is called periodically to check if all bootStrap nodes are active in the cluster,
274
+// if not, call the cluster join to merge 2 separate clusters that are formed when all managers
275
+// stopped/started at the same time
276
+func (nDB *NetworkDB) rejoinClusterBootStrap() {
277
+	nDB.RLock()
278
+	if len(nDB.bootStrapIP) == 0 {
279
+		nDB.RUnlock()
280
+		return
281
+	}
282
+
283
+	bootStrapIPs := make([]string, 0, len(nDB.bootStrapIP))
284
+	for _, bootIP := range nDB.bootStrapIP {
285
+		for _, node := range nDB.nodes {
286
+			if node.Addr.Equal(bootIP) {
287
+				// One of the bootstrap nodes is part of the cluster, return
288
+				nDB.RUnlock()
289
+				return
290
+			}
291
+		}
292
+		bootStrapIPs = append(bootStrapIPs, bootIP.String())
293
+	}
294
+	nDB.RUnlock()
295
+	// None of the bootStrap nodes are in the cluster, call memberlist join
296
+	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
297
+	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
298
+	defer cancel()
299
+	nDB.retryJoin(ctx, bootStrapIPs)
300
+}
301
+
273 302
 func (nDB *NetworkDB) reconnectNode() {
274 303
 	nDB.RLock()
275 304
 	if len(nDB.failedNodes) == 0 {
... ...
@@ -38,16 +38,11 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
38 38
 	// If we are here means that the event is fresher and the node is known. Update the laport time
39 39
 	n.ltime = nEvent.LTime
40 40
 
41
-	// If it is a node leave event for a manager and this is the only manager we
42
-	// know of we want the reconnect logic to kick in. In a single manager
43
-	// cluster manager's gossip can't be bootstrapped unless some other node
44
-	// connects to it.
45
-	if len(nDB.bootStrapIP) == 1 && nEvent.Type == NodeEventTypeLeave {
46
-		for _, ip := range nDB.bootStrapIP {
47
-			if ip.Equal(n.Addr) {
48
-				return true
49
-			}
50
-		}
41
+	// If the node is not known from memberlist we cannot process save any state of it else if it actually
42
+	// dies we won't receive any notification and we will remain stuck with it
43
+	if _, ok := nDB.nodes[nEvent.NodeName]; !ok {
44
+		logrus.Error("node: %s is unknown to memberlist", nEvent.NodeName)
45
+		return false
51 46
 	}
52 47
 
53 48
 	switch nEvent.Type {
... ...
@@ -3,6 +3,7 @@ package networkdb
3 3
 //go:generate protoc -I.:../vendor/github.com/gogo/protobuf --gogo_out=import_path=github.com/docker/libnetwork/networkdb,Mgogoproto/gogo.proto=github.com/gogo/protobuf/gogoproto:. networkdb.proto
4 4
 
5 5
 import (
6
+	"context"
6 7
 	"fmt"
7 8
 	"net"
8 9
 	"os"
... ...
@@ -77,9 +78,10 @@ type NetworkDB struct {
77 77
 	// Broadcast queue for node event gossip.
78 78
 	nodeBroadcasts *memberlist.TransmitLimitedQueue
79 79
 
80
-	// A central stop channel to stop all go routines running on
80
+	// A central context to stop all go routines running on
81 81
 	// behalf of the NetworkDB instance.
82
-	stopCh chan struct{}
82
+	ctx       context.Context
83
+	cancelCtx context.CancelFunc
83 84
 
84 85
 	// A central broadcaster for all local watchers watching table
85 86
 	// events.