Browse code

Vendoring libnetwork and swarmkit

Signed-off-by: Abhinandan Prativadi <abhi@docker.com>

Abhinandan Prativadi authored on 2017/10/06 10:20:25
Showing 30 changed files
... ...
@@ -1546,7 +1546,7 @@ func (s *DockerSwarmSuite) TestSwarmNetworkIPAMOptions(c *check.C) {
1546 1546
 	out, err = d.Cmd("network", "inspect", "--format", "{{.IPAM.Options}}", "foo")
1547 1547
 	c.Assert(err, checker.IsNil, check.Commentf(out))
1548 1548
 	c.Assert(strings.TrimSpace(out), checker.Contains, "foo:bar")
1549
-        c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
1549
+	c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
1550 1550
 
1551 1551
 	out, err = d.Cmd("service", "create", "--detach", "--no-resolve-image", "--network=foo", "--name", "top", "busybox", "top")
1552 1552
 	c.Assert(err, checker.IsNil, check.Commentf(out))
... ...
@@ -1557,7 +1557,7 @@ func (s *DockerSwarmSuite) TestSwarmNetworkIPAMOptions(c *check.C) {
1557 1557
 	out, err = d.Cmd("network", "inspect", "--format", "{{.IPAM.Options}}", "foo")
1558 1558
 	c.Assert(err, checker.IsNil, check.Commentf(out))
1559 1559
 	c.Assert(strings.TrimSpace(out), checker.Contains, "foo:bar")
1560
-        c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
1560
+	c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
1561 1561
 }
1562 1562
 
1563 1563
 func (s *DockerTrustedSwarmSuite) TestTrustedServiceCreate(c *check.C) {
... ...
@@ -31,7 +31,7 @@ github.com/moby/buildkit aaff9d591ef128560018433fe61beb802e149de8
31 31
 github.com/tonistiigi/fsutil dea3a0da73aee887fc02142d995be764106ac5e2
32 32
 
33 33
 #get libnetwork packages
34
-github.com/docker/libnetwork 0f08d31bf0e640e0cdc6d5161227f87602d605c5
34
+github.com/docker/libnetwork 68f1039f172434709a4550fe92e3e058406c74ce 
35 35
 github.com/docker/go-events 9461782956ad83b30282bf90e31fa6a70c255ba9
36 36
 github.com/armon/go-radix e39d623f12e8e41c7b5529e9a9dd67a1e2261f80
37 37
 github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
... ...
@@ -109,7 +109,7 @@ github.com/containerd/containerd 06b9cb35161009dcb7123345749fef02f7cea8e0
109 109
 github.com/tonistiigi/fifo 1405643975692217d6720f8b54aeee1bf2cd5cf4
110 110
 
111 111
 # cluster
112
-github.com/docker/swarmkit 941a01844b89c56aa61086fecb167ab3af1de22b
112
+github.com/docker/swarmkit 872861d2ae46958af7ead1d5fffb092c73afbaf0
113 113
 github.com/gogo/protobuf v0.4
114 114
 github.com/cloudflare/cfssl 7fb22c8cba7ecaf98e4082d22d65800cf45e042a
115 115
 github.com/google/certificate-transparency d90e65c3a07988180c5b1ece71791c0b6506826e
... ...
@@ -6,11 +6,9 @@ import (
6 6
 	"encoding/json"
7 7
 	"fmt"
8 8
 	"net"
9
-	"os"
10 9
 	"sort"
11 10
 	"sync"
12 11
 
13
-	"github.com/docker/docker/pkg/stringid"
14 12
 	"github.com/docker/go-events"
15 13
 	"github.com/docker/libnetwork/cluster"
16 14
 	"github.com/docker/libnetwork/datastore"
... ...
@@ -282,12 +280,8 @@ func (c *controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr, d
282 282
 	}
283 283
 
284 284
 	keys, _ := c.getKeys(subsysGossip)
285
-	hostname, _ := os.Hostname()
286
-	nodeName := hostname + "-" + stringid.TruncateID(stringid.GenerateRandomID())
287
-	logrus.Info("Gossip cluster hostname ", nodeName)
288 285
 
289 286
 	netDBConf := networkdb.DefaultConfig()
290
-	netDBConf.NodeName = nodeName
291 287
 	netDBConf.BindAddr = listenAddr
292 288
 	netDBConf.AdvertiseAddr = advertiseAddr
293 289
 	netDBConf.Keys = keys
... ...
@@ -41,6 +41,7 @@ type Handle struct {
41 41
 	id         string
42 42
 	dbIndex    uint64
43 43
 	dbExists   bool
44
+	curr       uint64
44 45
 	store      datastore.DataStore
45 46
 	sync.Mutex
46 47
 }
... ...
@@ -193,26 +194,27 @@ func (h *Handle) getCopy() *Handle {
193 193
 		dbIndex:    h.dbIndex,
194 194
 		dbExists:   h.dbExists,
195 195
 		store:      h.store,
196
+		curr:       h.curr,
196 197
 	}
197 198
 }
198 199
 
199 200
 // SetAnyInRange atomically sets the first unset bit in the specified range in the sequence and returns the corresponding ordinal
200
-func (h *Handle) SetAnyInRange(start, end uint64) (uint64, error) {
201
+func (h *Handle) SetAnyInRange(start, end uint64, serial bool) (uint64, error) {
201 202
 	if end < start || end >= h.bits {
202 203
 		return invalidPos, fmt.Errorf("invalid bit range [%d, %d]", start, end)
203 204
 	}
204 205
 	if h.Unselected() == 0 {
205 206
 		return invalidPos, ErrNoBitAvailable
206 207
 	}
207
-	return h.set(0, start, end, true, false)
208
+	return h.set(0, start, end, true, false, serial)
208 209
 }
209 210
 
210 211
 // SetAny atomically sets the first unset bit in the sequence and returns the corresponding ordinal
211
-func (h *Handle) SetAny() (uint64, error) {
212
+func (h *Handle) SetAny(serial bool) (uint64, error) {
212 213
 	if h.Unselected() == 0 {
213 214
 		return invalidPos, ErrNoBitAvailable
214 215
 	}
215
-	return h.set(0, 0, h.bits-1, true, false)
216
+	return h.set(0, 0, h.bits-1, true, false, serial)
216 217
 }
217 218
 
218 219
 // Set atomically sets the corresponding bit in the sequence
... ...
@@ -220,7 +222,7 @@ func (h *Handle) Set(ordinal uint64) error {
220 220
 	if err := h.validateOrdinal(ordinal); err != nil {
221 221
 		return err
222 222
 	}
223
-	_, err := h.set(ordinal, 0, 0, false, false)
223
+	_, err := h.set(ordinal, 0, 0, false, false, false)
224 224
 	return err
225 225
 }
226 226
 
... ...
@@ -229,7 +231,7 @@ func (h *Handle) Unset(ordinal uint64) error {
229 229
 	if err := h.validateOrdinal(ordinal); err != nil {
230 230
 		return err
231 231
 	}
232
-	_, err := h.set(ordinal, 0, 0, false, true)
232
+	_, err := h.set(ordinal, 0, 0, false, true, false)
233 233
 	return err
234 234
 }
235 235
 
... ...
@@ -298,7 +300,7 @@ func (h *Handle) CheckConsistency() error {
298 298
 }
299 299
 
300 300
 // set/reset the bit
301
-func (h *Handle) set(ordinal, start, end uint64, any bool, release bool) (uint64, error) {
301
+func (h *Handle) set(ordinal, start, end uint64, any bool, release bool, serial bool) (uint64, error) {
302 302
 	var (
303 303
 		bitPos  uint64
304 304
 		bytePos uint64
... ...
@@ -308,6 +310,7 @@ func (h *Handle) set(ordinal, start, end uint64, any bool, release bool) (uint64
308 308
 
309 309
 	for {
310 310
 		var store datastore.DataStore
311
+		curr := uint64(0)
311 312
 		h.Lock()
312 313
 		store = h.store
313 314
 		h.Unlock()
... ...
@@ -318,15 +321,18 @@ func (h *Handle) set(ordinal, start, end uint64, any bool, release bool) (uint64
318 318
 		}
319 319
 
320 320
 		h.Lock()
321
+		if serial {
322
+			curr = h.curr
323
+		}
321 324
 		// Get position if available
322 325
 		if release {
323 326
 			bytePos, bitPos = ordinalToPos(ordinal)
324 327
 		} else {
325 328
 			if any {
326
-				bytePos, bitPos, err = getFirstAvailable(h.head, start)
329
+				bytePos, bitPos, err = getAvailableFromCurrent(h.head, start, curr, end)
327 330
 				ret = posToOrdinal(bytePos, bitPos)
328
-				if end < ret {
329
-					err = ErrNoBitAvailable
331
+				if err == nil {
332
+					h.curr = ret + 1
330 333
 				}
331 334
 			} else {
332 335
 				bytePos, bitPos, err = checkIfAvailable(h.head, ordinal)
... ...
@@ -515,6 +521,29 @@ func getFirstAvailable(head *sequence, start uint64) (uint64, uint64, error) {
515 515
 	return invalidPos, invalidPos, ErrNoBitAvailable
516 516
 }
517 517
 
518
+// getAvailableFromCurrent will look for available ordinal from the current ordinal.
519
+// If none found then it will loop back to the start to check of the available bit.
520
+// This can be further optimized to check from start till curr in case of a rollover
521
+func getAvailableFromCurrent(head *sequence, start, curr, end uint64) (uint64, uint64, error) {
522
+	var bytePos, bitPos uint64
523
+	if curr != 0 && curr > start {
524
+		bytePos, bitPos, _ = getFirstAvailable(head, curr)
525
+		ret := posToOrdinal(bytePos, bitPos)
526
+		if end < ret {
527
+			goto begin
528
+		}
529
+		return bytePos, bitPos, nil
530
+	}
531
+
532
+begin:
533
+	bytePos, bitPos, _ = getFirstAvailable(head, start)
534
+	ret := posToOrdinal(bytePos, bitPos)
535
+	if end < ret {
536
+		return invalidPos, invalidPos, ErrNoBitAvailable
537
+	}
538
+	return bytePos, bitPos, nil
539
+}
540
+
518 541
 // checkIfAvailable checks if the bit correspondent to the specified ordinal is unset
519 542
 // If the ordinal is beyond the sequence limits, a negative response is returned
520 543
 func checkIfAvailable(head *sequence, ordinal uint64) (uint64, uint64, error) {
... ...
@@ -87,6 +87,7 @@ func (h *Handle) CopyTo(o datastore.KVObject) error {
87 87
 	dstH.dbIndex = h.dbIndex
88 88
 	dstH.dbExists = h.dbExists
89 89
 	dstH.store = h.store
90
+	dstH.curr = h.curr
90 91
 	dstH.Unlock()
91 92
 
92 93
 	return nil
... ...
@@ -21,7 +21,6 @@ import (
21 21
 
22 22
 const (
23 23
 	r            = 0xD0C4E3
24
-	timeout      = 30
25 24
 	pktExpansion = 26 // SPI(4) + SeqN(4) + IV(8) + PadLength(1) + NextHeader(1) + ICV(8)
26 25
 )
27 26
 
... ...
@@ -68,7 +68,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
68 68
 
69 69
 	ep.ifName = containerIfName
70 70
 
71
-	if err := d.writeEndpointToStore(ep); err != nil {
71
+	if err = d.writeEndpointToStore(ep); err != nil {
72 72
 		return fmt.Errorf("failed to update overlay endpoint %s to local data store: %v", ep.id[0:7], err)
73 73
 	}
74 74
 
... ...
@@ -86,7 +86,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
86 86
 		return err
87 87
 	}
88 88
 
89
-	if err := sbox.AddInterface(overlayIfName, "veth",
89
+	if err = sbox.AddInterface(overlayIfName, "veth",
90 90
 		sbox.InterfaceOptions().Master(s.brName)); err != nil {
91 91
 		return fmt.Errorf("could not add veth pair inside the network sandbox: %v", err)
92 92
 	}
... ...
@@ -100,7 +100,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
100 100
 		return err
101 101
 	}
102 102
 
103
-	if err := nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil {
103
+	if err = nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil {
104 104
 		return fmt.Errorf("could not set mac address (%v) to the container interface: %v", ep.mac, err)
105 105
 	}
106 106
 
... ...
@@ -108,7 +108,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
108 108
 		if sub == s {
109 109
 			continue
110 110
 		}
111
-		if err := jinfo.AddStaticRoute(sub.subnetIP, types.NEXTHOP, s.gwIP.IP); err != nil {
111
+		if err = jinfo.AddStaticRoute(sub.subnetIP, types.NEXTHOP, s.gwIP.IP); err != nil {
112 112
 			logrus.Errorf("Adding subnet %s static route in network %q failed\n", s.subnetIP, n.id)
113 113
 		}
114 114
 	}
... ...
@@ -122,7 +122,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
122 122
 
123 123
 	d.peerAdd(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true)
124 124
 
125
-	if err := d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil {
125
+	if err = d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil {
126 126
 		logrus.Warn(err)
127 127
 	}
128 128
 
... ...
@@ -200,7 +200,7 @@ func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key stri
200 200
 	}
201 201
 
202 202
 	if etype == driverapi.Delete {
203
-		d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep)
203
+		d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep, false)
204 204
 		return
205 205
 	}
206 206
 
... ...
@@ -232,11 +232,9 @@ func (d *driver) Leave(nid, eid string) error {
232 232
 		}
233 233
 	}
234 234
 
235
-	n.leaveSandbox()
235
+	d.peerDelete(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true)
236 236
 
237
-	if err := d.checkEncryption(nid, nil, 0, true, false); err != nil {
238
-		logrus.Warn(err)
239
-	}
237
+	n.leaveSandbox()
240 238
 
241 239
 	return nil
242 240
 }
... ...
@@ -119,7 +119,7 @@ func setDefaultVlan() {
119 119
 	data := []byte{'0', '\n'}
120 120
 
121 121
 	if err = ioutil.WriteFile(path, data, 0644); err != nil {
122
-		logrus.Errorf("endbling default vlan on bridge %s failed %v", brName, err)
122
+		logrus.Errorf("enabling default vlan on bridge %s failed %v", brName, err)
123 123
 		os.Exit(1)
124 124
 	}
125 125
 	os.Exit(0)
... ...
@@ -251,8 +251,9 @@ func (d *driver) DeleteNetwork(nid string) error {
251 251
 		if err := d.deleteEndpointFromStore(ep); err != nil {
252 252
 			logrus.Warnf("Failed to delete overlay endpoint %s from local store: %v", ep.id[0:7], err)
253 253
 		}
254
-
255 254
 	}
255
+	// flush the peerDB entries
256
+	d.peerFlush(nid)
256 257
 	d.deleteNetwork(nid)
257 258
 
258 259
 	vnis, err := n.releaseVxlanID()
... ...
@@ -505,11 +506,7 @@ func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) erro
505 505
 	vxlanIfaceOption := make([]osl.IfaceOption, 1)
506 506
 	vxlanIfaceOption = append(vxlanIfaceOption, sbox.InterfaceOptions().Master(brName))
507 507
 	Ifaces[vxlanName+"+vxlan"] = vxlanIfaceOption
508
-	err = sbox.Restore(Ifaces, nil, nil, nil)
509
-	if err != nil {
510
-		return err
511
-	}
512
-	return nil
508
+	return sbox.Restore(Ifaces, nil, nil, nil)
513 509
 }
514 510
 
515 511
 func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
... ...
@@ -760,58 +757,38 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) {
760 760
 				continue
761 761
 			}
762 762
 
763
-			logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac)
764
-
765 763
 			if neigh.State&(netlink.NUD_STALE|netlink.NUD_INCOMPLETE) == 0 {
766 764
 				continue
767 765
 			}
768 766
 
769 767
 			if n.driver.isSerfAlive() {
768
+				logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac)
770 769
 				mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, ip)
771 770
 				if err != nil {
772 771
 					logrus.Errorf("could not resolve peer %q: %v", ip, err)
773 772
 					continue
774 773
 				}
775 774
 				n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false)
776
-			} else {
777
-				// If the gc_thresh values are lower kernel might knock off the neighor entries.
778
-				// When we get a L3 miss check if its a valid peer and reprogram the neighbor
779
-				// entry again. Rate limit it to once attempt every 500ms, just in case a faulty
780
-				// container sends a flood of packets to invalid peers
781
-				if !l3Miss {
782
-					continue
783
-				}
784
-				if time.Since(t) > 500*time.Millisecond {
775
+			} else if l3Miss && time.Since(t) > time.Second {
776
+				// All the local peers will trigger a miss notification but this one is expected and the local container will reply
777
+				// autonomously to the ARP request
778
+				// In case the gc_thresh3 values is low kernel might reject new entries during peerAdd. This will trigger the following
779
+				// extra logs that will inform of the possible issue.
780
+				// Entries created would not be deleted see documentation http://man7.org/linux/man-pages/man7/arp.7.html:
781
+				// Entries which are marked as permanent are never deleted by the garbage-collector.
782
+				// The time limit here is to guarantee that the dbSearch is not
783
+				// done too frequently causing a stall of the peerDB operations.
784
+				pKey, pEntry, err := n.driver.peerDbSearch(n.id, ip)
785
+				if err == nil && !pEntry.isLocal {
785 786
 					t = time.Now()
786
-					n.programNeighbor(ip)
787
+					logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresh on the host pKey:%+v pEntry:%+v err:%v",
788
+						neigh, l3Miss, l2Miss, *pKey, *pEntry, err)
787 789
 				}
788 790
 			}
789 791
 		}
790 792
 	}
791 793
 }
792 794
 
793
-func (n *network) programNeighbor(ip net.IP) {
794
-	peerMac, _, _, err := n.driver.peerDbSearch(n.id, ip)
795
-	if err != nil {
796
-		logrus.Errorf("Reprogramming on L3 miss failed for %s, no peer entry", ip)
797
-		return
798
-	}
799
-	s := n.getSubnetforIPAddr(ip)
800
-	if s == nil {
801
-		logrus.Errorf("Reprogramming on L3 miss failed for %s, not a valid subnet", ip)
802
-		return
803
-	}
804
-	sbox := n.sandbox()
805
-	if sbox == nil {
806
-		logrus.Errorf("Reprogramming on L3 miss failed for %s, overlay sandbox missing", ip)
807
-		return
808
-	}
809
-	if err := sbox.AddNeighbor(ip, peerMac, true, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
810
-		logrus.Errorf("Reprogramming on L3 miss failed for %s: %v", ip, err)
811
-		return
812
-	}
813
-}
814
-
815 795
 func (d *driver) addNetwork(n *network) {
816 796
 	d.Lock()
817 797
 	d.networks[n.id] = n
... ...
@@ -1058,7 +1035,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
1058 1058
 		}
1059 1059
 
1060 1060
 		if s.vni == 0 {
1061
-			vxlanID, err := n.driver.vxlanIdm.GetID()
1061
+			vxlanID, err := n.driver.vxlanIdm.GetID(true)
1062 1062
 			if err != nil {
1063 1063
 				return fmt.Errorf("failed to allocate vxlan id: %v", err)
1064 1064
 			}
... ...
@@ -1090,15 +1067,6 @@ func (n *network) contains(ip net.IP) bool {
1090 1090
 	return false
1091 1091
 }
1092 1092
 
1093
-func (n *network) getSubnetforIPAddr(ip net.IP) *subnet {
1094
-	for _, s := range n.subnets {
1095
-		if s.subnetIP.Contains(ip) {
1096
-			return s
1097
-		}
1098
-	}
1099
-	return nil
1100
-}
1101
-
1102 1093
 // getSubnetforIP returns the subnet to which the given IP belongs
1103 1094
 func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
1104 1095
 	for _, s := range n.subnets {
... ...
@@ -122,7 +122,7 @@ func (d *driver) processEvent(u serf.UserEvent) {
122 122
 	case "join":
123 123
 		d.peerAdd(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false, false, false)
124 124
 	case "leave":
125
-		d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr))
125
+		d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false)
126 126
 	}
127 127
 }
128 128
 
... ...
@@ -135,13 +135,13 @@ func (d *driver) processQuery(q *serf.Query) {
135 135
 		fmt.Printf("Failed to scan query payload string: %v\n", err)
136 136
 	}
137 137
 
138
-	peerMac, peerIPMask, vtep, err := d.peerDbSearch(nid, net.ParseIP(ipStr))
138
+	pKey, pEntry, err := d.peerDbSearch(nid, net.ParseIP(ipStr))
139 139
 	if err != nil {
140 140
 		return
141 141
 	}
142 142
 
143
-	logrus.Debugf("Sending peer query resp mac %s, mask %s, vtep %s", peerMac, net.IP(peerIPMask), vtep)
144
-	q.Respond([]byte(fmt.Sprintf("%s %s %s", peerMac.String(), net.IP(peerIPMask).String(), vtep.String())))
143
+	logrus.Debugf("Sending peer query resp mac %v, mask %s, vtep %s", pKey.peerMac, net.IP(pEntry.peerIPMask).String(), pEntry.vtep)
144
+	q.Respond([]byte(fmt.Sprintf("%s %s %s", pKey.peerMac.String(), net.IP(pEntry.peerIPMask).String(), pEntry.vtep.String())))
145 145
 }
146 146
 
147 147
 func (d *driver) resolvePeer(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) {
... ...
@@ -262,7 +262,7 @@ func (d *driver) nodeJoin(advertiseAddress, bindAddress string, self bool) {
262 262
 		d.Unlock()
263 263
 
264 264
 		// If containers are already running on this network update the
265
-		// advertiseaddress in the peerDB
265
+		// advertise address in the peerDB
266 266
 		d.localJoinOnce.Do(func() {
267 267
 			d.peerDBUpdateSelf()
268 268
 		})
... ...
@@ -24,4 +24,4 @@ message PeerRecord {
24 24
 	// which this container is running and can be reached by
25 25
 	// building a tunnel to that host IP.
26 26
 	string tunnel_endpoint_ip = 3 [(gogoproto.customname) = "TunnelEndpointIP"];
27
-}
28 27
\ No newline at end of file
28
+}
... ...
@@ -165,7 +165,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
165 165
 	n.Unlock()
166 166
 
167 167
 	if vni == 0 {
168
-		vni, err = n.driver.vxlanIdm.GetIDInRange(vxlanIDStart, vxlanIDEnd)
168
+		vni, err = n.driver.vxlanIdm.GetIDInRange(vxlanIDStart, vxlanIDEnd, true)
169 169
 		if err != nil {
170 170
 			return err
171 171
 		}
... ...
@@ -8,6 +8,7 @@ import (
8 8
 	"syscall"
9 9
 
10 10
 	"github.com/docker/libnetwork/common"
11
+	"github.com/docker/libnetwork/osl"
11 12
 	"github.com/sirupsen/logrus"
12 13
 )
13 14
 
... ...
@@ -22,16 +23,48 @@ type peerEntry struct {
22 22
 	eid        string
23 23
 	vtep       net.IP
24 24
 	peerIPMask net.IPMask
25
-	inSandbox  bool
26 25
 	isLocal    bool
27 26
 }
28 27
 
28
+func (p *peerEntry) MarshalDB() peerEntryDB {
29
+	ones, bits := p.peerIPMask.Size()
30
+	return peerEntryDB{
31
+		eid:            p.eid,
32
+		vtep:           p.vtep.String(),
33
+		peerIPMaskOnes: ones,
34
+		peerIPMaskBits: bits,
35
+		isLocal:        p.isLocal,
36
+	}
37
+}
38
+
39
+// This the structure saved into the set (SetMatrix), due to the implementation of it
40
+// the value inserted in the set has to be Hashable so the []byte had to be converted into
41
+// strings
42
+type peerEntryDB struct {
43
+	eid            string
44
+	vtep           string
45
+	peerIPMaskOnes int
46
+	peerIPMaskBits int
47
+	isLocal        bool
48
+}
49
+
50
+func (p *peerEntryDB) UnMarshalDB() peerEntry {
51
+	return peerEntry{
52
+		eid:        p.eid,
53
+		vtep:       net.ParseIP(p.vtep),
54
+		peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
55
+		isLocal:    p.isLocal,
56
+	}
57
+}
58
+
29 59
 type peerMap struct {
30
-	mp map[string]peerEntry
60
+	// set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks
61
+	mp common.SetMatrix
31 62
 	sync.Mutex
32 63
 }
33 64
 
34 65
 type peerNetworkMap struct {
66
+	// map with key peerKey
35 67
 	mp map[string]*peerMap
36 68
 	sync.Mutex
37 69
 }
... ...
@@ -54,11 +87,7 @@ func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
54 54
 	}
55 55
 
56 56
 	pKey.peerMac, err = net.ParseMAC(string(macB))
57
-	if err != nil {
58
-		return err
59
-	}
60
-
61
-	return nil
57
+	return err
62 58
 }
63 59
 
64 60
 func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
... ...
@@ -87,10 +116,13 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool
87 87
 	}
88 88
 
89 89
 	mp := map[string]peerEntry{}
90
-
91 90
 	pMap.Lock()
92
-	for pKeyStr, pEntry := range pMap.mp {
93
-		mp[pKeyStr] = pEntry
91
+	for _, pKeyStr := range pMap.mp.Keys() {
92
+		entryDBList, ok := pMap.mp.Get(pKeyStr)
93
+		if ok {
94
+			peerEntryDB := entryDBList[0].(peerEntryDB)
95
+			mp[pKeyStr] = peerEntryDB.UnMarshalDB()
96
+		}
94 97
 	}
95 98
 	pMap.Unlock()
96 99
 
... ...
@@ -107,45 +139,38 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool
107 107
 	return nil
108 108
 }
109 109
 
110
-func (d *driver) peerDbSearch(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) {
111
-	var (
112
-		peerMac    net.HardwareAddr
113
-		vtep       net.IP
114
-		peerIPMask net.IPMask
115
-		found      bool
116
-	)
117
-
110
+func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
111
+	var pKeyMatched *peerKey
112
+	var pEntryMatched *peerEntry
118 113
 	err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
119 114
 		if pKey.peerIP.Equal(peerIP) {
120
-			peerMac = pKey.peerMac
121
-			peerIPMask = pEntry.peerIPMask
122
-			vtep = pEntry.vtep
123
-			found = true
124
-			return found
115
+			pKeyMatched = pKey
116
+			pEntryMatched = pEntry
117
+			return true
125 118
 		}
126 119
 
127
-		return found
120
+		return false
128 121
 	})
129 122
 
130 123
 	if err != nil {
131
-		return nil, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
124
+		return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
132 125
 	}
133 126
 
134
-	if !found {
135
-		return nil, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
127
+	if pKeyMatched == nil || pEntryMatched == nil {
128
+		return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
136 129
 	}
137 130
 
138
-	return peerMac, peerIPMask, vtep, nil
131
+	return pKeyMatched, pEntryMatched, nil
139 132
 }
140 133
 
141 134
 func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
142
-	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) {
135
+	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
143 136
 
144 137
 	d.peerDb.Lock()
145 138
 	pMap, ok := d.peerDb.mp[nid]
146 139
 	if !ok {
147 140
 		d.peerDb.mp[nid] = &peerMap{
148
-			mp: make(map[string]peerEntry),
141
+			mp: common.NewSetMatrix(),
149 142
 		}
150 143
 
151 144
 		pMap = d.peerDb.mp[nid]
... ...
@@ -165,18 +190,24 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
165 165
 	}
166 166
 
167 167
 	pMap.Lock()
168
-	pMap.mp[pKey.String()] = pEntry
169
-	pMap.Unlock()
168
+	defer pMap.Unlock()
169
+	b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
170
+	if i != 1 {
171
+		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
172
+		s, _ := pMap.mp.String(pKey.String())
173
+		logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
174
+	}
175
+	return b, i
170 176
 }
171 177
 
172 178
 func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
173
-	peerMac net.HardwareAddr, vtep net.IP) peerEntry {
179
+	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
174 180
 
175 181
 	d.peerDb.Lock()
176 182
 	pMap, ok := d.peerDb.mp[nid]
177 183
 	if !ok {
178 184
 		d.peerDb.Unlock()
179
-		return peerEntry{}
185
+		return false, 0
180 186
 	}
181 187
 	d.peerDb.Unlock()
182 188
 
... ...
@@ -185,22 +216,22 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPM
185 185
 		peerMac: peerMac,
186 186
 	}
187 187
 
188
-	pMap.Lock()
189
-
190
-	pEntry, ok := pMap.mp[pKey.String()]
191
-	if ok {
192
-		// Mismatched endpoint ID(possibly outdated). Do not
193
-		// delete peerdb
194
-		if pEntry.eid != eid {
195
-			pMap.Unlock()
196
-			return pEntry
197
-		}
188
+	pEntry := peerEntry{
189
+		eid:        eid,
190
+		vtep:       vtep,
191
+		peerIPMask: peerIPMask,
192
+		isLocal:    isLocal,
198 193
 	}
199 194
 
200
-	delete(pMap.mp, pKey.String())
201
-	pMap.Unlock()
202
-
203
-	return pEntry
195
+	pMap.Lock()
196
+	defer pMap.Unlock()
197
+	b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
198
+	if i != 0 {
199
+		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
200
+		s, _ := pMap.mp.String(pKey.String())
201
+		logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
202
+	}
203
+	return b, i
204 204
 }
205 205
 
206 206
 // The overlay uses a lazy initialization approach, this means that when a network is created
... ...
@@ -224,6 +255,7 @@ const (
224 224
 	peerOperationINIT peerOperationType = iota
225 225
 	peerOperationADD
226 226
 	peerOperationDELETE
227
+	peerOperationFLUSH
227 228
 )
228 229
 
229 230
 type peerOperation struct {
... ...
@@ -253,7 +285,9 @@ func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) {
253 253
 			case peerOperationADD:
254 254
 				err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer)
255 255
 			case peerOperationDELETE:
256
-				err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP)
256
+				err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer)
257
+			case peerOperationFLUSH:
258
+				err = d.peerFlushOp(op.networkID)
257 259
 			}
258 260
 			if err != nil {
259 261
 				logrus.Warnf("Peer operation failed:%s op:%v", err, op)
... ...
@@ -286,7 +320,6 @@ func (d *driver) peerInitOp(nid string) error {
286 286
 
287 287
 func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
288 288
 	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) {
289
-	callerName := common.CallerName(1)
290 289
 	d.peerOpCh <- &peerOperation{
291 290
 		opType:     peerOperationADD,
292 291
 		networkID:  nid,
... ...
@@ -298,24 +331,32 @@ func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
298 298
 		l2Miss:     l2Miss,
299 299
 		l3Miss:     l3Miss,
300 300
 		localPeer:  localPeer,
301
-		callerName: callerName,
301
+		callerName: common.CallerName(1),
302 302
 	}
303 303
 }
304 304
 
305 305
 func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
306
-	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, updateOnlyDB bool) error {
306
+	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
307 307
 
308 308
 	if err := validateID(nid, eid); err != nil {
309 309
 		return err
310 310
 	}
311 311
 
312
+	var dbEntries int
313
+	var inserted bool
312 314
 	if updateDB {
313
-		d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, false)
314
-		if updateOnlyDB {
315
-			return nil
315
+		inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
316
+		if !inserted {
317
+			logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
318
+				nid, eid, peerIP, peerMac, localPeer, vtep)
316 319
 		}
317 320
 	}
318 321
 
322
+	// Local peers do not need any further configuration
323
+	if localPeer {
324
+		return nil
325
+	}
326
+
319 327
 	n := d.network(nid)
320 328
 	if n == nil {
321 329
 		return nil
... ...
@@ -353,21 +394,26 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
353 353
 
354 354
 	// Add neighbor entry for the peer IP
355 355
 	if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
356
-		return fmt.Errorf("could not add neighbor entry into the sandbox: %v", err)
356
+		if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
357
+			// We are in the transient case so only the first configuration is programmed into the kernel
358
+			// Upon deletion if the active configuration is deleted the next one from the database will be restored
359
+			// Note we are skipping also the next configuration
360
+			return nil
361
+		}
362
+		return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
357 363
 	}
358 364
 
359 365
 	// Add fdb entry to the bridge for the peer mac
360 366
 	if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName),
361 367
 		sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil {
362
-		return fmt.Errorf("could not add fdb entry into the sandbox: %v", err)
368
+		return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
363 369
 	}
364 370
 
365 371
 	return nil
366 372
 }
367 373
 
368 374
 func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
369
-	peerMac net.HardwareAddr, vtep net.IP) {
370
-	callerName := common.CallerName(1)
375
+	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
371 376
 	d.peerOpCh <- &peerOperation{
372 377
 		opType:     peerOperationDELETE,
373 378
 		networkID:  nid,
... ...
@@ -376,18 +422,23 @@ func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMas
376 376
 		peerIPMask: peerIPMask,
377 377
 		peerMac:    peerMac,
378 378
 		vtepIP:     vtep,
379
-		callerName: callerName,
379
+		callerName: common.CallerName(1),
380
+		localPeer:  localPeer,
380 381
 	}
381 382
 }
382 383
 
383 384
 func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
384
-	peerMac net.HardwareAddr, vtep net.IP) error {
385
+	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
385 386
 
386 387
 	if err := validateID(nid, eid); err != nil {
387 388
 		return err
388 389
 	}
389 390
 
390
-	pEntry := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep)
391
+	deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
392
+	if !deleted {
393
+		logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
394
+			nid, eid, peerIP, peerMac, localPeer, vtep)
395
+	}
391 396
 
392 397
 	n := d.network(nid)
393 398
 	if n == nil {
... ...
@@ -399,30 +450,59 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM
399 399
 		return nil
400 400
 	}
401 401
 
402
-	// Delete fdb entry to the bridge for the peer mac only if the
403
-	// entry existed in local peerdb. If it is a stale delete
404
-	// request, still call DeleteNeighbor but only to cleanup any
405
-	// leftover sandbox neighbor cache and not actually delete the
406
-	// kernel state.
407
-	if (eid == pEntry.eid && vtep.Equal(pEntry.vtep)) ||
408
-		(eid != pEntry.eid && !vtep.Equal(pEntry.vtep)) {
409
-		if err := sbox.DeleteNeighbor(vtep, peerMac,
410
-			eid == pEntry.eid && vtep.Equal(pEntry.vtep)); err != nil {
411
-			return fmt.Errorf("could not delete fdb entry into the sandbox: %v", err)
412
-		}
402
+	if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil {
403
+		logrus.Warn(err)
413 404
 	}
414 405
 
415
-	// Delete neighbor entry for the peer IP
416
-	if eid == pEntry.eid {
406
+	// Local peers do not have any local configuration to delete
407
+	if !localPeer {
408
+		// Remove fdb entry to the bridge for the peer mac
409
+		if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil {
410
+			if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
411
+				// We fall in here if there is a transient state and if the neighbor that is being deleted
412
+				// was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
413
+				return nil
414
+			}
415
+			return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
416
+		}
417
+
418
+		// Delete neighbor entry for the peer IP
417 419
 		if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
418
-			return fmt.Errorf("could not delete neighbor entry into the sandbox: %v", err)
420
+			return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
419 421
 		}
420 422
 	}
421 423
 
422
-	if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil {
423
-		logrus.Warn(err)
424
+	if dbEntries == 0 {
425
+		return nil
426
+	}
427
+
428
+	// If there is still an entry into the database and the deletion went through without errors means that there is now no
429
+	// configuration active in the kernel.
430
+	// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
431
+	peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
432
+	if err != nil {
433
+		logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
434
+		return err
424 435
 	}
436
+	return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
437
+}
425 438
 
439
+func (d *driver) peerFlush(nid string) {
440
+	d.peerOpCh <- &peerOperation{
441
+		opType:     peerOperationFLUSH,
442
+		networkID:  nid,
443
+		callerName: common.CallerName(1),
444
+	}
445
+}
446
+
447
+func (d *driver) peerFlushOp(nid string) error {
448
+	d.peerDb.Lock()
449
+	defer d.peerDb.Unlock()
450
+	_, ok := d.peerDb.mp[nid]
451
+	if !ok {
452
+		return fmt.Errorf("Unable to find the peerDB for nid:%s", nid)
453
+	}
454
+	delete(d.peerDb.mp, nid)
426 455
 	return nil
427 456
 }
428 457
 
... ...
@@ -718,7 +718,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
718 718
 		}
719 719
 
720 720
 		if s.vni == 0 {
721
-			vxlanID, err := n.driver.vxlanIdm.GetID()
721
+			vxlanID, err := n.driver.vxlanIdm.GetID(true)
722 722
 			if err != nil {
723 723
 				return fmt.Errorf("failed to allocate vxlan id: %v", err)
724 724
 			}
... ...
@@ -62,10 +62,17 @@ type EndpointConnectivity struct {
62 62
 }
63 63
 
64 64
 type hnsEndpoint struct {
65
-	id             string
66
-	nid            string
67
-	profileID      string
68
-	Type           string
65
+	id        string
66
+	nid       string
67
+	profileID string
68
+	Type      string
69
+	//Note: Currently, the sandboxID is the same as the containerID since windows does
70
+	//not expose the sandboxID.
71
+	//In the future, windows will support a proper sandboxID that is different
72
+	//than the containerID.
73
+	//Therefore, we are using sandboxID now, so that we won't have to change this code
74
+	//when windows properly supports a sandboxID.
75
+	sandboxID      string
69 76
 	macAddress     net.HardwareAddr
70 77
 	epOption       *endpointOption       // User specified parameters
71 78
 	epConnectivity *EndpointConnectivity // User specified parameters
... ...
@@ -730,7 +737,15 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
730 730
 		return err
731 731
 	}
732 732
 
733
-	// This is just a stub for now
733
+	endpoint.sandboxID = sboxKey
734
+
735
+	err = hcsshim.HotAttachEndpoint(endpoint.sandboxID, endpoint.profileID)
736
+	if err != nil {
737
+		// If container doesn't exists in hcs, do not throw error for hot add/remove
738
+		if err != hcsshim.ErrComputeSystemDoesNotExist {
739
+			return err
740
+		}
741
+	}
734 742
 
735 743
 	jinfo.DisableGatewayService()
736 744
 	return nil
... ...
@@ -744,13 +759,18 @@ func (d *driver) Leave(nid, eid string) error {
744 744
 	}
745 745
 
746 746
 	// Ensure that the endpoint exists
747
-	_, err = network.getEndpoint(eid)
747
+	endpoint, err := network.getEndpoint(eid)
748 748
 	if err != nil {
749 749
 		return err
750 750
 	}
751 751
 
752
-	// This is just a stub for now
753
-
752
+	err = hcsshim.HotDetachEndpoint(endpoint.sandboxID, endpoint.profileID)
753
+	if err != nil {
754
+		// If container doesn't exists in hcs, do not throw error for hot add/remove
755
+		if err != hcsshim.ErrComputeSystemDoesNotExist {
756
+			return err
757
+		}
758
+	}
754 759
 	return nil
755 760
 }
756 761
 
... ...
@@ -34,11 +34,11 @@ func New(ds datastore.DataStore, id string, start, end uint64) (*Idm, error) {
34 34
 }
35 35
 
36 36
 // GetID returns the first available id in the set
37
-func (i *Idm) GetID() (uint64, error) {
37
+func (i *Idm) GetID(serial bool) (uint64, error) {
38 38
 	if i.handle == nil {
39 39
 		return 0, errors.New("ID set is not initialized")
40 40
 	}
41
-	ordinal, err := i.handle.SetAny()
41
+	ordinal, err := i.handle.SetAny(serial)
42 42
 	return i.start + ordinal, err
43 43
 }
44 44
 
... ...
@@ -56,7 +56,7 @@ func (i *Idm) GetSpecificID(id uint64) error {
56 56
 }
57 57
 
58 58
 // GetIDInRange returns the first available id in the set within a [start,end] range
59
-func (i *Idm) GetIDInRange(start, end uint64) (uint64, error) {
59
+func (i *Idm) GetIDInRange(start, end uint64, serial bool) (uint64, error) {
60 60
 	if i.handle == nil {
61 61
 		return 0, errors.New("ID set is not initialized")
62 62
 	}
... ...
@@ -65,7 +65,7 @@ func (i *Idm) GetIDInRange(start, end uint64) (uint64, error) {
65 65
 		return 0, errors.New("Requested range does not belong to the set")
66 66
 	}
67 67
 
68
-	ordinal, err := i.handle.SetAnyInRange(start-i.start, end-i.start)
68
+	ordinal, err := i.handle.SetAnyInRange(start-i.start, end-i.start, serial)
69 69
 
70 70
 	return i.start + ordinal, err
71 71
 }
... ...
@@ -457,7 +457,15 @@ func (a *Allocator) RequestAddress(poolID string, prefAddress net.IP, opts map[s
457 457
 		return nil, nil, types.InternalErrorf("could not find bitmask in datastore for %s on address %v request from pool %s: %v",
458 458
 			k.String(), prefAddress, poolID, err)
459 459
 	}
460
-	ip, err := a.getAddress(p.Pool, bm, prefAddress, p.Range)
460
+	// In order to request for a serial ip address allocation, callers can pass in the option to request
461
+	// IP allocation serially or first available IP in the subnet
462
+	var serial bool
463
+	if opts != nil {
464
+		if val, ok := opts[ipamapi.AllocSerialPrefix]; ok {
465
+			serial = (val == "true")
466
+		}
467
+	}
468
+	ip, err := a.getAddress(p.Pool, bm, prefAddress, p.Range, serial)
461 469
 	if err != nil {
462 470
 		return nil, nil, err
463 471
 	}
... ...
@@ -522,7 +530,7 @@ func (a *Allocator) ReleaseAddress(poolID string, address net.IP) error {
522 522
 	return bm.Unset(ipToUint64(h))
523 523
 }
524 524
 
525
-func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddress net.IP, ipr *AddressRange) (net.IP, error) {
525
+func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddress net.IP, ipr *AddressRange, serial bool) (net.IP, error) {
526 526
 	var (
527 527
 		ordinal uint64
528 528
 		err     error
... ...
@@ -535,7 +543,7 @@ func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddres
535 535
 		return nil, ipamapi.ErrNoAvailableIPs
536 536
 	}
537 537
 	if ipr == nil && prefAddress == nil {
538
-		ordinal, err = bitmask.SetAny()
538
+		ordinal, err = bitmask.SetAny(serial)
539 539
 	} else if prefAddress != nil {
540 540
 		hostPart, e := types.GetHostPartIP(prefAddress, base.Mask)
541 541
 		if e != nil {
... ...
@@ -544,7 +552,7 @@ func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddres
544 544
 		ordinal = ipToUint64(types.GetMinimalIP(hostPart))
545 545
 		err = bitmask.Set(ordinal)
546 546
 	} else {
547
-		ordinal, err = bitmask.SetAnyInRange(ipr.Start, ipr.End)
547
+		ordinal, err = bitmask.SetAnyInRange(ipr.Start, ipr.End, serial)
548 548
 	}
549 549
 
550 550
 	switch err {
551 551
new file mode 100644
... ...
@@ -0,0 +1,10 @@
0
+package ipamapi
1
+
2
+const (
3
+	// Prefix constant marks the reserved label space for libnetwork
4
+	Prefix = "com.docker.network"
5
+
6
+	// AllocSerialPrefix constant marks the reserved label space for libnetwork ipam
7
+	// allocation ordering.(serial/first available)
8
+	AllocSerialPrefix = Prefix + ".ipam.serial"
9
+)
... ...
@@ -456,7 +456,7 @@ func RawCombinedOutputNative(args ...string) error {
456 456
 
457 457
 // ExistChain checks if a chain exists
458 458
 func ExistChain(chain string, table Table) bool {
459
-	if _, err := Raw("-t", string(table), "-L", chain); err == nil {
459
+	if _, err := Raw("-t", string(table), "-nL", chain); err == nil {
460 460
 		return true
461 461
 	}
462 462
 	return false
... ...
@@ -32,7 +32,7 @@ func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltim
32 32
 	nEvent := NetworkEvent{
33 33
 		Type:      event,
34 34
 		LTime:     ltime,
35
-		NodeName:  nDB.config.NodeName,
35
+		NodeName:  nDB.config.NodeID,
36 36
 		NetworkID: nid,
37 37
 	}
38 38
 
... ...
@@ -44,7 +44,7 @@ func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltim
44 44
 	nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
45 45
 		msg:  raw,
46 46
 		id:   nid,
47
-		node: nDB.config.NodeName,
47
+		node: nDB.config.NodeID,
48 48
 	})
49 49
 	return nil
50 50
 }
... ...
@@ -72,7 +72,7 @@ func (nDB *NetworkDB) sendNodeEvent(event NodeEvent_Type) error {
72 72
 	nEvent := NodeEvent{
73 73
 		Type:     event,
74 74
 		LTime:    nDB.networkClock.Increment(),
75
-		NodeName: nDB.config.NodeName,
75
+		NodeName: nDB.config.NodeID,
76 76
 	}
77 77
 
78 78
 	raw, err := encodeMessage(MessageTypeNodeEvent, &nEvent)
... ...
@@ -129,7 +129,7 @@ func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname st
129 129
 	tEvent := TableEvent{
130 130
 		Type:      event,
131 131
 		LTime:     entry.ltime,
132
-		NodeName:  nDB.config.NodeName,
132
+		NodeName:  nDB.config.NodeID,
133 133
 		NetworkID: nid,
134 134
 		TableName: tname,
135 135
 		Key:       key,
... ...
@@ -145,7 +145,7 @@ func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname st
145 145
 
146 146
 	var broadcastQ *memberlist.TransmitLimitedQueue
147 147
 	nDB.RLock()
148
-	thisNodeNetworks, ok := nDB.networks[nDB.config.NodeName]
148
+	thisNodeNetworks, ok := nDB.networks[nDB.config.NodeID]
149 149
 	if ok {
150 150
 		// The network may have been removed
151 151
 		network, networkOk := thisNodeNetworks[nid]
... ...
@@ -168,7 +168,7 @@ func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname st
168 168
 		id:    nid,
169 169
 		tname: tname,
170 170
 		key:   key,
171
-		node:  nDB.config.NodeName,
171
+		node:  nDB.config.NodeID,
172 172
 	})
173 173
 	return nil
174 174
 }
... ...
@@ -106,7 +106,7 @@ func (nDB *NetworkDB) clusterInit() error {
106 106
 	nDB.lastHealthTimestamp = nDB.lastStatsTimestamp
107 107
 
108 108
 	config := memberlist.DefaultLANConfig()
109
-	config.Name = nDB.config.NodeName
109
+	config.Name = nDB.config.NodeID
110 110
 	config.BindAddr = nDB.config.BindAddr
111 111
 	config.AdvertiseAddr = nDB.config.AdvertiseAddr
112 112
 	config.UDPBufferSize = nDB.config.PacketBufferSize
... ...
@@ -329,7 +329,7 @@ func (nDB *NetworkDB) reapTableEntries() {
329 329
 	var nodeNetworks []string
330 330
 	// This is best effort, if the list of network changes will be picked up in the next cycle
331 331
 	nDB.RLock()
332
-	for nid := range nDB.networks[nDB.config.NodeName] {
332
+	for nid := range nDB.networks[nDB.config.NodeID] {
333 333
 		nodeNetworks = append(nodeNetworks, nid)
334 334
 	}
335 335
 	nDB.RUnlock()
... ...
@@ -376,7 +376,7 @@ func (nDB *NetworkDB) reapTableEntries() {
376 376
 func (nDB *NetworkDB) gossip() {
377 377
 	networkNodes := make(map[string][]string)
378 378
 	nDB.RLock()
379
-	thisNodeNetworks := nDB.networks[nDB.config.NodeName]
379
+	thisNodeNetworks := nDB.networks[nDB.config.NodeID]
380 380
 	for nid := range thisNodeNetworks {
381 381
 		networkNodes[nid] = nDB.networkNodes[nid]
382 382
 
... ...
@@ -388,7 +388,7 @@ func (nDB *NetworkDB) gossip() {
388 388
 	if printHealth {
389 389
 		healthScore := nDB.memberlist.GetHealthScore()
390 390
 		if healthScore != 0 {
391
-			logrus.Warnf("NetworkDB stats - healthscore:%d (connectivity issues)", healthScore)
391
+			logrus.Warnf("NetworkDB stats %v(%v) - healthscore:%d (connectivity issues)", nDB.config.Hostname, nDB.config.NodeID, healthScore)
392 392
 		}
393 393
 		nDB.lastHealthTimestamp = time.Now()
394 394
 	}
... ...
@@ -419,7 +419,8 @@ func (nDB *NetworkDB) gossip() {
419 419
 		// Collect stats and print the queue info, note this code is here also to have a view of the queues empty
420 420
 		network.qMessagesSent += len(msgs)
421 421
 		if printStats {
422
-			logrus.Infof("NetworkDB stats - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d netMsg/s:%d",
422
+			logrus.Infof("NetworkDB stats %v(%v) - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d netMsg/s:%d",
423
+				nDB.config.Hostname, nDB.config.NodeID,
423 424
 				nid, network.leaving, broadcastQ.NumNodes(), network.entriesNumber, broadcastQ.NumQueued(),
424 425
 				network.qMessagesSent/int((nDB.config.StatsPrintPeriod/time.Second)))
425 426
 			network.qMessagesSent = 0
... ...
@@ -456,7 +457,7 @@ func (nDB *NetworkDB) gossip() {
456 456
 func (nDB *NetworkDB) bulkSyncTables() {
457 457
 	var networks []string
458 458
 	nDB.RLock()
459
-	for nid, network := range nDB.networks[nDB.config.NodeName] {
459
+	for nid, network := range nDB.networks[nDB.config.NodeID] {
460 460
 		if network.leaving {
461 461
 			continue
462 462
 		}
... ...
@@ -522,10 +523,10 @@ func (nDB *NetworkDB) bulkSync(nodes []string, all bool) ([]string, error) {
522 522
 	var err error
523 523
 	var networks []string
524 524
 	for _, node := range nodes {
525
-		if node == nDB.config.NodeName {
525
+		if node == nDB.config.NodeID {
526 526
 			continue
527 527
 		}
528
-		logrus.Debugf("%s: Initiating bulk sync with node %v", nDB.config.NodeName, node)
528
+		logrus.Debugf("%v(%v): Initiating bulk sync with node %v", nDB.config.Hostname, nDB.config.NodeID, node)
529 529
 		networks = nDB.findCommonNetworks(node)
530 530
 		err = nDB.bulkSyncNode(networks, node, true)
531 531
 		// if its periodic bulksync stop after the first successful sync
... ...
@@ -556,7 +557,8 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
556 556
 		unsolMsg = "unsolicited"
557 557
 	}
558 558
 
559
-	logrus.Debugf("%s: Initiating %s bulk sync for networks %v with node %s", nDB.config.NodeName, unsolMsg, networks, node)
559
+	logrus.Debugf("%v(%v): Initiating %s bulk sync for networks %v with node %s",
560
+		nDB.config.Hostname, nDB.config.NodeID, unsolMsg, networks, node)
560 561
 
561 562
 	nDB.RLock()
562 563
 	mnode := nDB.nodes[node]
... ...
@@ -608,7 +610,7 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
608 608
 	bsm := BulkSyncMessage{
609 609
 		LTime:       nDB.tableClock.Time(),
610 610
 		Unsolicited: unsolicited,
611
-		NodeName:    nDB.config.NodeName,
611
+		NodeName:    nDB.config.NodeID,
612 612
 		Networks:    networks,
613 613
 		Payload:     compound,
614 614
 	}
... ...
@@ -640,7 +642,7 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
640 640
 		case <-t.C:
641 641
 			logrus.Errorf("Bulk sync to node %s timed out", node)
642 642
 		case <-ch:
643
-			logrus.Debugf("%s: Bulk sync to node %s took %s", nDB.config.NodeName, node, time.Since(startTime))
643
+			logrus.Debugf("%v(%v): Bulk sync to node %s took %s", nDB.config.Hostname, nDB.config.NodeID, node, time.Since(startTime))
644 644
 		}
645 645
 		t.Stop()
646 646
 	}
... ...
@@ -677,7 +679,7 @@ OUTER:
677 677
 		idx := randomOffset(n)
678 678
 		node := nodes[idx]
679 679
 
680
-		if node == nDB.config.NodeName {
680
+		if node == nDB.config.NodeID {
681 681
 			continue
682 682
 		}
683 683
 
... ...
@@ -2,7 +2,6 @@ package networkdb
2 2
 
3 3
 import (
4 4
 	"net"
5
-	"strings"
6 5
 	"time"
7 6
 
8 7
 	"github.com/gogo/protobuf/proto"
... ...
@@ -58,29 +57,6 @@ func (nDB *NetworkDB) checkAndGetNode(nEvent *NodeEvent) *node {
58 58
 	return nil
59 59
 }
60 60
 
61
-func (nDB *NetworkDB) purgeSameNode(n *node) {
62
-	nDB.Lock()
63
-	defer nDB.Unlock()
64
-
65
-	prefix := strings.Split(n.Name, "-")[0]
66
-	for _, nodes := range []map[string]*node{
67
-		nDB.failedNodes,
68
-		nDB.leftNodes,
69
-		nDB.nodes,
70
-	} {
71
-		var nodeNames []string
72
-		for name, node := range nodes {
73
-			if strings.HasPrefix(name, prefix) && n.Addr.Equal(node.Addr) {
74
-				nodeNames = append(nodeNames, name)
75
-			}
76
-		}
77
-
78
-		for _, name := range nodeNames {
79
-			delete(nodes, name)
80
-		}
81
-	}
82
-}
83
-
84 61
 func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
85 62
 	// Update our local clock if the received messages has newer
86 63
 	// time.
... ...
@@ -108,7 +84,6 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
108 108
 		return false
109 109
 	}
110 110
 
111
-	nDB.purgeSameNode(n)
112 111
 	n.ltime = nEvent.LTime
113 112
 
114 113
 	switch nEvent.Type {
... ...
@@ -140,7 +115,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
140 140
 	nDB.Lock()
141 141
 	defer nDB.Unlock()
142 142
 
143
-	if nEvent.NodeName == nDB.config.NodeName {
143
+	if nEvent.NodeName == nDB.config.NodeID {
144 144
 		return false
145 145
 	}
146 146
 
... ...
@@ -203,7 +178,7 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
203 203
 
204 204
 	// Ignore the table events for networks that are in the process of going away
205 205
 	nDB.RLock()
206
-	networks := nDB.networks[nDB.config.NodeName]
206
+	networks := nDB.networks[nDB.config.NodeID]
207 207
 	network, ok := networks[tEvent.NetworkID]
208 208
 	// Check if the owner of the event is still part of the network
209 209
 	nodes := nDB.networkNodes[tEvent.NetworkID]
... ...
@@ -253,7 +228,8 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
253 253
 		// If it is a delete event and we did not have a state for it, don't propagate to the application
254 254
 		// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around
255 255
 		// most likely the cluster is already aware of it, if not who will sync with this node will catch the state too.
256
-		return e.reapTime > reapPeriod/6
256
+		// This also avoids that deletion of entries close to their garbage collection ends up circuling around forever
257
+		return e.reapTime > reapEntryInterval/6
257 258
 	}
258 259
 
259 260
 	var op opType
... ...
@@ -292,7 +268,7 @@ func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) {
292 292
 	}
293 293
 
294 294
 	// Ignore messages that this node generated.
295
-	if tEvent.NodeName == nDB.config.NodeName {
295
+	if tEvent.NodeName == nDB.config.NodeID {
296 296
 		return
297 297
 	}
298 298
 
... ...
@@ -305,7 +281,7 @@ func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) {
305 305
 		}
306 306
 
307 307
 		nDB.RLock()
308
-		n, ok := nDB.networks[nDB.config.NodeName][tEvent.NetworkID]
308
+		n, ok := nDB.networks[nDB.config.NodeID][tEvent.NetworkID]
309 309
 		nDB.RUnlock()
310 310
 
311 311
 		// if the network is not there anymore, OR we are leaving the network OR the broadcast queue is not present
... ...
@@ -424,7 +400,7 @@ func (nDB *NetworkDB) handleMessage(buf []byte, isBulkSync bool) {
424 424
 	case MessageTypeCompound:
425 425
 		nDB.handleCompound(data, isBulkSync)
426 426
 	default:
427
-		logrus.Errorf("%s: unknown message type %d", nDB.config.NodeName, mType)
427
+		logrus.Errorf("%v(%v): unknown message type %d", nDB.config.Hostname, nDB.config.NodeID, mType)
428 428
 	}
429 429
 }
430 430
 
... ...
@@ -457,7 +433,7 @@ func (d *delegate) LocalState(join bool) []byte {
457 457
 
458 458
 	pp := NetworkPushPull{
459 459
 		LTime:    d.nDB.networkClock.Time(),
460
-		NodeName: d.nDB.config.NodeName,
460
+		NodeName: d.nDB.config.NodeID,
461 461
 	}
462 462
 
463 463
 	for name, nn := range d.nDB.networks {
... ...
@@ -11,6 +11,7 @@ import (
11 11
 	"time"
12 12
 
13 13
 	"github.com/armon/go-radix"
14
+	"github.com/docker/docker/pkg/stringid"
14 15
 	"github.com/docker/go-events"
15 16
 	"github.com/docker/libnetwork/types"
16 17
 	"github.com/hashicorp/memberlist"
... ...
@@ -151,8 +152,11 @@ type network struct {
151 151
 // Config represents the configuration of the networdb instance and
152 152
 // can be passed by the caller.
153 153
 type Config struct {
154
-	// NodeName is the cluster wide unique name for this node.
155
-	NodeName string
154
+	// NodeID is the node unique identifier of the node when is part of the cluster
155
+	NodeID string
156
+
157
+	// Hostname is the node hostname.
158
+	Hostname string
156 159
 
157 160
 	// BindAddr is the IP on which networkdb listens. It can be
158 161
 	// 0.0.0.0 to listen on all addresses on the host.
... ...
@@ -210,7 +214,8 @@ type entry struct {
210 210
 func DefaultConfig() *Config {
211 211
 	hostname, _ := os.Hostname()
212 212
 	return &Config{
213
-		NodeName:          hostname,
213
+		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
214
+		Hostname:          hostname,
214 215
 		BindAddr:          "0.0.0.0",
215 216
 		PacketBufferSize:  1400,
216 217
 		StatsPrintPeriod:  5 * time.Minute,
... ...
@@ -236,6 +241,7 @@ func New(c *Config) (*NetworkDB, error) {
236 236
 	nDB.indexes[byTable] = radix.New()
237 237
 	nDB.indexes[byNetwork] = radix.New()
238 238
 
239
+	logrus.Debugf("New memberlist node - Node:%v will use memberlist nodeID:%v", c.Hostname, c.NodeID)
239 240
 	if err := nDB.clusterInit(); err != nil {
240 241
 		return nil, err
241 242
 	}
... ...
@@ -259,8 +265,11 @@ func (nDB *NetworkDB) Join(members []string) error {
259 259
 // stopping timers, canceling goroutines etc.
260 260
 func (nDB *NetworkDB) Close() {
261 261
 	if err := nDB.clusterLeave(); err != nil {
262
-		logrus.Errorf("Could not close DB %s: %v", nDB.config.NodeName, err)
262
+		logrus.Errorf("%v(%v) Could not close DB: %v", nDB.config.Hostname, nDB.config.NodeID, err)
263 263
 	}
264
+
265
+	//Avoid (*Broadcaster).run goroutine leak
266
+	nDB.broadcaster.Close()
264 267
 }
265 268
 
266 269
 // ClusterPeers returns all the gossip cluster peers.
... ...
@@ -334,7 +343,7 @@ func (nDB *NetworkDB) CreateEntry(tname, nid, key string, value []byte) error {
334 334
 
335 335
 	entry := &entry{
336 336
 		ltime: nDB.tableClock.Increment(),
337
-		node:  nDB.config.NodeName,
337
+		node:  nDB.config.NodeID,
338 338
 		value: value,
339 339
 	}
340 340
 
... ...
@@ -360,7 +369,7 @@ func (nDB *NetworkDB) UpdateEntry(tname, nid, key string, value []byte) error {
360 360
 
361 361
 	entry := &entry{
362 362
 		ltime: nDB.tableClock.Increment(),
363
-		node:  nDB.config.NodeName,
363
+		node:  nDB.config.NodeID,
364 364
 		value: value,
365 365
 	}
366 366
 
... ...
@@ -402,7 +411,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
402 402
 
403 403
 	entry := &entry{
404 404
 		ltime:    nDB.tableClock.Increment(),
405
-		node:     nDB.config.NodeName,
405
+		node:     nDB.config.NodeID,
406 406
 		value:    value,
407 407
 		deleting: true,
408 408
 		reapTime: reapEntryInterval,
... ...
@@ -451,7 +460,7 @@ func (nDB *NetworkDB) deleteNetworkEntriesForNode(deletedNode string) {
451 451
 //		  entries owned by remote nodes, we will accept them and we notify the application
452 452
 func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
453 453
 	// Indicates if the delete is triggered for the local node
454
-	isNodeLocal := node == nDB.config.NodeName
454
+	isNodeLocal := node == nDB.config.NodeID
455 455
 
456 456
 	nDB.indexes[byNetwork].WalkPrefix(fmt.Sprintf("/%s", nid),
457 457
 		func(path string, v interface{}) bool {
... ...
@@ -496,7 +505,10 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
496 496
 				nDB.deleteEntry(nid, tname, key)
497 497
 			}
498 498
 
499
-			nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value))
499
+			// Notify to the upper layer only entries not already marked for deletion
500
+			if !oldEntry.deleting {
501
+				nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value))
502
+			}
500 503
 			return false
501 504
 		})
502 505
 }
... ...
@@ -552,10 +564,10 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
552 552
 	ltime := nDB.networkClock.Increment()
553 553
 
554 554
 	nDB.Lock()
555
-	nodeNetworks, ok := nDB.networks[nDB.config.NodeName]
555
+	nodeNetworks, ok := nDB.networks[nDB.config.NodeID]
556 556
 	if !ok {
557 557
 		nodeNetworks = make(map[string]*network)
558
-		nDB.networks[nDB.config.NodeName] = nodeNetworks
558
+		nDB.networks[nDB.config.NodeID] = nodeNetworks
559 559
 	}
560 560
 	n, ok := nodeNetworks[nid]
561 561
 	var entries int
... ...
@@ -571,8 +583,7 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
571 571
 		},
572 572
 		RetransmitMult: 4,
573 573
 	}
574
-
575
-	nDB.addNetworkNode(nid, nDB.config.NodeName)
574
+	nDB.addNetworkNode(nid, nDB.config.NodeID)
576 575
 	networkNodes := nDB.networkNodes[nid]
577 576
 	nDB.Unlock()
578 577
 
... ...
@@ -580,7 +591,7 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
580 580
 		return fmt.Errorf("failed to send leave network event for %s: %v", nid, err)
581 581
 	}
582 582
 
583
-	logrus.Debugf("%s: joined network %s", nDB.config.NodeName, nid)
583
+	logrus.Debugf("%v(%v): joined network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
584 584
 	if _, err := nDB.bulkSync(networkNodes, true); err != nil {
585 585
 		logrus.Errorf("Error bulk syncing while joining network %s: %v", nid, err)
586 586
 	}
... ...
@@ -604,12 +615,12 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
604 604
 	defer nDB.Unlock()
605 605
 
606 606
 	// Remove myself from the list of the nodes participating to the network
607
-	nDB.deleteNetworkNode(nid, nDB.config.NodeName)
607
+	nDB.deleteNetworkNode(nid, nDB.config.NodeID)
608 608
 
609 609
 	// Update all the local entries marking them for deletion and delete all the remote entries
610
-	nDB.deleteNodeNetworkEntries(nid, nDB.config.NodeName)
610
+	nDB.deleteNodeNetworkEntries(nid, nDB.config.NodeID)
611 611
 
612
-	nodeNetworks, ok := nDB.networks[nDB.config.NodeName]
612
+	nodeNetworks, ok := nDB.networks[nDB.config.NodeID]
613 613
 	if !ok {
614 614
 		return fmt.Errorf("could not find self node for network %s while trying to leave", nid)
615 615
 	}
... ...
@@ -619,7 +630,7 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
619 619
 		return fmt.Errorf("could not find network %s while trying to leave", nid)
620 620
 	}
621 621
 
622
-	logrus.Debugf("%s: leaving network %s", nDB.config.NodeName, nid)
622
+	logrus.Debugf("%v(%v): leaving network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
623 623
 	n.ltime = ltime
624 624
 	n.reapTime = reapNetworkInterval
625 625
 	n.leaving = true
... ...
@@ -665,7 +676,7 @@ func (nDB *NetworkDB) findCommonNetworks(nodeName string) []string {
665 665
 	defer nDB.RUnlock()
666 666
 
667 667
 	var networks []string
668
-	for nid := range nDB.networks[nDB.config.NodeName] {
668
+	for nid := range nDB.networks[nDB.config.NodeID] {
669 669
 		if n, ok := nDB.networks[nodeName][nid]; ok {
670 670
 			if !n.leaving {
671 671
 				networks = append(networks, nid)
... ...
@@ -681,7 +692,7 @@ func (nDB *NetworkDB) updateLocalNetworkTime() {
681 681
 	defer nDB.Unlock()
682 682
 
683 683
 	ltime := nDB.networkClock.Increment()
684
-	for _, n := range nDB.networks[nDB.config.NodeName] {
684
+	for _, n := range nDB.networks[nDB.config.NodeID] {
685 685
 		n.ltime = ltime
686 686
 	}
687 687
 }
... ...
@@ -693,7 +704,7 @@ func (nDB *NetworkDB) createOrUpdateEntry(nid, tname, key string, entry interfac
693 693
 	_, okNetwork := nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry)
694 694
 	if !okNetwork {
695 695
 		// Add only if it is an insert not an update
696
-		n, ok := nDB.networks[nDB.config.NodeName][nid]
696
+		n, ok := nDB.networks[nDB.config.NodeID][nid]
697 697
 		if ok {
698 698
 			n.entriesNumber++
699 699
 		}
... ...
@@ -708,7 +719,7 @@ func (nDB *NetworkDB) deleteEntry(nid, tname, key string) (bool, bool) {
708 708
 	_, okNetwork := nDB.indexes[byNetwork].Delete(fmt.Sprintf("/%s/%s/%s", nid, tname, key))
709 709
 	if okNetwork {
710 710
 		// Remove only if the delete is successful
711
-		n, ok := nDB.networks[nDB.config.NodeName][nid]
711
+		n, ok := nDB.networks[nDB.config.NodeID][nid]
712 712
 		if ok {
713 713
 			n.entriesNumber--
714 714
 		}
... ...
@@ -5,12 +5,7 @@ import "testing"
5 5
 // GenerateKey generates a sandbox key based on the passed
6 6
 // container id.
7 7
 func GenerateKey(containerID string) string {
8
-	maxLen := 12
9
-	if len(containerID) < maxLen {
10
-		maxLen = len(containerID)
11
-	}
12
-
13
-	return containerID[:maxLen]
8
+	return containerID
14 9
 }
15 10
 
16 11
 // NewSandbox provides a new sandbox instance created in an os specific way
... ...
@@ -9,6 +9,17 @@ import (
9 9
 	"github.com/vishvananda/netlink"
10 10
 )
11 11
 
12
+// NeighborSearchError indicates that the neighbor is already present
13
+type NeighborSearchError struct {
14
+	ip      net.IP
15
+	mac     net.HardwareAddr
16
+	present bool
17
+}
18
+
19
+func (n NeighborSearchError) Error() string {
20
+	return fmt.Sprintf("Search neighbor failed for IP %v, mac %v, present in db:%t", n.ip, n.mac, n.present)
21
+}
22
+
12 23
 // NeighOption is a function option type to set interface options
13 24
 type NeighOption func(nh *neigh)
14 25
 
... ...
@@ -41,7 +52,7 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr,
41 41
 
42 42
 	nh := n.findNeighbor(dstIP, dstMac)
43 43
 	if nh == nil {
44
-		return fmt.Errorf("could not find the neighbor entry to delete")
44
+		return NeighborSearchError{dstIP, dstMac, false}
45 45
 	}
46 46
 
47 47
 	if osDelete {
... ...
@@ -103,26 +114,27 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr,
103 103
 		}
104 104
 	}
105 105
 	n.Unlock()
106
-	logrus.Debugf("Neighbor entry deleted for IP %v, mac %v", dstIP, dstMac)
106
+	logrus.Debugf("Neighbor entry deleted for IP %v, mac %v osDelete:%t", dstIP, dstMac, osDelete)
107 107
 
108 108
 	return nil
109 109
 }
110 110
 
111 111
 func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, force bool, options ...NeighOption) error {
112 112
 	var (
113
-		iface netlink.Link
114
-		err   error
113
+		iface                  netlink.Link
114
+		err                    error
115
+		neighborAlreadyPresent bool
115 116
 	)
116 117
 
117 118
 	// If the namespace already has the neighbor entry but the AddNeighbor is called
118 119
 	// because of a miss notification (force flag) program the kernel anyway.
119 120
 	nh := n.findNeighbor(dstIP, dstMac)
120 121
 	if nh != nil {
122
+		neighborAlreadyPresent = true
123
+		logrus.Warnf("Neighbor entry already present for IP %v, mac %v neighbor:%+v forceUpdate:%t", dstIP, dstMac, nh, force)
121 124
 		if !force {
122
-			logrus.Warnf("Neighbor entry already present for IP %v, mac %v", dstIP, dstMac)
123
-			return nil
125
+			return NeighborSearchError{dstIP, dstMac, true}
124 126
 		}
125
-		logrus.Warnf("Force kernel update, Neighbor entry already present for IP %v, mac %v", dstIP, dstMac)
126 127
 	}
127 128
 
128 129
 	nh = &neigh{
... ...
@@ -146,8 +158,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo
146 146
 	if nh.linkDst != "" {
147 147
 		iface, err = nlh.LinkByName(nh.linkDst)
148 148
 		if err != nil {
149
-			return fmt.Errorf("could not find interface with destination name %s: %v",
150
-				nh.linkDst, err)
149
+			return fmt.Errorf("could not find interface with destination name %s: %v", nh.linkDst, err)
151 150
 		}
152 151
 	}
153 152
 
... ...
@@ -167,13 +178,17 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo
167 167
 	}
168 168
 
169 169
 	if err := nlh.NeighSet(nlnh); err != nil {
170
-		return fmt.Errorf("could not add neighbor entry: %v", err)
170
+		return fmt.Errorf("could not add neighbor entry:%+v error:%v", nlnh, err)
171
+	}
172
+
173
+	if neighborAlreadyPresent {
174
+		return nil
171 175
 	}
172 176
 
173 177
 	n.Lock()
174 178
 	n.neighbors = append(n.neighbors, nh)
175 179
 	n.Unlock()
176
-	logrus.Debugf("Neighbor entry added for IP %v, mac %v", dstIP, dstMac)
180
+	logrus.Debugf("Neighbor entry added for IP:%v, mac:%v on ifc:%s", dstIP, dstMac, nh.linkName)
177 181
 
178 182
 	return nil
179 183
 }
... ...
@@ -574,6 +574,7 @@ func (na *cnmNetworkAllocator) releaseEndpoints(networks []*api.NetworkAttachmen
574 574
 
575 575
 // allocate virtual IP for a single endpoint attachment of the service.
576 576
 func (na *cnmNetworkAllocator) allocateVIP(vip *api.Endpoint_VirtualIP) error {
577
+	var opts map[string]string
577 578
 	localNet := na.getNetwork(vip.NetworkID)
578 579
 	if localNet == nil {
579 580
 		return errors.New("networkallocator: could not find local network state")
... ...
@@ -603,9 +604,13 @@ func (na *cnmNetworkAllocator) allocateVIP(vip *api.Endpoint_VirtualIP) error {
603 603
 			return err
604 604
 		}
605 605
 	}
606
+	if localNet.nw.IPAM != nil && localNet.nw.IPAM.Driver != nil {
607
+		// set ipam allocation method to serial
608
+		opts = setIPAMSerialAlloc(localNet.nw.IPAM.Driver.Options)
609
+	}
606 610
 
607 611
 	for _, poolID := range localNet.pools {
608
-		ip, _, err := ipam.RequestAddress(poolID, addr, nil)
612
+		ip, _, err := ipam.RequestAddress(poolID, addr, opts)
609 613
 		if err != nil && err != ipamapi.ErrNoAvailableIPs && err != ipamapi.ErrIPOutOfRange {
610 614
 			return errors.Wrap(err, "could not allocate VIP from IPAM")
611 615
 		}
... ...
@@ -657,6 +662,7 @@ func (na *cnmNetworkAllocator) deallocateVIP(vip *api.Endpoint_VirtualIP) error
657 657
 // allocate the IP addresses for a single network attachment of the task.
658 658
 func (na *cnmNetworkAllocator) allocateNetworkIPs(nAttach *api.NetworkAttachment) error {
659 659
 	var ip *net.IPNet
660
+	var opts map[string]string
660 661
 
661 662
 	ipam, _, _, err := na.resolveIPAM(nAttach.Network)
662 663
 	if err != nil {
... ...
@@ -686,11 +692,16 @@ func (na *cnmNetworkAllocator) allocateNetworkIPs(nAttach *api.NetworkAttachment
686 686
 				}
687 687
 			}
688 688
 		}
689
+		// Set the ipam options if the network has an ipam driver.
690
+		if localNet.nw.IPAM != nil && localNet.nw.IPAM.Driver != nil {
691
+			// set ipam allocation method to serial
692
+			opts = setIPAMSerialAlloc(localNet.nw.IPAM.Driver.Options)
693
+		}
689 694
 
690 695
 		for _, poolID := range localNet.pools {
691 696
 			var err error
692 697
 
693
-			ip, _, err = ipam.RequestAddress(poolID, addr, nil)
698
+			ip, _, err = ipam.RequestAddress(poolID, addr, opts)
694 699
 			if err != nil && err != ipamapi.ErrNoAvailableIPs && err != ipamapi.ErrIPOutOfRange {
695 700
 				return errors.Wrap(err, "could not allocate IP from IPAM")
696 701
 			}
... ...
@@ -918,8 +929,16 @@ func (na *cnmNetworkAllocator) allocatePools(n *api.Network) (map[string]string,
918 918
 			}
919 919
 			gwIP.IP = ip
920 920
 		}
921
+		if dOptions == nil {
922
+			dOptions = make(map[string]string)
923
+		}
924
+		dOptions[ipamapi.RequestAddressType] = netlabel.Gateway
925
+		// set ipam allocation method to serial
926
+		dOptions = setIPAMSerialAlloc(dOptions)
927
+		defer delete(dOptions, ipamapi.RequestAddressType)
928
+
921 929
 		if ic.Gateway != "" || gwIP == nil {
922
-			gwIP, _, err = ipam.RequestAddress(poolID, net.ParseIP(ic.Gateway), map[string]string{ipamapi.RequestAddressType: netlabel.Gateway})
930
+			gwIP, _, err = ipam.RequestAddress(poolID, net.ParseIP(ic.Gateway), dOptions)
923 931
 			if err != nil {
924 932
 				// Rollback by releasing all the resources allocated so far.
925 933
 				releasePools(ipam, ipamConfigs[:i], pools)
... ...
@@ -980,3 +999,14 @@ func IsBuiltInDriver(name string) bool {
980 980
 	}
981 981
 	return false
982 982
 }
983
+
984
+// setIPAMSerialAlloc sets the ipam allocation method to serial
985
+func setIPAMSerialAlloc(opts map[string]string) map[string]string {
986
+	if opts == nil {
987
+		opts = make(map[string]string)
988
+	}
989
+	if _, ok := opts[ipamapi.AllocSerialPrefix]; !ok {
990
+		opts[ipamapi.AllocSerialPrefix] = "true"
991
+	}
992
+	return opts
993
+}
... ...
@@ -382,7 +382,7 @@ func (ps *portSpace) allocate(p *api.PortConfig) (err error) {
382 382
 	}
383 383
 
384 384
 	// Check out an arbitrary port from dynamic port space.
385
-	swarmPort, err := ps.dynamicPortSpace.GetID()
385
+	swarmPort, err := ps.dynamicPortSpace.GetID(true)
386 386
 	if err != nil {
387 387
 		return
388 388
 	}
... ...
@@ -542,6 +542,7 @@ func (n *Node) Run(ctx context.Context) error {
542 542
 		n.done()
543 543
 	}()
544 544
 
545
+	// Flag that indicates if this manager node is *currently* the raft leader.
545 546
 	wasLeader := false
546 547
 	transferLeadershipLimit := rate.NewLimiter(rate.Every(time.Minute), 1)
547 548
 
... ...
@@ -563,10 +564,13 @@ func (n *Node) Run(ctx context.Context) error {
563 563
 				return errors.Wrap(err, "failed to save entries to storage")
564 564
 			}
565 565
 
566
+			// If the memory store lock has been held for too long,
567
+			// transferring leadership is an easy way to break out of it.
566 568
 			if wasLeader &&
567 569
 				(rd.SoftState == nil || rd.SoftState.RaftState == raft.StateLeader) &&
568 570
 				n.memoryStore.Wedged() &&
569 571
 				transferLeadershipLimit.Allow() {
572
+				log.G(ctx).Error("Attempting to transfer leadership")
570 573
 				if !n.opts.DisableStackDump {
571 574
 					signal.DumpStacks("")
572 575
 				}
... ...
@@ -612,6 +616,8 @@ func (n *Node) Run(ctx context.Context) error {
612 612
 			if rd.SoftState != nil {
613 613
 				if wasLeader && rd.SoftState.RaftState != raft.StateLeader {
614 614
 					wasLeader = false
615
+					log.G(ctx).Error("soft state changed, node no longer a leader, resetting and cancelling all waits")
616
+
615 617
 					if atomic.LoadUint32(&n.signalledLeadership) == 1 {
616 618
 						atomic.StoreUint32(&n.signalledLeadership, 0)
617 619
 						n.leadershipBroadcast.Publish(IsFollower)
... ...
@@ -630,6 +636,7 @@ func (n *Node) Run(ctx context.Context) error {
630 630
 					// cancelAll, or by its own check of signalledLeadership.
631 631
 					n.wait.cancelAll()
632 632
 				} else if !wasLeader && rd.SoftState.RaftState == raft.StateLeader {
633
+					// Node just became a leader.
633 634
 					wasLeader = true
634 635
 				}
635 636
 			}
... ...
@@ -1478,7 +1485,7 @@ func (n *Node) registerNode(node *api.RaftMember) error {
1478 1478
 	return nil
1479 1479
 }
1480 1480
 
1481
-// ProposeValue calls Propose on the raft and waits
1481
+// ProposeValue calls Propose on the underlying raft library(etcd/raft) and waits
1482 1482
 // on the commit log action before returning a result
1483 1483
 func (n *Node) ProposeValue(ctx context.Context, storeAction []api.StoreAction, cb func()) error {
1484 1484
 	ctx, cancel := n.WithContext(ctx)
... ...
@@ -1654,11 +1661,14 @@ func (n *Node) saveToStorage(
1654 1654
 	return nil
1655 1655
 }
1656 1656
 
1657
-// processInternalRaftRequest sends a message to nodes participating
1658
-// in the raft to apply a log entry and then waits for it to be applied
1659
-// on the server. It will block until the update is performed, there is
1660
-// an error or until the raft node finalizes all the proposals on node
1661
-// shutdown.
1657
+// processInternalRaftRequest proposes a value to be appended to the raft log.
1658
+// It calls Propose() on etcd/raft, which calls back into the raft FSM,
1659
+// which then sends a message to each of the participating nodes
1660
+// in the raft group to apply a log entry and then waits for it to be applied
1661
+// on this node. It will block until the this node:
1662
+// 1. Gets the necessary replies back from the participating nodes and also performs the commit itself, or
1663
+// 2. There is an error, or
1664
+// 3. Until the raft node finalizes all the proposals on node shutdown.
1662 1665
 func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRaftRequest, cb func()) (proto.Message, error) {
1663 1666
 	n.stopMu.RLock()
1664 1667
 	if !n.IsMember() {
... ...
@@ -1679,6 +1689,7 @@ func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRa
1679 1679
 
1680 1680
 	// Do this check after calling register to avoid a race.
1681 1681
 	if atomic.LoadUint32(&n.signalledLeadership) != 1 {
1682
+		log.G(ctx).Error("node is no longer leader, aborting propose")
1682 1683
 		n.wait.cancel(r.ID)
1683 1684
 		return nil, ErrLostLeadership
1684 1685
 	}
... ...
@@ -1703,14 +1714,23 @@ func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRa
1703 1703
 	select {
1704 1704
 	case x, ok := <-ch:
1705 1705
 		if !ok {
1706
+			// Wait notification channel was closed. This should only happen if the wait was cancelled.
1707
+			log.G(ctx).Error("wait cancelled")
1708
+			if atomic.LoadUint32(&n.signalledLeadership) == 1 {
1709
+				log.G(ctx).Error("wait cancelled but node is still a leader")
1710
+			}
1706 1711
 			return nil, ErrLostLeadership
1707 1712
 		}
1708 1713
 		return x.(proto.Message), nil
1709 1714
 	case <-waitCtx.Done():
1710 1715
 		n.wait.cancel(r.ID)
1711
-		// if channel is closed, wait item was canceled, otherwise it was triggered
1716
+		// If we can read from the channel, wait item was triggered. Otherwise it was cancelled.
1712 1717
 		x, ok := <-ch
1713 1718
 		if !ok {
1719
+			log.G(ctx).WithError(waitCtx.Err()).Error("wait context cancelled")
1720
+			if atomic.LoadUint32(&n.signalledLeadership) == 1 {
1721
+				log.G(ctx).Error("wait context cancelled but node is still a leader")
1722
+			}
1714 1723
 			return nil, ErrLostLeadership
1715 1724
 		}
1716 1725
 		return x.(proto.Message), nil
... ...
@@ -1779,21 +1799,26 @@ func (n *Node) processEntry(ctx context.Context, entry raftpb.Entry) error {
1779 1779
 	}
1780 1780
 
1781 1781
 	if !n.wait.trigger(r.ID, r) {
1782
+		log.G(ctx).Errorf("wait not found for raft request id %x", r.ID)
1783
+
1782 1784
 		// There was no wait on this ID, meaning we don't have a
1783 1785
 		// transaction in progress that would be committed to the
1784 1786
 		// memory store by the "trigger" call. Either a different node
1785 1787
 		// wrote this to raft, or we wrote it before losing the leader
1786
-		// position and cancelling the transaction. Create a new
1787
-		// transaction to commit the data.
1788
+		// position and cancelling the transaction. This entry still needs
1789
+		// to be committed since other nodes have already committed it.
1790
+		// Create a new transaction to commit this entry.
1788 1791
 
1789 1792
 		// It should not be possible for processInternalRaftRequest
1790 1793
 		// to be running in this situation, but out of caution we
1791 1794
 		// cancel any current invocations to avoid a deadlock.
1795
+		// TODO(anshul) This call is likely redundant, remove after consideration.
1792 1796
 		n.wait.cancelAll()
1793 1797
 
1794 1798
 		err := n.memoryStore.ApplyStoreActions(r.Action)
1795 1799
 		if err != nil {
1796 1800
 			log.G(ctx).WithError(err).Error("failed to apply actions from raft")
1801
+			// TODO(anshul) return err here ?
1797 1802
 		}
1798 1803
 	}
1799 1804
 	return nil
... ...
@@ -83,8 +83,7 @@ func register(os ObjectStoreConfig) {
83 83
 	schema.Tables[os.Table.Name] = os.Table
84 84
 }
85 85
 
86
-// timedMutex wraps a sync.Mutex, and keeps track of how long it has been
87
-// locked.
86
+// timedMutex wraps a sync.Mutex, and keeps track of when it was locked.
88 87
 type timedMutex struct {
89 88
 	sync.Mutex
90 89
 	lockedAt atomic.Value
... ...
@@ -24,7 +24,7 @@ github.com/docker/go-connections 3ede32e2033de7505e6500d6c868c2b9ed9f169d
24 24
 github.com/docker/go-events 9461782956ad83b30282bf90e31fa6a70c255ba9
25 25
 github.com/docker/go-units 954fed01cc617c55d838fa2230073f2cb17386c8
26 26
 github.com/docker/libkv 9fd56606e928ff1f309808f5d5a0b7a2ef73f9a8
27
-github.com/docker/libnetwork 19ac3ea7f52bb46e0eb10669756cdae0c441a5b1 
27
+github.com/docker/libnetwork 21544598c53fa36a3c771a8725c643dd2340f845 
28 28
 github.com/docker/libtrust 9cbd2a1374f46905c68a4eb3694a130610adc62a
29 29
 github.com/opencontainers/runc d40db12e72a40109dfcf28539f5ee0930d2f0277
30 30
 github.com/opencontainers/go-digest 21dfd564fd89c944783d00d069f33e3e7123c448