Browse code

Vendor libnetwork and github.com/vishvananda/netlink

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>

Aaron Lehmann authored on 2016/07/22 06:27:47
Showing 33 changed files
... ...
@@ -65,7 +65,7 @@ clone git github.com/RackSec/srslog 259aed10dfa74ea2961eddd1d9847619f6e98837
65 65
 clone git github.com/imdario/mergo 0.2.1
66 66
 
67 67
 #get libnetwork packages
68
-clone git github.com/docker/libnetwork 905d374c096ca1f3a9b75529e52518b7540179f3
68
+clone git github.com/docker/libnetwork 83ab4deaa2da3deb32cb5e64ceec43801dc17370
69 69
 clone git github.com/docker/go-events afb2b9f2c23f33ada1a22b03651775fdc65a5089
70 70
 clone git github.com/armon/go-radix e39d623f12e8e41c7b5529e9a9dd67a1e2261f80
71 71
 clone git github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
... ...
@@ -75,7 +75,7 @@ clone git github.com/hashicorp/go-multierror fcdddc395df1ddf4247c69bd436e84cfa07
75 75
 clone git github.com/hashicorp/serf 598c54895cc5a7b1a24a398d635e8c0ea0959870
76 76
 clone git github.com/docker/libkv v0.2.1
77 77
 clone git github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25
78
-clone git github.com/vishvananda/netlink 734d02c3e202f682c74b71314b2c61eec0170fd4
78
+clone git github.com/vishvananda/netlink e73bad418fd727ed3a02830b1af1ad0283a1de6c
79 79
 clone git github.com/BurntSushi/toml f706d00e3de6abe700c994cdd545a1a4915af060
80 80
 clone git github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374
81 81
 clone git github.com/deckarep/golang-set ef32fa3046d9f249d399f98ebaf9be944430fd1d
... ...
@@ -35,6 +35,7 @@ func (b ByTime) Less(i, j int) bool { return b[i].LamportTime < b[j].LamportTime
35 35
 type agent struct {
36 36
 	networkDB         *networkdb.NetworkDB
37 37
 	bindAddr          string
38
+	advertiseAddr     string
38 39
 	epTblCancel       func()
39 40
 	driverCancelFuncs map[string][]func()
40 41
 }
... ...
@@ -236,25 +237,14 @@ func (c *controller) handleKeyChangeV1(keys []*types.EncryptionKey) error {
236 236
 func (c *controller) agentSetup() error {
237 237
 	clusterProvider := c.cfg.Daemon.ClusterProvider
238 238
 
239
-	bindAddr, _, _ := net.SplitHostPort(clusterProvider.GetListenAddress())
239
+	bindAddr := clusterProvider.GetLocalAddress()
240
+	advAddr := clusterProvider.GetAdvertiseAddress()
240 241
 	remote := clusterProvider.GetRemoteAddress()
241 242
 	remoteAddr, _, _ := net.SplitHostPort(remote)
242 243
 
243
-	// Determine the BindAddress from RemoteAddress or through best-effort routing
244
-	if !isValidClusteringIP(bindAddr) {
245
-		if !isValidClusteringIP(remoteAddr) {
246
-			remote = "8.8.8.8:53"
247
-		}
248
-		conn, err := net.Dial("udp", remote)
249
-		if err == nil {
250
-			bindHostPort := conn.LocalAddr().String()
251
-			bindAddr, _, _ = net.SplitHostPort(bindHostPort)
252
-			conn.Close()
253
-		}
254
-	}
255
-
256
-	if bindAddr != "" && c.agent == nil {
257
-		if err := c.agentInit(bindAddr); err != nil {
244
+	logrus.Infof("Initializing Libnetwork Agent Local-addr=%s Adv-addr=%s Remote-addr =%s", bindAddr, advAddr, remoteAddr)
245
+	if advAddr != "" && c.agent == nil {
246
+		if err := c.agentInit(bindAddr, advAddr); err != nil {
258 247
 			logrus.Errorf("Error in agentInit : %v", err)
259 248
 		} else {
260 249
 			c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
... ...
@@ -312,7 +302,7 @@ func (c *controller) getPrimaryKeyTag(subsys string) ([]byte, uint64) {
312 312
 	return keys[1].Key, keys[1].LamportTime
313 313
 }
314 314
 
315
-func (c *controller) agentInit(bindAddrOrInterface string) error {
315
+func (c *controller) agentInit(bindAddrOrInterface, advertiseAddr string) error {
316 316
 	if !c.isAgent() {
317 317
 		return nil
318 318
 	}
... ...
@@ -325,9 +315,9 @@ func (c *controller) agentInit(bindAddrOrInterface string) error {
325 325
 	keys, tags := c.getKeys(subsysGossip)
326 326
 	hostname, _ := os.Hostname()
327 327
 	nDB, err := networkdb.New(&networkdb.Config{
328
-		BindAddr: bindAddr,
329
-		NodeName: hostname,
330
-		Keys:     keys,
328
+		AdvertiseAddr: advertiseAddr,
329
+		NodeName:      hostname,
330
+		Keys:          keys,
331 331
 	})
332 332
 
333 333
 	if err != nil {
... ...
@@ -339,6 +329,7 @@ func (c *controller) agentInit(bindAddrOrInterface string) error {
339 339
 	c.agent = &agent{
340 340
 		networkDB:         nDB,
341 341
 		bindAddr:          bindAddr,
342
+		advertiseAddr:     advertiseAddr,
342 343
 		epTblCancel:       cancel,
343 344
 		driverCancelFuncs: make(map[string][]func()),
344 345
 	}
... ...
@@ -377,8 +368,9 @@ func (c *controller) agentDriverNotify(d driverapi.Driver) {
377 377
 	}
378 378
 
379 379
 	d.DiscoverNew(discoverapi.NodeDiscovery, discoverapi.NodeDiscoveryData{
380
-		Address: c.agent.bindAddr,
381
-		Self:    true,
380
+		Address:     c.agent.advertiseAddr,
381
+		BindAddress: c.agent.bindAddr,
382
+		Self:        true,
382 383
 	})
383 384
 
384 385
 	drvEnc := discoverapi.DriverEncryptionConfig{}
... ...
@@ -4,7 +4,8 @@ package cluster
4 4
 type Provider interface {
5 5
 	IsManager() bool
6 6
 	IsAgent() bool
7
-	GetListenAddress() string
7
+	GetLocalAddress() string
8
+	GetAdvertiseAddress() string
8 9
 	GetRemoteAddress() string
9 10
 	ListenClusterEvents() <-chan struct{}
10 11
 }
... ...
@@ -11,6 +11,7 @@ import (
11 11
 	"github.com/docker/libnetwork/cluster"
12 12
 	"github.com/docker/libnetwork/datastore"
13 13
 	"github.com/docker/libnetwork/netlabel"
14
+	"github.com/docker/libnetwork/osl"
14 15
 )
15 16
 
16 17
 // Config encapsulates configurations of various Libnetwork components
... ...
@@ -197,6 +198,13 @@ func OptionDataDir(dataDir string) Option {
197 197
 	}
198 198
 }
199 199
 
200
+// OptionExecRoot function returns an option setter for exec root folder
201
+func OptionExecRoot(execRoot string) Option {
202
+	return func(c *Config) {
203
+		osl.SetBasePath(execRoot)
204
+	}
205
+}
206
+
200 207
 // ProcessOptions processes options and stores it in config
201 208
 func (c *Config) ProcessOptions(options ...Option) {
202 209
 	for _, opt := range options {
... ...
@@ -378,6 +378,10 @@ func (c *controller) ReloadConfiguration(cfgOptions ...config.Option) error {
378 378
 		return nil
379 379
 	}
380 380
 
381
+	c.Lock()
382
+	c.cfg = cfg
383
+	c.Unlock()
384
+
381 385
 	var dsConfig *discoverapi.DatastoreConfigData
382 386
 	for scope, sCfg := range cfg.Scopes {
383 387
 		if scope == datastore.LocalScope || !sCfg.IsValid() {
... ...
@@ -26,8 +26,9 @@ const (
26 26
 
27 27
 // NodeDiscoveryData represents the structure backing the node discovery data json string
28 28
 type NodeDiscoveryData struct {
29
-	Address string
30
-	Self    bool
29
+	Address     string
30
+	BindAddress string
31
+	Self        bool
31 32
 }
32 33
 
33 34
 // DatastoreConfigData is the data for the datastore update event message
... ...
@@ -83,9 +83,9 @@ func (d *driver) populateEndpoints() error {
83 83
 		n, ok := d.networks[ep.nid]
84 84
 		if !ok {
85 85
 			logrus.Debugf("Network (%s) not found for restored bridge endpoint (%s)", ep.nid[0:7], ep.id[0:7])
86
-			logrus.Debugf("Deleting stale bridge endpoint (%s) from store", ep.nid[0:7])
86
+			logrus.Debugf("Deleting stale bridge endpoint (%s) from store", ep.id[0:7])
87 87
 			if err := d.storeDelete(ep); err != nil {
88
-				logrus.Debugf("Failed to delete stale bridge endpoint (%s) from store", ep.nid[0:7])
88
+				logrus.Debugf("Failed to delete stale bridge endpoint (%s) from store", ep.id[0:7])
89 89
 			}
90 90
 			continue
91 91
 		}
... ...
@@ -82,6 +82,6 @@ func (d *driver) DeleteEndpoint(nid, eid string) error {
82 82
 	if err := d.storeDelete(ep); err != nil {
83 83
 		logrus.Warnf("Failed to remove ipvlan endpoint %s from store: %v", ep.id[0:7], err)
84 84
 	}
85
-
85
+	n.deleteEndpoint(ep.id)
86 86
 	return nil
87 87
 }
... ...
@@ -96,9 +96,9 @@ func (d *driver) populateEndpoints() error {
96 96
 		n, ok := d.networks[ep.nid]
97 97
 		if !ok {
98 98
 			logrus.Debugf("Network (%s) not found for restored ipvlan endpoint (%s)", ep.nid[0:7], ep.id[0:7])
99
-			logrus.Debugf("Deleting stale ipvlan endpoint (%s) from store", ep.nid[0:7])
99
+			logrus.Debugf("Deleting stale ipvlan endpoint (%s) from store", ep.id[0:7])
100 100
 			if err := d.storeDelete(ep); err != nil {
101
-				logrus.Debugf("Failed to delete stale ipvlan endpoint (%s) from store", ep.nid[0:7])
101
+				logrus.Debugf("Failed to delete stale ipvlan endpoint (%s) from store", ep.id[0:7])
102 102
 			}
103 103
 			continue
104 104
 		}
... ...
@@ -96,9 +96,9 @@ func (d *driver) populateEndpoints() error {
96 96
 		n, ok := d.networks[ep.nid]
97 97
 		if !ok {
98 98
 			logrus.Debugf("Network (%s) not found for restored macvlan endpoint (%s)", ep.nid[0:7], ep.id[0:7])
99
-			logrus.Debugf("Deleting stale macvlan endpoint (%s) from store", ep.nid[0:7])
99
+			logrus.Debugf("Deleting stale macvlan endpoint (%s) from store", ep.id[0:7])
100 100
 			if err := d.storeDelete(ep); err != nil {
101
-				logrus.Debugf("Failed to delete stale macvlan endpoint (%s) from store", ep.nid[0:7])
101
+				logrus.Debugf("Failed to delete stale macvlan endpoint (%s) from store", ep.id[0:7])
102 102
 			}
103 103
 			continue
104 104
 		}
... ...
@@ -2,23 +2,27 @@ package overlay
2 2
 
3 3
 import (
4 4
 	"bytes"
5
+	"encoding/binary"
5 6
 	"encoding/hex"
6 7
 	"fmt"
8
+	"hash/fnv"
7 9
 	"net"
8 10
 	"sync"
9 11
 	"syscall"
10 12
 
13
+	"strconv"
14
+
11 15
 	log "github.com/Sirupsen/logrus"
12 16
 	"github.com/docker/libnetwork/iptables"
13 17
 	"github.com/docker/libnetwork/ns"
14 18
 	"github.com/docker/libnetwork/types"
15 19
 	"github.com/vishvananda/netlink"
16
-	"strconv"
17 20
 )
18 21
 
19 22
 const (
20
-	mark    = uint32(0xD0C4E3)
21
-	timeout = 30
23
+	mark         = uint32(0xD0C4E3)
24
+	timeout      = 30
25
+	pktExpansion = 26 // SPI(4) + SeqN(4) + IV(8) + PadLength(1) + NextHeader(1) + ICV(8)
22 26
 )
23 27
 
24 28
 const (
... ...
@@ -85,6 +89,7 @@ func (d *driver) checkEncryption(nid string, rIP net.IP, vxlanID uint32, isLocal
85 85
 	}
86 86
 
87 87
 	lIP := types.GetMinimalIP(net.ParseIP(d.bindAddress))
88
+	aIP := types.GetMinimalIP(net.ParseIP(d.advertiseAddress))
88 89
 	nodes := map[string]net.IP{}
89 90
 
90 91
 	switch {
... ...
@@ -107,7 +112,7 @@ func (d *driver) checkEncryption(nid string, rIP net.IP, vxlanID uint32, isLocal
107 107
 
108 108
 	if add {
109 109
 		for _, rIP := range nodes {
110
-			if err := setupEncryption(lIP, rIP, vxlanID, d.secMap, d.keys); err != nil {
110
+			if err := setupEncryption(lIP, aIP, rIP, vxlanID, d.secMap, d.keys); err != nil {
111 111
 				log.Warnf("Failed to program network encryption between %s and %s: %v", lIP, rIP, err)
112 112
 			}
113 113
 		}
... ...
@@ -122,7 +127,7 @@ func (d *driver) checkEncryption(nid string, rIP net.IP, vxlanID uint32, isLocal
122 122
 	return nil
123 123
 }
124 124
 
125
-func setupEncryption(localIP, remoteIP net.IP, vni uint32, em *encrMap, keys []*key) error {
125
+func setupEncryption(localIP, advIP, remoteIP net.IP, vni uint32, em *encrMap, keys []*key) error {
126 126
 	log.Debugf("Programming encryption for vxlan %d between %s and %s", vni, localIP, remoteIP)
127 127
 	rIPs := remoteIP.String()
128 128
 
... ...
@@ -134,7 +139,7 @@ func setupEncryption(localIP, remoteIP net.IP, vni uint32, em *encrMap, keys []*
134 134
 	}
135 135
 
136 136
 	for i, k := range keys {
137
-		spis := &spi{buildSPI(localIP, remoteIP, k.tag), buildSPI(remoteIP, localIP, k.tag)}
137
+		spis := &spi{buildSPI(advIP, remoteIP, k.tag), buildSPI(remoteIP, advIP, k.tag)}
138 138
 		dir := reverse
139 139
 		if i == 0 {
140 140
 			dir = bidir
... ...
@@ -216,7 +221,6 @@ func programMangle(vni uint32, add bool) (err error) {
216 216
 
217 217
 func programSA(localIP, remoteIP net.IP, spi *spi, k *key, dir int, add bool) (fSA *netlink.XfrmState, rSA *netlink.XfrmState, err error) {
218 218
 	var (
219
-		crypt       *netlink.XfrmStateAlgo
220 219
 		action      = "Removing"
221 220
 		xfrmProgram = ns.NlHandle().XfrmStateDel
222 221
 	)
... ...
@@ -224,7 +228,6 @@ func programSA(localIP, remoteIP net.IP, spi *spi, k *key, dir int, add bool) (f
224 224
 	if add {
225 225
 		action = "Adding"
226 226
 		xfrmProgram = ns.NlHandle().XfrmStateAdd
227
-		crypt = &netlink.XfrmStateAlgo{Name: "cbc(aes)", Key: k.value}
228 227
 	}
229 228
 
230 229
 	if dir&reverse > 0 {
... ...
@@ -236,7 +239,7 @@ func programSA(localIP, remoteIP net.IP, spi *spi, k *key, dir int, add bool) (f
236 236
 			Mode:  netlink.XFRM_MODE_TRANSPORT,
237 237
 		}
238 238
 		if add {
239
-			rSA.Crypt = crypt
239
+			rSA.Aead = buildAeadAlgo(k, spi.reverse)
240 240
 		}
241 241
 
242 242
 		exists, err := saExists(rSA)
... ...
@@ -261,7 +264,7 @@ func programSA(localIP, remoteIP net.IP, spi *spi, k *key, dir int, add bool) (f
261 261
 			Mode:  netlink.XFRM_MODE_TRANSPORT,
262 262
 		}
263 263
 		if add {
264
-			fSA.Crypt = crypt
264
+			fSA.Aead = buildAeadAlgo(k, spi.forward)
265 265
 		}
266 266
 
267 267
 		exists, err := saExists(fSA)
... ...
@@ -354,13 +357,23 @@ func spExists(sp *netlink.XfrmPolicy) (bool, error) {
354 354
 }
355 355
 
356 356
 func buildSPI(src, dst net.IP, st uint32) int {
357
-	spi := int(st)
358
-	f := src[len(src)-4:]
359
-	t := dst[len(dst)-4:]
360
-	for i := 0; i < 4; i++ {
361
-		spi = spi ^ (int(f[i])^int(t[3-i]))<<uint32(8*i)
357
+	b := make([]byte, 4)
358
+	binary.BigEndian.PutUint32(b, st)
359
+	h := fnv.New32a()
360
+	h.Write(src)
361
+	h.Write(b)
362
+	h.Write(dst)
363
+	return int(binary.BigEndian.Uint32(h.Sum(nil)))
364
+}
365
+
366
+func buildAeadAlgo(k *key, s int) *netlink.XfrmStateAlgo {
367
+	salt := make([]byte, 4)
368
+	binary.BigEndian.PutUint32(salt, uint32(s))
369
+	return &netlink.XfrmStateAlgo{
370
+		Name:   "rfc4106(gcm(aes))",
371
+		Key:    append(k.value, salt...),
372
+		ICVLen: 64,
362 373
 	}
363
-	return spi
364 374
 }
365 375
 
366 376
 func (d *driver) secMapWalk(f func(string, []*spi) ([]*spi, bool)) error {
... ...
@@ -560,3 +573,14 @@ func updateNodeKey(lIP, rIP net.IP, idxs []*spi, curKeys []*key, newIdx, priIdx,
560 560
 
561 561
 	return spis
562 562
 }
563
+
564
+func (n *network) maxMTU() int {
565
+	mtu := vxlanVethMTU
566
+	if n.secure {
567
+		// In case of encryption account for the
568
+		// esp packet espansion and padding
569
+		mtu -= pktExpansion
570
+		mtu -= (mtu % 4)
571
+	}
572
+	return mtu
573
+}
... ...
@@ -75,11 +75,13 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
75 75
 	// Set the container interface and its peer MTU to 1450 to allow
76 76
 	// for 50 bytes vxlan encap (inner eth header(14) + outer IP(20) +
77 77
 	// outer UDP(8) + vxlan header(8))
78
+	mtu := n.maxMTU()
79
+
78 80
 	veth, err := nlh.LinkByName(overlayIfName)
79 81
 	if err != nil {
80 82
 		return fmt.Errorf("cound not find link by name %s: %v", overlayIfName, err)
81 83
 	}
82
-	err = nlh.LinkSetMTU(veth, vxlanVethMTU)
84
+	err = nlh.LinkSetMTU(veth, mtu)
83 85
 	if err != nil {
84 86
 		return err
85 87
 	}
... ...
@@ -93,7 +95,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
93 93
 	if err != nil {
94 94
 		return fmt.Errorf("could not find link by name %s: %v", containerIfName, err)
95 95
 	}
96
-	err = nlh.LinkSetMTU(veth, vxlanVethMTU)
96
+	err = nlh.LinkSetMTU(veth, mtu)
97 97
 	if err != nil {
98 98
 		return err
99 99
 	}
... ...
@@ -119,7 +121,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
119 119
 	}
120 120
 
121 121
 	d.peerDbAdd(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac,
122
-		net.ParseIP(d.bindAddress), true)
122
+		net.ParseIP(d.advertiseAddress), true)
123 123
 
124 124
 	if err := d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil {
125 125
 		log.Warn(err)
... ...
@@ -128,7 +130,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
128 128
 	buf, err := proto.Marshal(&PeerRecord{
129 129
 		EndpointIP:       ep.addr.String(),
130 130
 		EndpointMAC:      ep.mac.String(),
131
-		TunnelEndpointIP: d.bindAddress,
131
+		TunnelEndpointIP: d.advertiseAddress,
132 132
 	})
133 133
 	if err != nil {
134 134
 		return err
... ...
@@ -159,7 +161,7 @@ func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key stri
159 159
 
160 160
 	// Ignore local peers. We already know about them and they
161 161
 	// should not be added to vxlan fdb.
162
-	if peer.TunnelEndpointIP == d.bindAddress {
162
+	if peer.TunnelEndpointIP == d.advertiseAddress {
163 163
 		return
164 164
 	}
165 165
 
... ...
@@ -40,7 +40,7 @@ func (d *driver) serfInit() error {
40 40
 
41 41
 	config := serf.DefaultConfig()
42 42
 	config.Init()
43
-	config.MemberlistConfig.BindAddr = d.bindAddress
43
+	config.MemberlistConfig.BindAddr = d.advertiseAddress
44 44
 
45 45
 	d.eventCh = make(chan serf.Event, 4)
46 46
 	config.EventCh = d.eventCh
... ...
@@ -31,22 +31,23 @@ const (
31 31
 var initVxlanIdm = make(chan (bool), 1)
32 32
 
33 33
 type driver struct {
34
-	eventCh      chan serf.Event
35
-	notifyCh     chan ovNotify
36
-	exitCh       chan chan struct{}
37
-	bindAddress  string
38
-	neighIP      string
39
-	config       map[string]interface{}
40
-	peerDb       peerNetworkMap
41
-	secMap       *encrMap
42
-	serfInstance *serf.Serf
43
-	networks     networkTable
44
-	store        datastore.DataStore
45
-	localStore   datastore.DataStore
46
-	vxlanIdm     *idm.Idm
47
-	once         sync.Once
48
-	joinOnce     sync.Once
49
-	keys         []*key
34
+	eventCh          chan serf.Event
35
+	notifyCh         chan ovNotify
36
+	exitCh           chan chan struct{}
37
+	bindAddress      string
38
+	advertiseAddress string
39
+	neighIP          string
40
+	config           map[string]interface{}
41
+	peerDb           peerNetworkMap
42
+	secMap           *encrMap
43
+	serfInstance     *serf.Serf
44
+	networks         networkTable
45
+	store            datastore.DataStore
46
+	localStore       datastore.DataStore
47
+	vxlanIdm         *idm.Idm
48
+	once             sync.Once
49
+	joinOnce         sync.Once
50
+	keys             []*key
50 51
 	sync.Mutex
51 52
 }
52 53
 
... ...
@@ -111,7 +112,11 @@ func (d *driver) restoreEndpoints() error {
111 111
 		ep := kvo.(*endpoint)
112 112
 		n := d.network(ep.nid)
113 113
 		if n == nil {
114
-			logrus.Debugf("Network (%s) not found for restored endpoint (%s)", ep.nid, ep.id)
114
+			logrus.Debugf("Network (%s) not found for restored endpoint (%s)", ep.nid[0:7], ep.id[0:7])
115
+			logrus.Debugf("Deleting stale overlay endpoint (%s) from store", ep.id[0:7])
116
+			if err := d.deleteEndpointFromStore(ep); err != nil {
117
+				logrus.Debugf("Failed to delete stale overlay endpoint (%s) from store", ep.id[0:7])
118
+			}
115 119
 			continue
116 120
 		}
117 121
 		n.addEndpoint(ep)
... ...
@@ -140,7 +145,7 @@ func (d *driver) restoreEndpoints() error {
140 140
 		}
141 141
 
142 142
 		n.incEndpointCount()
143
-		d.peerDbAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.bindAddress), true)
143
+		d.peerDbAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true)
144 144
 	}
145 145
 	return nil
146 146
 }
... ...
@@ -211,20 +216,25 @@ func validateSelf(node string) error {
211 211
 	return fmt.Errorf("Multi-Host overlay networking requires cluster-advertise(%s) to be configured with a local ip-address that is reachable within the cluster", advIP.String())
212 212
 }
213 213
 
214
-func (d *driver) nodeJoin(node string, self bool) {
214
+func (d *driver) nodeJoin(advertiseAddress, bindAddress string, self bool) {
215 215
 	if self && !d.isSerfAlive() {
216
-		if err := validateSelf(node); err != nil {
217
-			logrus.Errorf("%s", err.Error())
218
-		}
219 216
 		d.Lock()
220
-		d.bindAddress = node
217
+		d.advertiseAddress = advertiseAddress
218
+		d.bindAddress = bindAddress
221 219
 		d.Unlock()
222 220
 
223 221
 		// If there is no cluster store there is no need to start serf.
224 222
 		if d.store != nil {
223
+			if err := validateSelf(advertiseAddress); err != nil {
224
+				logrus.Warnf("%s", err.Error())
225
+			}
225 226
 			err := d.serfInit()
226 227
 			if err != nil {
227 228
 				logrus.Errorf("initializing serf instance failed: %v", err)
229
+				d.Lock()
230
+				d.advertiseAddress = ""
231
+				d.bindAddress = ""
232
+				d.Unlock()
228 233
 				return
229 234
 			}
230 235
 		}
... ...
@@ -232,7 +242,7 @@ func (d *driver) nodeJoin(node string, self bool) {
232 232
 
233 233
 	d.Lock()
234 234
 	if !self {
235
-		d.neighIP = node
235
+		d.neighIP = advertiseAddress
236 236
 	}
237 237
 	neighIP := d.neighIP
238 238
 	d.Unlock()
... ...
@@ -246,7 +256,7 @@ func (d *driver) nodeJoin(node string, self bool) {
246 246
 			}
247 247
 		})
248 248
 		if err != nil {
249
-			logrus.Errorf("joining serf neighbor %s failed: %v", node, err)
249
+			logrus.Errorf("joining serf neighbor %s failed: %v", advertiseAddress, err)
250 250
 			d.Lock()
251 251
 			d.joinOnce = sync.Once{}
252 252
 			d.Unlock()
... ...
@@ -286,7 +296,7 @@ func (d *driver) DiscoverNew(dType discoverapi.DiscoveryType, data interface{})
286 286
 		if !ok || nodeData.Address == "" {
287 287
 			return fmt.Errorf("invalid discovery data")
288 288
 		}
289
-		d.nodeJoin(nodeData.Address, nodeData.Self)
289
+		d.nodeJoin(nodeData.Address, nodeData.BindAddress, nodeData.Self)
290 290
 	case discoverapi.DatastoreConfig:
291 291
 		if d.store != nil {
292 292
 			return types.ForbiddenErrorf("cannot accept datastore configuration: Overlay driver has a datastore configured already")
... ...
@@ -113,6 +113,9 @@ func (ec *endpointCnt) updateStore() error {
113 113
 	if store == nil {
114 114
 		return fmt.Errorf("store not found for scope %s on endpoint count update", ec.DataScope())
115 115
 	}
116
+	// make a copy of count and n to avoid being overwritten by store.GetObject
117
+	count := ec.EndpointCnt()
118
+	n := ec.n
116 119
 	for {
117 120
 		if err := ec.n.getController().updateToStore(ec); err == nil || err != datastore.ErrKeyModified {
118 121
 			return err
... ...
@@ -120,6 +123,10 @@ func (ec *endpointCnt) updateStore() error {
120 120
 		if err := store.GetObject(datastore.Key(ec.Key()...), ec); err != nil {
121 121
 			return fmt.Errorf("could not update the kvobject to latest on endpoint count update: %v", err)
122 122
 		}
123
+		ec.Lock()
124
+		ec.Count = count
125
+		ec.n = n
126
+		ec.Unlock()
123 127
 	}
124 128
 }
125 129
 
... ...
@@ -136,7 +143,9 @@ retry:
136 136
 	if inc {
137 137
 		ec.Count++
138 138
 	} else {
139
-		ec.Count--
139
+		if ec.Count > 0 {
140
+			ec.Count--
141
+		}
140 142
 	}
141 143
 	ec.Unlock()
142 144
 
... ...
@@ -1105,9 +1105,13 @@ func (n *network) getSvcRecords(ep *endpoint) []etchosts.Record {
1105 1105
 	}
1106 1106
 
1107 1107
 	var recs []etchosts.Record
1108
-	sr, _ := n.ctrlr.svcRecords[n.id]
1108
+
1109 1109
 	epName := ep.Name()
1110 1110
 
1111
+	n.ctrlr.Lock()
1112
+	sr, _ := n.ctrlr.svcRecords[n.id]
1113
+	n.ctrlr.Unlock()
1114
+
1111 1115
 	for h, ip := range sr.svcMap {
1112 1116
 		if strings.Split(h, ".")[0] == epName {
1113 1117
 			continue
... ...
@@ -81,7 +81,7 @@ func (nDB *NetworkDB) RemoveKey(key []byte) {
81 81
 func (nDB *NetworkDB) clusterInit() error {
82 82
 	config := memberlist.DefaultLANConfig()
83 83
 	config.Name = nDB.config.NodeName
84
-	config.BindAddr = nDB.config.BindAddr
84
+	config.AdvertiseAddr = nDB.config.AdvertiseAddr
85 85
 
86 86
 	if nDB.config.BindPort != 0 {
87 87
 		config.BindPort = nDB.config.BindPort
... ...
@@ -107,9 +107,9 @@ type Config struct {
107 107
 	// NodeName is the cluster wide unique name for this node.
108 108
 	NodeName string
109 109
 
110
-	// BindAddr is the local node's IP address that we bind to for
110
+	// AdvertiseAddr is the node's IP address that we advertise for
111 111
 	// cluster communication.
112
-	BindAddr string
112
+	AdvertiseAddr string
113 113
 
114 114
 	// BindPort is the local node's port to which we bind to for
115 115
 	// cluster communication.
... ...
@@ -303,6 +303,7 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If
303 303
 	for err = nlh.LinkSetUp(iface); err != nil && cnt < 3; cnt++ {
304 304
 		log.Debugf("retrying link setup because of: %v", err)
305 305
 		time.Sleep(10 * time.Millisecond)
306
+		err = nlh.LinkSetUp(iface)
306 307
 	}
307 308
 	if err != nil {
308 309
 		return fmt.Errorf("failed to set link up: %v", err)
... ...
@@ -6,6 +6,7 @@ import (
6 6
 	"net"
7 7
 	"os"
8 8
 	"os/exec"
9
+	"path/filepath"
9 10
 	"runtime"
10 11
 	"strconv"
11 12
 	"strings"
... ...
@@ -21,7 +22,7 @@ import (
21 21
 	"github.com/vishvananda/netns"
22 22
 )
23 23
 
24
-const prefix = "/var/run/docker/netns"
24
+const defaultPrefix = "/var/run/docker"
25 25
 
26 26
 var (
27 27
 	once             sync.Once
... ...
@@ -30,6 +31,7 @@ var (
30 30
 	gpmWg            sync.WaitGroup
31 31
 	gpmCleanupPeriod = 60 * time.Second
32 32
 	gpmChan          = make(chan chan struct{})
33
+	prefix           = defaultPrefix
33 34
 )
34 35
 
35 36
 // The networkNamespace type is the linux implementation of the Sandbox
... ...
@@ -48,12 +50,21 @@ type networkNamespace struct {
48 48
 	sync.Mutex
49 49
 }
50 50
 
51
+// SetBasePath sets the base url prefix for the ns path
52
+func SetBasePath(path string) {
53
+	prefix = path
54
+}
55
+
51 56
 func init() {
52 57
 	reexec.Register("netns-create", reexecCreateNamespace)
53 58
 }
54 59
 
60
+func basePath() string {
61
+	return filepath.Join(prefix, "netns")
62
+}
63
+
55 64
 func createBasePath() {
56
-	err := os.MkdirAll(prefix, 0755)
65
+	err := os.MkdirAll(basePath(), 0755)
57 66
 	if err != nil {
58 67
 		panic("Could not create net namespace path directory")
59 68
 	}
... ...
@@ -142,7 +153,7 @@ func GenerateKey(containerID string) string {
142 142
 			indexStr string
143 143
 			tmpkey   string
144 144
 		)
145
-		dir, err := ioutil.ReadDir(prefix)
145
+		dir, err := ioutil.ReadDir(basePath())
146 146
 		if err != nil {
147 147
 			return ""
148 148
 		}
... ...
@@ -172,7 +183,7 @@ func GenerateKey(containerID string) string {
172 172
 		maxLen = len(containerID)
173 173
 	}
174 174
 
175
-	return prefix + "/" + containerID[:maxLen]
175
+	return basePath() + "/" + containerID[:maxLen]
176 176
 }
177 177
 
178 178
 // NewSandbox provides a new sandbox instance created in an os specific way
... ...
@@ -10,3 +10,7 @@ func GC() {
10 10
 func GetSandboxForExternalKey(path string, key string) (Sandbox, error) {
11 11
 	return nil, nil
12 12
 }
13
+
14
+// SetBasePath sets the base url prefix for the ns path
15
+func SetBasePath(path string) {
16
+}
... ...
@@ -37,3 +37,7 @@ func InitOSContext() func() {
37 37
 func SetupTestOSContext(t *testing.T) func() {
38 38
 	return func() {}
39 39
 }
40
+
41
+// SetBasePath sets the base url prefix for the ns path
42
+func SetBasePath(path string) {
43
+}
... ...
@@ -38,3 +38,7 @@ func InitOSContext() func() {
38 38
 func SetupTestOSContext(t *testing.T) func() {
39 39
 	return func() {}
40 40
 }
41
+
42
+// SetBasePath sets the base url prefix for the ns path
43
+func SetBasePath(path string) {
44
+}
... ...
@@ -413,7 +413,12 @@ func (sb *sandbox) ResolveIP(ip string) string {
413 413
 	for _, ep := range sb.getConnectedEndpoints() {
414 414
 		n := ep.getNetwork()
415 415
 
416
-		sr, ok := n.getController().svcRecords[n.ID()]
416
+		c := n.getController()
417
+
418
+		c.Lock()
419
+		sr, ok := c.svcRecords[n.ID()]
420
+		c.Unlock()
421
+
417 422
 		if !ok {
418 423
 			continue
419 424
 		}
... ...
@@ -454,7 +459,12 @@ func (sb *sandbox) ResolveService(name string) ([]*net.SRV, []net.IP, error) {
454 454
 	for _, ep := range sb.getConnectedEndpoints() {
455 455
 		n := ep.getNetwork()
456 456
 
457
-		sr, ok := n.getController().svcRecords[n.ID()]
457
+		c := n.getController()
458
+
459
+		c.Lock()
460
+		sr, ok := c.svcRecords[n.ID()]
461
+		c.Unlock()
462
+
458 463
 		if !ok {
459 464
 			continue
460 465
 		}
... ...
@@ -575,7 +585,11 @@ func (sb *sandbox) resolveName(req string, networkName string, epList []*endpoin
575 575
 			ep.Unlock()
576 576
 		}
577 577
 
578
-		sr, ok := n.getController().svcRecords[n.ID()]
578
+		c := n.getController()
579
+		c.Lock()
580
+		sr, ok := c.svcRecords[n.ID()]
581
+		c.Unlock()
582
+
579 583
 		if !ok {
580 584
 			continue
581 585
 		}
... ...
@@ -15,7 +15,7 @@ import (
15 15
 	"github.com/opencontainers/runc/libcontainer/configs"
16 16
 )
17 17
 
18
-const udsBase = "/var/lib/docker/network/files/"
18
+const udsBase = "/run/docker/libnetwork/"
19 19
 const success = "success"
20 20
 
21 21
 // processSetKeyReexec is a private function that must be called only on an reexec path
... ...
@@ -8,6 +8,7 @@ import (
8 8
 	"syscall"
9 9
 
10 10
 	"github.com/vishvananda/netlink/nl"
11
+	"github.com/vishvananda/netns"
11 12
 )
12 13
 
13 14
 // IFA_FLAGS is a u32 attribute.
... ...
@@ -192,7 +193,17 @@ type AddrUpdate struct {
192 192
 // AddrSubscribe takes a chan down which notifications will be sent
193 193
 // when addresses change.  Close the 'done' chan to stop subscription.
194 194
 func AddrSubscribe(ch chan<- AddrUpdate, done <-chan struct{}) error {
195
-	s, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_IPV4_IFADDR, syscall.RTNLGRP_IPV6_IFADDR)
195
+	return addrSubscribe(netns.None(), netns.None(), ch, done)
196
+}
197
+
198
+// AddrSubscribeAt works like AddrSubscribe plus it allows the caller
199
+// to choose the network namespace in which to subscribe (ns).
200
+func AddrSubscribeAt(ns netns.NsHandle, ch chan<- AddrUpdate, done <-chan struct{}) error {
201
+	return addrSubscribe(ns, netns.None(), ch, done)
202
+}
203
+
204
+func addrSubscribe(newNs, curNs netns.NsHandle, ch chan<- AddrUpdate, done <-chan struct{}) error {
205
+	s, err := nl.SubscribeAt(newNs, curNs, syscall.NETLINK_ROUTE, syscall.RTNLGRP_IPV4_IFADDR, syscall.RTNLGRP_IPV6_IFADDR)
196 206
 	if err != nil {
197 207
 		return err
198 208
 	}
... ...
@@ -143,7 +143,7 @@ func (h *Handle) FilterAdd(filter Filter) error {
143 143
 		if u32.RedirIndex != 0 {
144 144
 			u32.Actions = append([]Action{NewMirredAction(u32.RedirIndex)}, u32.Actions...)
145 145
 		}
146
-		if err := encodeActions(actionsAttr, u32.Actions); err != nil {
146
+		if err := EncodeActions(actionsAttr, u32.Actions); err != nil {
147 147
 			return err
148 148
 		}
149 149
 	} else if fw, ok := filter.(*Fw); ok {
... ...
@@ -309,7 +309,7 @@ func toAttrs(tcgen *nl.TcGen, attrs *ActionAttrs) {
309 309
 	attrs.Bindcnt = int(tcgen.Bindcnt)
310 310
 }
311 311
 
312
-func encodeActions(attr *nl.RtAttr, actions []Action) error {
312
+func EncodeActions(attr *nl.RtAttr, actions []Action) error {
313 313
 	tabIndex := int(nl.TCA_ACT_TAB)
314 314
 
315 315
 	for _, action := range actions {
... ...
@@ -10,6 +10,7 @@ import (
10 10
 	"unsafe"
11 11
 
12 12
 	"github.com/vishvananda/netlink/nl"
13
+	"github.com/vishvananda/netns"
13 14
 )
14 15
 
15 16
 const SizeofLinkStats = 0x5c
... ...
@@ -425,7 +426,7 @@ func addVxlanAttrs(vxlan *Vxlan, linkInfo *nl.RtAttr) {
425 425
 		nl.NewRtAttrChild(data, nl.IFLA_VXLAN_UDP_CSUM, boolAttr(vxlan.UDPCSum))
426 426
 	}
427 427
 	if vxlan.GBP {
428
-		nl.NewRtAttrChild(data, nl.IFLA_VXLAN_GBP, boolAttr(vxlan.GBP))
428
+		nl.NewRtAttrChild(data, nl.IFLA_VXLAN_GBP, []byte{})
429 429
 	}
430 430
 	if vxlan.NoAge {
431 431
 		nl.NewRtAttrChild(data, nl.IFLA_VXLAN_AGEING, nl.Uint32Attr(0))
... ...
@@ -1011,7 +1012,17 @@ type LinkUpdate struct {
1011 1011
 // LinkSubscribe takes a chan down which notifications will be sent
1012 1012
 // when links change.  Close the 'done' chan to stop subscription.
1013 1013
 func LinkSubscribe(ch chan<- LinkUpdate, done <-chan struct{}) error {
1014
-	s, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_LINK)
1014
+	return linkSubscribe(netns.None(), netns.None(), ch, done)
1015
+}
1016
+
1017
+// LinkSubscribeAt works like LinkSubscribe plus it allows the caller
1018
+// to choose the network namespace in which to subscribe (ns).
1019
+func LinkSubscribeAt(ns netns.NsHandle, ch chan<- LinkUpdate, done <-chan struct{}) error {
1020
+	return linkSubscribe(ns, netns.None(), ch, done)
1021
+}
1022
+
1023
+func linkSubscribe(newNs, curNs netns.NsHandle, ch chan<- LinkUpdate, done <-chan struct{}) error {
1024
+	s, err := nl.SubscribeAt(newNs, curNs, syscall.NETLINK_ROUTE, syscall.RTNLGRP_LINK)
1015 1025
 	if err != nil {
1016 1026
 		return err
1017 1027
 	}
... ...
@@ -1152,7 +1163,7 @@ func parseVxlanData(link Link, data []syscall.NetlinkRouteAttr) {
1152 1152
 		case nl.IFLA_VXLAN_UDP_CSUM:
1153 1153
 			vxlan.UDPCSum = int8(datum.Value[0]) != 0
1154 1154
 		case nl.IFLA_VXLAN_GBP:
1155
-			vxlan.GBP = int8(datum.Value[0]) != 0
1155
+			vxlan.GBP = true
1156 1156
 		case nl.IFLA_VXLAN_AGEING:
1157 1157
 			vxlan.Age = int(native.Uint32(datum.Value[0:4]))
1158 1158
 			vxlan.NoAge = vxlan.Age == 0
... ...
@@ -331,24 +331,63 @@ func getNetlinkSocket(protocol int) (*NetlinkSocket, error) {
331 331
 // moves back into it when done. If newNs is close, the socket will be opened
332 332
 // in the current network namespace.
333 333
 func GetNetlinkSocketAt(newNs, curNs netns.NsHandle, protocol int) (*NetlinkSocket, error) {
334
-	var err error
334
+	c, err := executeInNetns(newNs, curNs)
335
+	if err != nil {
336
+		return nil, err
337
+	}
338
+	defer c()
339
+	return getNetlinkSocket(protocol)
340
+}
335 341
 
342
+// executeInNetns sets execution of the code following this call to the
343
+// network namespace newNs, then moves the thread back to curNs if open,
344
+// otherwise to the current netns at the time the function was invoked
345
+// In case of success, the caller is expected to execute the returned function
346
+// at the end of the code that needs to be executed in the network namespace.
347
+// Example:
348
+// func jobAt(...) error {
349
+//      d, err := executeInNetns(...)
350
+//      if err != nil { return err}
351
+//      defer d()
352
+//      < code which needs to be executed in specific netns>
353
+//  }
354
+// TODO: his function probably belongs to netns pkg.
355
+func executeInNetns(newNs, curNs netns.NsHandle) (func(), error) {
356
+	var (
357
+		err       error
358
+		moveBack  func(netns.NsHandle) error
359
+		closeNs   func() error
360
+		unlockThd func()
361
+	)
362
+	restore := func() {
363
+		// order matters
364
+		if moveBack != nil {
365
+			moveBack(curNs)
366
+		}
367
+		if closeNs != nil {
368
+			closeNs()
369
+		}
370
+		if unlockThd != nil {
371
+			unlockThd()
372
+		}
373
+	}
336 374
 	if newNs.IsOpen() {
337 375
 		runtime.LockOSThread()
338
-		defer runtime.UnlockOSThread()
376
+		unlockThd = runtime.UnlockOSThread
339 377
 		if !curNs.IsOpen() {
340 378
 			if curNs, err = netns.Get(); err != nil {
379
+				restore()
341 380
 				return nil, fmt.Errorf("could not get current namespace while creating netlink socket: %v", err)
342 381
 			}
343
-			defer curNs.Close()
382
+			closeNs = curNs.Close
344 383
 		}
345 384
 		if err := netns.Set(newNs); err != nil {
385
+			restore()
346 386
 			return nil, fmt.Errorf("failed to set into network namespace %d while creating netlink socket: %v", newNs, err)
347 387
 		}
348
-		defer netns.Set(curNs)
388
+		moveBack = netns.Set
349 389
 	}
350
-
351
-	return getNetlinkSocket(protocol)
390
+	return restore, nil
352 391
 }
353 392
 
354 393
 // Create a netlink socket with a given protocol (e.g. NETLINK_ROUTE)
... ...
@@ -377,6 +416,18 @@ func Subscribe(protocol int, groups ...uint) (*NetlinkSocket, error) {
377 377
 	return s, nil
378 378
 }
379 379
 
380
+// SubscribeAt works like Subscribe plus let's the caller choose the network
381
+// namespace in which the socket would be opened (newNs). Then control goes back
382
+// to curNs if open, otherwise to the netns at the time this function was called.
383
+func SubscribeAt(newNs, curNs netns.NsHandle, protocol int, groups ...uint) (*NetlinkSocket, error) {
384
+	c, err := executeInNetns(newNs, curNs)
385
+	if err != nil {
386
+		return nil, err
387
+	}
388
+	defer c()
389
+	return Subscribe(protocol, groups...)
390
+}
391
+
380 392
 func (s *NetlinkSocket) Close() {
381 393
 	syscall.Close(s.fd)
382 394
 	s.fd = -1
... ...
@@ -10,6 +10,7 @@ const (
10 10
 	SizeofXfrmUsersaInfo  = 0xe0
11 11
 	SizeofXfrmAlgo        = 0x44
12 12
 	SizeofXfrmAlgoAuth    = 0x48
13
+	SizeofXfrmAlgoAEAD    = 0x48
13 14
 	SizeofXfrmEncapTmpl   = 0x18
14 15
 	SizeofXfrmUsersaFlush = 0x8
15 16
 )
... ...
@@ -194,6 +195,35 @@ func (msg *XfrmAlgoAuth) Serialize() []byte {
194 194
 //   char    alg_key[0];
195 195
 // }
196 196
 
197
+type XfrmAlgoAEAD struct {
198
+	AlgName   [64]byte
199
+	AlgKeyLen uint32
200
+	AlgICVLen uint32
201
+	AlgKey    []byte
202
+}
203
+
204
+func (msg *XfrmAlgoAEAD) Len() int {
205
+	return SizeofXfrmAlgoAEAD + int(msg.AlgKeyLen/8)
206
+}
207
+
208
+func DeserializeXfrmAlgoAEAD(b []byte) *XfrmAlgoAEAD {
209
+	ret := XfrmAlgoAEAD{}
210
+	copy(ret.AlgName[:], b[0:64])
211
+	ret.AlgKeyLen = *(*uint32)(unsafe.Pointer(&b[64]))
212
+	ret.AlgICVLen = *(*uint32)(unsafe.Pointer(&b[68]))
213
+	ret.AlgKey = b[72:ret.Len()]
214
+	return &ret
215
+}
216
+
217
+func (msg *XfrmAlgoAEAD) Serialize() []byte {
218
+	b := make([]byte, msg.Len())
219
+	copy(b[0:64], msg.AlgName[:])
220
+	copy(b[64:68], (*(*[4]byte)(unsafe.Pointer(&msg.AlgKeyLen)))[:])
221
+	copy(b[68:72], (*(*[4]byte)(unsafe.Pointer(&msg.AlgICVLen)))[:])
222
+	copy(b[72:msg.Len()], msg.AlgKey[:])
223
+	return b
224
+}
225
+
197 226
 // struct xfrm_encap_tmpl {
198 227
 //   __u16   encap_type;
199 228
 //   __be16    encap_sport;
... ...
@@ -6,6 +6,7 @@ import (
6 6
 	"syscall"
7 7
 
8 8
 	"github.com/vishvananda/netlink/nl"
9
+	"github.com/vishvananda/netns"
9 10
 )
10 11
 
11 12
 // RtAttr is shared so it is in netlink_linux.go
... ...
@@ -421,7 +422,17 @@ func (h *Handle) RouteGet(destination net.IP) ([]Route, error) {
421 421
 // RouteSubscribe takes a chan down which notifications will be sent
422 422
 // when routes are added or deleted. Close the 'done' chan to stop subscription.
423 423
 func RouteSubscribe(ch chan<- RouteUpdate, done <-chan struct{}) error {
424
-	s, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_IPV4_ROUTE, syscall.RTNLGRP_IPV6_ROUTE)
424
+	return routeSubscribeAt(netns.None(), netns.None(), ch, done)
425
+}
426
+
427
+// RouteSubscribeAt works like RouteSubscribe plus it allows the caller
428
+// to choose the network namespace in which to subscribe (ns).
429
+func RouteSubscribeAt(ns netns.NsHandle, ch chan<- RouteUpdate, done <-chan struct{}) error {
430
+	return routeSubscribeAt(ns, netns.None(), ch, done)
431
+}
432
+
433
+func routeSubscribeAt(newNs, curNs netns.NsHandle, ch chan<- RouteUpdate, done <-chan struct{}) error {
434
+	s, err := nl.SubscribeAt(newNs, curNs, syscall.NETLINK_ROUTE, syscall.RTNLGRP_IPV4_ROUTE, syscall.RTNLGRP_IPV6_ROUTE)
425 435
 	if err != nil {
426 436
 		return err
427 437
 	}
... ...
@@ -10,10 +10,18 @@ type XfrmStateAlgo struct {
10 10
 	Name        string
11 11
 	Key         []byte
12 12
 	TruncateLen int // Auth only
13
+	ICVLen      int // AEAD only
13 14
 }
14 15
 
15 16
 func (a XfrmStateAlgo) String() string {
16
-	return fmt.Sprintf("{Name: %s, Key: 0x%x, TruncateLen: %d}", a.Name, a.Key, a.TruncateLen)
17
+	base := fmt.Sprintf("{Name: %s, Key: 0x%x", a.Name, a.Key)
18
+	if a.TruncateLen != 0 {
19
+		base = fmt.Sprintf("%s, Truncate length: %d", base, a.TruncateLen)
20
+	}
21
+	if a.ICVLen != 0 {
22
+		base = fmt.Sprintf("%s, ICV length: %d", base, a.ICVLen)
23
+	}
24
+	return fmt.Sprintf("%s}", base)
17 25
 }
18 26
 
19 27
 // EncapType is an enum representing the optional packet encapsulation.
... ...
@@ -73,12 +81,13 @@ type XfrmState struct {
73 73
 	Mark         *XfrmMark
74 74
 	Auth         *XfrmStateAlgo
75 75
 	Crypt        *XfrmStateAlgo
76
+	Aead         *XfrmStateAlgo
76 77
 	Encap        *XfrmStateEncap
77 78
 }
78 79
 
79 80
 func (sa XfrmState) String() string {
80
-	return fmt.Sprintf("Dst: %v, Src: %v, Proto: %s, Mode: %s, SPI: 0x%x, ReqID: 0x%x, ReplayWindow: %d, Mark: %v, Auth: %v, Crypt: %v, Encap: %v",
81
-		sa.Dst, sa.Src, sa.Proto, sa.Mode, sa.Spi, sa.Reqid, sa.ReplayWindow, sa.Mark, sa.Auth, sa.Crypt, sa.Encap)
81
+	return fmt.Sprintf("Dst: %v, Src: %v, Proto: %s, Mode: %s, SPI: 0x%x, ReqID: 0x%x, ReplayWindow: %d, Mark: %v, Auth: %v, Crypt: %v, Aead: %v,Encap: %v",
82
+		sa.Dst, sa.Src, sa.Proto, sa.Mode, sa.Spi, sa.Reqid, sa.ReplayWindow, sa.Mark, sa.Auth, sa.Crypt, sa.Aead, sa.Encap)
82 83
 }
83 84
 func (sa XfrmState) Print(stats bool) string {
84 85
 	if !stats {
... ...
@@ -35,6 +35,20 @@ func writeStateAlgoAuth(a *XfrmStateAlgo) []byte {
35 35
 	return algo.Serialize()
36 36
 }
37 37
 
38
+func writeStateAlgoAead(a *XfrmStateAlgo) []byte {
39
+	algo := nl.XfrmAlgoAEAD{
40
+		AlgKeyLen: uint32(len(a.Key) * 8),
41
+		AlgICVLen: uint32(a.ICVLen),
42
+		AlgKey:    a.Key,
43
+	}
44
+	end := len(a.Name)
45
+	if end > 64 {
46
+		end = 64
47
+	}
48
+	copy(algo.AlgName[:end], a.Name)
49
+	return algo.Serialize()
50
+}
51
+
38 52
 func writeMark(m *XfrmMark) []byte {
39 53
 	mark := &nl.XfrmMark{
40 54
 		Value: m.Value,
... ...
@@ -97,6 +111,10 @@ func (h *Handle) xfrmStateAddOrUpdate(state *XfrmState, nlProto int) error {
97 97
 		out := nl.NewRtAttr(nl.XFRMA_ALG_CRYPT, writeStateAlgo(state.Crypt))
98 98
 		req.AddData(out)
99 99
 	}
100
+	if state.Aead != nil {
101
+		out := nl.NewRtAttr(nl.XFRMA_ALG_AEAD, writeStateAlgoAead(state.Aead))
102
+		req.AddData(out)
103
+	}
100 104
 	if state.Encap != nil {
101 105
 		encapData := make([]byte, nl.SizeofXfrmEncapTmpl)
102 106
 		encap := nl.DeserializeXfrmEncapTmpl(encapData)
... ...
@@ -271,6 +289,12 @@ func parseXfrmState(m []byte, family int) (*XfrmState, error) {
271 271
 			state.Auth.Name = nl.BytesToString(algo.AlgName[:])
272 272
 			state.Auth.Key = algo.AlgKey
273 273
 			state.Auth.TruncateLen = int(algo.AlgTruncLen)
274
+		case nl.XFRMA_ALG_AEAD:
275
+			state.Aead = new(XfrmStateAlgo)
276
+			algo := nl.DeserializeXfrmAlgoAEAD(attr.Value[:])
277
+			state.Aead.Name = nl.BytesToString(algo.AlgName[:])
278
+			state.Aead.Key = algo.AlgKey
279
+			state.Aead.ICVLen = int(algo.AlgICVLen)
274 280
 		case nl.XFRMA_ENCAP:
275 281
 			encap := nl.DeserializeXfrmEncapTmpl(attr.Value[:])
276 282
 			state.Encap = new(XfrmStateEncap)