Browse code

libn/d/overlay: add nftables support

Port the firewall ruleset for encrypted overlay networks to nftables.
Maximize compatibility with the most distros by only using nftables
features that are widely available. Use the deprecated 'meta secpath
exists' expression instead of the more modern 'meta ipsec exists'.
Extract the VNI from VXLAN packets using the more widely available '@th'
raw payload expressions instead of '@ih' or 'vxlan vni' expressions.

Signed-off-by: Cory Snider <csnider@mirantis.com>

Cory Snider authored on 2026/06/10 05:47:15
Showing 5 changed files
... ...
@@ -18,6 +18,7 @@ import (
18 18
 	"github.com/containerd/log"
19 19
 	"github.com/moby/moby/v2/daemon/libnetwork/discoverapi"
20 20
 	"github.com/moby/moby/v2/daemon/libnetwork/drivers/overlay/overlayutils"
21
+	"github.com/moby/moby/v2/daemon/libnetwork/internal/nftables"
21 22
 	"github.com/moby/moby/v2/daemon/libnetwork/iptables"
22 23
 	"github.com/moby/moby/v2/daemon/libnetwork/ns"
23 24
 	"github.com/moby/moby/v2/daemon/libnetwork/types"
... ...
@@ -277,6 +278,22 @@ func (d *driver) programInput(vni uint32, add bool) error {
277 277
 	return nil
278 278
 }
279 279
 
280
+func (d *driver) programOverlayEncryptionFirewall(ctx context.Context, vni uint32, encrypted bool) error {
281
+	if nftables.Enabled() {
282
+		return d.programOverlayEncVNINft(ctx, vni, encrypted)
283
+	}
284
+
285
+	mangleErr := d.programMangle(vni, encrypted)
286
+	if mangleErr != nil && encrypted {
287
+		return mangleErr
288
+	}
289
+	err := d.programInput(vni, encrypted)
290
+	if err != nil && encrypted {
291
+		return errors.Join(err, d.programMangle(vni, false))
292
+	}
293
+	return errors.Join(mangleErr, err)
294
+}
295
+
280 296
 func programSA(localIP, remoteIP net.IP, spi spi, k *key, dir int, add bool) (fSA *netlink.XfrmState, rSA *netlink.XfrmState, lastErr error) {
281 297
 	var (
282 298
 		action      = "Removing"
283 299
new file mode 100644
... ...
@@ -0,0 +1,113 @@
0
+//go:build linux
1
+
2
+package overlay
3
+
4
+import (
5
+	"context"
6
+	"fmt"
7
+	"strconv"
8
+
9
+	"github.com/moby/moby/v2/daemon/libnetwork/drivers/overlay/overlayutils"
10
+	"github.com/moby/moby/v2/daemon/libnetwork/internal/nftables"
11
+)
12
+
13
+const (
14
+	nftOverlayTable    = "docker-overlay"
15
+	nftEncOutChainName = "enc-out"
16
+	nftEncInChainName  = "enc-in"
17
+	nftEncVNSetName    = "encrypted-vnis"
18
+	nftEncVNIExpr      = "@th,96,24"
19
+)
20
+
21
+// ensureOverlayEncNftTable returns the overlay encryption nft table, running one-time setup on first use.
22
+func (d *driver) ensureOverlayEncNftTable(ctx context.Context) (nftables.Table, error) {
23
+	d.overlayEncNftInitMu.Lock()
24
+	defer d.overlayEncNftInitMu.Unlock()
25
+	if d.overlayEncNftTable.IsValid() {
26
+		return d.overlayEncNftTable, nil
27
+	}
28
+
29
+	v6, err := d.isIPv6Transport()
30
+	if err != nil {
31
+		return nftables.Table{}, err
32
+	}
33
+	fam := nftables.IPv4
34
+	if v6 {
35
+		fam = nftables.IPv6
36
+	}
37
+	t, err := nftables.NewTable(fam, nftOverlayTable)
38
+	if err != nil {
39
+		return nftables.Table{}, err
40
+	}
41
+
42
+	tm := nftables.Modifier{}
43
+	tm.Create(nftables.Set{
44
+		Name:        nftEncVNSetName,
45
+		ElementType: nftables.Typeof(nftEncVNIExpr),
46
+	})
47
+	tm.Create(nftables.BaseChain{
48
+		Name:      nftEncOutChainName,
49
+		ChainType: nftables.BaseChainTypeRoute,
50
+		Hook:      nftables.BaseChainHookOutput,
51
+		Priority:  nftables.BaseChainPriorityMangle,
52
+		Policy:    nftables.BaseChainPolicyAccept,
53
+	})
54
+	tm.Create(nftables.BaseChain{
55
+		Name:      nftEncInChainName,
56
+		ChainType: nftables.BaseChainTypeFilter,
57
+		Hook:      nftables.BaseChainHookInput,
58
+		Priority:  nftables.BaseChainPriorityRaw,
59
+		Policy:    nftables.BaseChainPolicyAccept,
60
+	})
61
+
62
+	port := strconv.FormatUint(uint64(overlayutils.VXLANUDPPort()), 10)
63
+	tm.Create(nftables.Rule{
64
+		Chain: nftEncOutChainName,
65
+		Rule: []string{
66
+			"udp dport", port,
67
+			nftEncVNIExpr,
68
+			"@" + nftEncVNSetName,
69
+			"counter",
70
+			"meta mark set", fmt.Sprintf("0x%x", mark),
71
+		},
72
+	})
73
+	tm.Create(nftables.Rule{
74
+		Chain: nftEncInChainName,
75
+		Rule: []string{
76
+			"meta secpath missing",
77
+			"udp dport", port,
78
+			nftEncVNIExpr,
79
+			"@" + nftEncVNSetName,
80
+			"counter",
81
+			"drop",
82
+		},
83
+	})
84
+
85
+	if err := t.Apply(ctx, tm); err != nil {
86
+		_ = t.Close()
87
+		return nftables.Table{}, err
88
+	}
89
+
90
+	d.overlayEncNftTable = t
91
+	return t, nil
92
+}
93
+
94
+func (d *driver) programOverlayEncVNINft(ctx context.Context, vni uint32, encrypted bool) error {
95
+	t, err := d.ensureOverlayEncNftTable(ctx)
96
+	if err != nil {
97
+		return err
98
+	}
99
+
100
+	tm := nftables.Modifier{}
101
+	se := nftables.SetElement{
102
+		SetName:    nftEncVNSetName,
103
+		Element:    fmt.Sprintf("0x%06x", vni&0xffffff),
104
+		Idempotent: true,
105
+	}
106
+	if encrypted {
107
+		tm.Create(se)
108
+	} else {
109
+		tm.Delete(se)
110
+	}
111
+	return t.Apply(ctx, tm)
112
+}
... ...
@@ -182,8 +182,7 @@ func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string
182 182
 	// Make sure no rule is on the way from any stale secure network
183 183
 	if !n.secure {
184 184
 		for _, vni := range vnis {
185
-			d.programMangle(vni, false)
186
-			d.programInput(vni, false)
185
+			_ = d.programOverlayEncryptionFirewall(ctx, vni, false)
187 186
 		}
188 187
 	}
189 188
 
... ...
@@ -227,19 +226,12 @@ func (d *driver) DeleteNetwork(nid string) error {
227 227
 
228 228
 	if n.secure {
229 229
 		for _, s := range n.subnets {
230
-			if err := d.programMangle(s.vni, false); err != nil {
230
+			if err := d.programOverlayEncryptionFirewall(context.TODO(), s.vni, false); err != nil {
231 231
 				log.G(context.TODO()).WithFields(log.Fields{
232 232
 					"error":      err,
233 233
 					"network_id": n.id,
234 234
 					"subnet":     s.subnetIP,
235
-				}).Warn("Failed to clean up iptables rules during overlay network deletion")
236
-			}
237
-			if err := d.programInput(s.vni, false); err != nil {
238
-				log.G(context.TODO()).WithFields(log.Fields{
239
-					"error":      err,
240
-					"network_id": n.id,
241
-					"subnet":     s.subnetIP,
242
-				}).Warn("Failed to clean up iptables rules during overlay network deletion")
235
+				}).Warn("Failed to clean up overlay encryption firewall rules during overlay network deletion")
243 236
 			}
244 237
 		}
245 238
 	}
... ...
@@ -529,14 +521,9 @@ func (n *network) initSubnetSandbox(s *subnet) error {
529 529
 	// Program iptables rules for mandatory encryption of the secure
530 530
 	// network, or clean up leftover rules for a stale secure network which
531 531
 	// was previously assigned the same VNI.
532
-	if err := n.driver.programMangle(s.vni, n.secure); err != nil {
532
+	if err := n.driver.programOverlayEncryptionFirewall(context.TODO(), s.vni, n.secure); err != nil {
533 533
 		return err
534 534
 	}
535
-	if err := n.driver.programInput(s.vni, n.secure); err != nil {
536
-		if n.secure {
537
-			return errors.Join(err, n.driver.programMangle(s.vni, false))
538
-		}
539
-	}
540 535
 
541 536
 	if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
542 537
 		return err
... ...
@@ -13,6 +13,7 @@ import (
13 13
 
14 14
 	"github.com/moby/moby/v2/daemon/libnetwork/discoverapi"
15 15
 	"github.com/moby/moby/v2/daemon/libnetwork/driverapi"
16
+	"github.com/moby/moby/v2/daemon/libnetwork/internal/nftables"
16 17
 	"github.com/moby/moby/v2/daemon/libnetwork/scope"
17 18
 )
18 19
 
... ...
@@ -43,6 +44,9 @@ type driver struct {
43 43
 	secMap encrMap
44 44
 	keys   []*key
45 45
 
46
+	overlayEncNftInitMu sync.Mutex
47
+	overlayEncNftTable  nftables.Table
48
+
46 49
 	// mu must be held when accessing the fields which follow it
47 50
 	// in the struct definition.
48 51
 	//
... ...
@@ -1015,6 +1015,9 @@ func (sd Set) delete(ctx context.Context, t *table) (bool, error) {
1015 1015
 type SetElement struct {
1016 1016
 	SetName string
1017 1017
 	Element string
1018
+	// If true, deleting an element that does not exist or creating an
1019
+	// element that already exists will succeed.
1020
+	Idempotent bool
1018 1021
 }
1019 1022
 
1020 1023
 func (se SetElement) create(ctx context.Context, t *table) (bool, error) {
... ...
@@ -1026,6 +1029,9 @@ func (se SetElement) create(ctx context.Context, t *table) (bool, error) {
1026 1026
 		return false, fmt.Errorf("cannot add to set '%s', element not specified", se.SetName)
1027 1027
 	}
1028 1028
 	if _, ok := s.Elements[se.Element]; ok {
1029
+		if se.Idempotent {
1030
+			return false, nil
1031
+		}
1029 1032
 		return false, fmt.Errorf("set '%s' already contains element '%s'", s.Name, se.Element)
1030 1033
 	}
1031 1034
 	s.Elements[se.Element] = struct{}{}
... ...
@@ -1046,6 +1052,9 @@ func (se SetElement) delete(ctx context.Context, t *table) (bool, error) {
1046 1046
 		return false, fmt.Errorf("cannot delete from set '%s', it does not exist", se.SetName)
1047 1047
 	}
1048 1048
 	if _, ok := s.Elements[se.Element]; !ok {
1049
+		if se.Idempotent {
1050
+			return false, nil
1051
+		}
1049 1052
 		return false, fmt.Errorf("cannot delete '%s' from set '%s', it does not exist", se.Element, s.Name)
1050 1053
 	}
1051 1054
 	delete(s.Elements, se.Element)