Browse code

libnet/i/defaultipam: introduce a linear allocator

The previous allocator was subnetting address pools eagerly
when the daemon started, and would then just iterate over that
list whenever RequestPool was called. This was leading to high
memory usage whenever IPv6 pools were configured with a target
subnet size too different from the pools prefix size.

For instance: pool = fd00::/8, target size = /64 -- 2 ^ (64-8)
subnets would be generated upfront. This would take approx.
9 * 10^18 bits -- way too much for any human computer in 2024.

Another noteworthy issue, the previous implementation was allocating
a subnet, and then in another layer was checking whether the
allocation was conflicting with some 'reserved networks'. If so,
the allocation would be retried, etc... To make it worse, 'reserved
networks' would be recomputed on every iteration. This is totally
ineffective as there could be 'reserved networks' that fully overlap
a given address pool (or many!).

To fix this issue, a new field `Exclude` is added to `RequestPool`.
It's up to each driver to take it into account. Since we don't know
whether this retry loop is useful for some remote IPAM driver, it's
reimplemented bug-for-bug directly in the remote driver.

The new allocator uses a linear-search algorithm. It takes advantage
of all lists (predefined pools, allocated subnets and reserved
networks) being sorted and logically combines 'allocated' and
'reserved' through a 'double cursor' to iterate on both lists at the
same time while preserving the total order. At the same time, it
iterates over 'predefined' pools and looks for the first empty space
that would be a good fit.

Currently, the size of the allocated subnet is still dictated by
each 'predefined' pools. We should consider hardcoding that size
instead, and let users specify what subnet size they want. This
wasn't possible before as the subnets were generated upfront. This
new allocator should be able to deal with this easily.

The method used for static allocation has been updated to make sure
the ascending order of 'allocated' is preserved. It's bug-for-bug
compatible with the previous implementation.

One consequence of this new algorithm is that we don't keep track
of where the last allocation happened, we just allocate the first
free subnet we find.

Before:

- Allocate: 10.0.1.0/24, 10.0.2.0/24 ; Deallocate: 10.0.1.0/24 ;
Allocate 10.0.3.0/24.

Now, the 3rd allocation would yield 10.0.1.0/24 once again.

As it doesn't change the semantics of the allocator, there's no
reason to worry about that.

Finally, about 'reserved networks'. The heuristics we use are
now properly documented. It was discovered that we don't check
routes for IPv6 allocations -- this can't be changed because
there's no such thing as on-link routes for IPv6.

(Kudos to Rob Murray for coming up with the linear-search idea.)

Signed-off-by: Albin Kerouanton <albinker@gmail.com>

Albin Kerouanton authored on 2024/04/23 05:36:03
Showing 29 changed files
... ...
@@ -2,6 +2,7 @@ package config // import "github.com/docker/docker/daemon/config"
2 2
 
3 3
 import (
4 4
 	"encoding/json"
5
+	"net/netip"
5 6
 	"os"
6 7
 	"path/filepath"
7 8
 	"reflect"
... ...
@@ -157,7 +158,7 @@ func TestDaemonConfigurationMergeDefaultAddressPools(t *testing.T) {
157 157
 	emptyConfigFile := makeConfigFile(t, `{}`)
158 158
 	configFile := makeConfigFile(t, `{"default-address-pools":[{"base": "10.123.0.0/16", "size": 24 }]}`)
159 159
 
160
-	expected := []*ipamutils.NetworkToSplit{{Base: "10.123.0.0/16", Size: 24}}
160
+	expected := []*ipamutils.NetworkToSplit{{Base: netip.MustParsePrefix("10.123.0.0/16"), Size: 24}}
161 161
 
162 162
 	t.Run("empty config file", func(t *testing.T) {
163 163
 		conf := Config{}
... ...
@@ -167,7 +168,7 @@ func TestDaemonConfigurationMergeDefaultAddressPools(t *testing.T) {
167 167
 
168 168
 		config, err := MergeDaemonConfigurations(&conf, flags, emptyConfigFile)
169 169
 		assert.NilError(t, err)
170
-		assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected)
170
+		assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected, cmpopts.EquateComparable(netip.Prefix{}))
171 171
 	})
172 172
 
173 173
 	t.Run("config file", func(t *testing.T) {
... ...
@@ -177,7 +178,7 @@ func TestDaemonConfigurationMergeDefaultAddressPools(t *testing.T) {
177 177
 
178 178
 		config, err := MergeDaemonConfigurations(&conf, flags, configFile)
179 179
 		assert.NilError(t, err)
180
-		assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected)
180
+		assert.DeepEqual(t, config.DefaultAddressPools.Value(), expected, cmpopts.EquateComparable(netip.Prefix{}))
181 181
 	})
182 182
 
183 183
 	t.Run("with conflicting options", func(t *testing.T) {
... ...
@@ -258,7 +258,7 @@ func (daemon *Daemon) fillDefaultAddressPools(ctx context.Context, v *system.Inf
258 258
 	defer span.End()
259 259
 	for _, pool := range cfg.DefaultAddressPools.Value() {
260 260
 		v.DefaultAddressPools = append(v.DefaultAddressPools, system.NetworkAddressPool{
261
-			Base: pool.Base,
261
+			Base: pool.Base.String(),
262 262
 			Size: pool.Size,
263 263
 		})
264 264
 	}
... ...
@@ -2,6 +2,8 @@ package cnmallocator
2 2
 
3 3
 import (
4 4
 	"context"
5
+	"fmt"
6
+	"net/netip"
5 7
 	"strconv"
6 8
 	"strings"
7 9
 
... ...
@@ -22,8 +24,12 @@ func initIPAMDrivers(r ipamapi.Registerer, netConfig *networkallocator.Config) e
22 22
 	// happens with default address pool option
23 23
 	if netConfig != nil {
24 24
 		for _, p := range netConfig.DefaultAddrPool {
25
+			base, err := netip.ParsePrefix(p)
26
+			if err != nil {
27
+				return fmt.Errorf("invalid prefix %q: %w", p, err)
28
+			}
25 29
 			addressPool = append(addressPool, &ipamutils.NetworkToSplit{
26
-				Base: p,
30
+				Base: base,
27 31
 				Size: int(netConfig.SubnetSize),
28 32
 			})
29 33
 			str.WriteString(p + ",")
... ...
@@ -12,6 +12,9 @@ import (
12 12
 
13 13
 	"github.com/docker/docker/internal/testutils/netnsutils"
14 14
 	"github.com/docker/docker/libnetwork/driverapi"
15
+	"github.com/docker/docker/libnetwork/internal/netiputil"
16
+	"github.com/docker/docker/libnetwork/ipamapi"
17
+	"github.com/docker/docker/libnetwork/ipams/defaultipam"
15 18
 	"github.com/docker/docker/libnetwork/ipamutils"
16 19
 	"github.com/docker/docker/libnetwork/iptables"
17 20
 	"github.com/docker/docker/libnetwork/netlabel"
... ...
@@ -195,16 +198,19 @@ func compareBindings(a, b []types.PortBinding) bool {
195 195
 }
196 196
 
197 197
 func getIPv4Data(t *testing.T) []driverapi.IPAMData {
198
-	ipd := driverapi.IPAMData{AddressSpace: "full"}
199
-	nw, err := netutils.FindAvailableNetwork(ipamutils.GetLocalScopeDefaultNetworks())
200
-	if err != nil {
201
-		t.Fatal(err)
202
-	}
203
-	ipd.Pool = nw
204
-	// Set network gateway to X.X.X.1
205
-	ipd.Gateway = types.GetIPNetCopy(nw)
206
-	ipd.Gateway.IP[len(ipd.Gateway.IP)-1] = 1
207
-	return []driverapi.IPAMData{ipd}
198
+	t.Helper()
199
+
200
+	a, _ := defaultipam.NewAllocator(ipamutils.GetLocalScopeDefaultNetworks(), nil)
201
+	alloc, err := a.RequestPool(ipamapi.PoolRequest{
202
+		AddressSpace: "LocalDefault",
203
+		Exclude:      netutils.InferReservedNetworks(false),
204
+	})
205
+	assert.NilError(t, err)
206
+
207
+	gw, _, err := a.RequestAddress(alloc.PoolID, nil, nil)
208
+	assert.NilError(t, err)
209
+
210
+	return []driverapi.IPAMData{{AddressSpace: "LocalDefault", Pool: netiputil.ToIPNet(alloc.Pool), Gateway: gw}}
208 211
 }
209 212
 
210 213
 func getIPv6Data(t *testing.T) []driverapi.IPAMData {
... ...
@@ -59,3 +59,32 @@ func AddrPortFromNet(addr net.Addr) netip.AddrPort {
59 59
 	}
60 60
 	return netip.AddrPort{}
61 61
 }
62
+
63
+// LastAddr returns the last address of prefix 'p'.
64
+func LastAddr(p netip.Prefix) netip.Addr {
65
+	return ipbits.Add(p.Addr().Prev(), 1, uint(p.Addr().BitLen()-p.Bits()))
66
+}
67
+
68
+// PrefixCompare two prefixes and return a negative, 0, or a positive integer as
69
+// required by [slices.SortFunc]. When two prefixes with the same address is
70
+// provided, the shortest one will be sorted first.
71
+func PrefixCompare(a, b netip.Prefix) int {
72
+	cmp := a.Addr().Compare(b.Addr())
73
+	if cmp != 0 {
74
+		return cmp
75
+	}
76
+	return a.Bits() - b.Bits()
77
+}
78
+
79
+// PrefixAfter returns the prefix of size 'sz' right after 'prev'.
80
+func PrefixAfter(prev netip.Prefix, sz int) netip.Prefix {
81
+	s := sz
82
+	if prev.Bits() < sz {
83
+		s = prev.Bits()
84
+	}
85
+	addr := ipbits.Add(prev.Addr(), 1, uint(prev.Addr().BitLen()-s))
86
+	if addr.IsUnspecified() {
87
+		return netip.Prefix{}
88
+	}
89
+	return netip.PrefixFrom(addr, sz).Masked()
90
+}
62 91
new file mode 100644
... ...
@@ -0,0 +1,46 @@
0
+package netiputil
1
+
2
+import (
3
+	"net/netip"
4
+	"testing"
5
+
6
+	"gotest.tools/v3/assert"
7
+)
8
+
9
+func TestLastAddr(t *testing.T) {
10
+	testcases := []struct {
11
+		p    netip.Prefix
12
+		want netip.Addr
13
+	}{
14
+		{netip.MustParsePrefix("10.0.0.0/24"), netip.MustParseAddr("10.0.0.255")},
15
+		{netip.MustParsePrefix("10.0.0.0/8"), netip.MustParseAddr("10.255.255.255")},
16
+		{netip.MustParsePrefix("fd00::/64"), netip.MustParseAddr("fd00::ffff:ffff:ffff:ffff")},
17
+		{netip.MustParsePrefix("fd00::/16"), netip.MustParseAddr("fd00:ffff:ffff:ffff:ffff:ffff:ffff:ffff")},
18
+		{netip.MustParsePrefix("ffff::/16"), netip.MustParseAddr("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")},
19
+	}
20
+
21
+	for _, tc := range testcases {
22
+		last := LastAddr(tc.p)
23
+		assert.Check(t, last == tc.want, "LastAddr(%q) = %s; want: %s", tc.p, last, tc.want)
24
+	}
25
+}
26
+
27
+func TestPrefixAfter(t *testing.T) {
28
+	testcases := []struct {
29
+		prev netip.Prefix
30
+		sz   int
31
+		want netip.Prefix
32
+	}{
33
+		{netip.MustParsePrefix("10.0.10.0/24"), 24, netip.MustParsePrefix("10.0.11.0/24")},
34
+		{netip.MustParsePrefix("10.0.10.0/24"), 16, netip.MustParsePrefix("10.1.0.0/16")},
35
+		{netip.MustParsePrefix("10.10.0.0/16"), 24, netip.MustParsePrefix("10.11.0.0/24")},
36
+		{netip.MustParsePrefix("2001:db8:feed:cafe:b000:dead::/96"), 16, netip.MustParsePrefix("2002::/16")},
37
+		{netip.MustParsePrefix("ffff::/16"), 16, netip.Prefix{}},
38
+		{netip.MustParsePrefix("2001:db8:1::/48"), 64, netip.MustParsePrefix("2001:db8:2::/64")},
39
+	}
40
+
41
+	for _, tc := range testcases {
42
+		next := PrefixAfter(tc.prev, tc.sz)
43
+		assert.Check(t, next == tc.want, "PrefixAfter(%q, %d) = %s; want: %s", tc.prev, tc.sz, next, tc.want)
44
+	}
45
+}
... ...
@@ -35,6 +35,7 @@ var (
35 35
 	ErrIPOutOfRange        = types.InvalidParameterErrorf("requested address is out of range")
36 36
 	ErrPoolOverlap         = types.ForbiddenErrorf("Pool overlaps with other one on this address space")
37 37
 	ErrBadPool             = types.InvalidParameterErrorf("address space does not contain specified address pool")
38
+	ErrNoMoreSubnets       = types.InvalidParameterErrorf("all predefined address pools have been fully subnetted")
38 39
 )
39 40
 
40 41
 // Ipam represents the interface the IPAM service plugins must implement
... ...
@@ -73,6 +74,10 @@ type PoolRequest struct {
73 73
 	// Options is a map of opaque k/v passed to the driver. It's non-mandatory.
74 74
 	// Drivers are free to ignore it.
75 75
 	Options map[string]string
76
+	// Exclude is a list of prefixes the requester wish to not be dynamically
77
+	// allocated (ie. when Pool isn't specified). It's up to the IPAM driver to
78
+	// take it into account, or totally ignore it. It's required to be sorted.
79
+	Exclude []netip.Prefix
76 80
 	// V6 indicates which address family should be used to dynamically allocate
77 81
 	// a prefix (ie. when Pool isn't specified).
78 82
 	V6 bool
... ...
@@ -2,36 +2,74 @@ package defaultipam
2 2
 
3 3
 import (
4 4
 	"context"
5
-	"fmt"
5
+	"errors"
6 6
 	"net/netip"
7
+	"slices"
7 8
 	"sync"
8 9
 
9 10
 	"github.com/containerd/log"
10 11
 	"github.com/docker/docker/libnetwork/internal/netiputil"
11 12
 	"github.com/docker/docker/libnetwork/ipamapi"
13
+	"github.com/docker/docker/libnetwork/ipamutils"
14
+	"github.com/docker/docker/libnetwork/ipbits"
12 15
 	"github.com/docker/docker/libnetwork/types"
13 16
 )
14 17
 
15 18
 // addrSpace contains the pool configurations for the address space
16 19
 type addrSpace struct {
17
-	// Master subnet pools, indexed by the value's stringified PoolData.Pool field.
20
+	// Ordered list of allocated subnets. This field is used for dynamic subnet
21
+	// allocations.
22
+	allocated []netip.Prefix
23
+	// Allocated subnets, indexed by their prefix. Values track address
24
+	// allocations.
18 25
 	subnets map[netip.Prefix]*PoolData
19 26
 
20
-	// Predefined pool for the address space
21
-	predefined           []netip.Prefix
22
-	predefinedStartIndex int
27
+	// predefined pools for the address space
28
+	predefined []*ipamutils.NetworkToSplit
23 29
 
24 30
 	mu sync.Mutex
25 31
 }
26 32
 
27
-func newAddrSpace(predefined []netip.Prefix) (*addrSpace, error) {
33
+func newAddrSpace(predefined []*ipamutils.NetworkToSplit) (*addrSpace, error) {
34
+	for i, p := range predefined {
35
+		if !p.Base.IsValid() {
36
+			return nil, errors.New("newAddrSpace: prefix zero found")
37
+		}
38
+
39
+		predefined[i].Base = p.Base.Masked()
40
+	}
41
+
42
+	slices.SortFunc(predefined, func(a, b *ipamutils.NetworkToSplit) int {
43
+		return netiputil.PrefixCompare(a.Base, b.Base)
44
+	})
45
+
46
+	// We need to discard longer overlapping prefixes (sorted after the shorter
47
+	// one), otherwise the dynamic allocator might consider a predefined
48
+	// network is fully overlapped, go to the next one, which is a subnet of
49
+	// the previous, and allocate from it.
50
+	j := 0
51
+	for i := 1; i < len(predefined); i++ {
52
+		if predefined[j].Overlaps(predefined[i].Base) {
53
+			continue
54
+		}
55
+		j++
56
+		predefined[j] = predefined[i]
57
+	}
58
+
59
+	if len(predefined) > j {
60
+		j++
61
+	}
62
+	clear(predefined[j:])
63
+
28 64
 	return &addrSpace{
29 65
 		subnets:    map[netip.Prefix]*PoolData{},
30
-		predefined: predefined,
66
+		predefined: predefined[:j],
31 67
 	}, nil
32 68
 }
33 69
 
34
-// allocateSubnet adds the subnet k to the address space.
70
+// allocateSubnet makes a static allocation for subnets 'nw' and 'sub'.
71
+//
72
+// This method is safe for concurrent use.
35 73
 func (aSpace *addrSpace) allocateSubnet(nw, sub netip.Prefix) error {
36 74
 	aSpace.mu.Lock()
37 75
 	defer aSpace.mu.Unlock()
... ...
@@ -52,99 +90,205 @@ func (aSpace *addrSpace) allocateSubnet(nw, sub netip.Prefix) error {
52 52
 	return aSpace.allocateSubnetL(nw, sub)
53 53
 }
54 54
 
55
+// allocateSubnetL takes a 'nw' parent prefix and a 'sub' prefix. These are
56
+// '--subnet' and '--ip-range' on the CLI.
57
+//
58
+// If 'sub' prefix is specified, we don't check if 'parent' overlaps with
59
+// existing allocations. However, if no 'sub' prefix is specified, we do check
60
+// for overlaps. This behavior is weird and leads to the inconsistencies
61
+// documented in https://github.com/moby/moby/issues/46756.
55 62
 func (aSpace *addrSpace) allocateSubnetL(nw, sub netip.Prefix) error {
56 63
 	// If master pool, check for overlap
57 64
 	if sub == (netip.Prefix{}) {
58 65
 		if aSpace.overlaps(nw) {
59 66
 			return ipamapi.ErrPoolOverlap
60 67
 		}
61
-		// This is a new master pool, add it along with corresponding bitmask
62
-		aSpace.subnets[nw] = newPoolData(nw)
63
-		return nil
64
-	}
65
-
66
-	// This is a new non-master pool (subPool)
67
-	if nw.Addr().BitLen() != sub.Addr().BitLen() {
68
-		return fmt.Errorf("pool and subpool are of incompatible address families")
68
+		return aSpace.allocatePool(nw)
69 69
 	}
70 70
 
71 71
 	// Look for parent pool
72
-	pp, ok := aSpace.subnets[nw]
72
+	_, ok := aSpace.subnets[nw]
73 73
 	if !ok {
74
-		// Parent pool does not exist, add it along with corresponding bitmask
75
-		pp = newPoolData(nw)
76
-		pp.autoRelease = true
77
-		aSpace.subnets[nw] = pp
74
+		if err := aSpace.allocatePool(nw); err != nil {
75
+			return err
76
+		}
77
+		aSpace.subnets[nw].autoRelease = true
78 78
 	}
79
-	pp.children[sub] = struct{}{}
79
+	aSpace.subnets[nw].children[sub] = struct{}{}
80 80
 	return nil
81 81
 }
82 82
 
83 83
 // overlaps reports whether nw contains any IP addresses in common with any of
84 84
 // the existing subnets in this address space.
85 85
 func (aSpace *addrSpace) overlaps(nw netip.Prefix) bool {
86
-	for pool := range aSpace.subnets {
87
-		if pool.Overlaps(nw) {
86
+	for _, allocated := range aSpace.allocated {
87
+		if allocated.Overlaps(nw) {
88 88
 			return true
89 89
 		}
90 90
 	}
91 91
 	return false
92 92
 }
93 93
 
94
-// getPredefineds returns the predefined subnets for the address space.
95
-//
96
-// It should not be called concurrently with any other method on the addrSpace.
97
-func (aSpace *addrSpace) getPredefineds() []netip.Prefix {
98
-	i := aSpace.predefinedStartIndex
99
-	// defensive in case the list changed since last update
100
-	if i >= len(aSpace.predefined) {
101
-		i = 0
102
-	}
103
-	return append(aSpace.predefined[i:], aSpace.predefined[:i]...)
94
+func (aSpace *addrSpace) allocatePool(nw netip.Prefix) error {
95
+	n, _ := slices.BinarySearchFunc(aSpace.allocated, nw, netiputil.PrefixCompare)
96
+	aSpace.allocated = slices.Insert(aSpace.allocated, n, nw)
97
+	aSpace.subnets[nw] = newPoolData(nw)
98
+	return nil
104 99
 }
105 100
 
106
-// updatePredefinedStartIndex rotates the predefined subnet list by amt.
101
+// allocatePredefinedPool dynamically allocates a subnet that doesn't overlap
102
+// with existing allocations and 'reserved' prefixes.
107 103
 //
108
-// It should not be called concurrently with any other method on the addrSpace.
109
-func (aSpace *addrSpace) updatePredefinedStartIndex(amt int) {
110
-	i := aSpace.predefinedStartIndex + amt
111
-	if i < 0 || i >= len(aSpace.predefined) {
112
-		i = 0
113
-	}
114
-	aSpace.predefinedStartIndex = i
115
-}
116
-
117
-func (aSpace *addrSpace) allocatePredefinedPool(ipV6 bool) (netip.Prefix, error) {
104
+// This method is safe for concurrent use.
105
+func (aSpace *addrSpace) allocatePredefinedPool(reserved []netip.Prefix) (netip.Prefix, error) {
118 106
 	aSpace.mu.Lock()
119 107
 	defer aSpace.mu.Unlock()
120 108
 
121
-	for i, nw := range aSpace.getPredefineds() {
122
-		if ipV6 != nw.Addr().Is6() {
123
-			continue
109
+	var pdfID int
110
+	var partialOverlap bool
111
+	var prevAlloc netip.Prefix
112
+
113
+	it := newMergeIter(aSpace.allocated, reserved, netiputil.PrefixCompare)
114
+
115
+	makeAlloc := func(subnet netip.Prefix) netip.Prefix {
116
+		// it.ia tracks the position of the mergeIter within aSpace.allocated.
117
+		aSpace.allocated = slices.Insert(aSpace.allocated, it.ia, subnet)
118
+		aSpace.subnets[subnet] = newPoolData(subnet)
119
+		return subnet
120
+	}
121
+
122
+	for {
123
+		allocated := it.Get()
124
+		if allocated == (netip.Prefix{}) {
125
+			// We reached the end of both 'aSpace.allocated' and 'reserved'.
126
+			break
124 127
 		}
125
-		// Checks whether pool has already been allocated
126
-		if _, ok := aSpace.subnets[nw]; ok {
128
+
129
+		if pdfID >= len(aSpace.predefined) {
130
+			return netip.Prefix{}, ipamapi.ErrNoMoreSubnets
131
+		}
132
+		pdf := aSpace.predefined[pdfID]
133
+
134
+		if allocated.Overlaps(pdf.Base) {
135
+			it.Inc()
136
+
137
+			if allocated.Bits() <= pdf.Base.Bits() {
138
+				// The current 'allocated' prefix is bigger than the 'pdf'
139
+				// network, thus the block is fully overlapped.
140
+				partialOverlap = false
141
+				prevAlloc = netip.Prefix{}
142
+				pdfID++
143
+				continue
144
+			}
145
+
146
+			// If no previous 'allocated' was found to partially overlap 'pdf',
147
+			// we need to test whether there's enough space available at the
148
+			// beginning of 'pdf'.
149
+			if !partialOverlap && ipbits.SubnetsBetween(pdf.FirstPrefix().Addr(), allocated.Addr(), pdf.Size) >= 1 {
150
+				// Okay, so there's at least a whole subnet available between
151
+				// the start of 'pdf' and 'allocated'.
152
+				next := pdf.FirstPrefix()
153
+				return makeAlloc(next), nil
154
+			}
155
+
156
+			// If the network 'pdf' was already found to be partially
157
+			// overlapped, we need to test whether there's enough space between
158
+			// the end of 'prevAlloc' and current 'allocated'.
159
+			afterPrev := netiputil.PrefixAfter(prevAlloc, pdf.Size)
160
+			if partialOverlap && ipbits.SubnetsBetween(afterPrev.Addr(), allocated.Addr(), pdf.Size) >= 1 {
161
+				// Okay, so there's at least a whole subnet available after
162
+				// 'prevAlloc' and before 'allocated'.
163
+				return makeAlloc(afterPrev), nil
164
+			}
165
+
166
+			if netiputil.LastAddr(allocated) == netiputil.LastAddr(pdf.Base) {
167
+				// The last address of the current 'allocated' prefix is the
168
+				// same as the last address of the 'pdf' network, it's fully
169
+				// overlapped.
170
+				partialOverlap = false
171
+				prevAlloc = netip.Prefix{}
172
+				pdfID++
173
+				continue
174
+			}
175
+
176
+			// This 'pdf' network is partially overlapped.
177
+			partialOverlap = true
178
+			prevAlloc = allocated
127 179
 			continue
128 180
 		}
129
-		// Shouldn't be necessary, but check prevents IP collisions should
130
-		// predefined pools overlap for any reason.
131
-		if !aSpace.overlaps(nw) {
132
-			aSpace.updatePredefinedStartIndex(i + 1)
133
-			err := aSpace.allocateSubnetL(nw, netip.Prefix{})
134
-			if err != nil {
135
-				return netip.Prefix{}, err
181
+
182
+		// Okay, so previous 'allocated' overlapped and current doesn't. Now
183
+		// the question is: is there enough space left between previous
184
+		// 'allocated' and the end of the 'pdf' network?
185
+		if partialOverlap {
186
+			partialOverlap = false
187
+
188
+			if next := netiputil.PrefixAfter(prevAlloc, pdf.Size); pdf.Overlaps(next) {
189
+				return makeAlloc(next), nil
136 190
 			}
137
-			return nw, nil
191
+
192
+			// No luck, PrefixAfter yielded an invalid prefix. There's not
193
+			// enough space left to subnet it once more.
194
+			pdfID++
195
+
196
+			// 'it' is not incremented here, we need to re-test the current
197
+			// 'allocated' against the next 'pdf' network.
198
+			continue
199
+		}
200
+
201
+		// If the network 'pdf' doesn't overlap and is sorted before the
202
+		// current 'allocated', we found the right spot.
203
+		if pdf.Base.Addr().Less(allocated.Addr()) {
204
+			next := netip.PrefixFrom(pdf.Base.Addr(), pdf.Size)
205
+			return makeAlloc(next), nil
206
+		}
207
+
208
+		it.Inc()
209
+		prevAlloc = allocated
210
+	}
211
+
212
+	if pdfID >= len(aSpace.predefined) {
213
+		return netip.Prefix{}, ipamapi.ErrNoMoreSubnets
214
+	}
215
+
216
+	// We reached the end of 'allocated', but not the end of predefined
217
+	// networks. Let's try two more times (once on the current 'pdf', and once
218
+	// on the next network if any).
219
+	if partialOverlap {
220
+		pdf := aSpace.predefined[pdfID]
221
+
222
+		if next := netiputil.PrefixAfter(prevAlloc, pdf.Size); pdf.Overlaps(next) {
223
+			return makeAlloc(next), nil
138 224
 		}
225
+
226
+		// No luck -- PrefixAfter yielded an invalid prefix. There's not enough
227
+		// space left.
228
+		pdfID++
139 229
 	}
140 230
 
141
-	v := 4
142
-	if ipV6 {
143
-		v = 6
231
+	// One last chance. Here we don't increment pdfID since the last iteration
232
+	// on 'it' found either:
233
+	//
234
+	// - A full overlap, and incremented 'pdfID'.
235
+	// - A partial overlap, and the previous 'if' incremented 'pdfID'.
236
+	// - The current 'pdfID' comes after the last 'allocated' -- it's not
237
+	//   overlapped at all.
238
+	//
239
+	// Hence, we're sure 'pdfID' has never been subnetted yet.
240
+	if pdfID < len(aSpace.predefined) {
241
+		pdf := aSpace.predefined[pdfID]
242
+
243
+		next := pdf.FirstPrefix()
244
+		return makeAlloc(next), nil
144 245
 	}
145
-	return netip.Prefix{}, types.NotFoundErrorf("could not find an available, non-overlapping IPv%d address pool among the defaults to assign to the network", v)
246
+
247
+	return netip.Prefix{}, ipamapi.ErrNoMoreSubnets
146 248
 }
147 249
 
250
+// releaseSubnet deallocates prefixes nw and sub. It returns an error if no
251
+// matching allocations could be found.
252
+//
253
+// This method is safe for concurrent use.
148 254
 func (aSpace *addrSpace) releaseSubnet(nw, sub netip.Prefix) error {
149 255
 	aSpace.mu.Lock()
150 256
 	defer aSpace.mu.Unlock()
... ...
@@ -164,12 +308,20 @@ func (aSpace *addrSpace) releaseSubnet(nw, sub netip.Prefix) error {
164 164
 	}
165 165
 
166 166
 	if len(p.children) == 0 && p.autoRelease {
167
-		delete(aSpace.subnets, nw)
167
+		aSpace.deallocate(nw)
168 168
 	}
169 169
 
170 170
 	return nil
171 171
 }
172 172
 
173
+// deallocate removes 'nw' from the list of allocations.
174
+func (aSpace *addrSpace) deallocate(nw netip.Prefix) {
175
+	if i, ok := slices.BinarySearchFunc(aSpace.allocated, nw, netiputil.PrefixCompare); ok {
176
+		aSpace.allocated = slices.Delete(aSpace.allocated, i, i+1)
177
+		delete(aSpace.subnets, nw)
178
+	}
179
+}
180
+
173 181
 func (aSpace *addrSpace) requestAddress(nw, sub netip.Prefix, prefAddress netip.Addr, opts map[string]string) (netip.Addr, error) {
174 182
 	aSpace.mu.Lock()
175 183
 	defer aSpace.mu.Unlock()
176 184
new file mode 100644
... ...
@@ -0,0 +1,389 @@
0
+package defaultipam
1
+
2
+import (
3
+	"net/netip"
4
+	"testing"
5
+
6
+	"github.com/docker/docker/libnetwork/ipamapi"
7
+	"github.com/docker/docker/libnetwork/ipamutils"
8
+	"github.com/google/go-cmp/cmp/cmpopts"
9
+	"gotest.tools/v3/assert"
10
+	is "gotest.tools/v3/assert/cmp"
11
+)
12
+
13
+func TestNewAddrSpaceDedup(t *testing.T) {
14
+	as, err := newAddrSpace([]*ipamutils.NetworkToSplit{
15
+		{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 16},
16
+		{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24},
17
+		{Base: netip.MustParsePrefix("10.10.0.0/8"), Size: 24},
18
+		{Base: netip.MustParsePrefix("10.0.100.0/8"), Size: 24},
19
+		{Base: netip.MustParsePrefix("192.168.0.0/24"), Size: 24},
20
+	})
21
+	assert.NilError(t, err)
22
+
23
+	assert.DeepEqual(t, as.predefined, []*ipamutils.NetworkToSplit{
24
+		{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 16},
25
+		{Base: netip.MustParsePrefix("192.168.0.0/24"), Size: 24},
26
+	}, cmpopts.EquateComparable(ipamutils.NetworkToSplit{}))
27
+}
28
+
29
+func TestDynamicPoolAllocation(t *testing.T) {
30
+	testcases := []struct {
31
+		name       string
32
+		predefined []*ipamutils.NetworkToSplit
33
+		allocated  []netip.Prefix
34
+		reserved   []netip.Prefix
35
+		expPrefix  netip.Prefix
36
+		expErr     error
37
+	}{
38
+		{
39
+			name: "First allocated overlaps at the end of first pool",
40
+			predefined: []*ipamutils.NetworkToSplit{
41
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
42
+			},
43
+			allocated: []netip.Prefix{
44
+				// Partial overlap with enough space remaining
45
+				netip.MustParsePrefix("192.168.255.0/24"),
46
+			},
47
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
48
+		},
49
+		{
50
+			name: "First reserved bigger than first allocated",
51
+			predefined: []*ipamutils.NetworkToSplit{
52
+				{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24},
53
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
54
+			},
55
+			allocated: []netip.Prefix{
56
+				// Partial overlap with enough space remaining
57
+				netip.MustParsePrefix("10.0.0.0/8"),
58
+			},
59
+			reserved: []netip.Prefix{
60
+				netip.MustParsePrefix("10.0.0.0/7"),
61
+			},
62
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
63
+		},
64
+		{
65
+			name: "First pool fully overlapped by bigger allocated, next overlapped in the middle",
66
+			predefined: []*ipamutils.NetworkToSplit{
67
+				{Base: netip.MustParsePrefix("10.20.0.0/16"), Size: 24},
68
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
69
+			},
70
+			allocated: []netip.Prefix{
71
+				netip.MustParsePrefix("10.0.0.0/8"),
72
+				// Partial overlap with enough space remaining
73
+				netip.MustParsePrefix("192.168.128.0/24"),
74
+			},
75
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
76
+		},
77
+		{
78
+			name: "First pool fully overlapped by bigger allocated, next overlapped at the beginning and in the middle",
79
+			predefined: []*ipamutils.NetworkToSplit{
80
+				{Base: netip.MustParsePrefix("10.20.0.0/16"), Size: 24},
81
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
82
+			},
83
+			allocated: []netip.Prefix{
84
+				netip.MustParsePrefix("10.0.0.0/8"),
85
+				// Partial overlap with enough space remaining
86
+				netip.MustParsePrefix("192.168.0.0/24"),
87
+				netip.MustParsePrefix("192.168.128.0/24"),
88
+			},
89
+			expPrefix: netip.MustParsePrefix("192.168.1.0/24"),
90
+		},
91
+		{
92
+			name: "First pool fully overlapped by smaller prefixes, next overlapped in the middle",
93
+			predefined: []*ipamutils.NetworkToSplit{
94
+				{Base: netip.MustParsePrefix("10.20.0.0/22"), Size: 24},
95
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
96
+			},
97
+			allocated: []netip.Prefix{
98
+				netip.MustParsePrefix("10.20.0.0/24"),
99
+				netip.MustParsePrefix("10.20.1.0/24"),
100
+				netip.MustParsePrefix("10.20.2.0/24"),
101
+				netip.MustParsePrefix("192.168.128.0/24"),
102
+			},
103
+			reserved: []netip.Prefix{
104
+				netip.MustParsePrefix("10.20.3.0/24"),
105
+			},
106
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
107
+		},
108
+		{
109
+			name: "First pool fully overlapped by smaller prefix, next predefined before reserved",
110
+			predefined: []*ipamutils.NetworkToSplit{
111
+				{Base: netip.MustParsePrefix("10.20.0.0/16"), Size: 24},
112
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
113
+			},
114
+			allocated: []netip.Prefix{
115
+				netip.MustParsePrefix("10.20.0.0/17"),
116
+				netip.MustParsePrefix("10.20.128.0/17"),
117
+			},
118
+			reserved: []netip.Prefix{
119
+				netip.MustParsePrefix("200.1.2.0/24"),
120
+			},
121
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
122
+		},
123
+		{
124
+			name: "First pool fully overlapped by smaller prefix, reserved is the same as the last allocated subnet",
125
+			predefined: []*ipamutils.NetworkToSplit{
126
+				{Base: netip.MustParsePrefix("10.10.0.0/22"), Size: 24},
127
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
128
+			},
129
+			allocated: []netip.Prefix{
130
+				netip.MustParsePrefix("10.10.0.0/24"),
131
+				netip.MustParsePrefix("10.10.1.0/24"),
132
+				netip.MustParsePrefix("10.10.2.0/24"),
133
+				netip.MustParsePrefix("10.10.3.0/24"),
134
+			},
135
+			reserved: []netip.Prefix{
136
+				netip.MustParsePrefix("10.10.3.0/24"),
137
+			},
138
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
139
+		},
140
+		{
141
+			name: "Partial overlap by allocated of different sizes",
142
+			predefined: []*ipamutils.NetworkToSplit{
143
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
144
+			},
145
+			allocated: []netip.Prefix{
146
+				// Partial overlap with enough space remaining
147
+				netip.MustParsePrefix("192.168.0.0/24"),
148
+				netip.MustParsePrefix("192.168.1.0/24"),
149
+				netip.MustParsePrefix("192.168.2.0/23"),
150
+				netip.MustParsePrefix("192.168.4.3/30"),
151
+			},
152
+			expPrefix: netip.MustParsePrefix("192.168.5.0/24"),
153
+		},
154
+		{
155
+			name: "Partial overlap at the start, not enough space left",
156
+			predefined: []*ipamutils.NetworkToSplit{
157
+				{Base: netip.MustParsePrefix("10.0.0.0/31"), Size: 31},
158
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
159
+			},
160
+			allocated: []netip.Prefix{
161
+				// Partial overlap, not enough space left
162
+				netip.MustParsePrefix("10.0.0.0/32"),
163
+				netip.MustParsePrefix("100.0.0.0/32"),
164
+				netip.MustParsePrefix("200.0.0.0/32"),
165
+			},
166
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
167
+		},
168
+		{
169
+			name: "Partial overlap by allocations and reserved of different sizes",
170
+			predefined: []*ipamutils.NetworkToSplit{
171
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
172
+			},
173
+			allocated: []netip.Prefix{
174
+				// Partial overlap with enough space remaining
175
+				netip.MustParsePrefix("192.168.0.0/24"),
176
+				netip.MustParsePrefix("192.168.1.0/24"),
177
+				netip.MustParsePrefix("192.168.2.3/30"),
178
+			},
179
+			reserved: []netip.Prefix{
180
+				netip.MustParsePrefix("192.168.2.4/30"),
181
+				netip.MustParsePrefix("192.168.3.0/30"),
182
+				netip.MustParsePrefix("192.168.4.0/23"),
183
+			},
184
+			expPrefix: netip.MustParsePrefix("192.168.6.0/24"),
185
+		},
186
+		{
187
+			name: "Partial overlap, same prefix in allocated and reserved",
188
+			predefined: []*ipamutils.NetworkToSplit{
189
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
190
+			},
191
+			allocated: []netip.Prefix{
192
+				// Partial overlap with enough space remaining
193
+				netip.MustParsePrefix("192.168.0.0/24"),
194
+			},
195
+			reserved: []netip.Prefix{
196
+				netip.MustParsePrefix("192.168.0.0/24"),
197
+			},
198
+			expPrefix: netip.MustParsePrefix("192.168.1.0/24"),
199
+		},
200
+		{
201
+			name: "Partial overlap, two predefined",
202
+			predefined: []*ipamutils.NetworkToSplit{
203
+				{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24},
204
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
205
+			},
206
+			allocated: []netip.Prefix{
207
+				netip.MustParsePrefix("10.0.0.0/24"),
208
+			},
209
+			reserved: []netip.Prefix{
210
+				netip.MustParsePrefix("192.168.0.0/24"),
211
+			},
212
+			expPrefix: netip.MustParsePrefix("10.0.1.0/24"),
213
+		},
214
+		{
215
+			name: "Predefined with overlapping prefixes, longer prefixes discarded",
216
+			predefined: []*ipamutils.NetworkToSplit{
217
+				{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24},
218
+				// This predefined will be discarded.
219
+				{Base: netip.MustParsePrefix("10.0.0.0/16"), Size: 24},
220
+				// This predefined will be discarded.
221
+				{Base: netip.MustParsePrefix("10.10.0.0/16"), Size: 24},
222
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
223
+			},
224
+			reserved:  []netip.Prefix{netip.MustParsePrefix("10.0.0.0/8")},
225
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
226
+		},
227
+		{
228
+			name: "Partial overlap at the beginning, single predefined",
229
+			predefined: []*ipamutils.NetworkToSplit{
230
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
231
+			},
232
+			allocated: []netip.Prefix{
233
+				netip.MustParsePrefix("172.16.0.0/16"),
234
+			},
235
+			expPrefix: netip.MustParsePrefix("172.17.0.0/16"),
236
+		},
237
+		{
238
+			name: "Partial overlap, no space left at the end, next pool not subnetted yet",
239
+			predefined: []*ipamutils.NetworkToSplit{
240
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
241
+				{Base: netip.MustParsePrefix("192.168.0.0/16"), Size: 24},
242
+			},
243
+			allocated: []netip.Prefix{
244
+				netip.MustParsePrefix("172.16.0.0/16"),
245
+				netip.MustParsePrefix("172.17.0.0/17"),
246
+			},
247
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
248
+		},
249
+		{
250
+			name: "Partial overlap, no space left at the end, no more predefined",
251
+			predefined: []*ipamutils.NetworkToSplit{
252
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
253
+			},
254
+			allocated: []netip.Prefix{
255
+				netip.MustParsePrefix("172.16.0.0/16"),
256
+				netip.MustParsePrefix("172.17.0.0/17"),
257
+			},
258
+			expErr: ipamapi.ErrNoMoreSubnets,
259
+		},
260
+		{
261
+			name: "Extra allocated, no pool left",
262
+			predefined: []*ipamutils.NetworkToSplit{
263
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
264
+			},
265
+			allocated: []netip.Prefix{
266
+				netip.MustParsePrefix("172.16.0.0/16"),
267
+				netip.MustParsePrefix("172.17.0.0/16"),
268
+				netip.MustParsePrefix("192.168.0.0/24"),
269
+			},
270
+			expErr: ipamapi.ErrNoMoreSubnets,
271
+		},
272
+		{
273
+			name: "Extra reserved, no pool left",
274
+			predefined: []*ipamutils.NetworkToSplit{
275
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
276
+			},
277
+			allocated: []netip.Prefix{
278
+				netip.MustParsePrefix("172.16.0.0/16"),
279
+				netip.MustParsePrefix("172.17.0.0/16"),
280
+			},
281
+			reserved: []netip.Prefix{
282
+				netip.MustParsePrefix("192.168.0.0/24"),
283
+			},
284
+			expErr: ipamapi.ErrNoMoreSubnets,
285
+		},
286
+		{
287
+			name: "Predefined fully allocated",
288
+			predefined: []*ipamutils.NetworkToSplit{
289
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
290
+				{Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24},
291
+			},
292
+			allocated: []netip.Prefix{
293
+				netip.MustParsePrefix("172.16.0.0/16"),
294
+				netip.MustParsePrefix("172.17.0.0/16"),
295
+				netip.MustParsePrefix("192.168.0.0/24"),
296
+				netip.MustParsePrefix("192.168.1.0/24"),
297
+			},
298
+			expErr: ipamapi.ErrNoMoreSubnets,
299
+		},
300
+		{
301
+			name: "Partial overlap, not enough space left",
302
+			predefined: []*ipamutils.NetworkToSplit{
303
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
304
+				{Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24},
305
+			},
306
+			allocated: []netip.Prefix{
307
+				netip.MustParsePrefix("172.16.0.0/16"),
308
+				netip.MustParsePrefix("172.17.128.0/17"),
309
+				netip.MustParsePrefix("192.168.0.1/32"),
310
+				netip.MustParsePrefix("192.168.1.0/24"),
311
+			},
312
+			expErr: ipamapi.ErrNoMoreSubnets,
313
+		},
314
+		{
315
+			name: "Duplicate 'allocated' at the end of a predefined",
316
+			predefined: []*ipamutils.NetworkToSplit{
317
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
318
+				{Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24},
319
+			},
320
+			allocated: []netip.Prefix{
321
+				netip.MustParsePrefix("172.16.0.0/16"),
322
+				netip.MustParsePrefix("172.17.128.0/17"),
323
+				netip.MustParsePrefix("172.17.128.0/17"),
324
+				netip.MustParsePrefix("172.17.128.0/17"),
325
+			},
326
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
327
+		},
328
+		{
329
+			name: "Duplicate 'allocated'",
330
+			predefined: []*ipamutils.NetworkToSplit{
331
+				{Base: netip.MustParsePrefix("172.16.0.0/15"), Size: 16},
332
+				{Base: netip.MustParsePrefix("192.168.0.0/23"), Size: 24},
333
+			},
334
+			allocated: []netip.Prefix{
335
+				netip.MustParsePrefix("172.16.0.0/16"),
336
+				netip.MustParsePrefix("172.16.120.0/24"),
337
+				netip.MustParsePrefix("172.17.128.0/17"),
338
+				netip.MustParsePrefix("172.17.128.0/17"),
339
+				netip.MustParsePrefix("172.17.128.0/24"),
340
+				netip.MustParsePrefix("172.17.128.0/17"),
341
+			},
342
+			expPrefix: netip.MustParsePrefix("192.168.0.0/24"),
343
+		},
344
+	}
345
+
346
+	for _, tc := range testcases {
347
+		t.Run(tc.name, func(t *testing.T) {
348
+			as, err := newAddrSpace(tc.predefined)
349
+			assert.NilError(t, err)
350
+			as.allocated = tc.allocated
351
+
352
+			p, err := as.allocatePredefinedPool(tc.reserved)
353
+
354
+			assert.Check(t, is.ErrorIs(err, tc.expErr))
355
+			assert.Check(t, is.Equal(p, tc.expPrefix))
356
+		})
357
+	}
358
+}
359
+
360
+func TestStaticAllocation(t *testing.T) {
361
+	as, err := newAddrSpace([]*ipamutils.NetworkToSplit{
362
+		{Base: netip.MustParsePrefix("10.0.0.0/8"), Size: 24},
363
+	})
364
+	assert.NilError(t, err)
365
+
366
+	for _, alloc := range []netip.Prefix{
367
+		netip.MustParsePrefix("192.168.0.0/16"),
368
+		netip.MustParsePrefix("10.0.0.0/24"),
369
+		netip.MustParsePrefix("10.0.1.0/24"),
370
+		netip.MustParsePrefix("10.1.0.0/16"),
371
+		netip.MustParsePrefix("10.0.0.0/31"),
372
+		netip.MustParsePrefix("10.0.0.0/8"),
373
+		netip.MustParsePrefix("192.168.3.0/24"),
374
+	} {
375
+		err := as.allocatePool(alloc)
376
+		assert.NilError(t, err)
377
+	}
378
+
379
+	assert.Check(t, is.DeepEqual(as.allocated, []netip.Prefix{
380
+		netip.MustParsePrefix("10.0.0.0/8"),
381
+		netip.MustParsePrefix("10.0.0.0/24"),
382
+		netip.MustParsePrefix("10.0.0.0/31"),
383
+		netip.MustParsePrefix("10.0.1.0/24"),
384
+		netip.MustParsePrefix("10.1.0.0/16"),
385
+		netip.MustParsePrefix("192.168.0.0/16"),
386
+		netip.MustParsePrefix("192.168.3.0/24"),
387
+	}, cmpopts.EquateComparable(netip.Prefix{})))
388
+}
... ...
@@ -27,25 +27,15 @@ const (
27 27
 // two optional address pools respectively containing the list of user-defined
28 28
 // address pools for 'local' and 'global' address spaces.
29 29
 func Register(ic ipamapi.Registerer, lAddrPools, gAddrPools []*ipamutils.NetworkToSplit) error {
30
-	localAddressPools := ipamutils.GetLocalScopeDefaultNetworks()
31
-	if len(lAddrPools) > 0 {
32
-		var err error
33
-		localAddressPools, err = ipamutils.SplitNetworks(lAddrPools)
34
-		if err != nil {
35
-			return err
36
-		}
30
+	if len(lAddrPools) == 0 {
31
+		lAddrPools = ipamutils.GetLocalScopeDefaultNetworks()
37 32
 	}
38 33
 
39
-	globalAddressPools := ipamutils.GetGlobalScopeDefaultNetworks()
40
-	if len(gAddrPools) > 0 {
41
-		var err error
42
-		globalAddressPools, err = ipamutils.SplitNetworks(gAddrPools)
43
-		if err != nil {
44
-			return err
45
-		}
34
+	if len(gAddrPools) == 0 {
35
+		gAddrPools = ipamutils.GetGlobalScopeDefaultNetworks()
46 36
 	}
47 37
 
48
-	a, err := NewAllocator(localAddressPools, globalAddressPools)
38
+	a, err := NewAllocator(lAddrPools, gAddrPools)
49 39
 	if err != nil {
50 40
 		return err
51 41
 	}
... ...
@@ -55,18 +45,18 @@ func Register(ic ipamapi.Registerer, lAddrPools, gAddrPools []*ipamutils.Network
55 55
 	return ic.RegisterIpamDriverWithCapabilities(DriverName, a, cps)
56 56
 }
57 57
 
58
-// Allocator provides per address space ipv4/ipv6 book keeping
58
+// Allocator provides per address space ipv4/ipv6 bookkeeping
59 59
 type Allocator struct {
60 60
 	// The address spaces
61 61
 	local4, local6, global4, global6 *addrSpace
62 62
 }
63 63
 
64 64
 // NewAllocator returns an instance of libnetwork ipam
65
-func NewAllocator(lcAs, glAs []*net.IPNet) (*Allocator, error) {
65
+func NewAllocator(lcAs, glAs []*ipamutils.NetworkToSplit) (*Allocator, error) {
66 66
 	var (
67 67
 		a                          Allocator
68 68
 		err                        error
69
-		lcAs4, lcAs6, glAs4, glAs6 []netip.Prefix
69
+		lcAs4, lcAs6, glAs4, glAs6 []*ipamutils.NetworkToSplit
70 70
 	)
71 71
 
72 72
 	lcAs4, lcAs6, err = splitByIPFamily(lcAs)
... ...
@@ -98,19 +88,18 @@ func NewAllocator(lcAs, glAs []*net.IPNet) (*Allocator, error) {
98 98
 	return &a, nil
99 99
 }
100 100
 
101
-func splitByIPFamily(s []*net.IPNet) ([]netip.Prefix, []netip.Prefix, error) {
102
-	var v4, v6 []netip.Prefix
101
+func splitByIPFamily(s []*ipamutils.NetworkToSplit) ([]*ipamutils.NetworkToSplit, []*ipamutils.NetworkToSplit, error) {
102
+	var v4, v6 []*ipamutils.NetworkToSplit
103 103
 
104 104
 	for i, n := range s {
105
-		p, ok := netiputil.ToPrefix(n)
106
-		if !ok {
107
-			return []netip.Prefix{}, []netip.Prefix{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n)
105
+		if !n.Base.IsValid() || n.Size == 0 {
106
+			return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n)
108 107
 		}
109 108
 
110
-		if p.Addr().Is4() {
111
-			v4 = append(v4, p)
109
+		if n.Base.Addr().Is4() {
110
+			v4 = append(v4, n)
112 111
 		} else {
113
-			v6 = append(v6, p)
112
+			v6 = append(v6, n)
114 113
 		}
115 114
 	}
116 115
 
... ...
@@ -147,7 +136,7 @@ func (a *Allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool,
147 147
 
148 148
 	k := PoolID{AddressSpace: req.AddressSpace}
149 149
 	if req.Pool == "" {
150
-		if k.Subnet, err = aSpace.allocatePredefinedPool(req.V6); err != nil {
150
+		if k.Subnet, err = aSpace.allocatePredefinedPool(req.Exclude); err != nil {
151 151
 			return ipamapi.AllocatedPool{}, err
152 152
 		}
153 153
 		return ipamapi.AllocatedPool{PoolID: k.String(), Pool: k.Subnet}, nil
... ...
@@ -163,6 +152,11 @@ func (a *Allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool,
163 163
 		}
164 164
 	}
165 165
 
166
+	// This is a new non-master pool (subPool)
167
+	if k.Subnet.IsValid() && k.ChildSubnet.IsValid() && k.Subnet.Addr().BitLen() != k.ChildSubnet.Addr().BitLen() {
168
+		return ipamapi.AllocatedPool{}, types.InvalidParameterErrorf("pool and subpool are of incompatible address families")
169
+	}
170
+
166 171
 	k.Subnet, k.ChildSubnet = k.Subnet.Masked(), k.ChildSubnet.Masked()
167 172
 	// Prior to https://github.com/moby/moby/pull/44968, libnetwork would happily accept a ChildSubnet with a bigger
168 173
 	// mask than its parent subnet. In such case, it was producing IP addresses based on the parent subnet, and the
... ...
@@ -227,7 +221,7 @@ func newPoolData(pool netip.Prefix) *PoolData {
227 227
 	h := bitmap.New(numAddresses)
228 228
 
229 229
 	// Pre-reserve the network address on IPv4 networks large
230
-	// enough to have one (i.e., anything bigger than a /31.
230
+	// enough to have one (i.e., anything bigger than a /31).
231 231
 	if !(pool.Addr().Is4() && numAddresses <= 2) {
232 232
 		h.Set(0)
233 233
 	}
... ...
@@ -336,40 +336,39 @@ func TestGetSameAddress(t *testing.T) {
336 336
 	}
337 337
 }
338 338
 
339
-func TestPoolAllocationReuse(t *testing.T) {
339
+// TestRequestFromSamePool verify the allocator implements the validation
340
+// inconsistencies described in https://github.com/moby/moby/issues/46756.
341
+func TestRequestFromSamePool(t *testing.T) {
340 342
 	a, err := NewAllocator(ipamutils.GetLocalScopeDefaultNetworks(), ipamutils.GetGlobalScopeDefaultNetworks())
341 343
 	assert.NilError(t, err)
342 344
 
343
-	// First get all pools until they are exhausted to
344
-	allocs := []ipamapi.AllocatedPool{}
345
-	alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace})
346
-	for err == nil {
347
-		allocs = append(allocs, alloc)
348
-		alloc, err = a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace})
349
-	}
350
-	for _, alloc := range allocs {
351
-		if err := a.ReleasePool(alloc.PoolID); err != nil {
352
-			t.Fatal(err)
353
-		}
354
-	}
345
+	_, err = a.RequestPool(ipamapi.PoolRequest{
346
+		AddressSpace: localAddressSpace,
347
+		Pool:         "10.0.0.0/8",
348
+		SubPool:      "10.10.0.0/16",
349
+	})
350
+	assert.NilError(t, err)
355 351
 
356
-	// Now try to allocate then free nPool pools sequentially.
357
-	// Verify that we don't see any repeat networks even though
358
-	// we have freed them.
359
-	seen := map[string]bool{}
360
-	for range allocs {
361
-		alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace})
362
-		if err != nil {
363
-			t.Fatal(err)
364
-		}
365
-		if _, ok := seen[alloc.Pool.String()]; ok {
366
-			t.Fatalf("Network %s was reused before exhausing the pool list", alloc.Pool.String())
367
-		}
368
-		seen[alloc.Pool.String()] = true
369
-		if err := a.ReleasePool(alloc.PoolID); err != nil {
370
-			t.Fatal(err)
371
-		}
372
-	}
352
+	_, err = a.RequestPool(ipamapi.PoolRequest{
353
+		AddressSpace: localAddressSpace,
354
+		Pool:         "10.0.0.0/8",
355
+		SubPool:      "10.10.0.0/16",
356
+	})
357
+	assert.ErrorContains(t, err, "invalid pool request")
358
+
359
+	_, err = a.RequestPool(ipamapi.PoolRequest{
360
+		AddressSpace: localAddressSpace,
361
+		Pool:         "10.0.0.0/8",
362
+		SubPool:      "10.10.0.0/17",
363
+	})
364
+	assert.NilError(t, err)
365
+
366
+	_, err = a.RequestPool(ipamapi.PoolRequest{
367
+		AddressSpace: localAddressSpace,
368
+		Pool:         "10.0.0.0/8",
369
+		SubPool:      "10.11.0.0/16",
370
+	})
371
+	assert.NilError(t, err)
373 372
 }
374 373
 
375 374
 func TestGetAddressSubPoolEqualPool(t *testing.T) {
... ...
@@ -3,13 +3,11 @@ package defaultipam
3 3
 import (
4 4
 	"context"
5 5
 	"fmt"
6
-	"math/rand"
7 6
 	"net"
8
-	"sort"
7
+	"net/netip"
8
+	"slices"
9 9
 	"sync"
10
-	"sync/atomic"
11 10
 	"testing"
12
-	"time"
13 11
 
14 12
 	"github.com/docker/docker/libnetwork/ipamapi"
15 13
 	"github.com/docker/docker/libnetwork/ipamutils"
... ...
@@ -66,77 +64,46 @@ func TestDebug(t *testing.T) {
66 66
 	tctx.a.RequestAddress(tctx.pid, nil, map[string]string{ipamapi.AllocSerialPrefix: "true"})
67 67
 }
68 68
 
69
-type op struct {
70
-	id   int32
71
-	add  bool
72
-	name string
73
-}
69
+func TestRequestPoolParallel(t *testing.T) {
70
+	a, err := NewAllocator([]*ipamutils.NetworkToSplit{
71
+		{Base: netip.MustParsePrefix("10.0.0.0/10"), Size: 24},
72
+	}, nil)
73
+	assert.NilError(t, err)
74 74
 
75
-func (o *op) String() string {
76
-	return fmt.Sprintf("%+v", *o)
77
-}
75
+	var expected []string
76
+	var eg errgroup.Group
77
+	imax := 1 << (a.local4.predefined[0].Size - a.local4.predefined[0].Base.Bits())
78
+	allocCh := make(chan string, imax)
78 79
 
79
-func TestRequestPoolParallel(t *testing.T) {
80
-	a, err := NewAllocator(ipamutils.GetLocalScopeDefaultNetworks(), ipamutils.GetGlobalScopeDefaultNetworks())
81
-	if err != nil {
82
-		t.Fatal(err)
83
-	}
84
-	var operationIndex int32
85
-	ch := make(chan *op, 240)
80
+	for i := 0; i < imax; i++ {
81
+		expected = append(expected, fmt.Sprintf("10.%d.%d.0/24", uint(i/256), i%256))
86 82
 
87
-	group := new(errgroup.Group)
88
-	defer func() {
89
-		if err := group.Wait(); err != nil {
90
-			t.Fatal(err)
91
-		}
92
-	}()
83
+		eg.Go(func() error {
84
+			t.Helper()
93 85
 
94
-	for i := 0; i < 120; i++ {
95
-		group.Go(func() error {
96
-			alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: "GlobalDefault"})
97
-			if err != nil {
98
-				t.Log(err) // log so we can see the error in real time rather than at the end when we actually call "Wait".
99
-				return fmt.Errorf("request error %v", err)
100
-			}
101
-			idx := atomic.AddInt32(&operationIndex, 1)
102
-			ch <- &op{idx, true, alloc.PoolID}
103
-			time.Sleep(time.Duration(rand.Intn(100)) * time.Millisecond)
104
-			idx = atomic.AddInt32(&operationIndex, 1)
105
-			err = a.ReleasePool(alloc.PoolID)
86
+			alloc, err := a.RequestPool(ipamapi.PoolRequest{AddressSpace: localAddressSpace})
106 87
 			if err != nil {
107
-				t.Log(err) // log so we can see the error in real time rather than at the end when we actually call "Wait".
108
-				return fmt.Errorf("release error %v", err)
88
+				return err
109 89
 			}
110
-			ch <- &op{idx, false, alloc.PoolID}
90
+
91
+			allocCh <- alloc.Pool.String()
92
+
111 93
 			return nil
112 94
 		})
113 95
 	}
114 96
 
115
-	// map of events
116
-	m := make(map[string][]*op)
117
-	for i := 0; i < 240; i++ {
118
-		x := <-ch
119
-		ops, ok := m[x.name]
120
-		if !ok {
121
-			ops = make([]*op, 0, 10)
122
-		}
123
-		ops = append(ops, x)
124
-		m[x.name] = ops
97
+	assert.NilError(t, eg.Wait())
98
+	close(allocCh)
99
+
100
+	var allocated []string
101
+	for alloc := range allocCh {
102
+		allocated = append(allocated, alloc)
125 103
 	}
126 104
 
127
-	// Post processing to avoid event reordering on the channel
128
-	for pool, ops := range m {
129
-		sort.Slice(ops[:], func(i, j int) bool {
130
-			return ops[i].id < ops[j].id
131
-		})
132
-		expected := true
133
-		for _, op := range ops {
134
-			if op.add != expected {
135
-				t.Fatalf("Operations for %v not valid %v, operations %v", pool, op, ops)
136
-			}
137
-			expected = !expected
138
-		}
105
+	for _, exp := range expected {
106
+		assert.Check(t, slices.Contains(allocated, exp) == true)
139 107
 	}
108
+	assert.Equal(t, len(allocated), len(expected))
140 109
 }
141 110
 
142 111
 func TestFullAllocateRelease(t *testing.T) {
... ...
@@ -72,3 +72,56 @@ func (s *PoolID) String() string {
72 72
 func (p *PoolData) String() string {
73 73
 	return fmt.Sprintf("PoolData[Children: %d]", len(p.children))
74 74
 }
75
+
76
+// mergeIter is used to iterate on both 'a' and 'b' at the same time while
77
+// maintaining the total order that would arise if both were merged and then
78
+// sorted. Both 'a' and 'b' have to be sorted beforehand.
79
+type mergeIter struct {
80
+	a, b   []netip.Prefix
81
+	ia, ib int
82
+	cmp    func(a, b netip.Prefix) int
83
+	lastA  bool
84
+}
85
+
86
+func newMergeIter(a, b []netip.Prefix, cmp func(a, b netip.Prefix) int) *mergeIter {
87
+	iter := &mergeIter{
88
+		a:   a,
89
+		b:   b,
90
+		cmp: cmp,
91
+	}
92
+	iter.lastA = iter.nextA()
93
+
94
+	return iter
95
+}
96
+
97
+func (it *mergeIter) Get() netip.Prefix {
98
+	if it.ia+it.ib >= len(it.a)+len(it.b) {
99
+		return netip.Prefix{}
100
+	}
101
+
102
+	if it.lastA {
103
+		return it.a[it.ia]
104
+	}
105
+
106
+	return it.b[it.ib]
107
+}
108
+
109
+func (it *mergeIter) Inc() {
110
+	if it.lastA {
111
+		it.ia++
112
+	} else {
113
+		it.ib++
114
+	}
115
+
116
+	it.lastA = it.nextA()
117
+}
118
+
119
+func (it *mergeIter) nextA() bool {
120
+	if it.ia < len(it.a) && it.ib < len(it.b) && it.cmp(it.a[it.ia], it.b[it.ib]) <= 0 {
121
+		return true
122
+	} else if it.ia < len(it.a) && it.ib >= len(it.b) {
123
+		return true
124
+	}
125
+
126
+	return false
127
+}
75 128
new file mode 100644
... ...
@@ -0,0 +1,32 @@
0
+package defaultipam
1
+
2
+import (
3
+	"net/netip"
4
+	"testing"
5
+
6
+	"github.com/docker/docker/libnetwork/internal/netiputil"
7
+	"gotest.tools/v3/assert"
8
+)
9
+
10
+func TestMergeIter(t *testing.T) {
11
+	allocated := []netip.Prefix{
12
+		netip.MustParsePrefix("172.16.0.0/24"),
13
+		netip.MustParsePrefix("172.17.0.0/24"),
14
+		netip.MustParsePrefix("172.18.0.0/24"),
15
+	}
16
+	reserved := []netip.Prefix{
17
+		netip.MustParsePrefix("172.16.0.0/24"),
18
+	}
19
+	it := newMergeIter(allocated, reserved, netiputil.PrefixCompare)
20
+
21
+	for _, exp := range []netip.Prefix{
22
+		allocated[0],
23
+		reserved[0],
24
+		allocated[1],
25
+		allocated[2],
26
+		{},
27
+	} {
28
+		assert.Equal(t, it.Get(), exp)
29
+		it.Inc()
30
+	}
31
+}
... ...
@@ -116,8 +116,26 @@ func (a *allocator) GetDefaultAddressSpaces() (string, string, error) {
116 116
 	return res.LocalDefaultAddressSpace, res.GlobalDefaultAddressSpace, nil
117 117
 }
118 118
 
119
-// RequestPool requests an address pool in the specified address space
119
+// RequestPool requests an address pool in the specified address space.
120
+//
121
+// This is a bug-for-bug re-implementation of the logic originally found in
122
+// requestPoolHelper prior to v27. See https://github.com/moby/moby/blob/faf84d7f0a1f2e6badff6f720a3e1e559c356fff/libnetwork/network.go#L1518-L1570
120 123
 func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, error) {
124
+	var tmpPoolLeases []string
125
+	defer func() {
126
+		// Release all pools we held on to.
127
+		for _, pID := range tmpPoolLeases {
128
+			if err := a.ReleasePool(pID); err != nil {
129
+				log.G(context.TODO()).Warnf("Failed to release overlapping pool")
130
+			}
131
+		}
132
+	}()
133
+
134
+	_, globalSpace, err := a.GetDefaultAddressSpaces()
135
+	if err != nil {
136
+		return ipamapi.AllocatedPool{}, err
137
+	}
138
+
121 139
 	remoteReq := &api.RequestPoolRequest{
122 140
 		AddressSpace: req.AddressSpace,
123 141
 		Pool:         req.Pool,
... ...
@@ -125,8 +143,51 @@ func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool,
125 125
 		Options:      req.Options,
126 126
 		V6:           req.V6,
127 127
 	}
128
+
129
+	for {
130
+		alloc, err := a.requestPool(remoteReq)
131
+		if err != nil {
132
+			return alloc, err
133
+		}
134
+
135
+		// If the network pool was explicitly chosen, the network belongs to
136
+		// global address space, or it is invalid ("0.0.0.0/0"), then we don't
137
+		// perform check for overlaps.
138
+		//
139
+		// FIXME(thaJeztah): why are we ignoring invalid pools here?
140
+		//
141
+		// The "invalid" conditions was added in [libnetwork#1095][1], which
142
+		// moved code to reduce os-specific dependencies in the ipam package,
143
+		// but also introduced a types.IsIPNetValid() function, which considers
144
+		// "0.0.0.0/0" invalid, and added it to the conditions below.
145
+		//
146
+		// Unfortunately review does not mention this change, so there's no
147
+		// context why. Possibly this was done to prevent errors further down
148
+		// the line (when checking for overlaps), but returning an error here
149
+		// instead would likely have avoided that as well, so we can only guess.
150
+		//
151
+		// [1]: https://github.com/moby/libnetwork/commit/5ca79d6b87873264516323a7b76f0af7d0298492#diff-bdcd879439d041827d334846f9aba01de6e3683ed8fdd01e63917dae6df23846
152
+		if req.Pool != "" || req.AddressSpace == globalSpace || alloc.Pool.String() == "0.0.0.0/0" {
153
+			return alloc, nil
154
+		}
155
+
156
+		// Check for overlap and if none found, we have found the right pool.
157
+		if !checkOverlaps(alloc, req.Exclude) {
158
+			return alloc, nil
159
+		}
160
+
161
+		// Pool obtained in this iteration is overlapping. Hold onto the pool
162
+		// and don't release it yet, because we don't want IPAM to give us back
163
+		// the same pool over again. But make sure we still do a deferred release
164
+		// when we have either obtained a non-overlapping pool or ran out of
165
+		// pre-defined pools.
166
+		tmpPoolLeases = append(tmpPoolLeases, alloc.PoolID)
167
+	}
168
+}
169
+
170
+func (a *allocator) requestPool(req *api.RequestPoolRequest) (ipamapi.AllocatedPool, error) {
128 171
 	res := &api.RequestPoolResponse{}
129
-	if err := a.call("RequestPool", remoteReq, res); err != nil {
172
+	if err := a.call("RequestPool", req, res); err != nil {
130 173
 		return ipamapi.AllocatedPool{}, err
131 174
 	}
132 175
 
... ...
@@ -138,6 +199,16 @@ func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool,
138 138
 	}, err
139 139
 }
140 140
 
141
+// checkOverlaps returns true if the 'pool' overlaps with some prefix in 'reserved'.
142
+func checkOverlaps(pool ipamapi.AllocatedPool, reserved []netip.Prefix) bool {
143
+	for _, r := range reserved {
144
+		if r.Overlaps(pool.Pool) {
145
+			return true
146
+		}
147
+	}
148
+	return false
149
+}
150
+
141 151
 // ReleasePool removes an address pool from the specified address space
142 152
 func (a *allocator) ReleasePool(poolID string) error {
143 153
 	req := &api.ReleasePoolRequest{PoolID: poolID}
... ...
@@ -2,27 +2,23 @@
2 2
 package ipamutils
3 3
 
4 4
 import (
5
-	"fmt"
6
-	"net"
5
+	"net/netip"
6
+	"slices"
7 7
 )
8 8
 
9 9
 var (
10
-	// predefinedLocalScopeDefaultNetworks contains a list of 31 IPv4 private networks with host size 16 and 12
11
-	// (172.17-31.x.x/16, 192.168.x.x/20) which do not overlap with the networks in `PredefinedGlobalScopeDefaultNetworks`
12
-	predefinedLocalScopeDefaultNetworks []*net.IPNet
13
-	// predefinedGlobalScopeDefaultNetworks contains a list of 64K IPv4 private networks with host size 8
14
-	// (10.x.x.x/24) which do not overlap with the networks in `PredefinedLocalScopeDefaultNetworks`
15
-	predefinedGlobalScopeDefaultNetworks []*net.IPNet
16
-	localScopeDefaultNetworks            = []*NetworkToSplit{
17
-		{"172.17.0.0/16", 16},
18
-		{"172.18.0.0/16", 16},
19
-		{"172.19.0.0/16", 16},
20
-		{"172.20.0.0/14", 16},
21
-		{"172.24.0.0/14", 16},
22
-		{"172.28.0.0/14", 16},
23
-		{"192.168.0.0/16", 20},
10
+	localScopeDefaultNetworks = []*NetworkToSplit{
11
+		{netip.MustParsePrefix("172.17.0.0/16"), 16},
12
+		{netip.MustParsePrefix("172.18.0.0/16"), 16},
13
+		{netip.MustParsePrefix("172.19.0.0/16"), 16},
14
+		{netip.MustParsePrefix("172.20.0.0/14"), 16},
15
+		{netip.MustParsePrefix("172.24.0.0/14"), 16},
16
+		{netip.MustParsePrefix("172.28.0.0/14"), 16},
17
+		{netip.MustParsePrefix("192.168.0.0/16"), 20},
18
+	}
19
+	globalScopeDefaultNetworks = []*NetworkToSplit{
20
+		{netip.MustParsePrefix("10.0.0.0/8"), 24},
24 21
 	}
25
-	globalScopeDefaultNetworks = []*NetworkToSplit{{"10.0.0.0/8", 24}}
26 22
 )
27 23
 
28 24
 // NetworkToSplit represent a network that has to be split in chunks with mask length Size.
... ...
@@ -31,73 +27,26 @@ var (
31 31
 // Example: a Base "10.10.0.0/16 with Size 24 will define the set of 256
32 32
 // 10.10.[0-255].0/24 address pools
33 33
 type NetworkToSplit struct {
34
-	Base string `json:"base"`
35
-	Size int    `json:"size"`
34
+	Base netip.Prefix `json:"base"`
35
+	Size int          `json:"size"`
36 36
 }
37 37
 
38
-func init() {
39
-	var err error
40
-	if predefinedGlobalScopeDefaultNetworks, err = SplitNetworks(globalScopeDefaultNetworks); err != nil {
41
-		panic("failed to initialize the global scope default address pool: " + err.Error())
42
-	}
38
+// FirstPrefix returns the first prefix available in NetworkToSplit.
39
+func (n NetworkToSplit) FirstPrefix() netip.Prefix {
40
+	return netip.PrefixFrom(n.Base.Addr(), n.Size)
41
+}
43 42
 
44
-	if predefinedLocalScopeDefaultNetworks, err = SplitNetworks(localScopeDefaultNetworks); err != nil {
45
-		panic("failed to initialize the local scope default address pool: " + err.Error())
46
-	}
43
+// Overlaps is a util function checking whether 'p' overlaps with 'n'.
44
+func (n NetworkToSplit) Overlaps(p netip.Prefix) bool {
45
+	return n.Base.Overlaps(p)
47 46
 }
48 47
 
49 48
 // GetGlobalScopeDefaultNetworks returns a copy of the global-sopce network list.
50
-func GetGlobalScopeDefaultNetworks() []*net.IPNet {
51
-	return append([]*net.IPNet(nil), predefinedGlobalScopeDefaultNetworks...)
49
+func GetGlobalScopeDefaultNetworks() []*NetworkToSplit {
50
+	return slices.Clone(globalScopeDefaultNetworks)
52 51
 }
53 52
 
54 53
 // GetLocalScopeDefaultNetworks returns a copy of the default local-scope network list.
55
-func GetLocalScopeDefaultNetworks() []*net.IPNet {
56
-	return append([]*net.IPNet(nil), predefinedLocalScopeDefaultNetworks...)
57
-}
58
-
59
-// SplitNetworks takes a slice of networks, split them accordingly and returns them
60
-func SplitNetworks(list []*NetworkToSplit) ([]*net.IPNet, error) {
61
-	localPools := make([]*net.IPNet, 0, len(list))
62
-
63
-	for _, p := range list {
64
-		_, b, err := net.ParseCIDR(p.Base)
65
-		if err != nil {
66
-			return nil, fmt.Errorf("invalid base pool %q: %v", p.Base, err)
67
-		}
68
-		ones, _ := b.Mask.Size()
69
-		if p.Size <= 0 || p.Size < ones {
70
-			return nil, fmt.Errorf("invalid pools size: %d", p.Size)
71
-		}
72
-		localPools = append(localPools, splitNetwork(p.Size, b)...)
73
-	}
74
-	return localPools, nil
75
-}
76
-
77
-func splitNetwork(size int, base *net.IPNet) []*net.IPNet {
78
-	one, bits := base.Mask.Size()
79
-	mask := net.CIDRMask(size, bits)
80
-	n := 1 << uint(size-one)
81
-	s := uint(bits - size)
82
-	list := make([]*net.IPNet, 0, n)
83
-
84
-	for i := 0; i < n; i++ {
85
-		ip := copyIP(base.IP)
86
-		addIntToIP(ip, uint(i<<s))
87
-		list = append(list, &net.IPNet{IP: ip, Mask: mask})
88
-	}
89
-	return list
90
-}
91
-
92
-func copyIP(from net.IP) net.IP {
93
-	ip := make([]byte, len(from))
94
-	copy(ip, from)
95
-	return ip
96
-}
97
-
98
-func addIntToIP(array net.IP, ordinal uint) {
99
-	for i := len(array) - 1; i >= 0; i-- {
100
-		array[i] |= (byte)(ordinal & 0xff)
101
-		ordinal >>= 8
102
-	}
54
+func GetLocalScopeDefaultNetworks() []*NetworkToSplit {
55
+	return slices.Clone(localScopeDefaultNetworks)
103 56
 }
104 57
deleted file mode 100644
... ...
@@ -1,74 +0,0 @@
1
-package ipamutils
2
-
3
-import (
4
-	"net"
5
-	"testing"
6
-
7
-	"gotest.tools/v3/assert"
8
-	is "gotest.tools/v3/assert/cmp"
9
-)
10
-
11
-func initBroadPredefinedNetworks() []*net.IPNet {
12
-	pl := make([]*net.IPNet, 0, 31)
13
-	mask := []byte{255, 255, 0, 0}
14
-	for i := 17; i < 32; i++ {
15
-		pl = append(pl, &net.IPNet{IP: []byte{172, byte(i), 0, 0}, Mask: mask})
16
-	}
17
-	mask20 := []byte{255, 255, 240, 0}
18
-	for i := 0; i < 16; i++ {
19
-		pl = append(pl, &net.IPNet{IP: []byte{192, 168, byte(i << 4), 0}, Mask: mask20})
20
-	}
21
-	return pl
22
-}
23
-
24
-func initGranularPredefinedNetworks() []*net.IPNet {
25
-	pl := make([]*net.IPNet, 0, 256*256)
26
-	mask := []byte{255, 255, 255, 0}
27
-	for i := 0; i < 256; i++ {
28
-		for j := 0; j < 256; j++ {
29
-			pl = append(pl, &net.IPNet{IP: []byte{10, byte(i), byte(j), 0}, Mask: mask})
30
-		}
31
-	}
32
-	return pl
33
-}
34
-
35
-func TestDefaultNetwork(t *testing.T) {
36
-	for _, nw := range GetGlobalScopeDefaultNetworks() {
37
-		if ones, bits := nw.Mask.Size(); bits != 32 || ones != 24 {
38
-			t.Fatalf("Unexpected size for network in granular list: %v", nw)
39
-		}
40
-	}
41
-
42
-	for _, nw := range GetLocalScopeDefaultNetworks() {
43
-		if ones, bits := nw.Mask.Size(); bits != 32 || (ones != 20 && ones != 16) {
44
-			t.Fatalf("Unexpected size for network in broad list: %v", nw)
45
-		}
46
-	}
47
-
48
-	originalBroadNets := initBroadPredefinedNetworks()
49
-	m := make(map[string]bool)
50
-	for _, v := range originalBroadNets {
51
-		m[v.String()] = true
52
-	}
53
-	for _, nw := range GetLocalScopeDefaultNetworks() {
54
-		_, ok := m[nw.String()]
55
-		assert.Check(t, ok)
56
-		delete(m, nw.String())
57
-	}
58
-
59
-	assert.Check(t, is.Len(m, 0))
60
-
61
-	originalGranularNets := initGranularPredefinedNetworks()
62
-
63
-	m = make(map[string]bool)
64
-	for _, v := range originalGranularNets {
65
-		m[v.String()] = true
66
-	}
67
-	for _, nw := range GetGlobalScopeDefaultNetworks() {
68
-		_, ok := m[nw.String()]
69
-		assert.Check(t, ok)
70
-		delete(m, nw.String())
71
-	}
72
-
73
-	assert.Check(t, is.Len(m, 0))
74
-}
... ...
@@ -24,6 +24,27 @@ func Add(ip netip.Addr, x uint64, shift uint) netip.Addr {
24 24
 	}
25 25
 }
26 26
 
27
+// SubnetsBetween computes the number of subnets of size 'sz' available between 'a1'
28
+// and 'a2'. The result is capped at [math.MaxUint64]. It returns 0 when one of
29
+// 'a1' or 'a2' is invalid, if both aren't of the same family, or when 'a2' is
30
+// less than 'a1'.
31
+func SubnetsBetween(a1 netip.Addr, a2 netip.Addr, sz int) uint64 {
32
+	if !a1.IsValid() || !a2.IsValid() || a1.Is4() != a2.Is4() || a2.Less(a1) {
33
+		return 0
34
+	}
35
+
36
+	p1, _ := a1.Prefix(sz)
37
+	p2, _ := a2.Prefix(sz)
38
+
39
+	return subAddr(p2.Addr(), p1.Addr()).rsh(uint(a1.BitLen() - sz)).uint64()
40
+}
41
+
42
+// subAddr returns 'ip1 - ip2'. Both netip.Addr have to be of the same address
43
+// family. 'ip1' as to be greater than or equal to 'ip2'.
44
+func subAddr(ip1 netip.Addr, ip2 netip.Addr) uint128 {
45
+	return uint128From16(ip1.As16()).sub(uint128From16(ip2.As16()))
46
+}
47
+
27 48
 // Field returns the value of the bitfield [u, v] in ip as an integer,
28 49
 // where bit 0 is the most-significant bit of ip.
29 50
 //
... ...
@@ -3,6 +3,8 @@ package ipbits
3 3
 import (
4 4
 	"net/netip"
5 5
 	"testing"
6
+
7
+	"gotest.tools/v3/assert"
6 8
 )
7 9
 
8 10
 func TestAdd(t *testing.T) {
... ...
@@ -68,3 +70,30 @@ func TestField(t *testing.T) {
68 68
 		}
69 69
 	}
70 70
 }
71
+func TestSubnetsBetween(t *testing.T) {
72
+	tests := []struct {
73
+		a1, a2 netip.Addr
74
+		sz     int
75
+		want   uint64
76
+	}{
77
+		{netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.0.0.0"), 8, 0},
78
+		{netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.0.10.0"), 8, 0},
79
+		{netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.1.0.0"), 24, 256},
80
+		{netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.10.0.0"), 16, 10},
81
+		{netip.MustParseAddr("10.20.0.0"), netip.MustParseAddr("10.20.128.0"), 24, 128},
82
+		{netip.MustParseAddr("10.0.0.0"), netip.MustParseAddr("10.0.10.0"), 24, 10},
83
+
84
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00::"), 8, 0x0},
85
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00:1000::"), 16, 0x0},
86
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc01::"), 24, 0x100},
87
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc01::"), 16, 0x1},
88
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00:1000::"), 24, 0x10},
89
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fc00:1000::"), 24, 0x10},
90
+		{netip.MustParseAddr("fc00::"), netip.MustParseAddr("fd00::"), 64, 0x100_0000_0000_0000},
91
+	}
92
+
93
+	for _, tt := range tests {
94
+		d := SubnetsBetween(tt.a1, tt.a2, tt.sz)
95
+		assert.Check(t, d == tt.want, "SubnetsBetween(%q, %q, %d) = 0x%x; want: 0x%x", tt.a1, tt.a2, tt.sz, d, tt.want)
96
+	}
97
+}
... ...
@@ -24,6 +24,12 @@ func (x uint128) add(y uint128) uint128 {
24 24
 	return uint128{hi: hi, lo: lo}
25 25
 }
26 26
 
27
+func (x uint128) sub(y uint128) uint128 {
28
+	lo, carry := bits.Sub64(x.lo, y.lo, 0)
29
+	hi, _ := bits.Sub64(x.hi, y.hi, carry)
30
+	return uint128{hi: hi, lo: lo}
31
+}
32
+
27 33
 func (x uint128) lsh(n uint) uint128 {
28 34
 	if n > 64 {
29 35
 		return uint128{hi: x.lo << (n - 64)}
... ...
@@ -6,7 +6,6 @@ import (
6 6
 	"context"
7 7
 	"crypto/rand"
8 8
 	"encoding/hex"
9
-	"errors"
10 9
 	"fmt"
11 10
 	"io"
12 11
 	"net"
... ...
@@ -16,34 +15,6 @@ import (
16 16
 	"github.com/containerd/log"
17 17
 )
18 18
 
19
-var (
20
-	// ErrNetworkOverlapsWithNameservers preformatted error
21
-	ErrNetworkOverlapsWithNameservers = errors.New("requested network overlaps with nameserver")
22
-	// ErrNetworkOverlaps preformatted error
23
-	ErrNetworkOverlaps = errors.New("requested network overlaps with existing network")
24
-)
25
-
26
-// CheckNameserverOverlaps checks whether the passed network overlaps with any of the nameservers
27
-func CheckNameserverOverlaps(nameservers []string, toCheck *net.IPNet) error {
28
-	if len(nameservers) > 0 {
29
-		for _, ns := range nameservers {
30
-			_, nsNetwork, err := net.ParseCIDR(ns)
31
-			if err != nil {
32
-				return err
33
-			}
34
-			if NetworkOverlaps(toCheck, nsNetwork) {
35
-				return ErrNetworkOverlapsWithNameservers
36
-			}
37
-		}
38
-	}
39
-	return nil
40
-}
41
-
42
-// NetworkOverlaps detects overlap between one IPNet and another
43
-func NetworkOverlaps(netX *net.IPNet, netY *net.IPNet) bool {
44
-	return netX.Contains(netY.IP) || netY.Contains(netX.IP)
45
-}
46
-
47 19
 func genMAC(ip net.IP) net.HardwareAddr {
48 20
 	hw := make(net.HardwareAddr, 6)
49 21
 	// The first byte of the MAC address has to comply with these rules:
... ...
@@ -1,13 +1,6 @@
1 1
 package netutils
2 2
 
3
-import (
4
-	"net"
5
-
6
-	"github.com/docker/docker/libnetwork/types"
7
-)
8
-
9
-// FindAvailableNetwork returns a network from the passed list which does not
10
-// overlap with existing interfaces in the system
11
-func FindAvailableNetwork(list []*net.IPNet) (*net.IPNet, error) {
12
-	return nil, types.NotImplementedErrorf("not supported on freebsd")
3
+// InferReservedNetworks returns an empty list on FreeBSD.
4
+func InferReservedNetworks(v6 bool) []netip.Prefix {
5
+	return []netip.Prefix{}
13 6
 }
... ...
@@ -5,9 +5,11 @@
5 5
 package netutils
6 6
 
7 7
 import (
8
-	"net"
8
+	"net/netip"
9 9
 	"os"
10
+	"slices"
10 11
 
12
+	"github.com/docker/docker/libnetwork/internal/netiputil"
11 13
 	"github.com/docker/docker/libnetwork/ns"
12 14
 	"github.com/docker/docker/libnetwork/resolvconf"
13 15
 	"github.com/docker/docker/libnetwork/types"
... ...
@@ -15,24 +17,84 @@ import (
15 15
 	"github.com/vishvananda/netlink"
16 16
 )
17 17
 
18
-var networkGetRoutesFct func(netlink.Link, int) ([]netlink.Route, error)
18
+// InferReservedNetworks returns a list of network prefixes that seem to be
19
+// used by the system and that would likely break it if they were assigned to
20
+// some Docker networks. It uses two heuristics to build that list:
21
+//
22
+// 1. Nameservers configured in /etc/resolv.conf ;
23
+// 2. On-link routes ;
24
+//
25
+// That 2nd heuristic was originally not limited to on-links -- all non-default
26
+// routes were checked (see [1]). This proved to be not ideal at best and
27
+// highly problematic at worst:
28
+//
29
+//   - VPN software and appliances doing split tunneling might push a small set
30
+//     of routes for large, aggregated prefixes to avoid maintenance and
31
+//     potential issues whenever a new subnet comes into use on internal
32
+//     network. However, not all subnets from these aggregates might be in use.
33
+//   - For full tunneling, especially when implemented with OpenVPN, the
34
+//     situation is even worse as the host might end up with the two following
35
+//     routes: 0.0.0.0/1 and 128.0.0.0/1. They are functionally
36
+//     indistinguishable from a default route, yet the Engine was treating them
37
+//     differently. With those routes, there was no way to use dynamic subnet
38
+//     allocation at all. (see 'def1' on [2])
39
+//   - A subnet covered by the default route can be used, or not. Same for
40
+//     non-default and non-on-link routes. The type of route says little about
41
+//     the availability of subnets it covers, except for on-link routes as they
42
+//     specifically define what subnet the current host is part of.
43
+//
44
+// The 2nd heuristic was modified to be limited to on-link routes in PR #42598
45
+// (first released in v23.0, see [3]).
46
+//
47
+// If these heuristics don't detect an overlap, users should change their daemon
48
+// config to remove that overlapping prefix from `default-address-pools`. If a
49
+// prefix is found to overlap but users care enough about it being associated
50
+// to a Docker network they can still rely on static allocation.
51
+//
52
+// For IPv6, the 2nd heuristic isn't applied as there's no such thing as
53
+// on-link routes for IPv6.
54
+//
55
+// [1]: https://github.com/moby/libnetwork/commit/56832d6d89bf0f9d5280849026ee25ae4ae5f22e
56
+// [2]: https://community.openvpn.net/openvpn/wiki/Openvpn23ManPage
57
+// [3]: https://github.com/moby/moby/pull/42598
58
+func InferReservedNetworks(v6 bool) []netip.Prefix {
59
+	var reserved []netip.Prefix
19 60
 
20
-// CheckRouteOverlaps checks whether the passed network overlaps with any existing routes
21
-func CheckRouteOverlaps(toCheck *net.IPNet) error {
22
-	networkGetRoutesFct := networkGetRoutesFct
23
-	if networkGetRoutesFct == nil {
24
-		networkGetRoutesFct = ns.NlHandle().RouteList
61
+	// We don't really care if os.ReadFile fails here. It either doesn't exist,
62
+	// or we can't read it for some reason.
63
+	if rc, err := os.ReadFile(resolvconf.Path()); err == nil {
64
+		reserved = slices.DeleteFunc(resolvconf.GetNameserversAsPrefix(rc), func(p netip.Prefix) bool {
65
+			return p.Addr().Is6() != v6
66
+		})
67
+	}
68
+
69
+	if !v6 {
70
+		reserved = append(reserved, queryOnLinkRoutes()...)
25 71
 	}
26
-	networks, err := networkGetRoutesFct(nil, netlink.FAMILY_V4)
72
+
73
+	slices.SortFunc(reserved, netiputil.PrefixCompare)
74
+	return reserved
75
+}
76
+
77
+// queryOnLinkRoutes returns a list of on-link routes available on the host.
78
+// Only IPv4 prefixes are returned as there's no such thing as on-link
79
+// routes for IPv6.
80
+func queryOnLinkRoutes() []netip.Prefix {
81
+	routes, err := ns.NlHandle().RouteList(nil, netlink.FAMILY_V4)
27 82
 	if err != nil {
28
-		return err
83
+		return nil
29 84
 	}
30
-	for _, network := range networks {
31
-		if network.Dst != nil && network.Scope == netlink.SCOPE_LINK && NetworkOverlaps(toCheck, network.Dst) {
32
-			return ErrNetworkOverlaps
85
+
86
+	var prefixes []netip.Prefix
87
+	for _, route := range routes {
88
+		if route.Dst != nil && route.Scope == netlink.SCOPE_LINK {
89
+			if p, ok := netiputil.ToPrefix(route.Dst); ok {
90
+				prefixes = append(prefixes, p)
91
+			}
33 92
 		}
34 93
 	}
35
-	return nil
94
+
95
+	return prefixes
36 96
 }
37 97
 
38 98
 // GenerateIfaceName returns an interface name using the passed in
... ...
@@ -58,23 +120,3 @@ func GenerateIfaceName(nlh *netlink.Handle, prefix string, len int) (string, err
58 58
 	}
59 59
 	return "", types.InternalErrorf("could not generate interface name")
60 60
 }
61
-
62
-// FindAvailableNetwork returns a network from the passed list which does not
63
-// overlap with existing interfaces in the system
64
-func FindAvailableNetwork(list []*net.IPNet) (*net.IPNet, error) {
65
-	// We don't check for an error here, because we don't really care if we
66
-	// can't read /etc/resolv.conf. So instead we skip the append if resolvConf
67
-	// is nil. It either doesn't exist, or we can't read it for some reason.
68
-	var nameservers []string
69
-	if rc, err := os.ReadFile(resolvconf.Path()); err == nil {
70
-		nameservers = resolvconf.GetNameserversAsCIDR(rc)
71
-	}
72
-	for _, nw := range list {
73
-		if err := CheckNameserverOverlaps(nameservers, nw); err == nil {
74
-			if err := CheckRouteOverlaps(nw); err == nil {
75
-				return nw, nil
76
-			}
77
-		}
78
-	}
79
-	return nil, errors.New("no available network")
80
-}
... ...
@@ -3,128 +3,18 @@ package netutils
3 3
 import (
4 4
 	"bytes"
5 5
 	"fmt"
6
-	"net"
6
+	"net/netip"
7
+	"slices"
7 8
 	"strings"
8 9
 	"testing"
9 10
 
10 11
 	"github.com/docker/docker/internal/testutils/netnsutils"
11
-	"github.com/docker/docker/libnetwork/ipamutils"
12
-	"github.com/docker/docker/libnetwork/types"
12
+	"github.com/docker/docker/libnetwork/internal/netiputil"
13 13
 	"github.com/vishvananda/netlink"
14 14
 	"gotest.tools/v3/assert"
15 15
 	is "gotest.tools/v3/assert/cmp"
16 16
 )
17 17
 
18
-func TestNonOverlappingNameservers(t *testing.T) {
19
-	network := &net.IPNet{
20
-		IP:   []byte{192, 168, 0, 1},
21
-		Mask: []byte{255, 255, 255, 0},
22
-	}
23
-	nameservers := []string{
24
-		"127.0.0.1/32",
25
-	}
26
-
27
-	if err := CheckNameserverOverlaps(nameservers, network); err != nil {
28
-		t.Fatal(err)
29
-	}
30
-}
31
-
32
-func TestOverlappingNameservers(t *testing.T) {
33
-	network := &net.IPNet{
34
-		IP:   []byte{192, 168, 0, 1},
35
-		Mask: []byte{255, 255, 255, 0},
36
-	}
37
-	nameservers := []string{
38
-		"192.168.0.1/32",
39
-	}
40
-
41
-	if err := CheckNameserverOverlaps(nameservers, network); err == nil {
42
-		t.Fatalf("Expected error %s got %s", ErrNetworkOverlapsWithNameservers, err)
43
-	}
44
-}
45
-
46
-func TestCheckRouteOverlaps(t *testing.T) {
47
-	networkGetRoutesFct = func(netlink.Link, int) ([]netlink.Route, error) {
48
-		routesData := []string{"10.0.2.0/32", "10.0.3.0/24", "10.0.42.0/24", "172.16.42.0/24", "192.168.142.0/24"}
49
-		routes := []netlink.Route{}
50
-		for _, addr := range routesData {
51
-			_, netX, _ := net.ParseCIDR(addr)
52
-			routes = append(routes, netlink.Route{Dst: netX, Scope: netlink.SCOPE_LINK})
53
-		}
54
-		// Add a route with a scope which should not overlap
55
-		_, netX, _ := net.ParseCIDR("10.0.5.0/24")
56
-		routes = append(routes, netlink.Route{Dst: netX, Scope: netlink.SCOPE_UNIVERSE})
57
-		return routes, nil
58
-	}
59
-	defer func() { networkGetRoutesFct = nil }()
60
-
61
-	_, netX, _ := net.ParseCIDR("172.16.0.1/24")
62
-	if err := CheckRouteOverlaps(netX); err != nil {
63
-		t.Fatal(err)
64
-	}
65
-
66
-	_, netX, _ = net.ParseCIDR("10.0.2.0/24")
67
-	if err := CheckRouteOverlaps(netX); err == nil {
68
-		t.Fatal("10.0.2.0/24 and 10.0.2.0 should overlap but it doesn't")
69
-	}
70
-
71
-	_, netX, _ = net.ParseCIDR("10.0.5.0/24")
72
-	if err := CheckRouteOverlaps(netX); err != nil {
73
-		t.Fatal("10.0.5.0/24 and 10.0.5.0 with scope UNIVERSE should not overlap but it does")
74
-	}
75
-}
76
-
77
-func TestCheckNameserverOverlaps(t *testing.T) {
78
-	nameservers := []string{"10.0.2.3/32", "192.168.102.1/32"}
79
-
80
-	_, netX, _ := net.ParseCIDR("10.0.2.3/32")
81
-
82
-	if err := CheckNameserverOverlaps(nameservers, netX); err == nil {
83
-		t.Fatalf("%s should overlap 10.0.2.3/32 but doesn't", netX)
84
-	}
85
-
86
-	_, netX, _ = net.ParseCIDR("192.168.102.2/32")
87
-
88
-	if err := CheckNameserverOverlaps(nameservers, netX); err != nil {
89
-		t.Fatalf("%s should not overlap %v but it does", netX, nameservers)
90
-	}
91
-}
92
-
93
-func AssertOverlap(CIDRx string, CIDRy string, t *testing.T) {
94
-	_, netX, _ := net.ParseCIDR(CIDRx)
95
-	_, netY, _ := net.ParseCIDR(CIDRy)
96
-	if !NetworkOverlaps(netX, netY) {
97
-		t.Errorf("%v and %v should overlap", netX, netY)
98
-	}
99
-}
100
-
101
-func AssertNoOverlap(CIDRx string, CIDRy string, t *testing.T) {
102
-	_, netX, _ := net.ParseCIDR(CIDRx)
103
-	_, netY, _ := net.ParseCIDR(CIDRy)
104
-	if NetworkOverlaps(netX, netY) {
105
-		t.Errorf("%v and %v should not overlap", netX, netY)
106
-	}
107
-}
108
-
109
-func TestNetworkOverlaps(t *testing.T) {
110
-	// netY starts at same IP and ends within netX
111
-	AssertOverlap("172.16.0.1/24", "172.16.0.1/25", t)
112
-	// netY starts within netX and ends at same IP
113
-	AssertOverlap("172.16.0.1/24", "172.16.0.128/25", t)
114
-	// netY starts and ends within netX
115
-	AssertOverlap("172.16.0.1/24", "172.16.0.64/25", t)
116
-	// netY starts at same IP and ends outside of netX
117
-	AssertOverlap("172.16.0.1/24", "172.16.0.1/23", t)
118
-	// netY starts before and ends at same IP of netX
119
-	AssertOverlap("172.16.1.1/24", "172.16.0.1/23", t)
120
-	// netY starts before and ends outside of netX
121
-	AssertOverlap("172.16.1.1/24", "172.16.0.1/22", t)
122
-	// netY starts and ends before netX
123
-	AssertNoOverlap("172.16.1.1/25", "172.16.0.1/24", t)
124
-	// netX starts and ends before netY
125
-	AssertNoOverlap("172.16.1.1/25", "172.16.2.1/24", t)
126
-}
127
-
128 18
 // Test veth name generation "veth"+rand (e.g.veth0f60e2c)
129 19
 func TestGenerateRandomName(t *testing.T) {
130 20
 	const vethPrefix = "veth"
... ...
@@ -184,83 +74,53 @@ func TestUtilGenerateRandomMAC(t *testing.T) {
184 184
 	}
185 185
 }
186 186
 
187
-func TestNetworkRequest(t *testing.T) {
187
+func TestInferReservedNetworksV4(t *testing.T) {
188 188
 	defer netnsutils.SetupTestOSContext(t)()
189 189
 
190
-	nw, err := FindAvailableNetwork(ipamutils.GetLocalScopeDefaultNetworks())
191
-	if err != nil {
192
-		t.Fatal(err)
193
-	}
194
-
195
-	var found bool
196
-	for _, exp := range ipamutils.GetLocalScopeDefaultNetworks() {
197
-		if types.CompareIPNet(exp, nw) {
198
-			found = true
199
-			break
200
-		}
201
-	}
202
-
203
-	if !found {
204
-		t.Fatalf("Found unexpected broad network %s", nw)
205
-	}
206
-
207
-	nw, err = FindAvailableNetwork(ipamutils.GetGlobalScopeDefaultNetworks())
208
-	if err != nil {
209
-		t.Fatal(err)
210
-	}
211
-
212
-	found = false
213
-	for _, exp := range ipamutils.GetGlobalScopeDefaultNetworks() {
214
-		if types.CompareIPNet(exp, nw) {
215
-			found = true
216
-			break
217
-		}
218
-	}
219
-
220
-	if !found {
221
-		t.Fatalf("Found unexpected granular network %s", nw)
222
-	}
223
-
224
-	// Add iface and ssert returned address on request
225
-	createInterface(t, "test", "172.17.42.1/16")
226
-
227
-	_, exp, err := net.ParseCIDR("172.18.0.0/16")
228
-	if err != nil {
229
-		t.Fatal(err)
230
-	}
231
-	nw, err = FindAvailableNetwork(ipamutils.GetLocalScopeDefaultNetworks())
232
-	if err != nil {
233
-		t.Fatal(err)
234
-	}
235
-	if !types.CompareIPNet(exp, nw) {
236
-		t.Fatalf("expected %s. got %s", exp, nw)
237
-	}
190
+	ifaceID := createInterface(t, "foobar")
191
+	addRoute(t, ifaceID, netlink.SCOPE_LINK, netip.MustParsePrefix("100.0.0.0/24"))
192
+	addRoute(t, ifaceID, netlink.SCOPE_LINK, netip.MustParsePrefix("10.0.0.0/8"))
193
+	addRoute(t, ifaceID, netlink.SCOPE_UNIVERSE, netip.MustParsePrefix("20.0.0.0/8"))
194
+
195
+	reserved := InferReservedNetworks(false)
196
+	t.Logf("reserved: %+v", reserved)
197
+
198
+	// We don't check the size of 'reserved' here because it also includes
199
+	// nameservers set in /etc/resolv.conf. This file might change from one test
200
+	// env to another, and it'd be unnecessarily complex to set up a mount
201
+	// namespace just to check that. Current implementation uses a function
202
+	// which is properly tested, so everything should be good.
203
+	assert.Check(t, slices.Contains(reserved, netip.MustParsePrefix("100.0.0.0/24")))
204
+	assert.Check(t, slices.Contains(reserved, netip.MustParsePrefix("10.0.0.0/8")))
205
+	assert.Check(t, !slices.Contains(reserved, netip.MustParsePrefix("20.0.0.0/8")))
238 206
 }
239 207
 
240
-func createInterface(t *testing.T, name string, nws ...string) {
241
-	// Add interface
242
-	link := &netlink.Bridge{
208
+func createInterface(t *testing.T, name string) int {
209
+	t.Helper()
210
+
211
+	link := &netlink.Dummy{
243 212
 		LinkAttrs: netlink.LinkAttrs{
244
-			Name: "test",
213
+			Name: name,
245 214
 		},
246 215
 	}
247
-	bips := []*net.IPNet{}
248
-	for _, nw := range nws {
249
-		bip, err := types.ParseCIDR(nw)
250
-		if err != nil {
251
-			t.Fatal(err)
252
-		}
253
-		bips = append(bips, bip)
254
-	}
255 216
 	if err := netlink.LinkAdd(link); err != nil {
256
-		t.Fatalf("Failed to create interface via netlink: %v", err)
257
-	}
258
-	for _, bip := range bips {
259
-		if err := netlink.AddrAdd(link, &netlink.Addr{IPNet: bip}); err != nil {
260
-			t.Fatal(err)
261
-		}
217
+		t.Fatalf("failed to create interface %s: %v", name, err)
262 218
 	}
263 219
 	if err := netlink.LinkSetUp(link); err != nil {
264 220
 		t.Fatal(err)
265 221
 	}
222
+
223
+	return link.Attrs().Index
224
+}
225
+
226
+func addRoute(t *testing.T, linkID int, scope netlink.Scope, prefix netip.Prefix) {
227
+	t.Helper()
228
+
229
+	if err := netlink.RouteAdd(&netlink.Route{
230
+		Scope:     scope,
231
+		LinkIndex: linkID,
232
+		Dst:       netiputil.ToIPNet(prefix),
233
+	}); err != nil {
234
+		t.Fatalf("failed to add on-link route %s: %v", prefix, err)
235
+	}
266 236
 }
... ...
@@ -1,13 +1,8 @@
1 1
 package netutils
2 2
 
3
-import (
4
-	"net"
5
-)
3
+import "net/netip"
6 4
 
7
-// FindAvailableNetwork returns a network from the passed list which does not
8
-// overlap with existing interfaces in the system
9
-//
10
-// TODO : Use appropriate windows APIs to identify non-overlapping subnets
11
-func FindAvailableNetwork(list []*net.IPNet) (*net.IPNet, error) {
12
-	return nil, nil
5
+// InferReservedNetworks returns an empty list on Windows.
6
+func InferReservedNetworks(v6 bool) []netip.Prefix {
7
+	return []netip.Prefix{}
13 8
 }
... ...
@@ -8,6 +8,7 @@ import (
8 8
 	"encoding/json"
9 9
 	"fmt"
10 10
 	"net"
11
+	"net/netip"
11 12
 	"runtime"
12 13
 	"strings"
13 14
 	"sync"
... ...
@@ -1517,66 +1518,6 @@ func (n *Network) ipamAllocate() error {
1517 1517
 	return err
1518 1518
 }
1519 1519
 
1520
-func (n *Network) requestPoolHelper(ipam ipamapi.Ipam, addressSpace, requestedPool, requestedSubPool string, options map[string]string, v6 bool) (string, *net.IPNet, map[string]string, error) {
1521
-	var tmpPoolLeases []string
1522
-	defer func() {
1523
-		// Prevent repeated lock/unlock in the loop.
1524
-		nwName := n.Name()
1525
-		// Release all pools we held on to.
1526
-		for _, pID := range tmpPoolLeases {
1527
-			if err := ipam.ReleasePool(pID); err != nil {
1528
-				log.G(context.TODO()).Warnf("Failed to release overlapping pool while returning from pool request helper for network %s", nwName)
1529
-			}
1530
-		}
1531
-	}()
1532
-
1533
-	for {
1534
-		alloc, err := ipam.RequestPool(ipamapi.PoolRequest{
1535
-			AddressSpace: addressSpace,
1536
-			Pool:         requestedPool,
1537
-			SubPool:      requestedSubPool,
1538
-			Options:      options,
1539
-			V6:           v6,
1540
-		})
1541
-		if err != nil {
1542
-			return "", nil, nil, err
1543
-		}
1544
-
1545
-		// If the network pool was explicitly chosen, the network belongs to
1546
-		// global scope, or it is invalid ("0.0.0.0/0"), then we don't perform
1547
-		// check for overlaps.
1548
-		//
1549
-		// FIXME(thaJeztah): why are we ignoring invalid pools here?
1550
-		//
1551
-		// The "invalid" conditions was added in [libnetwork#1095][1], which
1552
-		// moved code to reduce os-specific dependencies in the ipam package,
1553
-		// but also introduced a types.IsIPNetValid() function, which considers
1554
-		// "0.0.0.0/0" invalid, and added it to the conditions below.
1555
-		//
1556
-		// Unfortunately review does not mention this change, so there's no
1557
-		// context why. Possibly this was done to prevent errors further down
1558
-		// the line (when checking for overlaps), but returning an error here
1559
-		// instead would likely have avoided that as well, so we can only guess.
1560
-		//
1561
-		// [1]: https://github.com/moby/libnetwork/commit/5ca79d6b87873264516323a7b76f0af7d0298492#diff-bdcd879439d041827d334846f9aba01de6e3683ed8fdd01e63917dae6df23846
1562
-		if requestedPool != "" || n.Scope() == scope.Global || alloc.Pool.String() == "0.0.0.0/0" {
1563
-			return alloc.PoolID, netiputil.ToIPNet(alloc.Pool), alloc.Meta, nil
1564
-		}
1565
-
1566
-		// Check for overlap and if none found, we have found the right pool.
1567
-		if _, err := netutils.FindAvailableNetwork([]*net.IPNet{netiputil.ToIPNet(alloc.Pool)}); err == nil {
1568
-			return alloc.PoolID, netiputil.ToIPNet(alloc.Pool), alloc.Meta, nil
1569
-		}
1570
-
1571
-		// Pool obtained in this iteration is overlapping. Hold onto the pool
1572
-		// and don't release it yet, because we don't want IPAM to give us back
1573
-		// the same pool over again. But make sure we still do a deferred release
1574
-		// when we have either obtained a non-overlapping pool or ran out of
1575
-		// pre-defined pools.
1576
-		tmpPoolLeases = append(tmpPoolLeases, alloc.PoolID)
1577
-	}
1578
-}
1579
-
1580 1520
 func (n *Network) ipamAllocateVersion(ipVer int, ipam ipamapi.Ipam) error {
1581 1521
 	var (
1582 1522
 		cfgList  *[]*IpamConf
... ...
@@ -1611,11 +1552,28 @@ func (n *Network) ipamAllocateVersion(ipVer int, ipam ipamapi.Ipam) error {
1611 1611
 		(*infoList)[i] = d
1612 1612
 
1613 1613
 		d.AddressSpace = n.addrSpace
1614
-		d.PoolID, d.Pool, d.Meta, err = n.requestPoolHelper(ipam, n.addrSpace, cfg.PreferredPool, cfg.SubPool, n.ipamOptions, ipVer == 6)
1614
+
1615
+		var reserved []netip.Prefix
1616
+		if n.Scope() != scope.Global {
1617
+			reserved = netutils.InferReservedNetworks(ipVer == 6)
1618
+		}
1619
+
1620
+		alloc, err := ipam.RequestPool(ipamapi.PoolRequest{
1621
+			AddressSpace: n.addrSpace,
1622
+			Pool:         cfg.PreferredPool,
1623
+			SubPool:      cfg.SubPool,
1624
+			Options:      n.ipamOptions,
1625
+			Exclude:      reserved,
1626
+			V6:           ipVer == 6,
1627
+		})
1615 1628
 		if err != nil {
1616 1629
 			return err
1617 1630
 		}
1618 1631
 
1632
+		d.PoolID = alloc.PoolID
1633
+		d.Pool = netiputil.ToIPNet(alloc.Pool)
1634
+		d.Meta = alloc.Meta
1635
+
1619 1636
 		defer func() {
1620 1637
 			if err != nil {
1621 1638
 				if err := ipam.ReleasePool(d.PoolID); err != nil {
... ...
@@ -3,7 +3,7 @@ package resolvconf
3 3
 
4 4
 import (
5 5
 	"bytes"
6
-	"fmt"
6
+	"net/netip"
7 7
 	"os"
8 8
 	"strings"
9 9
 
... ...
@@ -83,19 +83,17 @@ func GetNameservers(resolvConf []byte, kind int) []string {
83 83
 	return nameservers
84 84
 }
85 85
 
86
-// GetNameserversAsCIDR returns nameservers (if any) listed in
86
+// GetNameserversAsPrefix returns nameservers (if any) listed in
87 87
 // /etc/resolv.conf as CIDR blocks (e.g., "1.2.3.4/32")
88
-// This function's output is intended for net.ParseCIDR
89
-func GetNameserversAsCIDR(resolvConf []byte) []string {
88
+func GetNameserversAsPrefix(resolvConf []byte) []netip.Prefix {
90 89
 	rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "")
91 90
 	if err != nil {
92 91
 		return nil
93 92
 	}
94 93
 	nsAddrs := rc.NameServers()
95
-	nameservers := make([]string, 0, len(nsAddrs))
94
+	nameservers := make([]netip.Prefix, 0, len(nsAddrs))
96 95
 	for _, addr := range nsAddrs {
97
-		str := fmt.Sprintf("%s/%d", addr.WithZone("").String(), addr.BitLen())
98
-		nameservers = append(nameservers, str)
96
+		nameservers = append(nameservers, netip.PrefixFrom(addr, addr.BitLen()))
99 97
 	}
100 98
 	return nameservers
101 99
 }
... ...
@@ -4,10 +4,12 @@ package resolvconf
4 4
 
5 5
 import (
6 6
 	"bytes"
7
+	"net/netip"
7 8
 	"os"
8 9
 	"strings"
9 10
 	"testing"
10 11
 
12
+	"github.com/google/go-cmp/cmp/cmpopts"
11 13
 	"github.com/opencontainers/go-digest"
12 14
 	"gotest.tools/v3/assert"
13 15
 	is "gotest.tools/v3/assert/cmp"
... ...
@@ -78,58 +80,58 @@ nameserver 1.2.3.4 # not 4.3.2.1`,
78 78
 	}
79 79
 }
80 80
 
81
-func TestGetNameserversAsCIDR(t *testing.T) {
81
+func TestGetNameserversAsPrefix(t *testing.T) {
82 82
 	for _, tc := range []struct {
83 83
 		input  string
84
-		result []string
84
+		result []netip.Prefix
85 85
 	}{
86 86
 		{
87
-			input: ``,
87
+			input:  ``,
88
+			result: []netip.Prefix{},
88 89
 		},
89 90
 		{
90
-			input: `search example.com`,
91
+			input:  `search example.com`,
92
+			result: []netip.Prefix{},
91 93
 		},
92 94
 		{
93 95
 			input:  `  nameserver 1.2.3.4   `,
94
-			result: []string{"1.2.3.4/32"},
96
+			result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32")},
95 97
 		},
96 98
 		{
97 99
 			input: `
98 100
 nameserver 1.2.3.4
99 101
 nameserver 40.3.200.10
100 102
 search example.com`,
101
-			result: []string{"1.2.3.4/32", "40.3.200.10/32"},
103
+			result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32"), netip.MustParsePrefix("40.3.200.10/32")},
102 104
 		},
103 105
 		{
104 106
 			input: `nameserver 1.2.3.4
105 107
 search example.com
106 108
 nameserver 4.30.20.100`,
107
-			result: []string{"1.2.3.4/32", "4.30.20.100/32"},
109
+			result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32"), netip.MustParsePrefix("4.30.20.100/32")},
108 110
 		},
109 111
 		{
110 112
 			input: `search example.com
111 113
 nameserver 1.2.3.4
112 114
 #nameserver 4.3.2.1`,
113
-			result: []string{"1.2.3.4/32"},
115
+			result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32")},
114 116
 		},
115 117
 		{
116 118
 			input: `search example.com
117 119
 nameserver 1.2.3.4 # not 4.3.2.1`,
118
-			result: []string{"1.2.3.4/32"},
120
+			result: []netip.Prefix{netip.MustParsePrefix("1.2.3.4/32")},
119 121
 		},
120 122
 		{
121 123
 			input:  `nameserver fd6f:c490:ec68::1`,
122
-			result: []string{"fd6f:c490:ec68::1/128"},
124
+			result: []netip.Prefix{netip.MustParsePrefix("fd6f:c490:ec68::1/128")},
123 125
 		},
124 126
 		{
125 127
 			input:  `nameserver fe80::1234%eth0`,
126
-			result: []string{"fe80::1234/128"},
128
+			result: []netip.Prefix{netip.MustParsePrefix("fe80::1234/128")},
127 129
 		},
128 130
 	} {
129
-		test := GetNameserversAsCIDR([]byte(tc.input))
130
-		if !strSlicesEqual(test, tc.result) {
131
-			t.Errorf("Wrong nameserver string {%s} should be %v. Input: %s", test, tc.result, tc.input)
132
-		}
131
+		test := GetNameserversAsPrefix([]byte(tc.input))
132
+		assert.DeepEqual(t, test, tc.result, cmpopts.EquateComparable(netip.Prefix{}))
133 133
 	}
134 134
 }
135 135
 
... ...
@@ -4,6 +4,7 @@ import (
4 4
 	"encoding/csv"
5 5
 	"encoding/json"
6 6
 	"fmt"
7
+	"net/netip"
7 8
 	"strconv"
8 9
 	"strings"
9 10
 
... ...
@@ -39,7 +40,11 @@ func (p *PoolsOpt) Set(value string) error {
39 39
 
40 40
 		switch key {
41 41
 		case "base":
42
-			poolsDef.Base = val
42
+			base, err := netip.ParsePrefix(val)
43
+			if err != nil {
44
+				return fmt.Errorf("invalid base prefix %q: %w", val, err)
45
+			}
46
+			poolsDef.Base = base
43 47
 		case "size":
44 48
 			size, err := strconv.Atoi(val)
45 49
 			if err != nil {