Browse code

Merge pull request #47103 from AkihiroSuda/detach-netns

rootless: properly support `--net=host` and localhost registries

Akihiro Suda authored on 2026/04/26 00:42:30
Showing 20 changed files
... ...
@@ -312,7 +312,12 @@ cmd_entrypoint_check() {
312 312
 cmd_entrypoint_nsenter() {
313 313
 	# No need to call init()
314 314
 	pid=$(cat "$XDG_RUNTIME_DIR/dockerd-rootless/child_pid")
315
-	exec nsenter --no-fork --wd="$(pwd)" --preserve-credentials -m -n -U -t "$pid" -- "$@"
315
+	n=""
316
+	# If RootlessKit is running with `--detach-netns` mode, we do NOT enter the detached netns here
317
+	if [ ! -e "$XDG_RUNTIME_DIR/dockerd-rootless/netns" ]; then
318
+		n="-n"
319
+	fi
320
+	exec nsenter --no-fork --wd="$(pwd)" --preserve-credentials -m $n -U -t "$pid" -- "$@"
316 321
 }
317 322
 
318 323
 show_systemd_error() {
... ...
@@ -22,6 +22,9 @@
22 22
 #   * Defaults to "auto".
23 23
 # * DOCKERD_ROOTLESS_ROOTLESSKIT_DISABLE_HOST_LOOPBACK=(true|false): prohibit connections to 127.0.0.1 on the host (including via 10.0.2.2, in the case of slirp4netns).
24 24
 #   * Defaults to "true".
25
+# * DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS=(true|false): whether to launch rootlesskit with the "detach-netns" mode.
26
+#   The "detached-netns" mode accelerates `docker (pull|push|build)` and enables `docker run --net=host`
27
+#   Defaults to "true". Set this to false only when facing a compatibility issue.
25 28
 
26 29
 # To apply an environment variable via systemd, create ~/.config/systemd/user/docker.service.d/override.conf as follows,
27 30
 # and run `systemctl --user daemon-reload && systemctl --user restart docker`:
... ...
@@ -114,6 +117,7 @@ fi
114 114
 : "${DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX:=auto}"
115 115
 : "${DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP:=auto}"
116 116
 : "${DOCKERD_ROOTLESS_ROOTLESSKIT_DISABLE_HOST_LOOPBACK:=}"
117
+: "${DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS:=true}"
117 118
 net=$DOCKERD_ROOTLESS_ROOTLESSKIT_NET
118 119
 port_driver=$DOCKERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER
119 120
 mtu=$DOCKERD_ROOTLESS_ROOTLESSKIT_MTU
... ...
@@ -182,6 +186,20 @@ if [ -z "$_DOCKERD_ROOTLESS_CHILD" ]; then
182 182
 		_DOCKERD_ROOTLESS_SELINUX=1
183 183
 		export _DOCKERD_ROOTLESS_SELINUX
184 184
 	fi
185
+
186
+	case "$DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS" in
187
+		1 | true)
188
+			DOCKERD_ROOTLESS_ROOTLESSKIT_FLAGS="--detach-netns $DOCKERD_ROOTLESS_ROOTLESSKIT_FLAGS"
189
+			;;
190
+		0 | false)
191
+			# NOP
192
+			;;
193
+		*)
194
+			echo "Unknown DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS value: $DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS"
195
+			exit 1
196
+			;;
197
+	esac
198
+
185 199
 	# Re-exec the script via RootlessKit, so as to create unprivileged {user,mount,network} namespaces.
186 200
 	#
187 201
 	# --copy-up allows removing/creating files in the directories by creating tmpfs and symlinks
... ...
@@ -219,13 +237,19 @@ else
219 219
 		mount_directory /etc/ssl "--rbind"
220 220
 	fi
221 221
 
222
+	netns="/proc/self/ns/net"
223
+	case "$DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS" in
224
+		1 | true)
225
+			netns="$ROOTLESSKIT_STATE_DIR/netns"
226
+			;;
227
+	esac
222 228
 	# When running with --firewall-backend=nftables, IP forwarding needs to be enabled
223 229
 	# because the daemon won't enable it. IP forwarding is harmless in the rootless
224 230
 	# netns, there's only a single external interface and only Docker uses the netns.
225 231
 	# So, always enable IPv4 and IPv6 forwarding. But ignore failure to enable IPv6
226 232
 	# forwarding, for hosts with IPv6 disabled.
227
-	sysctl -w net.ipv4.ip_forward=1
228
-	sysctl -w net.ipv6.conf.all.forwarding=1 || true
233
+	nsenter -n"$netns" sysctl -w net.ipv4.ip_forward=1
234
+	nsenter -n"$netns" sysctl -w net.ipv6.conf.all.forwarding=1 || true
229 235
 
230 236
 	exec "$dockerd" "$@"
231 237
 fi
... ...
@@ -6,6 +6,7 @@ import (
6 6
 	"fmt"
7 7
 
8 8
 	"github.com/containerd/containerd/v2/pkg/apparmor"
9
+	"github.com/moby/moby/v2/daemon/internal/rootless"
9 10
 	aaprofile "github.com/moby/profiles/apparmor"
10 11
 )
11 12
 
... ...
@@ -18,13 +19,25 @@ const (
18 18
 // DefaultApparmorProfile returns the name of the default apparmor profile
19 19
 func DefaultApparmorProfile() string {
20 20
 	if apparmor.HostSupports() {
21
+		if detachedNetNS, _ := rootless.DetachedNetNS(); detachedNetNS != "" {
22
+			// AppArmor is inaccessible with detached-netns because sysfs is netns-scoped.
23
+			return ""
24
+		}
21 25
 		return defaultAppArmorProfile
22 26
 	}
23 27
 	return ""
24 28
 }
25 29
 
26 30
 func ensureDefaultAppArmorProfile() error {
27
-	if apparmor.HostSupports() {
31
+	hostSupports := apparmor.HostSupports()
32
+	if hostSupports {
33
+		if detachedNetNS, _ := rootless.DetachedNetNS(); detachedNetNS != "" {
34
+			// "open /sys/kernel/security/apparmor/profiles: permission denied"
35
+			// (because sysfs is netns-scoped)
36
+			hostSupports = false
37
+		}
38
+	}
39
+	if hostSupports {
28 40
 		loaded, err := aaprofile.IsLoaded(defaultAppArmorProfile)
29 41
 		if err != nil {
30 42
 			return fmt.Errorf("Could not check if %s AppArmor profile was loaded: %s", defaultAppArmorProfile, err)
... ...
@@ -453,7 +453,9 @@ func (daemon *Daemon) initializeNetworking(ctx context.Context, cfg *config.Conf
453 453
 	}
454 454
 
455 455
 	// Cleanup any stale sandbox left over due to ungraceful daemon shutdown
456
-	if err := daemon.netController.SandboxDestroy(ctx, ctr.ID); err != nil {
456
+	if err := daemon.runInNetNS(func() error {
457
+		return daemon.netController.SandboxDestroy(ctx, ctr.ID)
458
+	}); err != nil {
457 459
 		log.G(ctx).WithError(err).Errorf("failed to cleanup up stale network sandbox for container %s", ctr.ID)
458 460
 	}
459 461
 
... ...
@@ -497,7 +499,9 @@ func (daemon *Daemon) initializeNetworking(ctx context.Context, cfg *config.Conf
497 497
 
498 498
 	defer func() {
499 499
 		if retErr != nil {
500
-			if err := sb.Delete(ctx); err != nil {
500
+			if err := daemon.runInNetNS(func() error {
501
+				return sb.Delete(ctx)
502
+			}); err != nil {
501 503
 				log.G(ctx).WithFields(log.Fields{
502 504
 					"error":     err,
503 505
 					"container": ctr.ID,
... ...
@@ -1027,7 +1031,9 @@ func (daemon *Daemon) releaseNetwork(ctx context.Context, ctr *container.Contain
1027 1027
 		return
1028 1028
 	}
1029 1029
 
1030
-	if err := sb.Delete(ctx); err != nil {
1030
+	if err := daemon.runInNetNS(func() error {
1031
+		return sb.Delete(ctx)
1032
+	}); err != nil {
1031 1033
 		log.G(ctx).Errorf("Error deleting sandbox id %s for container %s: %v", sid, ctr.ID, err)
1032 1034
 	}
1033 1035
 
... ...
@@ -1056,7 +1062,9 @@ func (daemon *Daemon) ConnectToNetwork(ctx context.Context, ctr *container.Conta
1056 1056
 
1057 1057
 		n, err := daemon.FindNetwork(idOrName)
1058 1058
 		if err == nil && n != nil {
1059
-			if err := daemon.updateNetworkConfig(ctr, n, endpointConfig); err != nil {
1059
+			if err := daemon.runInNetNS(func() error {
1060
+				return daemon.updateNetworkConfig(ctr, n, endpointConfig)
1061
+			}); err != nil {
1060 1062
 				return err
1061 1063
 			}
1062 1064
 		} else {
... ...
@@ -1070,7 +1078,9 @@ func (daemon *Daemon) ConnectToNetwork(ctx context.Context, ctr *container.Conta
1070 1070
 			EndpointSettings:  endpointConfig,
1071 1071
 			DesiredMacAddress: endpointConfig.MacAddress,
1072 1072
 		}
1073
-		if err := daemon.connectToNetwork(ctx, &daemon.config().Config, ctr, idOrName, epc); err != nil {
1073
+		if err := daemon.runInNetNS(func() error {
1074
+			return daemon.connectToNetwork(ctx, &daemon.config().Config, ctr, idOrName, epc)
1075
+		}); err != nil {
1074 1076
 			return err
1075 1077
 		}
1076 1078
 	}
... ...
@@ -1102,7 +1112,9 @@ func (daemon *Daemon) DisconnectFromNetwork(ctx context.Context, ctr *container.
1102 1102
 			return cerrdefs.ErrInvalidArgument.WithMessage("cannot disconnect container from host network - container was created in host network mode")
1103 1103
 		}
1104 1104
 
1105
-		if err := daemon.disconnectFromNetwork(ctx, ctr, n, false); err != nil {
1105
+		if err := daemon.runInNetNS(func() error {
1106
+			return daemon.disconnectFromNetwork(ctx, ctr, n, false)
1107
+		}); err != nil {
1106 1108
 			return err
1107 1109
 		}
1108 1110
 	} else {
... ...
@@ -13,6 +13,7 @@ import (
13 13
 
14 14
 	"github.com/containerd/log"
15 15
 	"github.com/moby/moby/v2/daemon/config"
16
+	"github.com/moby/moby/v2/daemon/internal/rootless"
16 17
 	"github.com/moby/moby/v2/daemon/libnetwork/ns"
17 18
 	"github.com/moby/moby/v2/daemon/libnetwork/resolvconf"
18 19
 	"github.com/moby/sys/mount"
... ...
@@ -237,3 +238,14 @@ func supportsRecursivelyReadOnly(cfg *configStore, runtime string) error {
237 237
 	}
238 238
 	return fmt.Errorf("rro is not supported by runtime %q", runtime)
239 239
 }
240
+
241
+func (daemon *Daemon) runInNetNS(f func() error) error {
242
+	if rootless.RunningWithRootlessKit() {
243
+		if detachedNetNS, err := rootless.DetachedNetNS(); err != nil {
244
+			return err
245
+		} else if detachedNetNS != "" {
246
+			return rootless.RunInNetNS(detachedNetNS, f)
247
+		}
248
+	}
249
+	return f()
250
+}
... ...
@@ -837,7 +837,9 @@ func (daemon *Daemon) initNetworkController(cfg *config.Config, activeSandboxes
837 837
 
838 838
 	if len(activeSandboxes) > 0 {
839 839
 		log.G(ctx).Info("there are running containers, updated network configuration will not take affect")
840
-	} else if err := configureNetworking(ctx, daemon.netController, cfg); err != nil {
840
+	} else if err := daemon.runInNetNS(func() error {
841
+		return configureNetworking(ctx, daemon.netController, cfg)
842
+	}); err != nil {
841 843
 		return err
842 844
 	}
843 845
 
... ...
@@ -17,3 +17,7 @@ func setupResolvConf(_ *any) {}
17 17
 func getSysInfo(_ *Daemon) *sysinfo.SysInfo {
18 18
 	return sysinfo.New()
19 19
 }
20
+
21
+func (daemon *Daemon) runInNetNS(f func() error) error {
22
+	return f()
23
+}
... ...
@@ -570,3 +570,7 @@ func setupResolvConf(config *config.Config) {}
570 570
 func getSysInfo(*config.Config) *sysinfo.SysInfo {
571 571
 	return sysinfo.New()
572 572
 }
573
+
574
+func (daemon *Daemon) runInNetNS(f func() error) error {
575
+	return f()
576
+}
573 577
new file mode 100644
... ...
@@ -0,0 +1,127 @@
0
+// Portions from https://github.com/containerd/nerdctl/pull/2723
1
+/*
2
+   Copyright The containerd Authors.
3
+
4
+   Licensed under the Apache License, Version 2.0 (the "License");
5
+   you may not use this file except in compliance with the License.
6
+   You may obtain a copy of the License at
7
+
8
+       http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+   Unless required by applicable law or agreed to in writing, software
11
+   distributed under the License is distributed on an "AS IS" BASIS,
12
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+   See the License for the specific language governing permissions and
14
+   limitations under the License.
15
+*/
16
+
17
+package rootless
18
+
19
+import (
20
+	"errors"
21
+	"fmt"
22
+	"os"
23
+	"path/filepath"
24
+	"runtime"
25
+	"sync"
26
+	"syscall"
27
+
28
+	"github.com/vishvananda/netns"
29
+)
30
+
31
+// DetachedNetNS returns non-empty netns path if RootlessKit is running with --detach-netns mode.
32
+// Otherwise returns "" without an error.
33
+var DetachedNetNS = sync.OnceValues(detachedNetNS)
34
+
35
+func detachedNetNS() (string, error) {
36
+	stateDir := os.Getenv("ROOTLESSKIT_STATE_DIR")
37
+	if stateDir == "" {
38
+		return "", nil
39
+	}
40
+	p := filepath.Join(stateDir, "netns")
41
+	if _, err := os.Stat(p); err != nil {
42
+		if errors.Is(err, os.ErrNotExist) {
43
+			return "", nil
44
+		}
45
+		return "", err
46
+	}
47
+	return p, nil
48
+}
49
+
50
+// RunInNetNS runs f in the detached network namespace if one is
51
+// configured, otherwise runs f directly. The function is executed on a
52
+// dedicated OS thread that is discarded after use, because setns back to the
53
+// host network namespace fails with EPERM in rootless mode (the host netns is
54
+// owned by the initial user namespace where the daemon lacks CAP_SYS_ADMIN).
55
+func RunInNetNS(nsPath string, f func() error) error {
56
+	if nsPath == "" {
57
+		return f()
58
+	}
59
+
60
+	ch := make(chan error, 1)
61
+	go func() {
62
+		runtime.LockOSThread()
63
+
64
+		ns, err := netns.GetFromPath(nsPath)
65
+		if err != nil {
66
+			runtime.UnlockOSThread()
67
+			ch <- fmt.Errorf("failed to open detached network namespace %s: %w", nsPath, err)
68
+			return
69
+		}
70
+		defer ns.Close()
71
+
72
+		origNS, err := netns.Get()
73
+		if err != nil {
74
+			runtime.UnlockOSThread()
75
+			ch <- fmt.Errorf("failed to get current network namespace: %w", err)
76
+			return
77
+		}
78
+		defer origNS.Close()
79
+
80
+		if err := netns.Set(ns); err != nil {
81
+			runtime.UnlockOSThread()
82
+			ch <- fmt.Errorf("failed to enter detached network namespace: %w", err)
83
+			return
84
+		}
85
+
86
+		ch <- f()
87
+
88
+		if err := netns.Set(origNS); err != nil {
89
+			// Cannot restore the thread's network namespace. Keep the
90
+			// goroutine locked so the Go runtime terminates the thread
91
+			// instead of returning a tainted thread to the pool.
92
+			return
93
+		}
94
+		runtime.UnlockOSThread()
95
+	}()
96
+	return <-ch
97
+}
98
+
99
+// sandboxNSThreads tracks OS threads that are currently executing in a
100
+// container sandbox network namespace (via InvokeFunc). This is used to
101
+// prevent iptables/nftables wrappers from adding nsenter to the detached
102
+// netns when they are already running in the correct (container) namespace.
103
+var sandboxNSThreads sync.Map // key: int (tid)
104
+
105
+// MarkInSandboxNS marks the current OS thread as being inside a container
106
+// sandbox network namespace. The caller must have locked the OS thread with
107
+// runtime.LockOSThread before calling this function.
108
+func MarkInSandboxNS() {
109
+	sandboxNSThreads.Store(syscall.Gettid(), struct{}{})
110
+}
111
+
112
+// UnmarkInSandboxNS removes the sandbox namespace mark from the current OS
113
+// thread. Must be called on the same locked thread that called MarkInSandboxNS.
114
+func UnmarkInSandboxNS() {
115
+	sandboxNSThreads.Delete(syscall.Gettid())
116
+}
117
+
118
+// InSandboxNS reports whether the current OS thread has been marked as
119
+// executing inside a container sandbox network namespace. When true,
120
+// iptables/nftables commands should NOT be wrapped with nsenter to the
121
+// detached netns, because the thread is already in the target (container)
122
+// namespace.
123
+func InSandboxNS() bool {
124
+	_, ok := sandboxNSThreads.Load(syscall.Gettid())
125
+	return ok
126
+}
... ...
@@ -10,6 +10,7 @@ import (
10 10
 	"strings"
11 11
 
12 12
 	"github.com/containerd/log"
13
+	"github.com/moby/moby/v2/daemon/internal/rootless"
13 14
 	"github.com/opencontainers/runtime-spec/specs-go"
14 15
 )
15 16
 
... ...
@@ -32,6 +33,7 @@ func ToRootfulInRootless(spec *specs.Spec) {
32 32
 // * Fix up OOMScoreAdj
33 33
 // * Fix up /proc if --pid=host
34 34
 // * Fix up /dev/shm and /dev/mqueue if --ipc=host
35
+// * Fix up /sys if --net=host (with detach-netns)
35 36
 //
36 37
 // v2Controllers should be non-nil only if running with v2 and systemd.
37 38
 func ToRootless(spec *specs.Spec, v2Controllers []string) error {
... ...
@@ -120,6 +122,31 @@ func toRootless(spec *specs.Spec, v2Controllers []string, currentOOMScoreAdj int
120 120
 		}
121 121
 	}
122 122
 
123
+	// Fix up /sys if --net=host (with detach-netns)
124
+	detachedNetNS, err := rootless.DetachedNetNS()
125
+	if err != nil {
126
+		return err
127
+	}
128
+	if detachedNetNS != "" {
129
+		netHost, err := isHostNS(spec, specs.NetworkNamespace)
130
+		if err != nil {
131
+			return err
132
+		}
133
+		if netHost {
134
+			// For rootless + host netns, we can't mount sysfs.
135
+			// We can't (non-recursively) bind mount /sys, either.
136
+			//
137
+			// TODO: consider to just rbind /sys from the host with rro,
138
+			// when rro is available (kernel >= 5.12, runc >= 1.1).
139
+			//
140
+			// Relevant: https://github.com/moby/buildkit/blob/v0.12.4/util/rootless/specconv/specconv_linux.go#L15-L34
141
+			// https://github.com/containerd/nerdctl/pull/2723
142
+			if err := removeSysfs(spec); err != nil {
143
+				return err
144
+			}
145
+		}
146
+	}
147
+
123 148
 	return nil
124 149
 }
125 150
 
... ...
@@ -139,7 +166,11 @@ func isHostNS(spec *specs.Spec, nsType specs.LinuxNamespaceType) (bool, error) {
139 139
 			if err != nil {
140 140
 				return false, err
141 141
 			}
142
-			selfNS, err := os.Readlink(filepath.Join("/proc/self/ns", string(nsType)))
142
+			procfsNSType := string(nsType)
143
+			if nsType == specs.NetworkNamespace {
144
+				procfsNSType = "net"
145
+			}
146
+			selfNS, err := os.Readlink(filepath.Join("/proc/self/ns", procfsNSType))
143 147
 			if err != nil {
144 148
 				return false, err
145 149
 			}
... ...
@@ -197,3 +228,26 @@ func bindMountHostIPC(s *specs.Spec) error {
197 197
 	}
198 198
 	return nil
199 199
 }
200
+
201
+func removeSysfs(s *specs.Spec) error {
202
+	var hasSysfs bool
203
+	for _, mount := range s.Mounts {
204
+		if mount.Type == "sysfs" {
205
+			hasSysfs = true
206
+			break
207
+		}
208
+	}
209
+	if !hasSysfs {
210
+		// NOP, as the user has specified a custom /sys mount
211
+		return nil
212
+	}
213
+	var mounts []specs.Mount // nolint: prealloc
214
+	for _, mount := range s.Mounts {
215
+		if strings.HasPrefix(mount.Destination, "/sys") {
216
+			continue
217
+		}
218
+		mounts = append(mounts, mount)
219
+	}
220
+	s.Mounts = mounts
221
+	return nil
222
+}
... ...
@@ -15,6 +15,7 @@ import (
15 15
 	"github.com/containerd/log"
16 16
 	"github.com/moby/moby/v2/daemon/internal/netiputil"
17 17
 	"github.com/moby/moby/v2/daemon/internal/otelutil"
18
+	"github.com/moby/moby/v2/daemon/internal/rootless"
18 19
 	"github.com/moby/moby/v2/daemon/internal/stringid"
19 20
 	"github.com/moby/moby/v2/daemon/libnetwork/datastore"
20 21
 	"github.com/moby/moby/v2/daemon/libnetwork/driverapi"
... ...
@@ -210,10 +211,25 @@ func newDriver(store *datastore.Store, config Configuration, pms *drvregistry.Po
210 210
 	return d, nil
211 211
 }
212 212
 
213
+func runInNetNS(f func() error) error {
214
+	if rootless.RunningWithRootlessKit() {
215
+		if detachedNetNS, err := rootless.DetachedNetNS(); err != nil {
216
+			return err
217
+		} else if detachedNetNS != "" {
218
+			return rootless.RunInNetNS(detachedNetNS, f)
219
+		}
220
+	}
221
+	return f()
222
+}
223
+
213 224
 // Register registers a new instance of bridge driver.
214 225
 func Register(r driverapi.Registerer, store *datastore.Store, pms *drvregistry.PortMappers, config Configuration) error {
215
-	d, err := newDriver(store, config, pms)
216
-	if err != nil {
226
+	var d *driver
227
+	if err := runInNetNS(func() error {
228
+		var err error
229
+		d, err = newDriver(store, config, pms)
230
+		return err
231
+	}); err != nil {
217 232
 		return err
218 233
 	}
219 234
 	return r.RegisterDriver(NetworkType, d, driverapi.Capability{
... ...
@@ -8,18 +8,38 @@ import (
8 8
 	"io"
9 9
 	"os/exec"
10 10
 	"strings"
11
+	"sync"
11 12
 
12 13
 	"github.com/containerd/log"
14
+	"github.com/moby/moby/v2/daemon/internal/rootless"
13 15
 	"go.opentelemetry.io/otel"
14 16
 )
15 17
 
16 18
 type nftHandle = struct{}
17 19
 
20
+var lookPathNSEnter = sync.OnceValues(func() (string, error) {
21
+	return exec.LookPath("nsenter")
22
+})
23
+
18 24
 func (t *table) nftApply(ctx context.Context, nftCmd []byte) error {
19 25
 	ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".nftApply.exec")
20 26
 	defer span.End()
21 27
 
22
-	cmd := exec.Command(nftPath, "-f", "-")
28
+	cmdPath := nftPath
29
+	cmdArgs := []string{nftPath, "-f", "-"}
30
+	detachedNetNS, err := rootless.DetachedNetNS()
31
+	if err != nil {
32
+		return fmt.Errorf("could not check for detached netns: %w", err)
33
+	}
34
+	if detachedNetNS != "" && !rootless.InSandboxNS() {
35
+		nsenterPath, err := lookPathNSEnter()
36
+		if err != nil {
37
+			return fmt.Errorf("nsenter not found: %w", err)
38
+		}
39
+		cmdPath = nsenterPath
40
+		cmdArgs = append([]string{nsenterPath, "-n" + detachedNetNS, "-F", "--"}, cmdArgs...)
41
+	}
42
+	cmd := exec.CommandContext(ctx, cmdPath, cmdArgs[1:]...)
23 43
 	stdinPipe, err := cmd.StdinPipe()
24 44
 	if err != nil {
25 45
 		return fmt.Errorf("getting stdin pipe for nft: %w", err)
... ...
@@ -364,7 +364,20 @@ func (iptable IPTable) raw(args ...string) ([]byte, error) {
364 364
 	log.G(context.TODO()).Debugf("%s, %v", path, args)
365 365
 
366 366
 	startTime := time.Now()
367
-	output, err := exec.Command(path, args...).CombinedOutput()
367
+	cmd := exec.CommandContext(context.TODO(), path, args...)
368
+	detachedNetNS, err := rootless.DetachedNetNS()
369
+	if err != nil {
370
+		return nil, fmt.Errorf("could not check for detached netns: %w", err)
371
+	}
372
+	if detachedNetNS != "" && !rootless.InSandboxNS() {
373
+		nsenterPath, err := exec.LookPath("nsenter")
374
+		if err != nil {
375
+			return nil, fmt.Errorf("nsenter not found: %w", err)
376
+		}
377
+		cmd.Args = append([]string{nsenterPath, "-n" + detachedNetNS, "-F", "--"}, cmd.Args...)
378
+		cmd.Path = nsenterPath
379
+	}
380
+	output, err := cmd.CombinedOutput()
368 381
 	if err != nil {
369 382
 		return nil, fmt.Errorf("iptables failed: %s %v: %s (%s)", commandName, strings.Join(args, " "), output, err)
370 383
 	}
... ...
@@ -20,6 +20,8 @@ package nlwrap
20 20
 
21 21
 import (
22 22
 	"context"
23
+	"fmt"
24
+	"runtime"
23 25
 
24 26
 	"github.com/containerd/log"
25 27
 	"github.com/pkg/errors"
... ...
@@ -42,12 +44,71 @@ func NewHandle(nlFamilies ...int) (Handle, error) {
42 42
 	return Handle{nlh}, nil
43 43
 }
44 44
 
45
+// NewHandleAt creates a new netlink handle in the specified network namespace.
46
+//
47
+// Unlike netlink.NewHandleAt, this function properly manages thread lifecycle
48
+// when the calling thread's network namespace cannot be restored after switching
49
+// (e.g. in rootless mode where setns back to the host netns fails with EPERM).
50
+// The upstream netlink library silently ignores setns restoration errors and
51
+// returns the tainted thread to the Go runtime's thread pool, which causes
52
+// goroutines scheduled on those threads to operate in the wrong network
53
+// namespace.
45 54
 func NewHandleAt(ns netns.NsHandle, nlFamilies ...int) (Handle, error) {
46
-	nlh, err := netlink.NewHandleAt(ns, nlFamilies...)
47
-	if err != nil {
48
-		return Handle{}, err
55
+	if !ns.IsOpen() {
56
+		// No target namespace; same as NewHandle.
57
+		return NewHandle(nlFamilies...)
49 58
 	}
50
-	return Handle{nlh}, nil
59
+
60
+	type result struct {
61
+		handle *netlink.Handle
62
+		err    error
63
+	}
64
+	ch := make(chan result, 1)
65
+
66
+	go func() {
67
+		runtime.LockOSThread()
68
+
69
+		origNS, err := netns.Get()
70
+		if err != nil {
71
+			runtime.UnlockOSThread()
72
+			ch <- result{err: fmt.Errorf("could not get current network namespace: %w", err)}
73
+			return
74
+		}
75
+		defer origNS.Close()
76
+
77
+		if err := netns.Set(ns); err != nil {
78
+			runtime.UnlockOSThread()
79
+			ch <- result{err: fmt.Errorf("failed to enter network namespace: %w", err)}
80
+			return
81
+		}
82
+
83
+		// Create netlink sockets in the target namespace.
84
+		// NewHandle with no ns args does not do any namespace switching.
85
+		nlh, err := netlink.NewHandle(nlFamilies...)
86
+		if err != nil {
87
+			// Best-effort restore before reporting the error.
88
+			netns.Set(origNS) //nolint:errcheck
89
+			runtime.UnlockOSThread()
90
+			ch <- result{err: err}
91
+			return
92
+		}
93
+
94
+		if err := netns.Set(origNS); err != nil {
95
+			// Cannot restore the thread's network namespace. Keep the
96
+			// goroutine locked to this thread so the Go runtime terminates
97
+			// it instead of returning a tainted thread to the pool.
98
+			ch <- result{handle: nlh}
99
+			return
100
+		}
101
+		runtime.UnlockOSThread()
102
+		ch <- result{handle: nlh}
103
+	}()
104
+
105
+	r := <-ch
106
+	if r.err != nil {
107
+		return Handle{}, r.err
108
+	}
109
+	return Handle{r.handle}, nil
51 110
 }
52 111
 
53 112
 func (nlh Handle) Close() {
... ...
@@ -160,10 +221,56 @@ func LinkList() (links []netlink.Link, err error) {
160 160
 	return links, discardErrDumpInterrupted(err)
161 161
 }
162 162
 
163
-// LinkSubscribeWithOptions calls netlink.LinkSubscribeWithOptions, retrying if necessary.
164
-// Close the done channel when done (rather than just sending on it), so that goroutines
165
-// started by the netlink package are all stopped.
163
+// LinkSubscribeWithOptions calls netlink.LinkSubscribeWithOptions, retrying if
164
+// necessary. Close the done channel when done (rather than just sending on it),
165
+// so that goroutines started by the netlink package are all stopped.
166
+//
167
+// When a target namespace is specified, the subscribe socket is created on a
168
+// dedicated OS thread to avoid the same executeInNetns thread contamination
169
+// issue described in [NewHandleAt].
166 170
 func LinkSubscribeWithOptions(ch chan<- netlink.LinkUpdate, done <-chan struct{}, options netlink.LinkSubscribeOptions) (err error) {
171
+	if options.Namespace != nil && options.Namespace.IsOpen() {
172
+		ns := *options.Namespace
173
+		// Clear the namespace option so the netlink library does not do
174
+		// its own namespace switching (via executeInNetns). We handle it.
175
+		options.Namespace = nil
176
+		errCh := make(chan error, 1)
177
+		go func() {
178
+			runtime.LockOSThread()
179
+
180
+			origNS, nserr := netns.Get()
181
+			if nserr != nil {
182
+				runtime.UnlockOSThread()
183
+				errCh <- fmt.Errorf("could not get current network namespace: %w", nserr)
184
+				return
185
+			}
186
+			defer origNS.Close()
187
+
188
+			if nserr := netns.Set(ns); nserr != nil {
189
+				runtime.UnlockOSThread()
190
+				errCh <- fmt.Errorf("failed to enter network namespace: %w", nserr)
191
+				return
192
+			}
193
+
194
+			// Create the subscribe socket in the target namespace.
195
+			// With Namespace cleared, the netlink library will not
196
+			// attempt any namespace switching internally.
197
+			retryOnIntr(func() error {
198
+				err = netlink.LinkSubscribeWithOptions(ch, done, options) //nolint:forbidigo
199
+				return err
200
+			})
201
+			errCh <- err
202
+
203
+			if nserr := netns.Set(origNS); nserr != nil {
204
+				// Cannot restore: keep locked so the runtime kills
205
+				// this thread instead of returning it to the pool.
206
+				return
207
+			}
208
+			runtime.UnlockOSThread()
209
+		}()
210
+		return <-errCh
211
+	}
212
+
167 213
 	retryOnIntr(func() error {
168 214
 		err = netlink.LinkSubscribeWithOptions(ch, done, options) //nolint:forbidigo
169 215
 		return err
... ...
@@ -9,6 +9,7 @@ import (
9 9
 	"time"
10 10
 
11 11
 	"github.com/containerd/log"
12
+	"github.com/moby/moby/v2/daemon/internal/rootless"
12 13
 	"github.com/moby/moby/v2/daemon/libnetwork/internal/modprobe"
13 14
 	"github.com/moby/moby/v2/daemon/libnetwork/nlwrap"
14 15
 	"github.com/vishvananda/netns"
... ...
@@ -22,12 +23,30 @@ var initNamespace = sync.OnceValues(initHandles)
22 22
 
23 23
 // initHandles initializes a new network namespace
24 24
 func initHandles() (netns.NsHandle, nlwrap.Handle) {
25
-	initNs, err := netns.Get()
25
+	var (
26
+		initNs netns.NsHandle
27
+		initNl nlwrap.Handle
28
+		err    error
29
+	)
30
+	detachedNetNS, err := rootless.DetachedNetNS()
26 31
 	if err != nil {
27
-		log.G(context.Background()).WithError(err).Error("could not get initial namespace: falling back to using netns.None")
28
-		initNs = netns.None()
32
+		log.G(context.Background()).WithError(err).Error("could not check for detached netns")
33
+	}
34
+	if detachedNetNS != "" {
35
+		initNs, err = netns.GetFromPath(detachedNetNS)
36
+		if err != nil {
37
+			log.G(context.Background()).WithError(err).Errorf("could not get detached network namespace %s", detachedNetNS)
38
+			return initNs, initNl
39
+		}
40
+		initNl, err = nlwrap.NewHandleAt(initNs, getSupportedNlFamilies()...)
41
+	} else {
42
+		initNs, err = netns.Get()
43
+		if err != nil {
44
+			log.G(context.Background()).WithError(err).Error("could not get initial namespace: falling back to using netns.None")
45
+			initNs = netns.None()
46
+		}
47
+		initNl, err = nlwrap.NewHandle(getSupportedNlFamilies()...)
29 48
 	}
30
-	initNl, err := nlwrap.NewHandle(getSupportedNlFamilies()...)
31 49
 	if err != nil {
32 50
 		// Fail fast to keep the invariant: NlHandle must be a valid handle
33 51
 		panic(fmt.Errorf("could not create netlink handle on initial (host) namespace: %w", err))
... ...
@@ -14,6 +14,7 @@ import (
14 14
 	"syscall"
15 15
 
16 16
 	"github.com/containerd/log"
17
+	"github.com/moby/moby/v2/daemon/internal/rootless"
17 18
 	"github.com/moby/moby/v2/daemon/internal/unshare"
18 19
 	"github.com/moby/moby/v2/daemon/libnetwork/nlwrap"
19 20
 	"github.com/moby/moby/v2/daemon/libnetwork/ns"
... ...
@@ -115,6 +116,15 @@ func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
115 115
 
116 116
 	n := &Namespace{path: key, isDefault: !osCreate}
117 117
 
118
+	detachedNetNS, err := rootless.DetachedNetNS()
119
+	if err != nil {
120
+		return nil, err
121
+	}
122
+	if detachedNetNS != "" && !osCreate {
123
+		// n refers to the host netns and we do not have a permission to do the netlink stuff
124
+		return n, nil
125
+	}
126
+
118 127
 	sboxNs, err := netns.GetFromPath(n.path)
119 128
 	if err != nil {
120 129
 		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
... ...
@@ -193,6 +203,7 @@ func createNetworkNamespace(path string, osCreate bool) error {
193 193
 	if osCreate {
194 194
 		return unshare.Go(unix.CLONE_NEWNET, do, nil)
195 195
 	}
196
+	// use host netns
196 197
 	return do()
197 198
 }
198 199
 
... ...
@@ -355,6 +366,7 @@ func (n *Namespace) InvokeFunc(f func()) error {
355 355
 		}
356 356
 		defer func() {
357 357
 			close(done)
358
+			rootless.UnmarkInSandboxNS()
358 359
 			if err := netns.Set(origNS); err != nil {
359 360
 				log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
360 361
 				// Recover from the error by leaving this goroutine locked to
... ...
@@ -364,6 +376,7 @@ func (n *Namespace) InvokeFunc(f func()) error {
364 364
 				runtime.UnlockOSThread()
365 365
 			}
366 366
 		}()
367
+		rootless.MarkInSandboxNS()
367 368
 		f()
368 369
 	}()
369 370
 	return <-done
... ...
@@ -14,6 +14,7 @@ import (
14 14
 	"time"
15 15
 
16 16
 	"github.com/containerd/log"
17
+	"github.com/moby/moby/v2/daemon/internal/rootless"
17 18
 	"github.com/moby/moby/v2/daemon/libnetwork/types"
18 19
 )
19 20
 
... ...
@@ -58,6 +59,18 @@ func StartProxy(pb types.PortBinding,
58 58
 		cmd.ExtraFiles = append(cmd.ExtraFiles, listenSock)
59 59
 	}
60 60
 
61
+	detachedNetNS, err := rootless.DetachedNetNS()
62
+	if err != nil {
63
+		return nil, err
64
+	}
65
+	if detachedNetNS != "" {
66
+		cmd.Path, err = exec.LookPath("nsenter")
67
+		if err != nil {
68
+			return nil, err
69
+		}
70
+		cmd.Args = append([]string{cmd.Path, "-n" + detachedNetNS, "-F", "--"}, cmd.Args...)
71
+	}
72
+
61 73
 	wait := make(chan error, 1)
62 74
 
63 75
 	// As p.cmd.SysProcAttr.Pdeathsig is set, the signal will be sent to the
... ...
@@ -280,13 +280,21 @@ func (daemon *Daemon) WaitForDetachment(ctx context.Context, networkName, networ
280 280
 
281 281
 // CreateManagedNetwork creates an agent network.
282 282
 func (daemon *Daemon) CreateManagedNetwork(create clustertypes.NetworkCreateRequest) error {
283
-	_, err := daemon.createNetwork(context.TODO(), &daemon.config().Config, create.CreateRequest, create.ID, true)
284
-	return err
283
+	return daemon.runInNetNS(func() error {
284
+		_, err := daemon.createNetwork(context.TODO(), &daemon.config().Config, create.CreateRequest, create.ID, true)
285
+		return err
286
+	})
285 287
 }
286 288
 
287 289
 // CreateNetwork creates a network with the given name, driver and other optional parameters
288 290
 func (daemon *Daemon) CreateNetwork(ctx context.Context, create networktypes.CreateRequest) (*networktypes.CreateResponse, error) {
289
-	return daemon.createNetwork(ctx, &daemon.config().Config, create, "", false)
291
+	var resp *networktypes.CreateResponse
292
+	err := daemon.runInNetNS(func() error {
293
+		var err error
294
+		resp, err = daemon.createNetwork(ctx, &daemon.config().Config, create, "", false)
295
+		return err
296
+	})
297
+	return resp, err
290 298
 }
291 299
 
292 300
 func (daemon *Daemon) createNetwork(ctx context.Context, cfg *config.Config, create networktypes.CreateRequest, id string, agent bool) (*networktypes.CreateResponse, error) {
... ...
@@ -623,20 +631,24 @@ func (daemon *Daemon) GetNetworkDriverList(ctx context.Context) []string {
623 623
 // DeleteManagedNetwork deletes an agent network.
624 624
 // The requirement of networkID is enforced.
625 625
 func (daemon *Daemon) DeleteManagedNetwork(networkID string) error {
626
-	n, err := daemon.GetNetworkByID(networkID)
627
-	if err != nil {
628
-		return err
629
-	}
630
-	return daemon.deleteNetwork(n, true)
626
+	return daemon.runInNetNS(func() error {
627
+		n, err := daemon.GetNetworkByID(networkID)
628
+		if err != nil {
629
+			return err
630
+		}
631
+		return daemon.deleteNetwork(n, true)
632
+	})
631 633
 }
632 634
 
633 635
 // DeleteNetwork destroys a network unless it's one of docker's predefined networks.
634 636
 func (daemon *Daemon) DeleteNetwork(networkID string) error {
635
-	n, err := daemon.GetNetworkByID(networkID)
636
-	if err != nil {
637
-		return fmt.Errorf("could not find network by ID: %w", err)
638
-	}
639
-	return daemon.deleteNetwork(n, false)
637
+	return daemon.runInNetNS(func() error {
638
+		n, err := daemon.GetNetworkByID(networkID)
639
+		if err != nil {
640
+			return fmt.Errorf("could not find network by ID: %w", err)
641
+		}
642
+		return daemon.deleteNetwork(n, false)
643
+	})
640 644
 }
641 645
 
642 646
 func (daemon *Daemon) deleteNetwork(nw *libnetwork.Network, dynamic bool) error {
... ...
@@ -18,6 +18,7 @@ import (
18 18
 	containertypes "github.com/moby/moby/api/types/container"
19 19
 	dconfig "github.com/moby/moby/v2/daemon/config"
20 20
 	"github.com/moby/moby/v2/daemon/container"
21
+	"github.com/moby/moby/v2/daemon/internal/rootless"
21 22
 	"github.com/moby/moby/v2/daemon/internal/rootless/mountopts"
22 23
 	"github.com/moby/moby/v2/daemon/internal/rootless/specconv"
23 24
 	"github.com/moby/moby/v2/daemon/pkg/oci"
... ...
@@ -126,6 +127,10 @@ func WithSelinux(c *container.Container) coci.SpecOpts {
126 126
 func WithApparmor(c *container.Container) coci.SpecOpts {
127 127
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
128 128
 		if apparmor.HostSupports() {
129
+			// AppArmor is inaccessible with detached-netns because sysfs is netns-scoped.
130
+			if detachedNetNS, _ := rootless.DetachedNetNS(); detachedNetNS != "" {
131
+				return nil
132
+			}
129 133
 			var appArmorProfile string
130 134
 			if c.AppArmorProfile != "" {
131 135
 				appArmorProfile = c.AppArmorProfile
... ...
@@ -34,7 +34,9 @@ func (daemon *Daemon) initializeCreatedTask(
34 34
 			return errdefs.System(err)
35 35
 		}
36 36
 	}
37
-	if err := daemon.allocateNetwork(ctx, cfg, ctr); err != nil {
37
+	if err := daemon.runInNetNS(func() error {
38
+		return daemon.allocateNetwork(ctx, cfg, ctr)
39
+	}); err != nil {
38 40
 		return fmt.Errorf("%s: %w", errSetupNetworking, err)
39 41
 	}
40 42
 	return nil