Browse code

vendor: github.com/opencontainers/runc v1.2.0

Removes dependency on various libcontainer packages;

- github.com/opencontainers/runc/libcontainer/system
- github.com/opencontainers/runc/libcontainer/user
- github.com/opencontainers/runc/libcontainer/userns

full diff: https://github.com/opencontainers/runc/compare/v1.1.14..v1.2.0

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>

Sebastiaan van Stijn authored on 2024/04/03 21:25:31
Showing 42 changed files
... ...
@@ -81,7 +81,7 @@ require (
81 81
 	github.com/morikuni/aec v1.0.0
82 82
 	github.com/opencontainers/go-digest v1.0.0
83 83
 	github.com/opencontainers/image-spec v1.1.0
84
-	github.com/opencontainers/runc v1.1.14
84
+	github.com/opencontainers/runc v1.2.0
85 85
 	github.com/opencontainers/runtime-spec v1.2.0
86 86
 	github.com/opencontainers/selinux v1.11.1
87 87
 	github.com/pelletier/go-toml v1.9.5
... ...
@@ -422,8 +422,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
422 422
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
423 423
 github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
424 424
 github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
425
-github.com/opencontainers/runc v1.1.14 h1:rgSuzbmgz5DUJjeSnw337TxDbRuqjs6iqQck/2weR6w=
426
-github.com/opencontainers/runc v1.1.14/go.mod h1:E4C2z+7BxR7GHXp0hAY53mek+x49X1LjPNeMTfRGvOA=
425
+github.com/opencontainers/runc v1.2.0 h1:qke7ZVCmJcKrJVY2iHJVC+0kql9uYdkusOPsQOOeBw4=
426
+github.com/opencontainers/runc v1.2.0/go.mod h1:/PXzF0h531HTMsYQnmxXkBD7YaGShm/2zcRB79dksUc=
427 427
 github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
428 428
 github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
429 429
 github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
... ...
@@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:
8 8
 
9 9
 
10 10
 Use and transfer of Docker may be subject to certain restrictions by the
11
-United States and other governments.  
11
+United States and other governments.
12 12
 It is your responsibility to ensure that your use and/or transfer does not
13
-violate applicable laws. 
13
+violate applicable laws.
14 14
 
15 15
 For more information, please see http://www.bis.doc.gov
16 16
 
... ...
@@ -1,9 +1,30 @@
1 1
 package cgroups
2 2
 
3 3
 import (
4
+	"errors"
5
+
4 6
 	"github.com/opencontainers/runc/libcontainer/configs"
5 7
 )
6 8
 
9
+var (
10
+	// ErrDevicesUnsupported is an error returned when a cgroup manager
11
+	// is not configured to set device rules.
12
+	ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
13
+
14
+	// ErrRootless is returned by [Manager.Apply] when there is an error
15
+	// creating cgroup directory, and cgroup.Rootless is set. In general,
16
+	// this error is to be ignored.
17
+	ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)")
18
+
19
+	// DevicesSetV1 and DevicesSetV2 are functions to set devices for
20
+	// cgroup v1 and v2, respectively. Unless
21
+	// [github.com/opencontainers/runc/libcontainer/cgroups/devices]
22
+	// package is imported, it is set to nil, so cgroup managers can't
23
+	// manage devices.
24
+	DevicesSetV1 func(path string, r *configs.Resources) error
25
+	DevicesSetV2 func(path string, r *configs.Resources) error
26
+)
27
+
7 28
 type Manager interface {
8 29
 	// Apply creates a cgroup, if not yet created, and adds a process
9 30
 	// with the specified pid into that cgroup.  A special value of -1
... ...
@@ -50,22 +50,45 @@ func WriteFile(dir, file, data string) error {
50 50
 		return err
51 51
 	}
52 52
 	defer fd.Close()
53
-	if err := retryingWriteFile(fd, data); err != nil {
53
+	if _, err := fd.WriteString(data); err != nil {
54 54
 		// Having data in the error message helps in debugging.
55 55
 		return fmt.Errorf("failed to write %q: %w", data, err)
56 56
 	}
57 57
 	return nil
58 58
 }
59 59
 
60
-func retryingWriteFile(fd *os.File, data string) error {
60
+// WriteFileByLine is the same as WriteFile, except if data contains newlines,
61
+// it is written line by line.
62
+func WriteFileByLine(dir, file, data string) error {
63
+	i := strings.Index(data, "\n")
64
+	if i == -1 {
65
+		return WriteFile(dir, file, data)
66
+	}
67
+
68
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
69
+	if err != nil {
70
+		return err
71
+	}
72
+	defer fd.Close()
73
+	start := 0
61 74
 	for {
62
-		_, err := fd.Write([]byte(data))
63
-		if errors.Is(err, unix.EINTR) {
64
-			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
65
-			continue
75
+		var line string
76
+		if i == -1 {
77
+			line = data[start:]
78
+		} else {
79
+			line = data[start : start+i+1]
66 80
 		}
67
-		return err
81
+		_, err := fd.WriteString(line)
82
+		if err != nil {
83
+			return fmt.Errorf("failed to write %q: %w", line, err)
84
+		}
85
+		if i == -1 {
86
+			break
87
+		}
88
+		start += i + 1
89
+		i = strings.Index(data[start:], "\n")
68 90
 	}
91
+	return nil
69 92
 }
70 93
 
71 94
 const (
... ...
@@ -90,7 +113,7 @@ func prepareOpenat2() error {
90 90
 		})
91 91
 		if err != nil {
92 92
 			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
93
-			if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
93
+			if err != unix.ENOSYS {
94 94
 				logrus.Warnf("falling back to securejoin: %s", prepErr)
95 95
 			} else {
96 96
 				logrus.Debug("openat2 not available, falling back to securejoin")
... ...
@@ -148,8 +171,9 @@ func openFile(dir, file string, flags int) (*os.File, error) {
148 148
 		//
149 149
 		// TODO: if such usage will ever be common, amend this
150 150
 		// to reopen cgroupRootHandle and retry openat2.
151
-		fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
152
-		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
151
+		fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
152
+		defer closer()
153
+		fdDest, _ := os.Readlink(fdPath)
153 154
 		if fdDest != cgroupfsDir {
154 155
 			// Wrap the error so it is clear that cgroupRootHandle
155 156
 			// is opened to an unexpected/wrong directory.
... ...
@@ -32,9 +32,22 @@ type CpuUsage struct {
32 32
 	UsageInUsermode uint64 `json:"usage_in_usermode"`
33 33
 }
34 34
 
35
+type PSIData struct {
36
+	Avg10  float64 `json:"avg10"`
37
+	Avg60  float64 `json:"avg60"`
38
+	Avg300 float64 `json:"avg300"`
39
+	Total  uint64  `json:"total"`
40
+}
41
+
42
+type PSIStats struct {
43
+	Some PSIData `json:"some,omitempty"`
44
+	Full PSIData `json:"full,omitempty"`
45
+}
46
+
35 47
 type CpuStats struct {
36 48
 	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
37 49
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
50
+	PSI            *PSIStats      `json:"psi,omitempty"`
38 51
 }
39 52
 
40 53
 type CPUSetStats struct {
... ...
@@ -91,6 +104,7 @@ type MemoryStats struct {
91 91
 	UseHierarchy bool `json:"use_hierarchy"`
92 92
 
93 93
 	Stats map[string]uint64 `json:"stats,omitempty"`
94
+	PSI   *PSIStats         `json:"psi,omitempty"`
94 95
 }
95 96
 
96 97
 type PageUsageByNUMA struct {
... ...
@@ -135,6 +149,7 @@ type BlkioStats struct {
135 135
 	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
136 136
 	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
137 137
 	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
138
+	PSI                     *PSIStats        `json:"psi,omitempty"`
138 139
 }
139 140
 
140 141
 type HugetlbStats struct {
... ...
@@ -157,6 +172,13 @@ type RdmaStats struct {
157 157
 	RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
158 158
 }
159 159
 
160
+type MiscStats struct {
161
+	// current resource usage for a key in misc
162
+	Usage uint64 `json:"usage,omitempty"`
163
+	// number of times the resource usage was about to go over the max boundary
164
+	Events uint64 `json:"events,omitempty"`
165
+}
166
+
160 167
 type Stats struct {
161 168
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
162 169
 	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
... ...
@@ -166,10 +188,13 @@ type Stats struct {
166 166
 	// the map is in the format "size of hugepage: stats of the hugepage"
167 167
 	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
168 168
 	RdmaStats    RdmaStats               `json:"rdma_stats,omitempty"`
169
+	// the map is in the format "misc resource name: stats of the key"
170
+	MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
169 171
 }
170 172
 
171 173
 func NewStats() *Stats {
172 174
 	memoryStats := MemoryStats{Stats: make(map[string]uint64)}
173 175
 	hugetlbStats := make(map[string]HugetlbStats)
174
-	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
176
+	miscStats := make(map[string]MiscStats)
177
+	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
175 178
 }
... ...
@@ -12,7 +12,7 @@ import (
12 12
 	"sync"
13 13
 	"time"
14 14
 
15
-	"github.com/opencontainers/runc/libcontainer/userns"
15
+	"github.com/moby/sys/userns"
16 16
 	"github.com/sirupsen/logrus"
17 17
 	"golang.org/x/sys/unix"
18 18
 )
... ...
@@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
36 36
 		var st unix.Statfs_t
37 37
 		err := unix.Statfs(unifiedMountpoint, &st)
38 38
 		if err != nil {
39
+			level := logrus.WarnLevel
39 40
 			if os.IsNotExist(err) && userns.RunningInUserNS() {
40
-				// ignore the "not found" error if running in userns
41
-				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
42
-				isUnified = false
43
-				return
41
+				// For rootless containers, sweep it under the rug.
42
+				level = logrus.DebugLevel
44 43
 			}
45
-			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
44
+			logrus.StandardLogger().Logf(level,
45
+				"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
46 46
 		}
47 47
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
48 48
 	})
... ...
@@ -136,18 +136,18 @@ func GetAllSubsystems() ([]string, error) {
136 136
 	return subsystems, nil
137 137
 }
138 138
 
139
-func readProcsFile(dir string) ([]int, error) {
140
-	f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
139
+func readProcsFile(dir string) (out []int, _ error) {
140
+	file := CgroupProcesses
141
+	retry := true
142
+
143
+again:
144
+	f, err := OpenFile(dir, file, os.O_RDONLY)
141 145
 	if err != nil {
142 146
 		return nil, err
143 147
 	}
144 148
 	defer f.Close()
145 149
 
146
-	var (
147
-		s   = bufio.NewScanner(f)
148
-		out = []int{}
149
-	)
150
-
150
+	s := bufio.NewScanner(f)
151 151
 	for s.Scan() {
152 152
 		if t := s.Text(); t != "" {
153 153
 			pid, err := strconv.Atoi(t)
... ...
@@ -157,6 +157,13 @@ func readProcsFile(dir string) ([]int, error) {
157 157
 			out = append(out, pid)
158 158
 		}
159 159
 	}
160
+	if errors.Is(s.Err(), unix.ENOTSUP) && retry {
161
+		// For a threaded cgroup, read returns ENOTSUP, and we should
162
+		// read from cgroup.threads instead.
163
+		file = "cgroup.threads"
164
+		retry = false
165
+		goto again
166
+	}
160 167
 	return out, s.Err()
161 168
 }
162 169
 
... ...
@@ -217,21 +224,26 @@ func PathExists(path string) bool {
217 217
 	return true
218 218
 }
219 219
 
220
-func EnterPid(cgroupPaths map[string]string, pid int) error {
221
-	for _, path := range cgroupPaths {
222
-		if PathExists(path) {
223
-			if err := WriteCgroupProc(path, pid); err != nil {
224
-				return err
225
-			}
226
-		}
227
-	}
228
-	return nil
229
-}
220
+// rmdir tries to remove a directory, optionally retrying on EBUSY.
221
+func rmdir(path string, retry bool) error {
222
+	delay := time.Millisecond
223
+	tries := 10
230 224
 
231
-func rmdir(path string) error {
225
+again:
232 226
 	err := unix.Rmdir(path)
233
-	if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
227
+	switch err { // nolint:errorlint // unix errors are bare
228
+	case nil, unix.ENOENT:
234 229
 		return nil
230
+	case unix.EINTR:
231
+		goto again
232
+	case unix.EBUSY:
233
+		if retry && tries > 0 {
234
+			time.Sleep(delay)
235
+			delay *= 2
236
+			tries--
237
+			goto again
238
+
239
+		}
235 240
 	}
236 241
 	return &os.PathError{Op: "rmdir", Path: path, Err: err}
237 242
 }
... ...
@@ -239,68 +251,40 @@ func rmdir(path string) error {
239 239
 // RemovePath aims to remove cgroup path. It does so recursively,
240 240
 // by removing any subdirectories (sub-cgroups) first.
241 241
 func RemovePath(path string) error {
242
-	// try the fast path first
243
-	if err := rmdir(path); err == nil {
242
+	// Try the fast path first.
243
+	if err := rmdir(path, false); err == nil {
244 244
 		return nil
245 245
 	}
246 246
 
247 247
 	infos, err := os.ReadDir(path)
248
-	if err != nil {
249
-		if os.IsNotExist(err) {
250
-			err = nil
251
-		}
248
+	if err != nil && !os.IsNotExist(err) {
252 249
 		return err
253 250
 	}
254 251
 	for _, info := range infos {
255 252
 		if info.IsDir() {
256
-			// We should remove subcgroups dir first
253
+			// We should remove subcgroup first.
257 254
 			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
258 255
 				break
259 256
 			}
260 257
 		}
261 258
 	}
262 259
 	if err == nil {
263
-		err = rmdir(path)
260
+		err = rmdir(path, true)
264 261
 	}
265 262
 	return err
266 263
 }
267 264
 
268 265
 // RemovePaths iterates over the provided paths removing them.
269
-// We trying to remove all paths five times with increasing delay between tries.
270
-// If after all there are not removed cgroups - appropriate error will be
271
-// returned.
272 266
 func RemovePaths(paths map[string]string) (err error) {
273
-	const retries = 5
274
-	delay := 10 * time.Millisecond
275
-	for i := 0; i < retries; i++ {
276
-		if i != 0 {
277
-			time.Sleep(delay)
278
-			delay *= 2
279
-		}
280
-		for s, p := range paths {
281
-			if err := RemovePath(p); err != nil {
282
-				// do not log intermediate iterations
283
-				switch i {
284
-				case 0:
285
-					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
286
-				case retries - 1:
287
-					logrus.WithError(err).Error("Failed to remove cgroup")
288
-				}
289
-			}
290
-			_, err := os.Stat(p)
291
-			// We need this strange way of checking cgroups existence because
292
-			// RemoveAll almost always returns error, even on already removed
293
-			// cgroups
294
-			if os.IsNotExist(err) {
295
-				delete(paths, s)
296
-			}
297
-		}
298
-		if len(paths) == 0 {
299
-			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
300
-			paths = make(map[string]string)
301
-			return nil
267
+	for s, p := range paths {
268
+		if err := RemovePath(p); err == nil {
269
+			delete(paths, s)
302 270
 		}
303 271
 	}
272
+	if len(paths) == 0 {
273
+		clear(paths)
274
+		return nil
275
+	}
304 276
 	return fmt.Errorf("Failed to remove paths: %v", paths)
305 277
 }
306 278
 
... ...
@@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
99 99
 // expensive), so it is assumed that cgroup mounts are not being changed.
100 100
 func readCgroupMountinfo() ([]*mountinfo.Info, error) {
101 101
 	readMountinfoOnce.Do(func() {
102
+		// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
103
+		// issues.
102 104
 		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
103 105
 			mountinfo.FSTypeFilter("cgroup"),
104 106
 		)
105 107
 	})
106
-
107 108
 	return cgroupMountinfo, readMountinfoErr
108 109
 }
109 110
 
... ...
@@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
196 196
 		return nil, err
197 197
 	}
198 198
 
199
+	// We don't need to use /proc/thread-self here because runc always runs
200
+	// with every thread in the same cgroup. This lets us avoid having to do
201
+	// runtime.LockOSThread.
199 202
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
200 203
 	if err != nil {
201 204
 		return nil, err
... ...
@@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) {
214 214
 	if IsCgroup2UnifiedMode() {
215 215
 		return "", errUnified
216 216
 	}
217
+
218
+	// We don't need to use /proc/thread-self here because runc always runs
219
+	// with every thread in the same cgroup. This lets us avoid having to do
220
+	// runtime.LockOSThread.
217 221
 	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
218 222
 	if err != nil {
219 223
 		return "", err
... ...
@@ -236,27 +244,6 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
236 236
 	return getCgroupPathHelper(subsystem, cgroup)
237 237
 }
238 238
 
239
-func GetInitCgroup(subsystem string) (string, error) {
240
-	if IsCgroup2UnifiedMode() {
241
-		return "", errUnified
242
-	}
243
-	cgroups, err := ParseCgroupFile("/proc/1/cgroup")
244
-	if err != nil {
245
-		return "", err
246
-	}
247
-
248
-	return getControllerPath(subsystem, cgroups)
249
-}
250
-
251
-func GetInitCgroupPath(subsystem string) (string, error) {
252
-	cgroup, err := GetInitCgroup(subsystem)
253
-	if err != nil {
254
-		return "", err
255
-	}
256
-
257
-	return getCgroupPathHelper(subsystem, cgroup)
258
-}
259
-
260 239
 func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
261 240
 	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
262 241
 	if err != nil {
... ...
@@ -2,8 +2,8 @@ package configs
2 2
 
3 3
 import "fmt"
4 4
 
5
-// blockIODevice holds major:minor format supported in blkio cgroup
6
-type blockIODevice struct {
5
+// BlockIODevice holds major:minor format supported in blkio cgroup.
6
+type BlockIODevice struct {
7 7
 	// Major is the device's major number
8 8
 	Major int64 `json:"major"`
9 9
 	// Minor is the device's minor number
... ...
@@ -12,7 +12,7 @@ type blockIODevice struct {
12 12
 
13 13
 // WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
14 14
 type WeightDevice struct {
15
-	blockIODevice
15
+	BlockIODevice
16 16
 	// Weight is the bandwidth rate for the device, range is from 10 to 1000
17 17
 	Weight uint16 `json:"weight"`
18 18
 	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
... ...
@@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {
41 41
 
42 42
 // ThrottleDevice struct holds a `major:minor rate_per_second` pair
43 43
 type ThrottleDevice struct {
44
-	blockIODevice
44
+	BlockIODevice
45 45
 	// Rate is the IO rate limit per cgroup per device
46 46
 	Rate uint64 `json:"rate"`
47 47
 }
... ...
@@ -69,6 +69,9 @@ type Resources struct {
69 69
 	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
70 70
 	CpuQuota int64 `json:"cpu_quota"`
71 71
 
72
+	// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
73
+	CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
74
+
72 75
 	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
73 76
 	CpuPeriod uint64 `json:"cpu_period"`
74 77
 
... ...
@@ -84,6 +87,9 @@ type Resources struct {
84 84
 	// MEM to use
85 85
 	CpusetMems string `json:"cpuset_mems"`
86 86
 
87
+	// cgroup SCHED_IDLE
88
+	CPUIdle *int64 `json:"cpu_idle,omitempty"`
89
+
87 90
 	// Process limit; set <= `0' to disable limit.
88 91
 	PidsLimit int64 `json:"pids_limit"`
89 92
 
... ...
@@ -155,4 +161,9 @@ type Resources struct {
155 155
 	// during Set() to figure out whether the freeze is required. Those
156 156
 	// methods may be relatively slow, thus this flag.
157 157
 	SkipFreezeOnSet bool `json:"-"`
158
+
159
+	// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
160
+	// if the new memory limits (Memory and MemorySwap) being set are lower
161
+	// than the current memory usage, and reject if so.
162
+	MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
158 163
 }
... ...
@@ -1,5 +1,4 @@
1 1
 //go:build !linux
2
-// +build !linux
3 2
 
4 3
 package configs
5 4
 
... ...
@@ -8,6 +8,7 @@ import (
8 8
 	"time"
9 9
 
10 10
 	"github.com/sirupsen/logrus"
11
+	"golang.org/x/sys/unix"
11 12
 
12 13
 	"github.com/opencontainers/runc/libcontainer/devices"
13 14
 	"github.com/opencontainers/runtime-spec/specs-go"
... ...
@@ -31,12 +32,13 @@ type IDMap struct {
31 31
 // for syscalls. Additional architectures can be added by specifying them in
32 32
 // Architectures.
33 33
 type Seccomp struct {
34
-	DefaultAction    Action     `json:"default_action"`
35
-	Architectures    []string   `json:"architectures"`
36
-	Syscalls         []*Syscall `json:"syscalls"`
37
-	DefaultErrnoRet  *uint      `json:"default_errno_ret"`
38
-	ListenerPath     string     `json:"listener_path,omitempty"`
39
-	ListenerMetadata string     `json:"listener_metadata,omitempty"`
34
+	DefaultAction    Action                   `json:"default_action"`
35
+	Architectures    []string                 `json:"architectures"`
36
+	Flags            []specs.LinuxSeccompFlag `json:"flags"`
37
+	Syscalls         []*Syscall               `json:"syscalls"`
38
+	DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
39
+	ListenerPath     string                   `json:"listener_path,omitempty"`
40
+	ListenerMetadata string                   `json:"listener_metadata,omitempty"`
40 41
 }
41 42
 
42 43
 // Action is taken upon rule match in Seccomp
... ...
@@ -83,9 +85,6 @@ type Syscall struct {
83 83
 	Args     []*Arg `json:"args"`
84 84
 }
85 85
 
86
-// TODO Windows. Many of these fields should be factored out into those parts
87
-// which are common across platforms, and those which are platform specific.
88
-
89 86
 // Config defines configuration options for executing a process inside a contained environment.
90 87
 type Config struct {
91 88
 	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
... ...
@@ -121,6 +120,9 @@ type Config struct {
121 121
 	// Hostname optionally sets the container's hostname if provided
122 122
 	Hostname string `json:"hostname"`
123 123
 
124
+	// Domainname optionally sets the container's domainname if provided
125
+	Domainname string `json:"domainname"`
126
+
124 127
 	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
125 128
 	// If a namespace is not provided that namespace is shared from the container's parent process
126 129
 	Namespaces Namespaces `json:"namespaces"`
... ...
@@ -158,11 +160,11 @@ type Config struct {
158 158
 	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
159 159
 	OomScoreAdj *int `json:"oom_score_adj,omitempty"`
160 160
 
161
-	// UidMappings is an array of User ID mappings for User Namespaces
162
-	UidMappings []IDMap `json:"uid_mappings"`
161
+	// UIDMappings is an array of User ID mappings for User Namespaces
162
+	UIDMappings []IDMap `json:"uid_mappings"`
163 163
 
164
-	// GidMappings is an array of Group ID mappings for User Namespaces
165
-	GidMappings []IDMap `json:"gid_mappings"`
164
+	// GIDMappings is an array of Group ID mappings for User Namespaces
165
+	GIDMappings []IDMap `json:"gid_mappings"`
166 166
 
167 167
 	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
168 168
 	// mount pointing to /dev/null as to prevent reads of the file.
... ...
@@ -211,8 +213,87 @@ type Config struct {
211 211
 	// RootlessCgroups is set when unlikely to have the full access to cgroups.
212 212
 	// When RootlessCgroups is set, cgroups errors are ignored.
213 213
 	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
214
+
215
+	// TimeOffsets specifies the offset for supporting time namespaces.
216
+	TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
217
+
218
+	// Scheduler represents the scheduling attributes for a process.
219
+	Scheduler *Scheduler `json:"scheduler,omitempty"`
220
+
221
+	// Personality contains configuration for the Linux personality syscall.
222
+	Personality *LinuxPersonality `json:"personality,omitempty"`
223
+
224
+	// IOPriority is the container's I/O priority.
225
+	IOPriority *IOPriority `json:"io_priority,omitempty"`
214 226
 }
215 227
 
228
+// Scheduler is based on the Linux sched_setattr(2) syscall.
229
+type Scheduler = specs.Scheduler
230
+
231
+// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
232
+func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
233
+	var policy uint32
234
+	switch scheduler.Policy {
235
+	case specs.SchedOther:
236
+		policy = 0
237
+	case specs.SchedFIFO:
238
+		policy = 1
239
+	case specs.SchedRR:
240
+		policy = 2
241
+	case specs.SchedBatch:
242
+		policy = 3
243
+	case specs.SchedISO:
244
+		policy = 4
245
+	case specs.SchedIdle:
246
+		policy = 5
247
+	case specs.SchedDeadline:
248
+		policy = 6
249
+	default:
250
+		return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
251
+	}
252
+
253
+	var flags uint64
254
+	for _, flag := range scheduler.Flags {
255
+		switch flag {
256
+		case specs.SchedFlagResetOnFork:
257
+			flags |= 0x01
258
+		case specs.SchedFlagReclaim:
259
+			flags |= 0x02
260
+		case specs.SchedFlagDLOverrun:
261
+			flags |= 0x04
262
+		case specs.SchedFlagKeepPolicy:
263
+			flags |= 0x08
264
+		case specs.SchedFlagKeepParams:
265
+			flags |= 0x10
266
+		case specs.SchedFlagUtilClampMin:
267
+			flags |= 0x20
268
+		case specs.SchedFlagUtilClampMax:
269
+			flags |= 0x40
270
+		default:
271
+			return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
272
+		}
273
+	}
274
+
275
+	return &unix.SchedAttr{
276
+		Size:     unix.SizeofSchedAttr,
277
+		Policy:   policy,
278
+		Flags:    flags,
279
+		Nice:     scheduler.Nice,
280
+		Priority: uint32(scheduler.Priority),
281
+		Runtime:  scheduler.Runtime,
282
+		Deadline: scheduler.Deadline,
283
+		Period:   scheduler.Period,
284
+	}, nil
285
+}
286
+
287
+var IOPrioClassMapping = map[specs.IOPriorityClass]int{
288
+	specs.IOPRIO_CLASS_RT:   1,
289
+	specs.IOPRIO_CLASS_BE:   2,
290
+	specs.IOPRIO_CLASS_IDLE: 3,
291
+}
292
+
293
+type IOPriority = specs.LinuxIOPriority
294
+
216 295
 type (
217 296
 	HookName string
218 297
 	HookList []Hook
... ...
@@ -277,6 +358,7 @@ type Capabilities struct {
277 277
 	Ambient []string
278 278
 }
279 279
 
280
+// Deprecated: use (Hooks).Run instead.
280 281
 func (hooks HookList) RunHooks(state *specs.State) error {
281 282
 	for i, h := range hooks {
282 283
 		if err := h.Run(state); err != nil {
... ...
@@ -333,6 +415,18 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
333 333
 	})
334 334
 }
335 335
 
336
+// Run executes all hooks for the given hook name.
337
+func (hooks Hooks) Run(name HookName, state *specs.State) error {
338
+	list := hooks[name]
339
+	for i, h := range list {
340
+		if err := h.Run(state); err != nil {
341
+			return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
342
+		}
343
+	}
344
+
345
+	return nil
346
+}
347
+
336 348
 type Hook interface {
337 349
 	// Run executes the hook with the provided state.
338 350
 	Run(*specs.State) error
... ...
@@ -393,7 +487,7 @@ func (c Command) Run(s *specs.State) error {
393 393
 	go func() {
394 394
 		err := cmd.Wait()
395 395
 		if err != nil {
396
-			err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
396
+			err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
397 397
 		}
398 398
 		errC <- err
399 399
 	}()
... ...
@@ -7,22 +7,33 @@ import (
7 7
 )
8 8
 
9 9
 var (
10
-	errNoUIDMap   = errors.New("User namespaces enabled, but no uid mappings found.")
11
-	errNoUserMap  = errors.New("User namespaces enabled, but no user mapping found.")
12
-	errNoGIDMap   = errors.New("User namespaces enabled, but no gid mappings found.")
13
-	errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.")
10
+	errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found")
11
+	errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found")
14 12
 )
15 13
 
14
+// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details.
15
+// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h
16
+const (
17
+	PerLinux   = 0x0000
18
+	PerLinux32 = 0x0008
19
+)
20
+
21
+type LinuxPersonality struct {
22
+	// Domain for the personality
23
+	// can only contain values "LINUX" and "LINUX32"
24
+	Domain int `json:"domain"`
25
+}
26
+
16 27
 // HostUID gets the translated uid for the process on host which could be
17 28
 // different when user namespaces are enabled.
18 29
 func (c Config) HostUID(containerId int) (int, error) {
19 30
 	if c.Namespaces.Contains(NEWUSER) {
20
-		if c.UidMappings == nil {
31
+		if len(c.UIDMappings) == 0 {
21 32
 			return -1, errNoUIDMap
22 33
 		}
23
-		id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
34
+		id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings)
24 35
 		if !found {
25
-			return -1, errNoUserMap
36
+			return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId)
26 37
 		}
27 38
 		// If we are a 32-bit binary running on a 64-bit system, it's possible
28 39
 		// the mapped user is too large to store in an int, which means we
... ...
@@ -47,12 +58,12 @@ func (c Config) HostRootUID() (int, error) {
47 47
 // different when user namespaces are enabled.
48 48
 func (c Config) HostGID(containerId int) (int, error) {
49 49
 	if c.Namespaces.Contains(NEWUSER) {
50
-		if c.GidMappings == nil {
50
+		if len(c.GIDMappings) == 0 {
51 51
 			return -1, errNoGIDMap
52 52
 		}
53
-		id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
53
+		id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings)
54 54
 		if !found {
55
-			return -1, errNoGroupMap
55
+			return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId)
56 56
 		}
57 57
 		// If we are a 32-bit binary running on a 64-bit system, it's possible
58 58
 		// the mapped user is too large to store in an int, which means we
... ...
@@ -1,5 +1,4 @@
1 1
 //go:build gofuzz
2
-// +build gofuzz
3 2
 
4 3
 package configs
5 4
 
... ...
@@ -1,48 +1,7 @@
1 1
 package configs
2 2
 
3
-import "golang.org/x/sys/unix"
4
-
5 3
 const (
6 4
 	// EXT_COPYUP is a directive to copy up the contents of a directory when
7 5
 	// a tmpfs is mounted over it.
8
-	EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
6
+	EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning
9 7
 )
10
-
11
-type Mount struct {
12
-	// Source path for the mount.
13
-	Source string `json:"source"`
14
-
15
-	// Destination path for the mount inside the container.
16
-	Destination string `json:"destination"`
17
-
18
-	// Device the mount is for.
19
-	Device string `json:"device"`
20
-
21
-	// Mount flags.
22
-	Flags int `json:"flags"`
23
-
24
-	// Propagation Flags
25
-	PropagationFlags []int `json:"propagation_flags"`
26
-
27
-	// Mount data applied to the mount.
28
-	Data string `json:"data"`
29
-
30
-	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
31
-	Relabel string `json:"relabel"`
32
-
33
-	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
34
-	RecAttr *unix.MountAttr `json:"rec_attr"`
35
-
36
-	// Extensions are additional flags that are specific to runc.
37
-	Extensions int `json:"extensions"`
38
-
39
-	// Optional Command to be run before Source is mounted.
40
-	PremountCmds []Command `json:"premount_cmds"`
41
-
42
-	// Optional Command to be run after Source is mounted.
43
-	PostmountCmds []Command `json:"postmount_cmds"`
44
-}
45
-
46
-func (m *Mount) IsBind() bool {
47
-	return m.Flags&unix.MS_BIND != 0
48
-}
49 8
new file mode 100644
... ...
@@ -0,0 +1,66 @@
0
+package configs
1
+
2
+import "golang.org/x/sys/unix"
3
+
4
+type MountIDMapping struct {
5
+	// Recursive indicates if the mapping needs to be recursive.
6
+	Recursive bool `json:"recursive"`
7
+
8
+	// UserNSPath is a path to a user namespace that indicates the necessary
9
+	// id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
10
+	// GIDMappings must be set to nil.
11
+	UserNSPath string `json:"userns_path,omitempty"`
12
+
13
+	// UIDMappings is the uid mapping set for this mount, to be used with
14
+	// MOUNT_ATTR_IDMAP.
15
+	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
16
+
17
+	// GIDMappings is the gid mapping set for this mount, to be used with
18
+	// MOUNT_ATTR_IDMAP.
19
+	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
20
+}
21
+
22
+type Mount struct {
23
+	// Source path for the mount.
24
+	Source string `json:"source"`
25
+
26
+	// Destination path for the mount inside the container.
27
+	Destination string `json:"destination"`
28
+
29
+	// Device the mount is for.
30
+	Device string `json:"device"`
31
+
32
+	// Mount flags.
33
+	Flags int `json:"flags"`
34
+
35
+	// Mount flags that were explicitly cleared in the configuration (meaning
36
+	// the user explicitly requested that these flags *not* be set).
37
+	ClearedFlags int `json:"cleared_flags"`
38
+
39
+	// Propagation Flags
40
+	PropagationFlags []int `json:"propagation_flags"`
41
+
42
+	// Mount data applied to the mount.
43
+	Data string `json:"data"`
44
+
45
+	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
46
+	Relabel string `json:"relabel"`
47
+
48
+	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
49
+	RecAttr *unix.MountAttr `json:"rec_attr"`
50
+
51
+	// Extensions are additional flags that are specific to runc.
52
+	Extensions int `json:"extensions"`
53
+
54
+	// Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
55
+	// the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
56
+	IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
57
+}
58
+
59
+func (m *Mount) IsBind() bool {
60
+	return m.Flags&unix.MS_BIND != 0
61
+}
62
+
63
+func (m *Mount) IsIDMapped() bool {
64
+	return m.IDMapping != nil
65
+}
0 66
new file mode 100644
... ...
@@ -0,0 +1,9 @@
0
+//go:build !linux
1
+
2
+package configs
3
+
4
+type Mount struct{}
5
+
6
+func (m *Mount) IsBind() bool {
7
+	return false
8
+}
... ...
@@ -14,6 +14,7 @@ const (
14 14
 	NEWIPC    NamespaceType = "NEWIPC"
15 15
 	NEWUSER   NamespaceType = "NEWUSER"
16 16
 	NEWCGROUP NamespaceType = "NEWCGROUP"
17
+	NEWTIME   NamespaceType = "NEWTIME"
17 18
 )
18 19
 
19 20
 var (
... ...
@@ -38,6 +39,8 @@ func NsName(ns NamespaceType) string {
38 38
 		return "uts"
39 39
 	case NEWCGROUP:
40 40
 		return "cgroup"
41
+	case NEWTIME:
42
+		return "time"
41 43
 	}
42 44
 	return ""
43 45
 }
... ...
@@ -56,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool {
56 56
 	if nsFile == "" {
57 57
 		return false
58 58
 	}
59
+	// We don't need to use /proc/thread-self here because the list of
60
+	// namespace types is unrelated to the thread. This lets us avoid having to
61
+	// do runtime.LockOSThread.
59 62
 	_, err := os.Stat("/proc/self/ns/" + nsFile)
60 63
 	// a namespace is supported if it exists and we have permissions to read it
61 64
 	supported = err == nil
... ...
@@ -72,6 +78,7 @@ func NamespaceTypes() []NamespaceType {
72 72
 		NEWPID,
73 73
 		NEWNS,
74 74
 		NEWCGROUP,
75
+		NEWTIME,
75 76
 	}
76 77
 }
77 78
 
... ...
@@ -1,5 +1,4 @@
1 1
 //go:build linux
2
-// +build linux
3 2
 
4 3
 package configs
5 4
 
... ...
@@ -17,6 +16,7 @@ var namespaceInfo = map[NamespaceType]int{
17 17
 	NEWUTS:    unix.CLONE_NEWUTS,
18 18
 	NEWPID:    unix.CLONE_NEWPID,
19 19
 	NEWCGROUP: unix.CLONE_NEWCGROUP,
20
+	NEWTIME:   unix.CLONE_NEWTIME,
20 21
 }
21 22
 
22 23
 // CloneFlags parses the container's Namespaces options to set the correct
... ...
@@ -31,3 +31,15 @@ func (n *Namespaces) CloneFlags() uintptr {
31 31
 	}
32 32
 	return uintptr(flag)
33 33
 }
34
+
35
+// IsPrivate tells whether the namespace of type t is configured as private
36
+// (i.e. it exists and is not shared).
37
+func (n Namespaces) IsPrivate(t NamespaceType) bool {
38
+	for _, v := range n {
39
+		if v.Type == t {
40
+			return v.Path == ""
41
+		}
42
+	}
43
+	// Not found, so implicitly sharing a parent namespace.
44
+	return false
45
+}
... ...
@@ -1,5 +1,4 @@
1 1
 //go:build !linux && !windows
2
-// +build !linux,!windows
3 2
 
4 3
 package configs
5 4
 
... ...
@@ -1,5 +1,4 @@
1 1
 //go:build !linux
2
-// +build !linux
3 2
 
4 3
 package configs
5 4
 
... ...
@@ -1,5 +1,4 @@
1 1
 //go:build !windows
2
-// +build !windows
3 2
 
4 3
 package devices
5 4
 
6 5
deleted file mode 100644
... ...
@@ -1,145 +0,0 @@
1
-//go:build linux
2
-// +build linux
3
-
4
-package system
5
-
6
-import (
7
-	"os"
8
-	"os/exec"
9
-	"runtime"
10
-	"strings"
11
-	"unsafe"
12
-
13
-	"golang.org/x/sys/unix"
14
-)
15
-
16
-type ParentDeathSignal int
17
-
18
-func (p ParentDeathSignal) Restore() error {
19
-	if p == 0 {
20
-		return nil
21
-	}
22
-	current, err := GetParentDeathSignal()
23
-	if err != nil {
24
-		return err
25
-	}
26
-	if p == current {
27
-		return nil
28
-	}
29
-	return p.Set()
30
-}
31
-
32
-func (p ParentDeathSignal) Set() error {
33
-	return SetParentDeathSignal(uintptr(p))
34
-}
35
-
36
-// Deprecated: Execv is not used in runc anymore, it will be removed in v1.2.0.
37
-func Execv(cmd string, args []string, env []string) error {
38
-	name, err := exec.LookPath(cmd)
39
-	if err != nil {
40
-		return err
41
-	}
42
-	return Exec(name, args, env)
43
-}
44
-
45
-func Exec(cmd string, args []string, env []string) error {
46
-	for {
47
-		err := unix.Exec(cmd, args, env)
48
-		if err != unix.EINTR { //nolint:errorlint // unix errors are bare
49
-			return &os.PathError{Op: "exec", Path: cmd, Err: err}
50
-		}
51
-	}
52
-}
53
-
54
-func SetParentDeathSignal(sig uintptr) error {
55
-	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
56
-		return err
57
-	}
58
-	return nil
59
-}
60
-
61
-func GetParentDeathSignal() (ParentDeathSignal, error) {
62
-	var sig int
63
-	if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
64
-		return -1, err
65
-	}
66
-	return ParentDeathSignal(sig), nil
67
-}
68
-
69
-func SetKeepCaps() error {
70
-	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
71
-		return err
72
-	}
73
-
74
-	return nil
75
-}
76
-
77
-func ClearKeepCaps() error {
78
-	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
79
-		return err
80
-	}
81
-
82
-	return nil
83
-}
84
-
85
-func Setctty() error {
86
-	if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
87
-		return err
88
-	}
89
-	return nil
90
-}
91
-
92
-// SetSubreaper sets the value i as the subreaper setting for the calling process
93
-func SetSubreaper(i int) error {
94
-	return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
95
-}
96
-
97
-// GetSubreaper returns the subreaper setting for the calling process
98
-func GetSubreaper() (int, error) {
99
-	var i uintptr
100
-
101
-	if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
102
-		return -1, err
103
-	}
104
-
105
-	return int(i), nil
106
-}
107
-
108
-func prepareAt(dir *os.File, path string) (int, string) {
109
-	if dir == nil {
110
-		return unix.AT_FDCWD, path
111
-	}
112
-
113
-	// Rather than just filepath.Join-ing path here, do it manually so the
114
-	// error and handle correctly indicate cases like path=".." as being
115
-	// relative to the correct directory. The handle.Name() might end up being
116
-	// wrong but because this is (currently) only used in MkdirAllInRoot, that
117
-	// isn't a problem.
118
-	dirName := dir.Name()
119
-	if !strings.HasSuffix(dirName, "/") {
120
-		dirName += "/"
121
-	}
122
-	fullPath := dirName + path
123
-
124
-	return int(dir.Fd()), fullPath
125
-}
126
-
127
-func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
128
-	dirFd, fullPath := prepareAt(dir, path)
129
-	fd, err := unix.Openat(dirFd, path, flags, mode)
130
-	if err != nil {
131
-		return nil, &os.PathError{Op: "openat", Path: fullPath, Err: err}
132
-	}
133
-	runtime.KeepAlive(dir)
134
-	return os.NewFile(uintptr(fd), fullPath), nil
135
-}
136
-
137
-func Mkdirat(dir *os.File, path string, mode uint32) error {
138
-	dirFd, fullPath := prepareAt(dir, path)
139
-	err := unix.Mkdirat(dirFd, path, mode)
140
-	if err != nil {
141
-		err = &os.PathError{Op: "mkdirat", Path: fullPath, Err: err}
142
-	}
143
-	runtime.KeepAlive(dir)
144
-	return err
145
-}
146 1
deleted file mode 100644
... ...
@@ -1,127 +0,0 @@
1
-package system
2
-
3
-import (
4
-	"fmt"
5
-	"os"
6
-	"path/filepath"
7
-	"strconv"
8
-	"strings"
9
-)
10
-
11
-// State is the status of a process.
12
-type State rune
13
-
14
-const ( // Only values for Linux 3.14 and later are listed here
15
-	Dead        State = 'X'
16
-	DiskSleep   State = 'D'
17
-	Running     State = 'R'
18
-	Sleeping    State = 'S'
19
-	Stopped     State = 'T'
20
-	TracingStop State = 't'
21
-	Zombie      State = 'Z'
22
-	Parked      State = 'P'
23
-	Idle        State = 'I'
24
-)
25
-
26
-// String forms of the state from proc(5)'s documentation for
27
-// /proc/[pid]/status' "State" field.
28
-func (s State) String() string {
29
-	switch s {
30
-	case Dead:
31
-		return "dead"
32
-	case DiskSleep:
33
-		return "disk sleep"
34
-	case Running:
35
-		return "running"
36
-	case Sleeping:
37
-		return "sleeping"
38
-	case Stopped:
39
-		return "stopped"
40
-	case TracingStop:
41
-		return "tracing stop"
42
-	case Zombie:
43
-		return "zombie"
44
-	case Parked:
45
-		return "parked"
46
-	case Idle:
47
-		return "idle" // kernel thread
48
-	default:
49
-		return fmt.Sprintf("unknown (%c)", s)
50
-	}
51
-}
52
-
53
-// Stat_t represents the information from /proc/[pid]/stat, as
54
-// described in proc(5) with names based on the /proc/[pid]/status
55
-// fields.
56
-type Stat_t struct {
57
-	// Name is the command run by the process.
58
-	Name string
59
-
60
-	// State is the state of the process.
61
-	State State
62
-
63
-	// StartTime is the number of clock ticks after system boot (since
64
-	// Linux 2.6).
65
-	StartTime uint64
66
-}
67
-
68
-// Stat returns a Stat_t instance for the specified process.
69
-func Stat(pid int) (stat Stat_t, err error) {
70
-	bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
71
-	if err != nil {
72
-		return stat, err
73
-	}
74
-	return parseStat(string(bytes))
75
-}
76
-
77
-func parseStat(data string) (stat Stat_t, err error) {
78
-	// Example:
79
-	// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
80
-	// The fields are space-separated, see full description in proc(5).
81
-	//
82
-	// We are only interested in:
83
-	//  * field 2: process name. It is the only field enclosed into
84
-	//    parenthesis, as it can contain spaces (and parenthesis) inside.
85
-	//  * field 3: process state, a single character (%c)
86
-	//  * field 22: process start time, a long unsigned integer (%llu).
87
-
88
-	// 1. Look for the first '(' and the last ')' first, what's in between is Name.
89
-	//    We expect at least 20 fields and a space after the last one.
90
-
91
-	const minAfterName = 20*2 + 1 // the min field is '0 '.
92
-
93
-	first := strings.IndexByte(data, '(')
94
-	if first < 0 || first+minAfterName >= len(data) {
95
-		return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
96
-	}
97
-
98
-	last := strings.LastIndexByte(data, ')')
99
-	if last <= first || last+minAfterName >= len(data) {
100
-		return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
101
-	}
102
-
103
-	stat.Name = data[first+1 : last]
104
-
105
-	// 2. Remove fields 1 and 2 and a space after. State is right after.
106
-	data = data[last+2:]
107
-	stat.State = State(data[0])
108
-
109
-	// 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces.
110
-	skipSpaces := 22 - 3
111
-	for first = 0; skipSpaces > 0 && first < len(data); first++ {
112
-		if data[first] == ' ' {
113
-			skipSpaces--
114
-		}
115
-	}
116
-	// Now first points to StartTime; look for space right after.
117
-	i := strings.IndexByte(data[first:], ' ')
118
-	if i < 0 {
119
-		return stat, fmt.Errorf("invalid stat data (too short): %q", data)
120
-	}
121
-	stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64)
122
-	if err != nil {
123
-		return stat, fmt.Errorf("invalid stat data (bad start time): %w", err)
124
-	}
125
-
126
-	return stat, nil
127
-}
128 1
deleted file mode 100644
... ...
@@ -1,15 +0,0 @@
1
-//go:build go1.23
2
-
3
-package system
4
-
5
-import (
6
-	"syscall"
7
-)
8
-
9
-// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument
10
-// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076.
11
-func ClearRlimitNofileCache(lim *syscall.Rlimit) {
12
-	// Ignore the return values since we only need to clean the cache,
13
-	// the limit is going to be set via unix.Prlimit elsewhere.
14
-	_ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim)
15
-}
16 1
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-//go:build go1.19 && !go1.23
2
-
3
-// TODO: remove this file once go 1.22 is no longer supported.
4
-
5
-package system
6
-
7
-import (
8
-	"sync/atomic"
9
-	"syscall"
10
-	_ "unsafe" // Needed for go:linkname to work.
11
-)
12
-
13
-//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile
14
-var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit]
15
-
16
-// ClearRlimitNofileCache clears go runtime's nofile rlimit cache.
17
-// The argument is process RLIMIT_NOFILE values.
18
-func ClearRlimitNofileCache(_ *syscall.Rlimit) {
19
-	// As reported in issue #4195, the new version of go runtime(since 1.19)
20
-	// will cache rlimit-nofile. Before executing execve, the rlimit-nofile
21
-	// of the process will be restored with the cache. In runc, this will
22
-	// cause the rlimit-nofile setting by the parent process for the container
23
-	// to become invalid. It can be solved by clearing this cache. But
24
-	// unfortunately, go stdlib doesn't provide such function, so we need to
25
-	// link to the private var `origRlimitNofile` in package syscall to hack.
26
-	syscallOrigRlimitNofile.Store(nil)
27
-}
28 1
deleted file mode 100644
... ...
@@ -1,7 +0,0 @@
1
-//go:build !go1.19
2
-
3
-package system
4
-
5
-import "syscall"
6
-
7
-func ClearRlimitNofileCache(_ *syscall.Rlimit) {}
8 1
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-//go:build linux && (386 || arm)
2
-// +build linux
3
-// +build 386 arm
4
-
5
-package system
6
-
7
-import (
8
-	"golang.org/x/sys/unix"
9
-)
10
-
11
-// Setuid sets the uid of the calling thread to the specified uid.
12
-func Setuid(uid int) (err error) {
13
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0)
14
-	if e1 != 0 {
15
-		err = e1
16
-	}
17
-	return
18
-}
19
-
20
-// Setgid sets the gid of the calling thread to the specified gid.
21
-func Setgid(gid int) (err error) {
22
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0)
23
-	if e1 != 0 {
24
-		err = e1
25
-	}
26
-	return
27
-}
28 1
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-//go:build linux && (arm64 || amd64 || mips || mipsle || mips64 || mips64le || ppc || ppc64 || ppc64le || riscv64 || s390x)
2
-// +build linux
3
-// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x
4
-
5
-package system
6
-
7
-import (
8
-	"golang.org/x/sys/unix"
9
-)
10
-
11
-// Setuid sets the uid of the calling thread to the specified uid.
12
-func Setuid(uid int) (err error) {
13
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETUID, uintptr(uid), 0, 0)
14
-	if e1 != 0 {
15
-		err = e1
16
-	}
17
-	return
18
-}
19
-
20
-// Setgid sets the gid of the calling thread to the specified gid.
21
-func Setgid(gid int) (err error) {
22
-	_, _, e1 := unix.RawSyscall(unix.SYS_SETGID, uintptr(gid), 0, 0)
23
-	if e1 != 0 {
24
-		err = e1
25
-	}
26
-	return
27
-}
28 1
deleted file mode 100644
... ...
@@ -1,157 +0,0 @@
1
-//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
2
-// +build darwin dragonfly freebsd linux netbsd openbsd solaris
3
-
4
-package user
5
-
6
-import (
7
-	"io"
8
-	"os"
9
-	"strconv"
10
-
11
-	"golang.org/x/sys/unix"
12
-)
13
-
14
-// Unix-specific path to the passwd and group formatted files.
15
-const (
16
-	unixPasswdPath = "/etc/passwd"
17
-	unixGroupPath  = "/etc/group"
18
-)
19
-
20
-// LookupUser looks up a user by their username in /etc/passwd. If the user
21
-// cannot be found (or there is no /etc/passwd file on the filesystem), then
22
-// LookupUser returns an error.
23
-func LookupUser(username string) (User, error) {
24
-	return lookupUserFunc(func(u User) bool {
25
-		return u.Name == username
26
-	})
27
-}
28
-
29
-// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
30
-// be found (or there is no /etc/passwd file on the filesystem), then LookupId
31
-// returns an error.
32
-func LookupUid(uid int) (User, error) {
33
-	return lookupUserFunc(func(u User) bool {
34
-		return u.Uid == uid
35
-	})
36
-}
37
-
38
-func lookupUserFunc(filter func(u User) bool) (User, error) {
39
-	// Get operating system-specific passwd reader-closer.
40
-	passwd, err := GetPasswd()
41
-	if err != nil {
42
-		return User{}, err
43
-	}
44
-	defer passwd.Close()
45
-
46
-	// Get the users.
47
-	users, err := ParsePasswdFilter(passwd, filter)
48
-	if err != nil {
49
-		return User{}, err
50
-	}
51
-
52
-	// No user entries found.
53
-	if len(users) == 0 {
54
-		return User{}, ErrNoPasswdEntries
55
-	}
56
-
57
-	// Assume the first entry is the "correct" one.
58
-	return users[0], nil
59
-}
60
-
61
-// LookupGroup looks up a group by its name in /etc/group. If the group cannot
62
-// be found (or there is no /etc/group file on the filesystem), then LookupGroup
63
-// returns an error.
64
-func LookupGroup(groupname string) (Group, error) {
65
-	return lookupGroupFunc(func(g Group) bool {
66
-		return g.Name == groupname
67
-	})
68
-}
69
-
70
-// LookupGid looks up a group by its group id in /etc/group. If the group cannot
71
-// be found (or there is no /etc/group file on the filesystem), then LookupGid
72
-// returns an error.
73
-func LookupGid(gid int) (Group, error) {
74
-	return lookupGroupFunc(func(g Group) bool {
75
-		return g.Gid == gid
76
-	})
77
-}
78
-
79
-func lookupGroupFunc(filter func(g Group) bool) (Group, error) {
80
-	// Get operating system-specific group reader-closer.
81
-	group, err := GetGroup()
82
-	if err != nil {
83
-		return Group{}, err
84
-	}
85
-	defer group.Close()
86
-
87
-	// Get the users.
88
-	groups, err := ParseGroupFilter(group, filter)
89
-	if err != nil {
90
-		return Group{}, err
91
-	}
92
-
93
-	// No user entries found.
94
-	if len(groups) == 0 {
95
-		return Group{}, ErrNoGroupEntries
96
-	}
97
-
98
-	// Assume the first entry is the "correct" one.
99
-	return groups[0], nil
100
-}
101
-
102
-func GetPasswdPath() (string, error) {
103
-	return unixPasswdPath, nil
104
-}
105
-
106
-func GetPasswd() (io.ReadCloser, error) {
107
-	return os.Open(unixPasswdPath)
108
-}
109
-
110
-func GetGroupPath() (string, error) {
111
-	return unixGroupPath, nil
112
-}
113
-
114
-func GetGroup() (io.ReadCloser, error) {
115
-	return os.Open(unixGroupPath)
116
-}
117
-
118
-// CurrentUser looks up the current user by their user id in /etc/passwd. If the
119
-// user cannot be found (or there is no /etc/passwd file on the filesystem),
120
-// then CurrentUser returns an error.
121
-func CurrentUser() (User, error) {
122
-	return LookupUid(unix.Getuid())
123
-}
124
-
125
-// CurrentGroup looks up the current user's group by their primary group id's
126
-// entry in /etc/passwd. If the group cannot be found (or there is no
127
-// /etc/group file on the filesystem), then CurrentGroup returns an error.
128
-func CurrentGroup() (Group, error) {
129
-	return LookupGid(unix.Getgid())
130
-}
131
-
132
-func currentUserSubIDs(fileName string) ([]SubID, error) {
133
-	u, err := CurrentUser()
134
-	if err != nil {
135
-		return nil, err
136
-	}
137
-	filter := func(entry SubID) bool {
138
-		return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
139
-	}
140
-	return ParseSubIDFileFilter(fileName, filter)
141
-}
142
-
143
-func CurrentUserSubUIDs() ([]SubID, error) {
144
-	return currentUserSubIDs("/etc/subuid")
145
-}
146
-
147
-func CurrentUserSubGIDs() ([]SubID, error) {
148
-	return currentUserSubIDs("/etc/subgid")
149
-}
150
-
151
-func CurrentProcessUIDMap() ([]IDMap, error) {
152
-	return ParseIDMapFile("/proc/self/uid_map")
153
-}
154
-
155
-func CurrentProcessGIDMap() ([]IDMap, error) {
156
-	return ParseIDMapFile("/proc/self/gid_map")
157
-}
158 1
deleted file mode 100644
... ...
@@ -1,604 +0,0 @@
1
-package user
2
-
3
-import (
4
-	"bufio"
5
-	"bytes"
6
-	"errors"
7
-	"fmt"
8
-	"io"
9
-	"os"
10
-	"strconv"
11
-	"strings"
12
-)
13
-
14
-const (
15
-	minID = 0
16
-	maxID = 1<<31 - 1 // for 32-bit systems compatibility
17
-)
18
-
19
-var (
20
-	// ErrNoPasswdEntries is returned if no matching entries were found in /etc/group.
21
-	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
22
-	// ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd.
23
-	ErrNoGroupEntries = errors.New("no matching entries in group file")
24
-	// ErrRange is returned if a UID or GID is outside of the valid range.
25
-	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID)
26
-)
27
-
28
-type User struct {
29
-	Name  string
30
-	Pass  string
31
-	Uid   int
32
-	Gid   int
33
-	Gecos string
34
-	Home  string
35
-	Shell string
36
-}
37
-
38
-type Group struct {
39
-	Name string
40
-	Pass string
41
-	Gid  int
42
-	List []string
43
-}
44
-
45
-// SubID represents an entry in /etc/sub{u,g}id
46
-type SubID struct {
47
-	Name  string
48
-	SubID int64
49
-	Count int64
50
-}
51
-
52
-// IDMap represents an entry in /proc/PID/{u,g}id_map
53
-type IDMap struct {
54
-	ID       int64
55
-	ParentID int64
56
-	Count    int64
57
-}
58
-
59
-func parseLine(line []byte, v ...interface{}) {
60
-	parseParts(bytes.Split(line, []byte(":")), v...)
61
-}
62
-
63
-func parseParts(parts [][]byte, v ...interface{}) {
64
-	if len(parts) == 0 {
65
-		return
66
-	}
67
-
68
-	for i, p := range parts {
69
-		// Ignore cases where we don't have enough fields to populate the arguments.
70
-		// Some configuration files like to misbehave.
71
-		if len(v) <= i {
72
-			break
73
-		}
74
-
75
-		// Use the type of the argument to figure out how to parse it, scanf() style.
76
-		// This is legit.
77
-		switch e := v[i].(type) {
78
-		case *string:
79
-			*e = string(p)
80
-		case *int:
81
-			// "numbers", with conversion errors ignored because of some misbehaving configuration files.
82
-			*e, _ = strconv.Atoi(string(p))
83
-		case *int64:
84
-			*e, _ = strconv.ParseInt(string(p), 10, 64)
85
-		case *[]string:
86
-			// Comma-separated lists.
87
-			if len(p) != 0 {
88
-				*e = strings.Split(string(p), ",")
89
-			} else {
90
-				*e = []string{}
91
-			}
92
-		default:
93
-			// Someone goof'd when writing code using this function. Scream so they can hear us.
94
-			panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
95
-		}
96
-	}
97
-}
98
-
99
-func ParsePasswdFile(path string) ([]User, error) {
100
-	passwd, err := os.Open(path)
101
-	if err != nil {
102
-		return nil, err
103
-	}
104
-	defer passwd.Close()
105
-	return ParsePasswd(passwd)
106
-}
107
-
108
-func ParsePasswd(passwd io.Reader) ([]User, error) {
109
-	return ParsePasswdFilter(passwd, nil)
110
-}
111
-
112
-func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) {
113
-	passwd, err := os.Open(path)
114
-	if err != nil {
115
-		return nil, err
116
-	}
117
-	defer passwd.Close()
118
-	return ParsePasswdFilter(passwd, filter)
119
-}
120
-
121
-func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
122
-	if r == nil {
123
-		return nil, errors.New("nil source for passwd-formatted data")
124
-	}
125
-
126
-	var (
127
-		s   = bufio.NewScanner(r)
128
-		out = []User{}
129
-	)
130
-
131
-	for s.Scan() {
132
-		line := bytes.TrimSpace(s.Bytes())
133
-		if len(line) == 0 {
134
-			continue
135
-		}
136
-
137
-		// see: man 5 passwd
138
-		//  name:password:UID:GID:GECOS:directory:shell
139
-		// Name:Pass:Uid:Gid:Gecos:Home:Shell
140
-		//  root:x:0:0:root:/root:/bin/bash
141
-		//  adm:x:3:4:adm:/var/adm:/bin/false
142
-		p := User{}
143
-		parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
144
-
145
-		if filter == nil || filter(p) {
146
-			out = append(out, p)
147
-		}
148
-	}
149
-	if err := s.Err(); err != nil {
150
-		return nil, err
151
-	}
152
-
153
-	return out, nil
154
-}
155
-
156
-func ParseGroupFile(path string) ([]Group, error) {
157
-	group, err := os.Open(path)
158
-	if err != nil {
159
-		return nil, err
160
-	}
161
-
162
-	defer group.Close()
163
-	return ParseGroup(group)
164
-}
165
-
166
-func ParseGroup(group io.Reader) ([]Group, error) {
167
-	return ParseGroupFilter(group, nil)
168
-}
169
-
170
-func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) {
171
-	group, err := os.Open(path)
172
-	if err != nil {
173
-		return nil, err
174
-	}
175
-	defer group.Close()
176
-	return ParseGroupFilter(group, filter)
177
-}
178
-
179
-func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
180
-	if r == nil {
181
-		return nil, errors.New("nil source for group-formatted data")
182
-	}
183
-	rd := bufio.NewReader(r)
184
-	out := []Group{}
185
-
186
-	// Read the file line-by-line.
187
-	for {
188
-		var (
189
-			isPrefix  bool
190
-			wholeLine []byte
191
-			err       error
192
-		)
193
-
194
-		// Read the next line. We do so in chunks (as much as reader's
195
-		// buffer is able to keep), check if we read enough columns
196
-		// already on each step and store final result in wholeLine.
197
-		for {
198
-			var line []byte
199
-			line, isPrefix, err = rd.ReadLine()
200
-			if err != nil {
201
-				// We should return no error if EOF is reached
202
-				// without a match.
203
-				if err == io.EOF {
204
-					err = nil
205
-				}
206
-				return out, err
207
-			}
208
-
209
-			// Simple common case: line is short enough to fit in a
210
-			// single reader's buffer.
211
-			if !isPrefix && len(wholeLine) == 0 {
212
-				wholeLine = line
213
-				break
214
-			}
215
-
216
-			wholeLine = append(wholeLine, line...)
217
-
218
-			// Check if we read the whole line already.
219
-			if !isPrefix {
220
-				break
221
-			}
222
-		}
223
-
224
-		// There's no spec for /etc/passwd or /etc/group, but we try to follow
225
-		// the same rules as the glibc parser, which allows comments and blank
226
-		// space at the beginning of a line.
227
-		wholeLine = bytes.TrimSpace(wholeLine)
228
-		if len(wholeLine) == 0 || wholeLine[0] == '#' {
229
-			continue
230
-		}
231
-
232
-		// see: man 5 group
233
-		//  group_name:password:GID:user_list
234
-		// Name:Pass:Gid:List
235
-		//  root:x:0:root
236
-		//  adm:x:4:root,adm,daemon
237
-		p := Group{}
238
-		parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List)
239
-
240
-		if filter == nil || filter(p) {
241
-			out = append(out, p)
242
-		}
243
-	}
244
-}
245
-
246
-type ExecUser struct {
247
-	Uid   int
248
-	Gid   int
249
-	Sgids []int
250
-	Home  string
251
-}
252
-
253
-// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
254
-// given file paths and uses that data as the arguments to GetExecUser. If the
255
-// files cannot be opened for any reason, the error is ignored and a nil
256
-// io.Reader is passed instead.
257
-func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
258
-	var passwd, group io.Reader
259
-
260
-	if passwdFile, err := os.Open(passwdPath); err == nil {
261
-		passwd = passwdFile
262
-		defer passwdFile.Close()
263
-	}
264
-
265
-	if groupFile, err := os.Open(groupPath); err == nil {
266
-		group = groupFile
267
-		defer groupFile.Close()
268
-	}
269
-
270
-	return GetExecUser(userSpec, defaults, passwd, group)
271
-}
272
-
273
-// GetExecUser parses a user specification string (using the passwd and group
274
-// readers as sources for /etc/passwd and /etc/group data, respectively). In
275
-// the case of blank fields or missing data from the sources, the values in
276
-// defaults is used.
277
-//
278
-// GetExecUser will return an error if a user or group literal could not be
279
-// found in any entry in passwd and group respectively.
280
-//
281
-// Examples of valid user specifications are:
282
-//   - ""
283
-//   - "user"
284
-//   - "uid"
285
-//   - "user:group"
286
-//   - "uid:gid
287
-//   - "user:gid"
288
-//   - "uid:group"
289
-//
290
-// It should be noted that if you specify a numeric user or group id, they will
291
-// not be evaluated as usernames (only the metadata will be filled). So attempting
292
-// to parse a user with user.Name = "1337" will produce the user with a UID of
293
-// 1337.
294
-func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
295
-	if defaults == nil {
296
-		defaults = new(ExecUser)
297
-	}
298
-
299
-	// Copy over defaults.
300
-	user := &ExecUser{
301
-		Uid:   defaults.Uid,
302
-		Gid:   defaults.Gid,
303
-		Sgids: defaults.Sgids,
304
-		Home:  defaults.Home,
305
-	}
306
-
307
-	// Sgids slice *cannot* be nil.
308
-	if user.Sgids == nil {
309
-		user.Sgids = []int{}
310
-	}
311
-
312
-	// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
313
-	var userArg, groupArg string
314
-	parseLine([]byte(userSpec), &userArg, &groupArg)
315
-
316
-	// Convert userArg and groupArg to be numeric, so we don't have to execute
317
-	// Atoi *twice* for each iteration over lines.
318
-	uidArg, uidErr := strconv.Atoi(userArg)
319
-	gidArg, gidErr := strconv.Atoi(groupArg)
320
-
321
-	// Find the matching user.
322
-	users, err := ParsePasswdFilter(passwd, func(u User) bool {
323
-		if userArg == "" {
324
-			// Default to current state of the user.
325
-			return u.Uid == user.Uid
326
-		}
327
-
328
-		if uidErr == nil {
329
-			// If the userArg is numeric, always treat it as a UID.
330
-			return uidArg == u.Uid
331
-		}
332
-
333
-		return u.Name == userArg
334
-	})
335
-
336
-	// If we can't find the user, we have to bail.
337
-	if err != nil && passwd != nil {
338
-		if userArg == "" {
339
-			userArg = strconv.Itoa(user.Uid)
340
-		}
341
-		return nil, fmt.Errorf("unable to find user %s: %w", userArg, err)
342
-	}
343
-
344
-	var matchedUserName string
345
-	if len(users) > 0 {
346
-		// First match wins, even if there's more than one matching entry.
347
-		matchedUserName = users[0].Name
348
-		user.Uid = users[0].Uid
349
-		user.Gid = users[0].Gid
350
-		user.Home = users[0].Home
351
-	} else if userArg != "" {
352
-		// If we can't find a user with the given username, the only other valid
353
-		// option is if it's a numeric username with no associated entry in passwd.
354
-
355
-		if uidErr != nil {
356
-			// Not numeric.
357
-			return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries)
358
-		}
359
-		user.Uid = uidArg
360
-
361
-		// Must be inside valid uid range.
362
-		if user.Uid < minID || user.Uid > maxID {
363
-			return nil, ErrRange
364
-		}
365
-
366
-		// Okay, so it's numeric. We can just roll with this.
367
-	}
368
-
369
-	// On to the groups. If we matched a username, we need to do this because of
370
-	// the supplementary group IDs.
371
-	if groupArg != "" || matchedUserName != "" {
372
-		groups, err := ParseGroupFilter(group, func(g Group) bool {
373
-			// If the group argument isn't explicit, we'll just search for it.
374
-			if groupArg == "" {
375
-				// Check if user is a member of this group.
376
-				for _, u := range g.List {
377
-					if u == matchedUserName {
378
-						return true
379
-					}
380
-				}
381
-				return false
382
-			}
383
-
384
-			if gidErr == nil {
385
-				// If the groupArg is numeric, always treat it as a GID.
386
-				return gidArg == g.Gid
387
-			}
388
-
389
-			return g.Name == groupArg
390
-		})
391
-		if err != nil && group != nil {
392
-			return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err)
393
-		}
394
-
395
-		// Only start modifying user.Gid if it is in explicit form.
396
-		if groupArg != "" {
397
-			if len(groups) > 0 {
398
-				// First match wins, even if there's more than one matching entry.
399
-				user.Gid = groups[0].Gid
400
-			} else {
401
-				// If we can't find a group with the given name, the only other valid
402
-				// option is if it's a numeric group name with no associated entry in group.
403
-
404
-				if gidErr != nil {
405
-					// Not numeric.
406
-					return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries)
407
-				}
408
-				user.Gid = gidArg
409
-
410
-				// Must be inside valid gid range.
411
-				if user.Gid < minID || user.Gid > maxID {
412
-					return nil, ErrRange
413
-				}
414
-
415
-				// Okay, so it's numeric. We can just roll with this.
416
-			}
417
-		} else if len(groups) > 0 {
418
-			// Supplementary group ids only make sense if in the implicit form.
419
-			user.Sgids = make([]int, len(groups))
420
-			for i, group := range groups {
421
-				user.Sgids[i] = group.Gid
422
-			}
423
-		}
424
-	}
425
-
426
-	return user, nil
427
-}
428
-
429
-// GetAdditionalGroups looks up a list of groups by name or group id
430
-// against the given /etc/group formatted data. If a group name cannot
431
-// be found, an error will be returned. If a group id cannot be found,
432
-// or the given group data is nil, the id will be returned as-is
433
-// provided it is in the legal range.
434
-func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
435
-	groups := []Group{}
436
-	if group != nil {
437
-		var err error
438
-		groups, err = ParseGroupFilter(group, func(g Group) bool {
439
-			for _, ag := range additionalGroups {
440
-				if g.Name == ag || strconv.Itoa(g.Gid) == ag {
441
-					return true
442
-				}
443
-			}
444
-			return false
445
-		})
446
-		if err != nil {
447
-			return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err)
448
-		}
449
-	}
450
-
451
-	gidMap := make(map[int]struct{})
452
-	for _, ag := range additionalGroups {
453
-		var found bool
454
-		for _, g := range groups {
455
-			// if we found a matched group either by name or gid, take the
456
-			// first matched as correct
457
-			if g.Name == ag || strconv.Itoa(g.Gid) == ag {
458
-				if _, ok := gidMap[g.Gid]; !ok {
459
-					gidMap[g.Gid] = struct{}{}
460
-					found = true
461
-					break
462
-				}
463
-			}
464
-		}
465
-		// we asked for a group but didn't find it. let's check to see
466
-		// if we wanted a numeric group
467
-		if !found {
468
-			gid, err := strconv.ParseInt(ag, 10, 64)
469
-			if err != nil {
470
-				// Not a numeric ID either.
471
-				return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries)
472
-			}
473
-			// Ensure gid is inside gid range.
474
-			if gid < minID || gid > maxID {
475
-				return nil, ErrRange
476
-			}
477
-			gidMap[int(gid)] = struct{}{}
478
-		}
479
-	}
480
-	gids := []int{}
481
-	for gid := range gidMap {
482
-		gids = append(gids, gid)
483
-	}
484
-	return gids, nil
485
-}
486
-
487
-// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
488
-// that opens the groupPath given and gives it as an argument to
489
-// GetAdditionalGroups.
490
-func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
491
-	var group io.Reader
492
-
493
-	if groupFile, err := os.Open(groupPath); err == nil {
494
-		group = groupFile
495
-		defer groupFile.Close()
496
-	}
497
-	return GetAdditionalGroups(additionalGroups, group)
498
-}
499
-
500
-func ParseSubIDFile(path string) ([]SubID, error) {
501
-	subid, err := os.Open(path)
502
-	if err != nil {
503
-		return nil, err
504
-	}
505
-	defer subid.Close()
506
-	return ParseSubID(subid)
507
-}
508
-
509
-func ParseSubID(subid io.Reader) ([]SubID, error) {
510
-	return ParseSubIDFilter(subid, nil)
511
-}
512
-
513
-func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
514
-	subid, err := os.Open(path)
515
-	if err != nil {
516
-		return nil, err
517
-	}
518
-	defer subid.Close()
519
-	return ParseSubIDFilter(subid, filter)
520
-}
521
-
522
-func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
523
-	if r == nil {
524
-		return nil, errors.New("nil source for subid-formatted data")
525
-	}
526
-
527
-	var (
528
-		s   = bufio.NewScanner(r)
529
-		out = []SubID{}
530
-	)
531
-
532
-	for s.Scan() {
533
-		line := bytes.TrimSpace(s.Bytes())
534
-		if len(line) == 0 {
535
-			continue
536
-		}
537
-
538
-		// see: man 5 subuid
539
-		p := SubID{}
540
-		parseLine(line, &p.Name, &p.SubID, &p.Count)
541
-
542
-		if filter == nil || filter(p) {
543
-			out = append(out, p)
544
-		}
545
-	}
546
-	if err := s.Err(); err != nil {
547
-		return nil, err
548
-	}
549
-
550
-	return out, nil
551
-}
552
-
553
-func ParseIDMapFile(path string) ([]IDMap, error) {
554
-	r, err := os.Open(path)
555
-	if err != nil {
556
-		return nil, err
557
-	}
558
-	defer r.Close()
559
-	return ParseIDMap(r)
560
-}
561
-
562
-func ParseIDMap(r io.Reader) ([]IDMap, error) {
563
-	return ParseIDMapFilter(r, nil)
564
-}
565
-
566
-func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
567
-	r, err := os.Open(path)
568
-	if err != nil {
569
-		return nil, err
570
-	}
571
-	defer r.Close()
572
-	return ParseIDMapFilter(r, filter)
573
-}
574
-
575
-func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
576
-	if r == nil {
577
-		return nil, errors.New("nil source for idmap-formatted data")
578
-	}
579
-
580
-	var (
581
-		s   = bufio.NewScanner(r)
582
-		out = []IDMap{}
583
-	)
584
-
585
-	for s.Scan() {
586
-		line := bytes.TrimSpace(s.Bytes())
587
-		if len(line) == 0 {
588
-			continue
589
-		}
590
-
591
-		// see: man 7 user_namespaces
592
-		p := IDMap{}
593
-		parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count)
594
-
595
-		if filter == nil || filter(p) {
596
-			out = append(out, p)
597
-		}
598
-	}
599
-	if err := s.Err(); err != nil {
600
-		return nil, err
601
-	}
602
-
603
-	return out, nil
604
-}
605 1
deleted file mode 100644
... ...
@@ -1,43 +0,0 @@
1
-//go:build gofuzz
2
-// +build gofuzz
3
-
4
-package user
5
-
6
-import (
7
-	"io"
8
-	"strings"
9
-)
10
-
11
-func IsDivisbleBy(n int, divisibleby int) bool {
12
-	return (n % divisibleby) == 0
13
-}
14
-
15
-func FuzzUser(data []byte) int {
16
-	if len(data) == 0 {
17
-		return -1
18
-	}
19
-	if !IsDivisbleBy(len(data), 5) {
20
-		return -1
21
-	}
22
-
23
-	var divided [][]byte
24
-
25
-	chunkSize := len(data) / 5
26
-
27
-	for i := 0; i < len(data); i += chunkSize {
28
-		end := i + chunkSize
29
-
30
-		divided = append(divided, data[i:end])
31
-	}
32
-
33
-	_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
34
-
35
-	var passwd, group io.Reader
36
-
37
-	group = strings.NewReader(string(divided[1]))
38
-	_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
39
-
40
-	passwd = strings.NewReader(string(divided[3]))
41
-	_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
42
-	return 1
43
-}
44 1
deleted file mode 100644
... ...
@@ -1,5 +0,0 @@
1
-package userns
2
-
3
-// RunningInUserNS detects whether we are currently running in a user namespace.
4
-// Originally copied from github.com/lxc/lxd/shared/util.go
5
-var RunningInUserNS = runningInUserNS
6 1
deleted file mode 100644
... ...
@@ -1,16 +0,0 @@
1
-//go:build gofuzz
2
-// +build gofuzz
3
-
4
-package userns
5
-
6
-import (
7
-	"strings"
8
-
9
-	"github.com/opencontainers/runc/libcontainer/user"
10
-)
11
-
12
-func FuzzUIDMap(data []byte) int {
13
-	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
14
-	_ = uidMapInUserNS(uidmap)
15
-	return 1
16
-}
17 1
deleted file mode 100644
... ...
@@ -1,37 +0,0 @@
1
-package userns
2
-
3
-import (
4
-	"sync"
5
-
6
-	"github.com/opencontainers/runc/libcontainer/user"
7
-)
8
-
9
-var (
10
-	inUserNS bool
11
-	nsOnce   sync.Once
12
-)
13
-
14
-// runningInUserNS detects whether we are currently running in a user namespace.
15
-// Originally copied from github.com/lxc/lxd/shared/util.go
16
-func runningInUserNS() bool {
17
-	nsOnce.Do(func() {
18
-		uidmap, err := user.CurrentProcessUIDMap()
19
-		if err != nil {
20
-			// This kernel-provided file only exists if user namespaces are supported
21
-			return
22
-		}
23
-		inUserNS = uidMapInUserNS(uidmap)
24
-	})
25
-	return inUserNS
26
-}
27
-
28
-func uidMapInUserNS(uidmap []user.IDMap) bool {
29
-	/*
30
-	 * We assume we are in the initial user namespace if we have a full
31
-	 * range - 4294967295 uids starting at uid 0.
32
-	 */
33
-	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
34
-		return false
35
-	}
36
-	return true
37
-}
38 1
deleted file mode 100644
... ...
@@ -1,79 +0,0 @@
1
-#define _GNU_SOURCE
2
-#include <fcntl.h>
3
-#include <sched.h>
4
-#include <stdio.h>
5
-#include <unistd.h>
6
-#include <stdarg.h>
7
-#include <stdlib.h>
8
-
9
-/*
10
- * All of the code here is run inside an aync-signal-safe context, so we need
11
- * to be careful to not call any functions that could cause issues. In theory,
12
- * since we are a Go program, there are fewer restrictions in practice, it's
13
- * better to be safe than sorry.
14
- *
15
- * The only exception is exit, which we need to call to make sure we don't
16
- * return into runc.
17
- */
18
-
19
-void bail(int pipefd, const char *fmt, ...)
20
-{
21
-	va_list args;
22
-
23
-	va_start(args, fmt);
24
-	vdprintf(pipefd, fmt, args);
25
-	va_end(args);
26
-
27
-	exit(1);
28
-}
29
-
30
-int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
31
-{
32
-	char buffer[4096] = { 0 };
33
-
34
-	pid_t child = fork();
35
-	if (child != 0)
36
-		return child;
37
-	/* in child */
38
-
39
-	/* Join the target userns. */
40
-	int nsfd = open(userns_path, O_RDONLY);
41
-	if (nsfd < 0)
42
-		bail(errfd, "open userns path %s failed: %m", userns_path);
43
-
44
-	int err = setns(nsfd, CLONE_NEWUSER);
45
-	if (err < 0)
46
-		bail(errfd, "setns %s failed: %m", userns_path);
47
-
48
-	close(nsfd);
49
-
50
-	/* Pipe the requested file contents. */
51
-	int fd = open(path, O_RDONLY);
52
-	if (fd < 0)
53
-		bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
54
-
55
-	int nread, ntotal = 0;
56
-	while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
57
-		if (nread < 0)
58
-			bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
59
-		ntotal += nread;
60
-
61
-		int nwritten = 0;
62
-		while (nwritten < nread) {
63
-			int n = write(outfd, buffer, nread - nwritten);
64
-			if (n < 0)
65
-				bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
66
-				     nread - nwritten, path, nwritten);
67
-			nwritten += n;
68
-		}
69
-		if (nread != nwritten)
70
-			bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
71
-	}
72
-
73
-	close(fd);
74
-	close(outfd);
75
-	close(errfd);
76
-
77
-	/* We must exit here, otherwise we would return into a forked runc. */
78
-	exit(0);
79
-}
80 1
deleted file mode 100644
... ...
@@ -1,186 +0,0 @@
1
-//go:build linux
2
-
3
-package userns
4
-
5
-import (
6
-	"bufio"
7
-	"bytes"
8
-	"fmt"
9
-	"io"
10
-	"os"
11
-	"unsafe"
12
-
13
-	"github.com/opencontainers/runc/libcontainer/configs"
14
-	"github.com/sirupsen/logrus"
15
-)
16
-
17
-/*
18
-#include <stdlib.h>
19
-extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
20
-*/
21
-import "C"
22
-
23
-func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
24
-	scanner := bufio.NewScanner(bytes.NewReader(data))
25
-	for scanner.Scan() {
26
-		var m configs.IDMap
27
-		line := scanner.Text()
28
-		if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
29
-			return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
30
-		}
31
-		ms = append(ms, m)
32
-	}
33
-	if err := scanner.Err(); err != nil {
34
-		return nil, fmt.Errorf("parsing id map failed: %w", err)
35
-	}
36
-	return ms, nil
37
-}
38
-
39
-// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
40
-// efficiently. Returns the contents of the requested file from within the user
41
-// namespace.
42
-func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
43
-	rdr, wtr, err := os.Pipe()
44
-	if err != nil {
45
-		return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
46
-	}
47
-	defer rdr.Close()
48
-	defer wtr.Close()
49
-
50
-	errRdr, errWtr, err := os.Pipe()
51
-	if err != nil {
52
-		return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
53
-	}
54
-	defer errRdr.Close()
55
-	defer errWtr.Close()
56
-
57
-	cNsPath := C.CString(nsPath)
58
-	defer C.free(unsafe.Pointer(cNsPath))
59
-	cPath := C.CString(path)
60
-	defer C.free(unsafe.Pointer(cPath))
61
-
62
-	childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
63
-
64
-	if childPid < 0 {
65
-		return nil, fmt.Errorf("failed to spawn fork for userns")
66
-	} else if childPid == 0 {
67
-		// this should never happen
68
-		panic("runc executing inside fork child -- unsafe state!")
69
-	}
70
-
71
-	// We are in the parent -- close the write end of the pipe before reading.
72
-	wtr.Close()
73
-	output, err := io.ReadAll(rdr)
74
-	rdr.Close()
75
-	if err != nil {
76
-		return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
77
-	}
78
-
79
-	// Ditto for the error pipe.
80
-	errWtr.Close()
81
-	errOutput, err := io.ReadAll(errRdr)
82
-	errRdr.Close()
83
-	if err != nil {
84
-		return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
85
-	}
86
-	errOutput = bytes.TrimSpace(errOutput)
87
-
88
-	// Clean up the child.
89
-	child, err := os.FindProcess(int(childPid))
90
-	if err != nil {
91
-		return nil, fmt.Errorf("could not find userns spawn process: %w", err)
92
-	}
93
-	state, err := child.Wait()
94
-	if err != nil {
95
-		return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
96
-	}
97
-	if !state.Success() {
98
-		errStr := string(errOutput)
99
-		if errStr == "" {
100
-			errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
101
-		}
102
-		return nil, fmt.Errorf("userns spawn: %s", errStr)
103
-	} else if len(errOutput) > 0 {
104
-		// We can just ignore weird output in the error pipe if the process
105
-		// didn't bail(), but for completeness output for debugging.
106
-		logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
107
-	}
108
-	// The subprocess succeeded, return whatever it wrote to the pipe.
109
-	return output, nil
110
-}
111
-
112
-func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
113
-	var (
114
-		pid         int
115
-		extra       rune
116
-		tryFastPath bool
117
-	)
118
-
119
-	// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
120
-	// already have a pid that is part of the user namespace and thus we can
121
-	// just use the pid to read from /proc/<pid>/*id_map.
122
-	//
123
-	// Note that Sscanf doesn't consume the whole input, so we check for any
124
-	// trailing data with %c. That way, we can be sure the pattern matched
125
-	// /proc/$pid/ns/user _exactly_ iff n === 1.
126
-	if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
127
-		tryFastPath = pid > 0
128
-	}
129
-
130
-	for _, mapType := range []struct {
131
-		name  string
132
-		idMap *[]configs.IDMap
133
-	}{
134
-		{"uid_map", &uidMap},
135
-		{"gid_map", &gidMap},
136
-	} {
137
-		var mapData []byte
138
-
139
-		if tryFastPath {
140
-			path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
141
-			data, err := os.ReadFile(path)
142
-			if err != nil {
143
-				// Do not error out here -- we need to try the slow path if the
144
-				// fast path failed.
145
-				logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
146
-			} else {
147
-				mapData = data
148
-			}
149
-		} else {
150
-			logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
151
-		}
152
-
153
-		if mapData == nil {
154
-			// We have to actually join the namespace if we cannot take the
155
-			// fast path. The path is resolved with respect to the child
156
-			// process, so just use /proc/self.
157
-			data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
158
-			if err != nil {
159
-				return nil, nil, err
160
-			}
161
-			mapData = data
162
-		}
163
-		idMap, err := parseIdmapData(mapData)
164
-		if err != nil {
165
-			return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
166
-		}
167
-		*mapType.idMap = idMap
168
-	}
169
-
170
-	return uidMap, gidMap, nil
171
-}
172
-
173
-// IsSameMapping returns whether or not the two id mappings are the same. Note
174
-// that if the order of the mappings is different, or a mapping has been split,
175
-// the mappings will be considered different.
176
-func IsSameMapping(a, b []configs.IDMap) bool {
177
-	if len(a) != len(b) {
178
-		return false
179
-	}
180
-	for idx := range a {
181
-		if a[idx] != b[idx] {
182
-			return false
183
-		}
184
-	}
185
-	return true
186
-}
187 1
deleted file mode 100644
... ...
@@ -1,18 +0,0 @@
1
-//go:build !linux
2
-// +build !linux
3
-
4
-package userns
5
-
6
-import "github.com/opencontainers/runc/libcontainer/user"
7
-
8
-// runningInUserNS is a stub for non-Linux systems
9
-// Always returns false
10
-func runningInUserNS() bool {
11
-	return false
12
-}
13
-
14
-// uidMapInUserNS is a stub for non-Linux systems
15
-// Always returns false
16
-func uidMapInUserNS(uidmap []user.IDMap) bool {
17
-	return false
18
-}
... ...
@@ -19,13 +19,14 @@ package utils
19 19
 import (
20 20
 	"fmt"
21 21
 	"os"
22
+	"runtime"
22 23
 
23 24
 	"golang.org/x/sys/unix"
24 25
 )
25 26
 
26
-// MaxSendfdLen is the maximum length of the name of a file descriptor being
27
-// sent using SendFd. The name of the file handle returned by RecvFd will never
28
-// be larger than this value.
27
+// MaxNameLen is the maximum length of the name of a file descriptor being sent
28
+// using SendFile. The name of the file handle returned by RecvFile will never be
29
+// larger than this value.
29 30
 const MaxNameLen = 4096
30 31
 
31 32
 // oobSpace is the size of the oob slice required to store a single FD. Note
... ...
@@ -33,26 +34,21 @@ const MaxNameLen = 4096
33 33
 // so sizeof(fd) = 4.
34 34
 var oobSpace = unix.CmsgSpace(4)
35 35
 
36
-// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
36
+// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
37 37
 // socket. The file name of the remote file descriptor will be recreated
38 38
 // locally (it is sent as non-auxiliary data in the same payload).
39
-func RecvFd(socket *os.File) (*os.File, error) {
40
-	// For some reason, unix.Recvmsg uses the length rather than the capacity
41
-	// when passing the msg_controllen and other attributes to recvmsg.  So we
42
-	// have to actually set the length.
39
+func RecvFile(socket *os.File) (_ *os.File, Err error) {
43 40
 	name := make([]byte, MaxNameLen)
44 41
 	oob := make([]byte, oobSpace)
45 42
 
46 43
 	sockfd := socket.Fd()
47
-	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
44
+	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
48 45
 	if err != nil {
49 46
 		return nil, err
50 47
 	}
51
-
52 48
 	if n >= MaxNameLen || oobn != oobSpace {
53
-		return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
49
+		return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
54 50
 	}
55
-
56 51
 	// Truncate.
57 52
 	name = name[:n]
58 53
 	oob = oob[:oobn]
... ...
@@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) {
61 61
 	if err != nil {
62 62
 		return nil, err
63 63
 	}
64
-	if len(scms) != 1 {
65
-		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
64
+
65
+	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
66
+	// them all of the descriptors are installed in our fd table, so we need to
67
+	// parse all of the SCM_RIGHTS we received in order to close all of the
68
+	// descriptors on error.
69
+	var fds []int
70
+	defer func() {
71
+		for i, fd := range fds {
72
+			if i == 0 && Err == nil {
73
+				// Only close the first one on error.
74
+				continue
75
+			}
76
+			// Always close extra ones.
77
+			_ = unix.Close(fd)
78
+		}
79
+	}()
80
+	var lastErr error
81
+	for _, scm := range scms {
82
+		if scm.Header.Type == unix.SCM_RIGHTS {
83
+			scmFds, err := unix.ParseUnixRights(&scm)
84
+			if err != nil {
85
+				lastErr = err
86
+			} else {
87
+				fds = append(fds, scmFds...)
88
+			}
89
+		}
90
+	}
91
+	if lastErr != nil {
92
+		return nil, lastErr
66 93
 	}
67
-	scm := scms[0]
68 94
 
69
-	fds, err := unix.ParseUnixRights(&scm)
70
-	if err != nil {
71
-		return nil, err
95
+	// We do this after collecting the fds to make sure we close them all when
96
+	// returning an error here.
97
+	if len(scms) != 1 {
98
+		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
72 99
 	}
73 100
 	if len(fds) != 1 {
74 101
 		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
75 102
 	}
76
-	fd := uintptr(fds[0])
77
-
78
-	return os.NewFile(fd, string(name)), nil
103
+	return os.NewFile(uintptr(fds[0]), string(name)), nil
79 104
 }
80 105
 
81
-// SendFd sends a file descriptor over the given AF_UNIX socket. In
82
-// addition, the file.Name() of the given file will also be sent as
83
-// non-auxiliary data in the same payload (allowing to send contextual
84
-// information for a file descriptor).
85
-func SendFd(socket *os.File, name string, fd uintptr) error {
106
+// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
107
+// included so that if the other end uses RecvFile, the file will have the same
108
+// name information.
109
+func SendFile(socket *os.File, file *os.File) error {
110
+	name := file.Name()
86 111
 	if len(name) >= MaxNameLen {
87 112
 		return fmt.Errorf("sendfd: filename too long: %s", name)
88 113
 	}
89
-	return SendFds(socket, []byte(name), int(fd))
114
+	err := SendRawFd(socket, name, file.Fd())
115
+	runtime.KeepAlive(file)
116
+	return err
90 117
 }
91 118
 
92
-// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
93
-func SendFds(socket *os.File, msg []byte, fds ...int) error {
94
-	oob := unix.UnixRights(fds...)
95
-	return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
119
+// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
120
+func SendRawFd(socket *os.File, msg string, fd uintptr) error {
121
+	oob := unix.UnixRights(int(fd))
122
+	return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
96 123
 }
... ...
@@ -1,17 +1,12 @@
1 1
 package utils
2 2
 
3 3
 import (
4
-	"encoding/binary"
5 4
 	"encoding/json"
6
-	"fmt"
7 5
 	"io"
8 6
 	"os"
9 7
 	"path/filepath"
10
-	"strconv"
11 8
 	"strings"
12
-	"unsafe"
13 9
 
14
-	securejoin "github.com/cyphar/filepath-securejoin"
15 10
 	"golang.org/x/sys/unix"
16 11
 )
17 12
 
... ...
@@ -19,20 +14,6 @@ const (
19 19
 	exitSignalOffset = 128
20 20
 )
21 21
 
22
-// NativeEndian is the native byte order of the host system.
23
-var NativeEndian binary.ByteOrder
24
-
25
-func init() {
26
-	// Copied from <golang.org/x/net/internal/socket/sys.go>.
27
-	i := uint32(1)
28
-	b := (*[4]byte)(unsafe.Pointer(&i))
29
-	if b[0] == 1 {
30
-		NativeEndian = binary.LittleEndian
31
-	} else {
32
-		NativeEndian = binary.BigEndian
33
-	}
34
-}
35
-
36 22
 // ExitStatus returns the correct exit status for a process based on if it
37 23
 // was signaled or exited cleanly
38 24
 func ExitStatus(status unix.WaitStatus) int {
... ...
@@ -43,6 +24,9 @@ func ExitStatus(status unix.WaitStatus) int {
43 43
 }
44 44
 
45 45
 // WriteJSON writes the provided struct v to w using standard json marshaling
46
+// without a trailing newline. This is used instead of json.Encoder because
47
+// there might be a problem in json decoder in some cases, see:
48
+// https://github.com/docker/docker/issues/14203#issuecomment-174177790
46 49
 func WriteJSON(w io.Writer, v interface{}) error {
47 50
 	data, err := json.Marshal(v)
48 51
 	if err != nil {
... ...
@@ -99,52 +83,16 @@ func stripRoot(root, path string) string {
99 99
 	return CleanPath("/" + path)
100 100
 }
101 101
 
102
-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
103
-// corresponding to the unsafePath resolved within the root. Before passing the
104
-// fd, this path is verified to have been inside the root -- so operating on it
105
-// through the passed fdpath should be safe. Do not access this path through
106
-// the original path strings, and do not attempt to use the pathname outside of
107
-// the passed closure (the file handle will be freed once the closure returns).
108
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
109
-	// Remove the root then forcefully resolve inside the root.
110
-	unsafePath = stripRoot(root, unsafePath)
111
-	path, err := securejoin.SecureJoin(root, unsafePath)
112
-	if err != nil {
113
-		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
114
-	}
115
-
116
-	// Open the target path.
117
-	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
118
-	if err != nil {
119
-		return fmt.Errorf("open o_path procfd: %w", err)
120
-	}
121
-	defer fh.Close()
122
-
123
-	// Double-check the path is the one we expected.
124
-	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
125
-	if realpath, err := os.Readlink(procfd); err != nil {
126
-		return fmt.Errorf("procfd verification failed: %w", err)
127
-	} else if realpath != path {
128
-		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
129
-	}
130
-
131
-	// Run the closure.
132
-	return fn(procfd)
133
-}
134
-
135
-// SearchLabels searches a list of key-value pairs for the provided key and
136
-// returns the corresponding value. The pairs must be separated with '='.
137
-func SearchLabels(labels []string, query string) string {
138
-	for _, l := range labels {
139
-		parts := strings.SplitN(l, "=", 2)
140
-		if len(parts) < 2 {
141
-			continue
142
-		}
143
-		if parts[0] == query {
144
-			return parts[1]
102
+// SearchLabels searches through a list of key=value pairs for a given key,
103
+// returning its value, and the binary flag telling whether the key exist.
104
+func SearchLabels(labels []string, key string) (string, bool) {
105
+	key += "="
106
+	for _, s := range labels {
107
+		if strings.HasPrefix(s, key) {
108
+			return s[len(key):], true
145 109
 		}
146 110
 	}
147
-	return ""
111
+	return "", false
148 112
 }
149 113
 
150 114
 // Annotations returns the bundle path and user defined annotations from the
... ...
@@ -153,14 +101,14 @@ func SearchLabels(labels []string, query string) string {
153 153
 func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
154 154
 	userAnnotations = make(map[string]string)
155 155
 	for _, l := range labels {
156
-		parts := strings.SplitN(l, "=", 2)
157
-		if len(parts) < 2 {
156
+		name, value, ok := strings.Cut(l, "=")
157
+		if !ok {
158 158
 			continue
159 159
 		}
160
-		if parts[0] == "bundle" {
161
-			bundle = parts[1]
160
+		if name == "bundle" {
161
+			bundle = value
162 162
 		} else {
163
-			userAnnotations[parts[0]] = parts[1]
163
+			userAnnotations[name] = value
164 164
 		}
165 165
 	}
166 166
 	return
... ...
@@ -1,20 +1,20 @@
1 1
 //go:build !windows
2
-// +build !windows
3 2
 
4 3
 package utils
5 4
 
6 5
 import (
7
-	"errors"
8 6
 	"fmt"
7
+	"math"
9 8
 	"os"
10 9
 	"path/filepath"
10
+	"runtime"
11 11
 	"strconv"
12 12
 	"strings"
13
+	"sync"
13 14
 	_ "unsafe" // for go:linkname
14 15
 
15
-	"github.com/opencontainers/runc/libcontainer/system"
16
-
17 16
 	securejoin "github.com/cyphar/filepath-securejoin"
17
+	"github.com/sirupsen/logrus"
18 18
 	"golang.org/x/sys/unix"
19 19
 )
20 20
 
... ...
@@ -30,12 +30,39 @@ func EnsureProcHandle(fh *os.File) error {
30 30
 	return nil
31 31
 }
32 32
 
33
+var (
34
+	haveCloseRangeCloexecBool bool
35
+	haveCloseRangeCloexecOnce sync.Once
36
+)
37
+
38
+func haveCloseRangeCloexec() bool {
39
+	haveCloseRangeCloexecOnce.Do(func() {
40
+		// Make sure we're not closing a random file descriptor.
41
+		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
42
+		if err != nil {
43
+			return
44
+		}
45
+		defer unix.Close(tmpFd)
46
+
47
+		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
48
+		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
49
+		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
50
+		// other potential error would imply that even the most basic close
51
+		// operation wouldn't work.
52
+		haveCloseRangeCloexecBool = err == nil
53
+	})
54
+	return haveCloseRangeCloexecBool
55
+}
56
+
33 57
 type fdFunc func(fd int)
34 58
 
35 59
 // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
36 60
 // the current process.
37 61
 func fdRangeFrom(minFd int, fn fdFunc) error {
38
-	fdDir, err := os.Open("/proc/self/fd")
62
+	procSelfFd, closer := ProcThreadSelf("fd")
63
+	defer closer()
64
+
65
+	fdDir, err := os.Open(procSelfFd)
39 66
 	if err != nil {
40 67
 		return err
41 68
 	}
... ...
@@ -73,6 +100,12 @@ func fdRangeFrom(minFd int, fn fdFunc) error {
73 73
 // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
74 74
 // equal to minFd in the current process.
75 75
 func CloseExecFrom(minFd int) error {
76
+	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
77
+	if haveCloseRangeCloexec() {
78
+		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
79
+		return os.NewSyscallError("close_range", err)
80
+	}
81
+	// Otherwise, fall back to the standard loop.
76 82
 	return fdRangeFrom(minFd, unix.CloseOnExec)
77 83
 }
78 84
 
... ...
@@ -95,7 +128,8 @@ func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
95 95
 // *os.File operations would apply to the wrong file). This function is only
96 96
 // intended to be called from the last stage of runc init.
97 97
 func UnsafeCloseFrom(minFd int) error {
98
-	// We must not close some file descriptors.
98
+	// We cannot use close_range(2) even if it is available, because we must
99
+	// not close some file descriptors.
99 100
 	return fdRangeFrom(minFd, func(fd int) {
100 101
 		if runtime_IsPollDescriptor(uintptr(fd)) {
101 102
 			// These are the Go runtimes internal netpoll file descriptors.
... ...
@@ -113,8 +147,8 @@ func UnsafeCloseFrom(minFd int) error {
113 113
 	})
114 114
 }
115 115
 
116
-// NewSockPair returns a new unix socket pair
117
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
116
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
117
+func NewSockPair(name string) (parent, child *os.File, err error) {
118 118
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
119 119
 	if err != nil {
120 120
 		return nil, nil, err
... ...
@@ -122,6 +156,112 @@ func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
122 122
 	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
123 123
 }
124 124
 
125
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
126
+// corresponding to the unsafePath resolved within the root. Before passing the
127
+// fd, this path is verified to have been inside the root -- so operating on it
128
+// through the passed fdpath should be safe. Do not access this path through
129
+// the original path strings, and do not attempt to use the pathname outside of
130
+// the passed closure (the file handle will be freed once the closure returns).
131
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
132
+	// Remove the root then forcefully resolve inside the root.
133
+	unsafePath = stripRoot(root, unsafePath)
134
+	path, err := securejoin.SecureJoin(root, unsafePath)
135
+	if err != nil {
136
+		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
137
+	}
138
+
139
+	procSelfFd, closer := ProcThreadSelf("fd/")
140
+	defer closer()
141
+
142
+	// Open the target path.
143
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
144
+	if err != nil {
145
+		return fmt.Errorf("open o_path procfd: %w", err)
146
+	}
147
+	defer fh.Close()
148
+
149
+	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
150
+	// Double-check the path is the one we expected.
151
+	if realpath, err := os.Readlink(procfd); err != nil {
152
+		return fmt.Errorf("procfd verification failed: %w", err)
153
+	} else if realpath != path {
154
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
155
+	}
156
+
157
+	return fn(procfd)
158
+}
159
+
160
+type ProcThreadSelfCloser func()
161
+
162
+var (
163
+	haveProcThreadSelf     bool
164
+	haveProcThreadSelfOnce sync.Once
165
+)
166
+
167
+// ProcThreadSelf returns a string that is equivalent to
168
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
169
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
170
+// meaning that the passed string needs to be trusted. The caller _must_ call
171
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
172
+// *only once* after it has finished using the returned path string.
173
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
174
+	haveProcThreadSelfOnce.Do(func() {
175
+		if _, err := os.Stat("/proc/thread-self/"); err == nil {
176
+			haveProcThreadSelf = true
177
+		} else {
178
+			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
179
+		}
180
+	})
181
+
182
+	// We need to lock our thread until the caller is done with the path string
183
+	// because any non-atomic operation on the path (such as opening a file,
184
+	// then reading it) could be interrupted by the Go runtime where the
185
+	// underlying thread is swapped out and the original thread is killed,
186
+	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
187
+	// addition, the pre-3.17 fallback makes everything non-atomic because the
188
+	// same thing could happen between unix.Gettid() and the path operations.
189
+	//
190
+	// In theory, we don't need to lock in the atomic user case when using
191
+	// /proc/thread-self/, but it's better to be safe than sorry (and there are
192
+	// only one or two truly atomic users of /proc/thread-self/).
193
+	runtime.LockOSThread()
194
+
195
+	threadSelf := "/proc/thread-self/"
196
+	if !haveProcThreadSelf {
197
+		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
198
+		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
199
+		if _, err := os.Stat(threadSelf); err != nil {
200
+			// Unfortunately, this code is called from rootfs_linux.go where we
201
+			// are running inside the pid namespace of the container but /proc
202
+			// is the host's procfs. Unfortunately there is no real way to get
203
+			// the correct tid to use here (the kernel age means we cannot do
204
+			// things like set up a private fsopen("proc") -- even scanning
205
+			// NSpid in all of the tasks in /proc/self/task/*/status requires
206
+			// Linux 4.1).
207
+			//
208
+			// So, we just have to assume that /proc/self is acceptable in this
209
+			// one specific case.
210
+			if os.Getpid() == 1 {
211
+				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
212
+			} else {
213
+				// This should never happen, but the fallback should work in most cases...
214
+				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
215
+			}
216
+			threadSelf = "/proc/self/"
217
+		}
218
+	}
219
+	return threadSelf + subpath, runtime.UnlockOSThread
220
+}
221
+
222
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
223
+// create a /proc/thread-self handle for given file descriptor.
224
+//
225
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
226
+// without using fmt.Sprintf to avoid unneeded overhead.
227
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
228
+	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
229
+}
230
+
125 231
 // IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
126 232
 // but properly handling the case where path or root are "/".
127 233
 //
... ...
@@ -156,83 +296,45 @@ func IsLexicallyInRoot(root, path string) bool {
156 156
 // This means that the path also must not contain ".." elements, otherwise an
157 157
 // error will occur.
158 158
 //
159
-// This is a somewhat less safe alternative to
160
-// <https://github.com/cyphar/filepath-securejoin/pull/13>, but it should
161
-// detect attempts to trick us into creating directories outside of the root.
162
-// We should migrate to securejoin.MkdirAll once it is merged.
159
+// This uses securejoin.MkdirAllHandle under the hood, but it has special
160
+// handling if unsafePath has already been scoped within the rootfs (this is
161
+// needed for a lot of runc callers and fixing this would require reworking a
162
+// lot of path logic).
163 163
 func MkdirAllInRootOpen(root, unsafePath string, mode uint32) (_ *os.File, Err error) {
164
-	// If the path is already "within" the root, use it verbatim.
165
-	fullPath := unsafePath
166
-	if !IsLexicallyInRoot(root, unsafePath) {
167
-		var err error
168
-		fullPath, err = securejoin.SecureJoin(root, unsafePath)
164
+	// If the path is already "within" the root, get the path relative to the
165
+	// root and use that as the unsafe path. This is necessary because a lot of
166
+	// MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
167
+	// all of them to stop using these SecureJoin'd paths would require a fair
168
+	// amount of work.
169
+	// TODO(cyphar): Do the refactor to libpathrs once it's ready.
170
+	if IsLexicallyInRoot(root, unsafePath) {
171
+		subPath, err := filepath.Rel(root, unsafePath)
169 172
 		if err != nil {
170 173
 			return nil, err
171 174
 		}
172
-	}
173
-	subPath, err := filepath.Rel(root, fullPath)
174
-	if err != nil {
175
-		return nil, err
175
+		unsafePath = subPath
176 176
 	}
177 177
 
178 178
 	// Check for any silly mode bits.
179 179
 	if mode&^0o7777 != 0 {
180 180
 		return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
181 181
 	}
182
+	// Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
183
+	// passed. While it would make sense to return an error in that case (since
184
+	// the user has asked for a mode that won't be applied), for compatibility
185
+	// reasons we have to ignore these bits.
186
+	if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
187
+		logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
188
+		mode &= 0o1777
189
+	}
182 190
 
183
-	currentDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
191
+	rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
184 192
 	if err != nil {
185 193
 		return nil, fmt.Errorf("open root handle: %w", err)
186 194
 	}
187
-	defer func() {
188
-		if Err != nil {
189
-			currentDir.Close()
190
-		}
191
-	}()
192
-
193
-	for _, part := range strings.Split(subPath, string(filepath.Separator)) {
194
-		switch part {
195
-		case "", ".":
196
-			// Skip over no-op components.
197
-			continue
198
-		case "..":
199
-			return nil, fmt.Errorf("possible breakout detected: found %q component in SecureJoin subpath %s", part, subPath)
200
-		}
195
+	defer rootDir.Close()
201 196
 
202
-		nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
203
-		switch {
204
-		case err == nil:
205
-			// Update the currentDir.
206
-			_ = currentDir.Close()
207
-			currentDir = nextDir
208
-
209
-		case errors.Is(err, unix.ENOTDIR):
210
-			// This might be a symlink or some other random file. Either way,
211
-			// error out.
212
-			return nil, fmt.Errorf("cannot mkdir in %s/%s: %w", currentDir.Name(), part, unix.ENOTDIR)
213
-
214
-		case errors.Is(err, os.ErrNotExist):
215
-			// Luckily, mkdirat will not follow trailing symlinks, so this is
216
-			// safe to do as-is.
217
-			if err := system.Mkdirat(currentDir, part, mode); err != nil {
218
-				return nil, err
219
-			}
220
-			// Open the new directory. There is a race here where an attacker
221
-			// could swap the directory with a different directory, but
222
-			// MkdirAll's fuzzy semantics mean we don't care about that.
223
-			nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
224
-			if err != nil {
225
-				return nil, fmt.Errorf("open newly created directory: %w", err)
226
-			}
227
-			// Update the currentDir.
228
-			_ = currentDir.Close()
229
-			currentDir = nextDir
230
-
231
-		default:
232
-			return nil, err
233
-		}
234
-	}
235
-	return currentDir, nil
197
+	return securejoin.MkdirAllHandle(rootDir, unsafePath, int(mode))
236 198
 }
237 199
 
238 200
 // MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
... ...
@@ -244,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error {
244 244
 	}
245 245
 	return err
246 246
 }
247
+
248
+// Openat is a Go-friendly openat(2) wrapper.
249
+func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
250
+	dirFd := unix.AT_FDCWD
251
+	if dir != nil {
252
+		dirFd = int(dir.Fd())
253
+	}
254
+	flags |= unix.O_CLOEXEC
255
+
256
+	fd, err := unix.Openat(dirFd, path, flags, mode)
257
+	if err != nil {
258
+		return nil, &os.PathError{Op: "openat", Path: path, Err: err}
259
+	}
260
+	return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
261
+}
... ...
@@ -986,14 +986,11 @@ github.com/opencontainers/go-digest/digestset
986 986
 github.com/opencontainers/image-spec/identity
987 987
 github.com/opencontainers/image-spec/specs-go
988 988
 github.com/opencontainers/image-spec/specs-go/v1
989
-# github.com/opencontainers/runc v1.1.14
990
-## explicit; go 1.18
989
+# github.com/opencontainers/runc v1.2.0
990
+## explicit; go 1.22
991 991
 github.com/opencontainers/runc/libcontainer/cgroups
992 992
 github.com/opencontainers/runc/libcontainer/configs
993 993
 github.com/opencontainers/runc/libcontainer/devices
994
-github.com/opencontainers/runc/libcontainer/system
995
-github.com/opencontainers/runc/libcontainer/user
996
-github.com/opencontainers/runc/libcontainer/userns
997 994
 github.com/opencontainers/runc/libcontainer/utils
998 995
 # github.com/opencontainers/runtime-spec v1.2.0
999 996
 ## explicit