Browse code

seccomp: add support for "clone3" syscall in default policy

This is a backport of 9f6b562dd12ef7b1f9e2f8e6f2ab6477790a6594, adapted to avoid the refactoring that happened in d92739713c633c155c0f3d8065c8278b1d8a44e7.

Original commit message is as follows:

> If no seccomp policy is requested, then the built-in default policy in
> dockerd applies. This has no rule for "clone3" defined, nor any default
> errno defined. So when runc receives the config it attempts to determine
> a default errno, using logic defined in its commit:
>
> opencontainers/runc@7a8d716
>
> As explained in the above commit message, runc uses a heuristic to
> decide which errno to return by default:
>
> [quote]
> The solution applied here is to prepend a "stub" filter which returns
> -ENOSYS if the requested syscall has a larger syscall number than any
> syscall mentioned in the filter. The reason for this specific rule is
> that syscall numbers are (roughly) allocated sequentially and thus newer
> syscalls will (usually) have a larger syscall number -- thus causing our
> filters to produce -ENOSYS if the filter was written before the syscall
> existed.
> [/quote]
>
> Unfortunately clone3 appears to one of the edge cases that does not
> result in use of ENOSYS, instead ending up with the historical EPERM
> errno.
>
> Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
> clone3 by default. If it sees ENOSYS then it will automatically
> fallback to using clone. Any other errno is treated as a fatal
> error. Thus when docker seccomp policy triggers EPERM from clone3,
> no fallback occurs and programs are thus unable to spawn threads.
>
> The clone3 syscall is much more complicated than clone, most notably its
> flags are not exposed as a directly argument any more. Instead they are
> hidden inside a struct. This means that seccomp filters are unable to
> apply policy based on values seen in flags. Thus we can't directly
> replicate the current "clone" filtering for "clone3". We can at least
> ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
> at which point we can filter on flags.

Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>

Tianon Gravi authored on 2021/09/10 03:31:30
Showing 4 changed files
... ...
@@ -591,6 +591,7 @@
591 591
 			"names": [
592 592
 				"bpf",
593 593
 				"clone",
594
+				"clone3",
594 595
 				"fanotify_init",
595 596
 				"fsconfig",
596 597
 				"fsmount",
... ...
@@ -672,6 +673,21 @@
672 672
 		},
673 673
 		{
674 674
 			"names": [
675
+				"clone3"
676
+			],
677
+			"action": "SCMP_ACT_ERRNO",
678
+			"errnoRet": 38,
679
+			"args": [],
680
+			"comment": "",
681
+			"includes": {},
682
+			"excludes": {
683
+				"caps": [
684
+					"CAP_SYS_ADMIN"
685
+				]
686
+			}
687
+		},
688
+		{
689
+			"names": [
675 690
 				"reboot"
676 691
 			],
677 692
 			"action": "SCMP_ACT_ALLOW",
... ...
@@ -42,6 +42,7 @@ func arches() []Architecture {
42 42
 
43 43
 // DefaultProfile defines the allowed syscalls for the default seccomp profile.
44 44
 func DefaultProfile() *Seccomp {
45
+	nosys := uint(unix.ENOSYS)
45 46
 	syscalls := []*Syscall{
46 47
 		{
47 48
 			Names: []string{
... ...
@@ -522,6 +523,7 @@ func DefaultProfile() *Seccomp {
522 522
 			Names: []string{
523 523
 				"bpf",
524 524
 				"clone",
525
+				"clone3",
525 526
 				"fanotify_init",
526 527
 				"fsconfig",
527 528
 				"fsmount",
... ...
@@ -589,6 +591,17 @@ func DefaultProfile() *Seccomp {
589 589
 		},
590 590
 		{
591 591
 			Names: []string{
592
+				"clone3",
593
+			},
594
+			Action:   specs.ActErrno,
595
+			ErrnoRet: &nosys,
596
+			Args:     []*specs.LinuxSeccompArg{},
597
+			Excludes: Filter{
598
+				Caps: []string{"CAP_SYS_ADMIN"},
599
+			},
600
+		},
601
+		{
602
+			Names: []string{
592 603
 				"reboot",
593 604
 			},
594 605
 			Action: specs.ActAllow,
... ...
@@ -45,6 +45,7 @@ type Syscall struct {
45 45
 	Name     string                   `json:"name,omitempty"`
46 46
 	Names    []string                 `json:"names,omitempty"`
47 47
 	Action   specs.LinuxSeccompAction `json:"action"`
48
+	ErrnoRet *uint                    `json:"errnoRet,omitempty"`
48 49
 	Args     []*specs.LinuxSeccompArg `json:"args"`
49 50
 	Comment  string                   `json:"comment"`
50 51
 	Includes Filter                   `json:"includes"`
... ...
@@ -150,29 +150,25 @@ Loop:
150 150
 			}
151 151
 		}
152 152
 
153
+		newCall := specs.LinuxSyscall{
154
+			Action:   call.Action,
155
+			ErrnoRet: call.ErrnoRet,
156
+		}
153 157
 		if call.Name != "" && len(call.Names) != 0 {
154 158
 			return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
155 159
 		}
156
-
157 160
 		if call.Name != "" {
158
-			newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall([]string{call.Name}, call.Action, call.Args))
161
+			newCall.Names = []string{call.Name}
159 162
 		} else {
160
-			newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall(call.Names, call.Action, call.Args))
163
+			newCall.Names = call.Names
164
+		}
165
+		// Loop through all the arguments of the syscall and convert them
166
+		for _, arg := range call.Args {
167
+			newCall.Args = append(newCall.Args, *arg)
161 168
 		}
162
-	}
163
-
164
-	return newConfig, nil
165
-}
166 169
 
167
-func createSpecsSyscall(names []string, action specs.LinuxSeccompAction, args []*specs.LinuxSeccompArg) specs.LinuxSyscall {
168
-	newCall := specs.LinuxSyscall{
169
-		Names:  names,
170
-		Action: action,
170
+		newConfig.Syscalls = append(newConfig.Syscalls, newCall)
171 171
 	}
172 172
 
173
-	// Loop through all the arguments of the syscall and convert them
174
-	for _, arg := range args {
175
-		newCall.Args = append(newCall.Args, *arg)
176
-	}
177
-	return newCall
173
+	return newConfig, nil
178 174
 }