This is a backport of 9f6b562dd12ef7b1f9e2f8e6f2ab6477790a6594, adapted to avoid the refactoring that happened in d92739713c633c155c0f3d8065c8278b1d8a44e7.
Original commit message is as follows:
> If no seccomp policy is requested, then the built-in default policy in
> dockerd applies. This has no rule for "clone3" defined, nor any default
> errno defined. So when runc receives the config it attempts to determine
> a default errno, using logic defined in its commit:
>
> opencontainers/runc@7a8d716
>
> As explained in the above commit message, runc uses a heuristic to
> decide which errno to return by default:
>
> [quote]
> The solution applied here is to prepend a "stub" filter which returns
> -ENOSYS if the requested syscall has a larger syscall number than any
> syscall mentioned in the filter. The reason for this specific rule is
> that syscall numbers are (roughly) allocated sequentially and thus newer
> syscalls will (usually) have a larger syscall number -- thus causing our
> filters to produce -ENOSYS if the filter was written before the syscall
> existed.
> [/quote]
>
> Unfortunately clone3 appears to one of the edge cases that does not
> result in use of ENOSYS, instead ending up with the historical EPERM
> errno.
>
> Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
> clone3 by default. If it sees ENOSYS then it will automatically
> fallback to using clone. Any other errno is treated as a fatal
> error. Thus when docker seccomp policy triggers EPERM from clone3,
> no fallback occurs and programs are thus unable to spawn threads.
>
> The clone3 syscall is much more complicated than clone, most notably its
> flags are not exposed as a directly argument any more. Instead they are
> hidden inside a struct. This means that seccomp filters are unable to
> apply policy based on values seen in flags. Thus we can't directly
> replicate the current "clone" filtering for "clone3". We can at least
> ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
> at which point we can filter on flags.
Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
| ... | ... |
@@ -591,6 +591,7 @@ |
| 591 | 591 |
"names": [ |
| 592 | 592 |
"bpf", |
| 593 | 593 |
"clone", |
| 594 |
+ "clone3", |
|
| 594 | 595 |
"fanotify_init", |
| 595 | 596 |
"fsconfig", |
| 596 | 597 |
"fsmount", |
| ... | ... |
@@ -672,6 +673,21 @@ |
| 672 | 672 |
}, |
| 673 | 673 |
{
|
| 674 | 674 |
"names": [ |
| 675 |
+ "clone3" |
|
| 676 |
+ ], |
|
| 677 |
+ "action": "SCMP_ACT_ERRNO", |
|
| 678 |
+ "errnoRet": 38, |
|
| 679 |
+ "args": [], |
|
| 680 |
+ "comment": "", |
|
| 681 |
+ "includes": {},
|
|
| 682 |
+ "excludes": {
|
|
| 683 |
+ "caps": [ |
|
| 684 |
+ "CAP_SYS_ADMIN" |
|
| 685 |
+ ] |
|
| 686 |
+ } |
|
| 687 |
+ }, |
|
| 688 |
+ {
|
|
| 689 |
+ "names": [ |
|
| 675 | 690 |
"reboot" |
| 676 | 691 |
], |
| 677 | 692 |
"action": "SCMP_ACT_ALLOW", |
| ... | ... |
@@ -42,6 +42,7 @@ func arches() []Architecture {
|
| 42 | 42 |
|
| 43 | 43 |
// DefaultProfile defines the allowed syscalls for the default seccomp profile. |
| 44 | 44 |
func DefaultProfile() *Seccomp {
|
| 45 |
+ nosys := uint(unix.ENOSYS) |
|
| 45 | 46 |
syscalls := []*Syscall{
|
| 46 | 47 |
{
|
| 47 | 48 |
Names: []string{
|
| ... | ... |
@@ -522,6 +523,7 @@ func DefaultProfile() *Seccomp {
|
| 522 | 522 |
Names: []string{
|
| 523 | 523 |
"bpf", |
| 524 | 524 |
"clone", |
| 525 |
+ "clone3", |
|
| 525 | 526 |
"fanotify_init", |
| 526 | 527 |
"fsconfig", |
| 527 | 528 |
"fsmount", |
| ... | ... |
@@ -589,6 +591,17 @@ func DefaultProfile() *Seccomp {
|
| 589 | 589 |
}, |
| 590 | 590 |
{
|
| 591 | 591 |
Names: []string{
|
| 592 |
+ "clone3", |
|
| 593 |
+ }, |
|
| 594 |
+ Action: specs.ActErrno, |
|
| 595 |
+ ErrnoRet: &nosys, |
|
| 596 |
+ Args: []*specs.LinuxSeccompArg{},
|
|
| 597 |
+ Excludes: Filter{
|
|
| 598 |
+ Caps: []string{"CAP_SYS_ADMIN"},
|
|
| 599 |
+ }, |
|
| 600 |
+ }, |
|
| 601 |
+ {
|
|
| 602 |
+ Names: []string{
|
|
| 592 | 603 |
"reboot", |
| 593 | 604 |
}, |
| 594 | 605 |
Action: specs.ActAllow, |
| ... | ... |
@@ -45,6 +45,7 @@ type Syscall struct {
|
| 45 | 45 |
Name string `json:"name,omitempty"` |
| 46 | 46 |
Names []string `json:"names,omitempty"` |
| 47 | 47 |
Action specs.LinuxSeccompAction `json:"action"` |
| 48 |
+ ErrnoRet *uint `json:"errnoRet,omitempty"` |
|
| 48 | 49 |
Args []*specs.LinuxSeccompArg `json:"args"` |
| 49 | 50 |
Comment string `json:"comment"` |
| 50 | 51 |
Includes Filter `json:"includes"` |
| ... | ... |
@@ -150,29 +150,25 @@ Loop: |
| 150 | 150 |
} |
| 151 | 151 |
} |
| 152 | 152 |
|
| 153 |
+ newCall := specs.LinuxSyscall{
|
|
| 154 |
+ Action: call.Action, |
|
| 155 |
+ ErrnoRet: call.ErrnoRet, |
|
| 156 |
+ } |
|
| 153 | 157 |
if call.Name != "" && len(call.Names) != 0 {
|
| 154 | 158 |
return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
|
| 155 | 159 |
} |
| 156 |
- |
|
| 157 | 160 |
if call.Name != "" {
|
| 158 |
- newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall([]string{call.Name}, call.Action, call.Args))
|
|
| 161 |
+ newCall.Names = []string{call.Name}
|
|
| 159 | 162 |
} else {
|
| 160 |
- newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall(call.Names, call.Action, call.Args)) |
|
| 163 |
+ newCall.Names = call.Names |
|
| 164 |
+ } |
|
| 165 |
+ // Loop through all the arguments of the syscall and convert them |
|
| 166 |
+ for _, arg := range call.Args {
|
|
| 167 |
+ newCall.Args = append(newCall.Args, *arg) |
|
| 161 | 168 |
} |
| 162 |
- } |
|
| 163 |
- |
|
| 164 |
- return newConfig, nil |
|
| 165 |
-} |
|
| 166 | 169 |
|
| 167 |
-func createSpecsSyscall(names []string, action specs.LinuxSeccompAction, args []*specs.LinuxSeccompArg) specs.LinuxSyscall {
|
|
| 168 |
- newCall := specs.LinuxSyscall{
|
|
| 169 |
- Names: names, |
|
| 170 |
- Action: action, |
|
| 170 |
+ newConfig.Syscalls = append(newConfig.Syscalls, newCall) |
|
| 171 | 171 |
} |
| 172 | 172 |
|
| 173 |
- // Loop through all the arguments of the syscall and convert them |
|
| 174 |
- for _, arg := range args {
|
|
| 175 |
- newCall.Args = append(newCall.Args, *arg) |
|
| 176 |
- } |
|
| 177 |
- return newCall |
|
| 173 |
+ return newConfig, nil |
|
| 178 | 174 |
} |