Noteworthy changes:
- Add Prestart/Poststop hook support
- Fix bug finding cgroup mount directory
- Add OomScoreAdj as a container configuration option
- Ensure the cleanup jobs in the deferrer are executed on error
- Don't make modifications to /dev when it is bind mounted
Other changes in runc:
https://github.com/opencontainers/runc/compare/v0.0.3...v0.0.4
Signed-off-by: David Calavera <david.calavera@gmail.com>
| ... | ... |
@@ -1112,12 +1112,9 @@ func (container *Container) unmountVolumes(forceSyscall bool) error {
|
| 1112 | 1112 |
|
| 1113 | 1113 |
func (container *Container) networkMounts() []execdriver.Mount {
|
| 1114 | 1114 |
var mounts []execdriver.Mount |
| 1115 |
- mode := "Z" |
|
| 1116 |
- if container.hostConfig.NetworkMode.IsContainer() {
|
|
| 1117 |
- mode = "z" |
|
| 1118 |
- } |
|
| 1115 |
+ shared := container.hostConfig.NetworkMode.IsContainer() |
|
| 1119 | 1116 |
if container.ResolvConfPath != "" {
|
| 1120 |
- label.Relabel(container.ResolvConfPath, container.MountLabel, mode) |
|
| 1117 |
+ label.Relabel(container.ResolvConfPath, container.MountLabel, shared) |
|
| 1121 | 1118 |
writable := !container.hostConfig.ReadonlyRootfs |
| 1122 | 1119 |
if m, exists := container.MountPoints["/etc/resolv.conf"]; exists {
|
| 1123 | 1120 |
writable = m.RW |
| ... | ... |
@@ -1130,7 +1127,7 @@ func (container *Container) networkMounts() []execdriver.Mount {
|
| 1130 | 1130 |
}) |
| 1131 | 1131 |
} |
| 1132 | 1132 |
if container.HostnamePath != "" {
|
| 1133 |
- label.Relabel(container.HostnamePath, container.MountLabel, mode) |
|
| 1133 |
+ label.Relabel(container.HostnamePath, container.MountLabel, shared) |
|
| 1134 | 1134 |
writable := !container.hostConfig.ReadonlyRootfs |
| 1135 | 1135 |
if m, exists := container.MountPoints["/etc/hostname"]; exists {
|
| 1136 | 1136 |
writable = m.RW |
| ... | ... |
@@ -1143,7 +1140,7 @@ func (container *Container) networkMounts() []execdriver.Mount {
|
| 1143 | 1143 |
}) |
| 1144 | 1144 |
} |
| 1145 | 1145 |
if container.HostsPath != "" {
|
| 1146 |
- label.Relabel(container.HostsPath, container.MountLabel, mode) |
|
| 1146 |
+ label.Relabel(container.HostsPath, container.MountLabel, shared) |
|
| 1147 | 1147 |
writable := !container.hostConfig.ReadonlyRootfs |
| 1148 | 1148 |
if m, exists := container.MountPoints["/etc/hosts"]; exists {
|
| 1149 | 1149 |
writable = m.RW |
| ... | ... |
@@ -59,7 +59,7 @@ func createContainerPlatformSpecificSettings(container *Container, config *runco |
| 59 | 59 |
return err |
| 60 | 60 |
} |
| 61 | 61 |
|
| 62 |
- if err := label.Relabel(v.Path(), container.MountLabel, "z"); err != nil {
|
|
| 62 |
+ if err := label.Relabel(v.Path(), container.MountLabel, true); err != nil {
|
|
| 63 | 63 |
return err |
| 64 | 64 |
} |
| 65 | 65 |
|
| ... | ... |
@@ -355,7 +355,8 @@ func (daemon *Daemon) registerMountPoints(container *Container, hostConfig *runc |
| 355 | 355 |
} |
| 356 | 356 |
} |
| 357 | 357 |
|
| 358 |
- if err := label.Relabel(bind.Source, container.MountLabel, bind.Mode); err != nil {
|
|
| 358 |
+ shared := label.IsShared(bind.Mode) |
|
| 359 |
+ if err := label.Relabel(bind.Source, container.MountLabel, shared); err != nil {
|
|
| 359 | 360 |
return err |
| 360 | 361 |
} |
| 361 | 362 |
binds[bind.Destination] = true |
| ... | ... |
@@ -42,7 +42,7 @@ clone git github.com/endophage/gotuf 9bcdad0308e34a49f38448b8ad436ad8860825ce |
| 42 | 42 |
clone git github.com/jfrazelle/go 6e461eb70cb4187b41a84e9a567d7137bdbe0f16 |
| 43 | 43 |
clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c |
| 44 | 44 |
|
| 45 |
-clone git github.com/opencontainers/runc v0.0.3 # libcontainer |
|
| 45 |
+clone git github.com/opencontainers/runc v0.0.4 # libcontainer |
|
| 46 | 46 |
# libcontainer deps (see src/github.com/docker/libcontainer/update-vendor.sh) |
| 47 | 47 |
clone git github.com/coreos/go-systemd v3 |
| 48 | 48 |
clone git github.com/godbus/dbus v2 |
| ... | ... |
@@ -83,7 +83,7 @@ type data struct {
|
| 83 | 83 |
pid int |
| 84 | 84 |
} |
| 85 | 85 |
|
| 86 |
-func (m *Manager) Apply(pid int) error {
|
|
| 86 |
+func (m *Manager) Apply(pid int) (err error) {
|
|
| 87 | 87 |
if m.Cgroups == nil {
|
| 88 | 88 |
return nil |
| 89 | 89 |
} |
| ... | ... |
@@ -235,12 +235,12 @@ func getCgroupData(c *configs.Cgroup, pid int) (*data, error) {
|
| 235 | 235 |
}, nil |
| 236 | 236 |
} |
| 237 | 237 |
|
| 238 |
-func (raw *data) parent(subsystem, mountpoint, src string) (string, error) {
|
|
| 239 |
- initPath, err := cgroups.GetInitCgroupDir(subsystem) |
|
| 238 |
+func (raw *data) parent(subsystem, mountpoint, root string) (string, error) {
|
|
| 239 |
+ initPath, err := cgroups.GetThisCgroupDir(subsystem) |
|
| 240 | 240 |
if err != nil {
|
| 241 | 241 |
return "", err |
| 242 | 242 |
} |
| 243 |
- relDir, err := filepath.Rel(src, initPath) |
|
| 243 |
+ relDir, err := filepath.Rel(root, initPath) |
|
| 244 | 244 |
if err != nil {
|
| 245 | 245 |
return "", err |
| 246 | 246 |
} |
| ... | ... |
@@ -248,7 +248,7 @@ func (raw *data) parent(subsystem, mountpoint, src string) (string, error) {
|
| 248 | 248 |
} |
| 249 | 249 |
|
| 250 | 250 |
func (raw *data) path(subsystem string) (string, error) {
|
| 251 |
- mnt, src, err := cgroups.FindCgroupMountpointAndSource(subsystem) |
|
| 251 |
+ mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem) |
|
| 252 | 252 |
// If we didn't mount the subsystem, there is no point we make the path. |
| 253 | 253 |
if err != nil {
|
| 254 | 254 |
return "", err |
| ... | ... |
@@ -259,7 +259,7 @@ func (raw *data) path(subsystem string) (string, error) {
|
| 259 | 259 |
return filepath.Join(raw.root, filepath.Base(mnt), raw.cgroup), nil |
| 260 | 260 |
} |
| 261 | 261 |
|
| 262 |
- parent, err := raw.parent(subsystem, mnt, src) |
|
| 262 |
+ parent, err := raw.parent(subsystem, mnt, root) |
|
| 263 | 263 |
if err != nil {
|
| 264 | 264 |
return "", err |
| 265 | 265 |
} |
| ... | ... |
@@ -17,7 +17,7 @@ import ( |
| 17 | 17 |
type MemoryGroup struct {
|
| 18 | 18 |
} |
| 19 | 19 |
|
| 20 |
-func (s *MemoryGroup) Apply(d *data) error {
|
|
| 20 |
+func (s *MemoryGroup) Apply(d *data) (err error) {
|
|
| 21 | 21 |
path, err := d.path("memory")
|
| 22 | 22 |
if err != nil {
|
| 23 | 23 |
if cgroups.IsNotFound(err) {
|
| ... | ... |
@@ -28,21 +28,22 @@ func (s *MemoryGroup) Apply(d *data) error {
|
| 28 | 28 |
if err := os.MkdirAll(path, 0755); err != nil {
|
| 29 | 29 |
return err |
| 30 | 30 |
} |
| 31 |
+ |
|
| 32 |
+ defer func() {
|
|
| 33 |
+ if err != nil {
|
|
| 34 |
+ os.RemoveAll(path) |
|
| 35 |
+ } |
|
| 36 |
+ }() |
|
| 37 |
+ |
|
| 31 | 38 |
if err := s.Set(path, d.c); err != nil {
|
| 32 | 39 |
return err |
| 33 | 40 |
} |
| 34 | 41 |
|
| 35 | 42 |
// We need to join memory cgroup after set memory limits, because |
| 36 | 43 |
// kmem.limit_in_bytes can only be set when the cgroup is empty. |
| 37 |
- _, err = d.join("memory")
|
|
| 38 |
- if err != nil {
|
|
| 44 |
+ if _, err = d.join("memory"); err != nil {
|
|
| 39 | 45 |
return err |
| 40 | 46 |
} |
| 41 |
- defer func() {
|
|
| 42 |
- if err != nil {
|
|
| 43 |
- os.RemoveAll(path) |
|
| 44 |
- } |
|
| 45 |
- }() |
|
| 46 | 47 |
|
| 47 | 48 |
return nil |
| 48 | 49 |
} |
| ... | ... |
@@ -21,6 +21,9 @@ const cgroupNamePrefix = "name=" |
| 21 | 21 |
|
| 22 | 22 |
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt |
| 23 | 23 |
func FindCgroupMountpoint(subsystem string) (string, error) {
|
| 24 |
+ // We are not using mount.GetMounts() because it's super-inefficient, |
|
| 25 |
+ // parsing it directly sped up x10 times because of not using Sscanf. |
|
| 26 |
+ // It was one of two major performance drawbacks in container start. |
|
| 24 | 27 |
f, err := os.Open("/proc/self/mountinfo")
|
| 25 | 28 |
if err != nil {
|
| 26 | 29 |
return "", err |
| ... | ... |
@@ -44,7 +47,7 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
|
| 44 | 44 |
return "", NewNotFoundError(subsystem) |
| 45 | 45 |
} |
| 46 | 46 |
|
| 47 |
-func FindCgroupMountpointAndSource(subsystem string) (string, string, error) {
|
|
| 47 |
+func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
|
|
| 48 | 48 |
f, err := os.Open("/proc/self/mountinfo")
|
| 49 | 49 |
if err != nil {
|
| 50 | 50 |
return "", "", err |
| ... | ... |
@@ -69,16 +72,29 @@ func FindCgroupMountpointAndSource(subsystem string) (string, string, error) {
|
| 69 | 69 |
} |
| 70 | 70 |
|
| 71 | 71 |
func FindCgroupMountpointDir() (string, error) {
|
| 72 |
- mounts, err := mount.GetMounts() |
|
| 72 |
+ f, err := os.Open("/proc/self/mountinfo")
|
|
| 73 | 73 |
if err != nil {
|
| 74 | 74 |
return "", err |
| 75 | 75 |
} |
| 76 |
+ defer f.Close() |
|
| 76 | 77 |
|
| 77 |
- for _, mount := range mounts {
|
|
| 78 |
- if mount.Fstype == "cgroup" {
|
|
| 79 |
- return filepath.Dir(mount.Mountpoint), nil |
|
| 78 |
+ scanner := bufio.NewScanner(f) |
|
| 79 |
+ for scanner.Scan() {
|
|
| 80 |
+ text := scanner.Text() |
|
| 81 |
+ fields := strings.Split(text, " ") |
|
| 82 |
+ // Safe as mountinfo encodes mountpoints with spaces as \040. |
|
| 83 |
+ index := strings.Index(text, " - ") |
|
| 84 |
+ postSeparatorFields := strings.Fields(text[index+3:]) |
|
| 85 |
+ if len(postSeparatorFields) < 3 {
|
|
| 86 |
+ return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
|
|
| 87 |
+ } |
|
| 88 |
+ if postSeparatorFields[0] == "cgroup" {
|
|
| 89 |
+ return filepath.Dir(fields[4]), nil |
|
| 80 | 90 |
} |
| 81 | 91 |
} |
| 92 |
+ if err := scanner.Err(); err != nil {
|
|
| 93 |
+ return "", err |
|
| 94 |
+ } |
|
| 82 | 95 |
|
| 83 | 96 |
return "", NewNotFoundError("cgroup")
|
| 84 | 97 |
} |
| ... | ... |
@@ -1,5 +1,11 @@ |
| 1 | 1 |
package configs |
| 2 | 2 |
|
| 3 |
+import ( |
|
| 4 |
+ "bytes" |
|
| 5 |
+ "encoding/json" |
|
| 6 |
+ "os/exec" |
|
| 7 |
+) |
|
| 8 |
+ |
|
| 3 | 9 |
type Rlimit struct {
|
| 4 | 10 |
Type int `json:"type"` |
| 5 | 11 |
Hard uint64 `json:"hard"` |
| ... | ... |
@@ -13,36 +19,46 @@ type IDMap struct {
|
| 13 | 13 |
Size int `json:"size"` |
| 14 | 14 |
} |
| 15 | 15 |
|
| 16 |
+// Seccomp represents syscall restrictions |
|
| 16 | 17 |
type Seccomp struct {
|
| 17 |
- Syscalls []*Syscall `json:"syscalls"` |
|
| 18 |
+ DefaultAction Action `json:"default_action"` |
|
| 19 |
+ Syscalls []*Syscall `json:"syscalls"` |
|
| 18 | 20 |
} |
| 19 | 21 |
|
| 22 |
+// An action to be taken upon rule match in Seccomp |
|
| 20 | 23 |
type Action int |
| 21 | 24 |
|
| 22 | 25 |
const ( |
| 23 |
- Kill Action = iota - 3 |
|
| 26 |
+ Kill Action = iota - 4 |
|
| 27 |
+ Errno |
|
| 24 | 28 |
Trap |
| 25 | 29 |
Allow |
| 26 | 30 |
) |
| 27 | 31 |
|
| 32 |
+// A comparison operator to be used when matching syscall arguments in Seccomp |
|
| 28 | 33 |
type Operator int |
| 29 | 34 |
|
| 30 | 35 |
const ( |
| 31 | 36 |
EqualTo Operator = iota |
| 32 | 37 |
NotEqualTo |
| 33 |
- GreatherThan |
|
| 38 |
+ GreaterThan |
|
| 39 |
+ GreaterThanOrEqualTo |
|
| 34 | 40 |
LessThan |
| 41 |
+ LessThanOrEqualTo |
|
| 35 | 42 |
MaskEqualTo |
| 36 | 43 |
) |
| 37 | 44 |
|
| 45 |
+// A rule to match a specific syscall argument in Seccomp |
|
| 38 | 46 |
type Arg struct {
|
| 39 |
- Index int `json:"index"` |
|
| 40 |
- Value uint32 `json:"value"` |
|
| 41 |
- Op Operator `json:"op"` |
|
| 47 |
+ Index uint `json:"index"` |
|
| 48 |
+ Value uint64 `json:"value"` |
|
| 49 |
+ ValueTwo uint64 `json:"value_two"` |
|
| 50 |
+ Op Operator `json:"op"` |
|
| 42 | 51 |
} |
| 43 | 52 |
|
| 53 |
+// An rule to match a syscall in Seccomp |
|
| 44 | 54 |
type Syscall struct {
|
| 45 |
- Value int `json:"value"` |
|
| 55 |
+ Name string `json:"name"` |
|
| 46 | 56 |
Action Action `json:"action"` |
| 47 | 57 |
Args []*Arg `json:"args"` |
| 48 | 58 |
} |
| ... | ... |
@@ -117,6 +133,12 @@ type Config struct {
|
| 117 | 117 |
// If Rlimits are not set, the container will inherit rlimits from the parent process |
| 118 | 118 |
Rlimits []Rlimit `json:"rlimits"` |
| 119 | 119 |
|
| 120 |
+ // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores |
|
| 121 |
+ // for a process. Valid values are between the range [-1000, '1000'], where processes with |
|
| 122 |
+ // higher scores are preferred for being killed. |
|
| 123 |
+ // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ |
|
| 124 |
+ OomScoreAdj int `json:"oom_score_adj"` |
|
| 125 |
+ |
|
| 120 | 126 |
// AdditionalGroups specifies the gids that should be added to supplementary groups |
| 121 | 127 |
// in addition to those that the user belongs to. |
| 122 | 128 |
AdditionalGroups []string `json:"additional_groups"` |
| ... | ... |
@@ -140,7 +162,79 @@ type Config struct {
|
| 140 | 140 |
Sysctl map[string]string `json:"sysctl"` |
| 141 | 141 |
|
| 142 | 142 |
// Seccomp allows actions to be taken whenever a syscall is made within the container. |
| 143 |
- // By default, all syscalls are allowed with actions to allow, trap, kill, or return an errno |
|
| 144 |
- // can be specified on a per syscall basis. |
|
| 143 |
+ // A number of rules are given, each having an action to be taken if a syscall matches it. |
|
| 144 |
+ // A default action to be taken if no rules match is also given. |
|
| 145 | 145 |
Seccomp *Seccomp `json:"seccomp"` |
| 146 |
+ |
|
| 147 |
+ // Hooks are a collection of actions to perform at various container lifecycle events. |
|
| 148 |
+ // Hooks are not able to be marshaled to json but they are also not needed to. |
|
| 149 |
+ Hooks *Hooks `json:"-"` |
|
| 150 |
+} |
|
| 151 |
+ |
|
| 152 |
+type Hooks struct {
|
|
| 153 |
+ // Prestart commands are executed after the container namespaces are created, |
|
| 154 |
+ // but before the user supplied command is executed from init. |
|
| 155 |
+ Prestart []Hook |
|
| 156 |
+ |
|
| 157 |
+ // Poststop commands are executed after the container init process exits. |
|
| 158 |
+ Poststop []Hook |
|
| 159 |
+} |
|
| 160 |
+ |
|
| 161 |
+// HookState is the payload provided to a hook on execution. |
|
| 162 |
+type HookState struct {
|
|
| 163 |
+ ID string `json:"id"` |
|
| 164 |
+ Pid int `json:"pid"` |
|
| 165 |
+ Root string `json:"root"` |
|
| 166 |
+} |
|
| 167 |
+ |
|
| 168 |
+type Hook interface {
|
|
| 169 |
+ // Run executes the hook with the provided state. |
|
| 170 |
+ Run(HookState) error |
|
| 171 |
+} |
|
| 172 |
+ |
|
| 173 |
+// NewFunctionHooks will call the provided function when the hook is run. |
|
| 174 |
+func NewFunctionHook(f func(HookState) error) FuncHook {
|
|
| 175 |
+ return FuncHook{
|
|
| 176 |
+ run: f, |
|
| 177 |
+ } |
|
| 178 |
+} |
|
| 179 |
+ |
|
| 180 |
+type FuncHook struct {
|
|
| 181 |
+ run func(HookState) error |
|
| 182 |
+} |
|
| 183 |
+ |
|
| 184 |
+func (f FuncHook) Run(s HookState) error {
|
|
| 185 |
+ return f.run(s) |
|
| 186 |
+} |
|
| 187 |
+ |
|
| 188 |
+type Command struct {
|
|
| 189 |
+ Path string `json:"path"` |
|
| 190 |
+ Args []string `json:"args"` |
|
| 191 |
+ Env []string `json:"env"` |
|
| 192 |
+ Dir string `json:"dir"` |
|
| 193 |
+} |
|
| 194 |
+ |
|
| 195 |
+// NewCommandHooks will execute the provided command when the hook is run. |
|
| 196 |
+func NewCommandHook(cmd Command) CommandHook {
|
|
| 197 |
+ return CommandHook{
|
|
| 198 |
+ Command: cmd, |
|
| 199 |
+ } |
|
| 200 |
+} |
|
| 201 |
+ |
|
| 202 |
+type CommandHook struct {
|
|
| 203 |
+ Command |
|
| 204 |
+} |
|
| 205 |
+ |
|
| 206 |
+func (c Command) Run(s HookState) error {
|
|
| 207 |
+ b, err := json.Marshal(s) |
|
| 208 |
+ if err != nil {
|
|
| 209 |
+ return err |
|
| 210 |
+ } |
|
| 211 |
+ cmd := exec.Cmd{
|
|
| 212 |
+ Path: c.Path, |
|
| 213 |
+ Args: c.Args, |
|
| 214 |
+ Env: c.Env, |
|
| 215 |
+ Stdin: bytes.NewReader(b), |
|
| 216 |
+ } |
|
| 217 |
+ return cmd.Run() |
|
| 146 | 218 |
} |
| ... | ... |
@@ -25,10 +25,3 @@ type Mount struct {
|
| 25 | 25 |
// Optional Command to be run after Source is mounted. |
| 26 | 26 |
PostmountCmds []Command `json:"postmount_cmds"` |
| 27 | 27 |
} |
| 28 |
- |
|
| 29 |
-type Command struct {
|
|
| 30 |
- Path string `json:"path"` |
|
| 31 |
- Args []string `json:"args"` |
|
| 32 |
- Env []string `json:"env"` |
|
| 33 |
- Dir string `json:"dir"` |
|
| 34 |
-} |
| ... | ... |
@@ -185,6 +185,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c |
| 185 | 185 |
parentPipe: parentPipe, |
| 186 | 186 |
manager: c.cgroupManager, |
| 187 | 187 |
config: c.newInitConfig(p), |
| 188 |
+ container: c, |
|
| 188 | 189 |
}, nil |
| 189 | 190 |
} |
| 190 | 191 |
|
| ... | ... |
@@ -247,6 +248,17 @@ func (c *linuxContainer) Destroy() error {
|
| 247 | 247 |
err = rerr |
| 248 | 248 |
} |
| 249 | 249 |
c.initProcess = nil |
| 250 |
+ if c.config.Hooks != nil {
|
|
| 251 |
+ s := configs.HookState{
|
|
| 252 |
+ ID: c.id, |
|
| 253 |
+ Root: c.config.Rootfs, |
|
| 254 |
+ } |
|
| 255 |
+ for _, hook := range c.config.Hooks.Poststop {
|
|
| 256 |
+ if err := hook.Run(s); err != nil {
|
|
| 257 |
+ return err |
|
| 258 |
+ } |
|
| 259 |
+ } |
|
| 260 |
+ } |
|
| 250 | 261 |
return err |
| 251 | 262 |
} |
| 252 | 263 |
|
| ... | ... |
@@ -299,7 +311,7 @@ func (c *linuxContainer) checkCriuVersion() error {
|
| 299 | 299 |
return nil |
| 300 | 300 |
} |
| 301 | 301 |
|
| 302 |
-const descriptors_filename = "descriptors.json" |
|
| 302 |
+const descriptorsFilename = "descriptors.json" |
|
| 303 | 303 |
|
| 304 | 304 |
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
|
| 305 | 305 |
mountDest := m.Destination |
| ... | ... |
@@ -406,7 +418,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
| 406 | 406 |
return err |
| 407 | 407 |
} |
| 408 | 408 |
|
| 409 |
- err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptors_filename), fdsJSON, 0655) |
|
| 409 |
+ err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) |
|
| 410 | 410 |
if err != nil {
|
| 411 | 411 |
return err |
| 412 | 412 |
} |
| ... | ... |
@@ -532,13 +544,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
| 532 | 532 |
break |
| 533 | 533 |
} |
| 534 | 534 |
} |
| 535 |
+ for _, i := range criuOpts.VethPairs {
|
|
| 536 |
+ veth := new(criurpc.CriuVethPair) |
|
| 537 |
+ veth.IfOut = proto.String(i.HostInterfaceName) |
|
| 538 |
+ veth.IfIn = proto.String(i.ContainerInterfaceName) |
|
| 539 |
+ req.Opts.Veths = append(req.Opts.Veths, veth) |
|
| 540 |
+ } |
|
| 535 | 541 |
|
| 536 | 542 |
var ( |
| 537 | 543 |
fds []string |
| 538 | 544 |
fdJSON []byte |
| 539 | 545 |
) |
| 540 | 546 |
|
| 541 |
- if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptors_filename)); err != nil {
|
|
| 547 |
+ if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
|
| 542 | 548 |
return err |
| 543 | 549 |
} |
| 544 | 550 |
|
| ... | ... |
@@ -568,6 +586,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * |
| 568 | 568 |
return err |
| 569 | 569 |
} |
| 570 | 570 |
|
| 571 |
+ logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) |
|
| 571 | 572 |
criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") |
| 572 | 573 |
criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") |
| 573 | 574 |
defer criuClient.Close() |
| ... | ... |
@@ -631,7 +650,8 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * |
| 631 | 631 |
return err |
| 632 | 632 |
} |
| 633 | 633 |
if !resp.GetSuccess() {
|
| 634 |
- return fmt.Errorf("criu failed: type %s errno %d", req.GetType().String(), resp.GetCrErrno())
|
|
| 634 |
+ typeString := req.GetType().String() |
|
| 635 |
+ return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
|
|
| 635 | 636 |
} |
| 636 | 637 |
|
| 637 | 638 |
t := resp.GetType() |
| ... | ... |
@@ -671,7 +691,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * |
| 671 | 671 |
return err |
| 672 | 672 |
} |
| 673 | 673 |
if !st.Success() {
|
| 674 |
- return fmt.Errorf("criu failed: %s", st.String())
|
|
| 674 |
+ return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
|
|
| 675 | 675 |
} |
| 676 | 676 |
return nil |
| 677 | 677 |
} |
| ... | ... |
@@ -5,6 +5,11 @@ type CriuPageServerInfo struct {
|
| 5 | 5 |
Port int32 // port number of CRIU page server |
| 6 | 6 |
} |
| 7 | 7 |
|
| 8 |
+type VethPairName struct {
|
|
| 9 |
+ ContainerInterfaceName string |
|
| 10 |
+ HostInterfaceName string |
|
| 11 |
+} |
|
| 12 |
+ |
|
| 8 | 13 |
type CriuOpts struct {
|
| 9 | 14 |
ImagesDirectory string // directory for storing image files |
| 10 | 15 |
WorkDirectory string // directory to cd and write logs/pidfiles/stats to |
| ... | ... |
@@ -14,4 +19,5 @@ type CriuOpts struct {
|
| 14 | 14 |
ShellJob bool // allow to dump and restore shell jobs |
| 15 | 15 |
FileLocks bool // handle file locks, for safety |
| 16 | 16 |
PageServer CriuPageServerInfo // allow to dump to criu page server |
| 17 |
+ VethPairs []VethPairName // pass the veth to criu when restore |
|
| 17 | 18 |
} |
| ... | ... |
@@ -5,7 +5,9 @@ package libcontainer |
| 5 | 5 |
import ( |
| 6 | 6 |
"encoding/json" |
| 7 | 7 |
"fmt" |
| 8 |
+ "io/ioutil" |
|
| 8 | 9 |
"os" |
| 10 |
+ "strconv" |
|
| 9 | 11 |
"strings" |
| 10 | 12 |
"syscall" |
| 11 | 13 |
|
| ... | ... |
@@ -13,7 +15,6 @@ import ( |
| 13 | 13 |
"github.com/opencontainers/runc/libcontainer/cgroups" |
| 14 | 14 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 15 | 15 |
"github.com/opencontainers/runc/libcontainer/netlink" |
| 16 |
- "github.com/opencontainers/runc/libcontainer/seccomp" |
|
| 17 | 16 |
"github.com/opencontainers/runc/libcontainer/system" |
| 18 | 17 |
"github.com/opencontainers/runc/libcontainer/user" |
| 19 | 18 |
"github.com/opencontainers/runc/libcontainer/utils" |
| ... | ... |
@@ -239,6 +240,11 @@ func setupRlimits(config *configs.Config) error {
|
| 239 | 239 |
return nil |
| 240 | 240 |
} |
| 241 | 241 |
|
| 242 |
+func setOomScoreAdj(oomScoreAdj int) error {
|
|
| 243 |
+ path := "/proc/self/oom_score_adj" |
|
| 244 |
+ return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700) |
|
| 245 |
+} |
|
| 246 |
+ |
|
| 242 | 247 |
// killCgroupProcesses freezes then iterates over all the processes inside the |
| 243 | 248 |
// manager's cgroups sending a SIGKILL to each process then waiting for them to |
| 244 | 249 |
// exit. |
| ... | ... |
@@ -270,61 +276,3 @@ func killCgroupProcesses(m cgroups.Manager) error {
|
| 270 | 270 |
} |
| 271 | 271 |
return nil |
| 272 | 272 |
} |
| 273 |
- |
|
| 274 |
-func finalizeSeccomp(config *initConfig) error {
|
|
| 275 |
- if config.Config.Seccomp == nil {
|
|
| 276 |
- return nil |
|
| 277 |
- } |
|
| 278 |
- context := seccomp.New() |
|
| 279 |
- for _, s := range config.Config.Seccomp.Syscalls {
|
|
| 280 |
- ss := &seccomp.Syscall{
|
|
| 281 |
- Value: uint32(s.Value), |
|
| 282 |
- Action: seccompAction(s.Action), |
|
| 283 |
- } |
|
| 284 |
- if len(s.Args) > 0 {
|
|
| 285 |
- ss.Args = seccompArgs(s.Args) |
|
| 286 |
- } |
|
| 287 |
- context.Add(ss) |
|
| 288 |
- } |
|
| 289 |
- return context.Load() |
|
| 290 |
-} |
|
| 291 |
- |
|
| 292 |
-func seccompAction(a configs.Action) seccomp.Action {
|
|
| 293 |
- switch a {
|
|
| 294 |
- case configs.Kill: |
|
| 295 |
- return seccomp.Kill |
|
| 296 |
- case configs.Trap: |
|
| 297 |
- return seccomp.Trap |
|
| 298 |
- case configs.Allow: |
|
| 299 |
- return seccomp.Allow |
|
| 300 |
- } |
|
| 301 |
- return seccomp.Error(syscall.Errno(int(a))) |
|
| 302 |
-} |
|
| 303 |
- |
|
| 304 |
-func seccompArgs(args []*configs.Arg) seccomp.Args {
|
|
| 305 |
- var sa []seccomp.Arg |
|
| 306 |
- for _, a := range args {
|
|
| 307 |
- sa = append(sa, seccomp.Arg{
|
|
| 308 |
- Index: uint32(a.Index), |
|
| 309 |
- Op: seccompOperator(a.Op), |
|
| 310 |
- Value: uint(a.Value), |
|
| 311 |
- }) |
|
| 312 |
- } |
|
| 313 |
- return seccomp.Args{sa}
|
|
| 314 |
-} |
|
| 315 |
- |
|
| 316 |
-func seccompOperator(o configs.Operator) seccomp.Operator {
|
|
| 317 |
- switch o {
|
|
| 318 |
- case configs.EqualTo: |
|
| 319 |
- return seccomp.EqualTo |
|
| 320 |
- case configs.NotEqualTo: |
|
| 321 |
- return seccomp.NotEqualTo |
|
| 322 |
- case configs.GreatherThan: |
|
| 323 |
- return seccomp.GreatherThan |
|
| 324 |
- case configs.LessThan: |
|
| 325 |
- return seccomp.LessThan |
|
| 326 |
- case configs.MaskEqualTo: |
|
| 327 |
- return seccomp.MaskEqualTo |
|
| 328 |
- } |
|
| 329 |
- return 0 |
|
| 330 |
-} |
| ... | ... |
@@ -29,7 +29,7 @@ func SetFileCreateLabel(fileLabel string) error {
|
| 29 | 29 |
return nil |
| 30 | 30 |
} |
| 31 | 31 |
|
| 32 |
-func Relabel(path string, fileLabel string, relabel string) error {
|
|
| 32 |
+func Relabel(path string, fileLabel string, shared bool) error {
|
|
| 33 | 33 |
return nil |
| 34 | 34 |
} |
| 35 | 35 |
|
| ... | ... |
@@ -59,3 +59,13 @@ func DupSecOpt(src string) []string {
|
| 59 | 59 |
func DisableSecOpt() []string {
|
| 60 | 60 |
return nil |
| 61 | 61 |
} |
| 62 |
+ |
|
| 63 |
+// Validate checks that the label does not include unexpected options |
|
| 64 |
+func Validate(label string) error {
|
|
| 65 |
+ return nil |
|
| 66 |
+} |
|
| 67 |
+ |
|
| 68 |
+// IsShared checks that the label includes a "shared" mark |
|
| 69 |
+func IsShared(label string) bool {
|
|
| 70 |
+ return false |
|
| 71 |
+} |
| ... | ... |
@@ -9,6 +9,8 @@ import ( |
| 9 | 9 |
"github.com/opencontainers/runc/libcontainer/selinux" |
| 10 | 10 |
) |
| 11 | 11 |
|
| 12 |
+var ErrIncompatibleLabel = fmt.Errorf("Bad SELinux option z and Z can not be used together")
|
|
| 13 |
+ |
|
| 12 | 14 |
// InitLabels returns the process label and file labels to be used within |
| 13 | 15 |
// the container. A list of options can be passed into this function to alter |
| 14 | 16 |
// the labels. The labels returned will include a random MCS String, that is |
| ... | ... |
@@ -95,28 +97,24 @@ func SetFileCreateLabel(fileLabel string) error {
|
| 95 | 95 |
return nil |
| 96 | 96 |
} |
| 97 | 97 |
|
| 98 |
-// Change the label of path to the filelabel string. If the relabel string |
|
| 99 |
-// is "z", relabel will change the MCS label to s0. This will allow all |
|
| 100 |
-// containers to share the content. If the relabel string is a "Z" then |
|
| 101 |
-// the MCS label should continue to be used. SELinux will use this field |
|
| 102 |
-// to make sure the content can not be shared by other containes. |
|
| 103 |
-func Relabel(path string, fileLabel string, relabel string) error {
|
|
| 104 |
- exclude_path := []string{"/", "/usr", "/etc"}
|
|
| 105 |
- if fileLabel == "" {
|
|
| 98 |
+// Change the label of path to the filelabel string. |
|
| 99 |
+// It changes the MCS label to s0 if shared is true. |
|
| 100 |
+// This will allow all containers to share the content. |
|
| 101 |
+func Relabel(path string, fileLabel string, shared bool) error {
|
|
| 102 |
+ if !selinux.SelinuxEnabled() {
|
|
| 106 | 103 |
return nil |
| 107 | 104 |
} |
| 108 |
- if !strings.ContainsAny(relabel, "zZ") {
|
|
| 105 |
+ |
|
| 106 |
+ if fileLabel == "" {
|
|
| 109 | 107 |
return nil |
| 110 | 108 |
} |
| 111 |
- for _, p := range exclude_path {
|
|
| 112 |
- if path == p {
|
|
| 113 |
- return fmt.Errorf("Relabeling of %s is not allowed", path)
|
|
| 114 |
- } |
|
| 115 |
- } |
|
| 116 |
- if strings.Contains(relabel, "z") && strings.Contains(relabel, "Z") {
|
|
| 117 |
- return fmt.Errorf("Bad SELinux option z and Z can not be used together")
|
|
| 109 |
+ |
|
| 110 |
+ exclude_paths := map[string]bool{"/": true, "/usr": true, "/etc": true}
|
|
| 111 |
+ if exclude_paths[path] {
|
|
| 112 |
+ return fmt.Errorf("Relabeling of %s is not allowed", path)
|
|
| 118 | 113 |
} |
| 119 |
- if strings.Contains(relabel, "z") {
|
|
| 114 |
+ |
|
| 115 |
+ if shared {
|
|
| 120 | 116 |
c := selinux.NewContext(fileLabel) |
| 121 | 117 |
c["level"] = "s0" |
| 122 | 118 |
fileLabel = c.Get() |
| ... | ... |
@@ -161,3 +159,16 @@ func DupSecOpt(src string) []string {
|
| 161 | 161 |
func DisableSecOpt() []string {
|
| 162 | 162 |
return selinux.DisableSecOpt() |
| 163 | 163 |
} |
| 164 |
+ |
|
| 165 |
+// Validate checks that the label does not include unexpected options |
|
| 166 |
+func Validate(label string) error {
|
|
| 167 |
+ if strings.Contains(label, "z") && strings.Contains(label, "Z") {
|
|
| 168 |
+ return ErrIncompatibleLabel |
|
| 169 |
+ } |
|
| 170 |
+ return nil |
|
| 171 |
+} |
|
| 172 |
+ |
|
| 173 |
+// IsShared checks that the label includes a "shared" mark |
|
| 174 |
+func IsShared(label string) bool {
|
|
| 175 |
+ return strings.Contains(label, "z") |
|
| 176 |
+} |
| ... | ... |
@@ -13,6 +13,7 @@ import ( |
| 13 | 13 |
"syscall" |
| 14 | 14 |
|
| 15 | 15 |
"github.com/opencontainers/runc/libcontainer/cgroups" |
| 16 |
+ "github.com/opencontainers/runc/libcontainer/configs" |
|
| 16 | 17 |
"github.com/opencontainers/runc/libcontainer/system" |
| 17 | 18 |
) |
| 18 | 19 |
|
| ... | ... |
@@ -138,11 +139,9 @@ func (p *setnsProcess) terminate() error {
|
| 138 | 138 |
|
| 139 | 139 |
func (p *setnsProcess) wait() (*os.ProcessState, error) {
|
| 140 | 140 |
err := p.cmd.Wait() |
| 141 |
- if err != nil {
|
|
| 142 |
- return p.cmd.ProcessState, err |
|
| 143 |
- } |
|
| 144 | 141 |
|
| 145 |
- return p.cmd.ProcessState, nil |
|
| 142 |
+ // Return actual ProcessState even on Wait error |
|
| 143 |
+ return p.cmd.ProcessState, err |
|
| 146 | 144 |
} |
| 147 | 145 |
|
| 148 | 146 |
func (p *setnsProcess) pid() int {
|
| ... | ... |
@@ -175,9 +174,9 @@ func (p *initProcess) externalDescriptors() []string {
|
| 175 | 175 |
return p.fds |
| 176 | 176 |
} |
| 177 | 177 |
|
| 178 |
-func (p *initProcess) start() error {
|
|
| 178 |
+func (p *initProcess) start() (err error) {
|
|
| 179 | 179 |
defer p.parentPipe.Close() |
| 180 |
- err := p.cmd.Start() |
|
| 180 |
+ err = p.cmd.Start() |
|
| 181 | 181 |
p.childPipe.Close() |
| 182 | 182 |
if err != nil {
|
| 183 | 183 |
return newSystemError(err) |
| ... | ... |
@@ -202,6 +201,18 @@ func (p *initProcess) start() error {
|
| 202 | 202 |
p.manager.Destroy() |
| 203 | 203 |
} |
| 204 | 204 |
}() |
| 205 |
+ if p.config.Config.Hooks != nil {
|
|
| 206 |
+ s := configs.HookState{
|
|
| 207 |
+ ID: p.container.id, |
|
| 208 |
+ Pid: p.pid(), |
|
| 209 |
+ Root: p.config.Config.Rootfs, |
|
| 210 |
+ } |
|
| 211 |
+ for _, hook := range p.config.Config.Hooks.Prestart {
|
|
| 212 |
+ if err := hook.Run(s); err != nil {
|
|
| 213 |
+ return newSystemError(err) |
|
| 214 |
+ } |
|
| 215 |
+ } |
|
| 216 |
+ } |
|
| 205 | 217 |
if err := p.createNetworkInterfaces(); err != nil {
|
| 206 | 218 |
return newSystemError(err) |
| 207 | 219 |
} |
| ... | ... |
@@ -286,9 +297,7 @@ func (p *initProcess) setExternalDescriptors(newFds []string) {
|
| 286 | 286 |
} |
| 287 | 287 |
|
| 288 | 288 |
func getPipeFds(pid int) ([]string, error) {
|
| 289 |
- var fds []string |
|
| 290 |
- |
|
| 291 |
- fds = make([]string, 3) |
|
| 289 |
+ fds := make([]string, 3) |
|
| 292 | 290 |
|
| 293 | 291 |
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
|
| 294 | 292 |
for i := 0; i < 3; i++ {
|
| ... | ... |
@@ -27,6 +27,8 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
|
| 27 | 27 |
if err := prepareRoot(config); err != nil {
|
| 28 | 28 |
return newSystemError(err) |
| 29 | 29 |
} |
| 30 |
+ |
|
| 31 |
+ setupDev := len(config.Devices) == 0 |
|
| 30 | 32 |
for _, m := range config.Mounts {
|
| 31 | 33 |
for _, precmd := range m.PremountCmds {
|
| 32 | 34 |
if err := mountCmd(precmd); err != nil {
|
| ... | ... |
@@ -43,14 +45,16 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
|
| 43 | 43 |
} |
| 44 | 44 |
} |
| 45 | 45 |
} |
| 46 |
- if err := createDevices(config); err != nil {
|
|
| 47 |
- return newSystemError(err) |
|
| 48 |
- } |
|
| 49 |
- if err := setupPtmx(config, console); err != nil {
|
|
| 50 |
- return newSystemError(err) |
|
| 51 |
- } |
|
| 52 |
- if err := setupDevSymlinks(config.Rootfs); err != nil {
|
|
| 53 |
- return newSystemError(err) |
|
| 46 |
+ if !setupDev {
|
|
| 47 |
+ if err := createDevices(config); err != nil {
|
|
| 48 |
+ return newSystemError(err) |
|
| 49 |
+ } |
|
| 50 |
+ if err := setupPtmx(config, console); err != nil {
|
|
| 51 |
+ return newSystemError(err) |
|
| 52 |
+ } |
|
| 53 |
+ if err := setupDevSymlinks(config.Rootfs); err != nil {
|
|
| 54 |
+ return newSystemError(err) |
|
| 55 |
+ } |
|
| 54 | 56 |
} |
| 55 | 57 |
if err := syscall.Chdir(config.Rootfs); err != nil {
|
| 56 | 58 |
return newSystemError(err) |
| ... | ... |
@@ -63,8 +67,10 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
|
| 63 | 63 |
if err != nil {
|
| 64 | 64 |
return newSystemError(err) |
| 65 | 65 |
} |
| 66 |
- if err := reOpenDevNull(config.Rootfs); err != nil {
|
|
| 67 |
- return newSystemError(err) |
|
| 66 |
+ if !setupDev {
|
|
| 67 |
+ if err := reOpenDevNull(config.Rootfs); err != nil {
|
|
| 68 |
+ return newSystemError(err) |
|
| 69 |
+ } |
|
| 68 | 70 |
} |
| 69 | 71 |
if config.Readonlyfs {
|
| 70 | 72 |
if err := setReadonly(); err != nil {
|
| ... | ... |
@@ -131,6 +137,11 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
|
| 131 | 131 |
return err |
| 132 | 132 |
} |
| 133 | 133 |
return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data) |
| 134 |
+ case "securityfs": |
|
| 135 |
+ if err := os.MkdirAll(dest, 0755); err != nil {
|
|
| 136 |
+ return err |
|
| 137 |
+ } |
|
| 138 |
+ return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data) |
|
| 134 | 139 |
case "bind": |
| 135 | 140 |
stat, err := os.Stat(m.Source) |
| 136 | 141 |
if err != nil {
|
| ... | ... |
@@ -160,7 +171,11 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
|
| 160 | 160 |
} |
| 161 | 161 |
} |
| 162 | 162 |
if m.Relabel != "" {
|
| 163 |
- if err := label.Relabel(m.Source, mountLabel, m.Relabel); err != nil {
|
|
| 163 |
+ if err := label.Validate(m.Relabel); err != nil {
|
|
| 164 |
+ return err |
|
| 165 |
+ } |
|
| 166 |
+ shared := label.IsShared(m.Relabel) |
|
| 167 |
+ if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
|
|
| 164 | 168 |
return err |
| 165 | 169 |
} |
| 166 | 170 |
} |
| 167 | 171 |
deleted file mode 100644 |
| ... | ... |
@@ -1,34 +0,0 @@ |
| 1 |
-// +build linux |
|
| 2 |
- |
|
| 3 |
-package seccomp |
|
| 4 |
- |
|
| 5 |
-import "strings" |
|
| 6 |
- |
|
| 7 |
-type bpfLabel struct {
|
|
| 8 |
- label string |
|
| 9 |
- location uint32 |
|
| 10 |
-} |
|
| 11 |
- |
|
| 12 |
-type bpfLabels []bpfLabel |
|
| 13 |
- |
|
| 14 |
-// labelIndex returns the index for the label if it exists in the slice. |
|
| 15 |
-// if it does not exist in the slice it appends the label lb to the end |
|
| 16 |
-// of the slice and returns the index. |
|
| 17 |
-func labelIndex(labels *bpfLabels, lb string) uint32 {
|
|
| 18 |
- var id uint32 |
|
| 19 |
- for id = 0; id < uint32(len(*labels)); id++ {
|
|
| 20 |
- if strings.EqualFold(lb, (*labels)[id].label) {
|
|
| 21 |
- return id |
|
| 22 |
- } |
|
| 23 |
- } |
|
| 24 |
- *labels = append(*labels, bpfLabel{lb, 0xffffffff})
|
|
| 25 |
- return id |
|
| 26 |
-} |
|
| 27 |
- |
|
| 28 |
-func scmpBpfStmt(code uint16, k uint32) sockFilter {
|
|
| 29 |
- return sockFilter{code, 0, 0, k}
|
|
| 30 |
-} |
|
| 31 |
- |
|
| 32 |
-func scmpBpfJump(code uint16, k uint32, jt, jf uint8) sockFilter {
|
|
| 33 |
- return sockFilter{code, jt, jf, k}
|
|
| 34 |
-} |
| 35 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,53 @@ |
| 0 |
+package seccomp |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "fmt" |
|
| 4 |
+ |
|
| 5 |
+ "github.com/opencontainers/runc/libcontainer/configs" |
|
| 6 |
+) |
|
| 7 |
+ |
|
| 8 |
+// ConvertStringToOperator converts a string into a Seccomp comparison operator. |
|
| 9 |
+// Comparison operators use the names they are assigned by Libseccomp's header. |
|
| 10 |
+// Attempting to convert a string that is not a valid operator results in an |
|
| 11 |
+// error. |
|
| 12 |
+func ConvertStringToOperator(in string) (configs.Operator, error) {
|
|
| 13 |
+ switch in {
|
|
| 14 |
+ case "SCMP_CMP_NE": |
|
| 15 |
+ return configs.NotEqualTo, nil |
|
| 16 |
+ case "SCMP_CMP_LT": |
|
| 17 |
+ return configs.LessThan, nil |
|
| 18 |
+ case "SCMP_CMP_LE": |
|
| 19 |
+ return configs.LessThanOrEqualTo, nil |
|
| 20 |
+ case "SCMP_CMP_EQ": |
|
| 21 |
+ return configs.EqualTo, nil |
|
| 22 |
+ case "SCMP_CMP_GE": |
|
| 23 |
+ return configs.GreaterThan, nil |
|
| 24 |
+ case "SCMP_CMP_GT": |
|
| 25 |
+ return configs.GreaterThanOrEqualTo, nil |
|
| 26 |
+ case "SCMP_CMP_MASKED_EQ": |
|
| 27 |
+ return configs.MaskEqualTo, nil |
|
| 28 |
+ default: |
|
| 29 |
+ return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
|
|
| 30 |
+ } |
|
| 31 |
+} |
|
| 32 |
+ |
|
| 33 |
+// ConvertStringToAction converts a string into a Seccomp rule match action. |
|
| 34 |
+// Actions use the named they are assigned in Libseccomp's header, though some |
|
| 35 |
+// (notable, SCMP_ACT_TRACE) are not available in this implementation and will |
|
| 36 |
+// return errors. |
|
| 37 |
+// Attempting to convert a string that is not a valid action results in an |
|
| 38 |
+// error. |
|
| 39 |
+func ConvertStringToAction(in string) (configs.Action, error) {
|
|
| 40 |
+ switch in {
|
|
| 41 |
+ case "SCMP_ACT_KILL": |
|
| 42 |
+ return configs.Kill, nil |
|
| 43 |
+ case "SCMP_ACT_ERRNO": |
|
| 44 |
+ return configs.Errno, nil |
|
| 45 |
+ case "SCMP_ACT_TRAP": |
|
| 46 |
+ return configs.Trap, nil |
|
| 47 |
+ case "SCMP_ACT_ALLOW": |
|
| 48 |
+ return configs.Allow, nil |
|
| 49 |
+ default: |
|
| 50 |
+ return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
|
|
| 51 |
+ } |
|
| 52 |
+} |
| 0 | 53 |
deleted file mode 100644 |
| ... | ... |
@@ -1,146 +0,0 @@ |
| 1 |
-// +build linux |
|
| 2 |
- |
|
| 3 |
-package seccomp |
|
| 4 |
- |
|
| 5 |
-import ( |
|
| 6 |
- "errors" |
|
| 7 |
- "syscall" |
|
| 8 |
-) |
|
| 9 |
- |
|
| 10 |
-const labelTemplate = "lb-%d-%d" |
|
| 11 |
- |
|
| 12 |
-// Action is the type of action that will be taken when a |
|
| 13 |
-// syscall is performed. |
|
| 14 |
-type Action int |
|
| 15 |
- |
|
| 16 |
-const ( |
|
| 17 |
- Kill Action = iota - 3 // Kill the calling process of the syscall. |
|
| 18 |
- Trap // Trap and coredump the calling process of the syscall. |
|
| 19 |
- Allow // Allow the syscall to be completed. |
|
| 20 |
-) |
|
| 21 |
- |
|
| 22 |
-// Syscall is the specified syscall, action, and any type of arguments |
|
| 23 |
-// to filter on. |
|
| 24 |
-type Syscall struct {
|
|
| 25 |
- // Value is the syscall number. |
|
| 26 |
- Value uint32 |
|
| 27 |
- // Action is the action to perform when the specified syscall is made. |
|
| 28 |
- Action Action |
|
| 29 |
- // Args are filters that can be specified on the arguments to the syscall. |
|
| 30 |
- Args Args |
|
| 31 |
-} |
|
| 32 |
- |
|
| 33 |
-func (s *Syscall) scmpAction() uint32 {
|
|
| 34 |
- switch s.Action {
|
|
| 35 |
- case Allow: |
|
| 36 |
- return retAllow |
|
| 37 |
- case Trap: |
|
| 38 |
- return retTrap |
|
| 39 |
- case Kill: |
|
| 40 |
- return retKill |
|
| 41 |
- } |
|
| 42 |
- return actionErrno(uint32(s.Action)) |
|
| 43 |
-} |
|
| 44 |
- |
|
| 45 |
-// Arg represents an argument to the syscall with the argument's index, |
|
| 46 |
-// the operator to apply when matching, and the argument's value at that time. |
|
| 47 |
-type Arg struct {
|
|
| 48 |
- Index uint32 // index of args which start from zero |
|
| 49 |
- Op Operator // operation, such as EQ/NE/GE/LE |
|
| 50 |
- Value uint // the value of arg |
|
| 51 |
-} |
|
| 52 |
- |
|
| 53 |
-type Args [][]Arg |
|
| 54 |
- |
|
| 55 |
-var ( |
|
| 56 |
- ErrUnresolvedLabel = errors.New("seccomp: unresolved label")
|
|
| 57 |
- ErrDuplicateLabel = errors.New("seccomp: duplicate label use")
|
|
| 58 |
- ErrUnsupportedOperation = errors.New("seccomp: unsupported operation for argument")
|
|
| 59 |
-) |
|
| 60 |
- |
|
| 61 |
-// Error returns an Action that will be used to send the calling |
|
| 62 |
-// process the specified errno when the syscall is made. |
|
| 63 |
-func Error(code syscall.Errno) Action {
|
|
| 64 |
- return Action(code) |
|
| 65 |
-} |
|
| 66 |
- |
|
| 67 |
-// New returns a new syscall context for use. |
|
| 68 |
-func New() *Context {
|
|
| 69 |
- return &Context{
|
|
| 70 |
- syscalls: make(map[uint32]*Syscall), |
|
| 71 |
- } |
|
| 72 |
-} |
|
| 73 |
- |
|
| 74 |
-// Context holds syscalls for the current process to limit the type of |
|
| 75 |
-// actions the calling process can make. |
|
| 76 |
-type Context struct {
|
|
| 77 |
- syscalls map[uint32]*Syscall |
|
| 78 |
-} |
|
| 79 |
- |
|
| 80 |
-// Add will add the specified syscall, action, and arguments to the seccomp |
|
| 81 |
-// Context. |
|
| 82 |
-func (c *Context) Add(s *Syscall) {
|
|
| 83 |
- c.syscalls[s.Value] = s |
|
| 84 |
-} |
|
| 85 |
- |
|
| 86 |
-// Remove removes the specified syscall configuration from the Context. |
|
| 87 |
-func (c *Context) Remove(call uint32) {
|
|
| 88 |
- delete(c.syscalls, call) |
|
| 89 |
-} |
|
| 90 |
- |
|
| 91 |
-// Load will apply the Context to the calling process makeing any secccomp process changes |
|
| 92 |
-// apply after the context is loaded. |
|
| 93 |
-func (c *Context) Load() error {
|
|
| 94 |
- filter, err := c.newFilter() |
|
| 95 |
- if err != nil {
|
|
| 96 |
- return err |
|
| 97 |
- } |
|
| 98 |
- if err := prctl(prSetNoNewPrivileges, 1, 0, 0, 0); err != nil {
|
|
| 99 |
- return err |
|
| 100 |
- } |
|
| 101 |
- prog := newSockFprog(filter) |
|
| 102 |
- return prog.set() |
|
| 103 |
-} |
|
| 104 |
- |
|
| 105 |
-func (c *Context) newFilter() ([]sockFilter, error) {
|
|
| 106 |
- var ( |
|
| 107 |
- labels bpfLabels |
|
| 108 |
- f = newFilter() |
|
| 109 |
- ) |
|
| 110 |
- for _, s := range c.syscalls {
|
|
| 111 |
- f.addSyscall(s, &labels) |
|
| 112 |
- } |
|
| 113 |
- f.allow() |
|
| 114 |
- // process args for the syscalls |
|
| 115 |
- for _, s := range c.syscalls {
|
|
| 116 |
- if err := f.addArguments(s, &labels); err != nil {
|
|
| 117 |
- return nil, err |
|
| 118 |
- } |
|
| 119 |
- } |
|
| 120 |
- // apply labels for arguments |
|
| 121 |
- idx := int32(len(*f) - 1) |
|
| 122 |
- for ; idx >= 0; idx-- {
|
|
| 123 |
- lf := &(*f)[idx] |
|
| 124 |
- if lf.code != (syscall.BPF_JMP + syscall.BPF_JA) {
|
|
| 125 |
- continue |
|
| 126 |
- } |
|
| 127 |
- rel := int32(lf.jt)<<8 | int32(lf.jf) |
|
| 128 |
- if ((jumpJT << 8) | jumpJF) == rel {
|
|
| 129 |
- if labels[lf.k].location == 0xffffffff {
|
|
| 130 |
- return nil, ErrUnresolvedLabel |
|
| 131 |
- } |
|
| 132 |
- lf.k = labels[lf.k].location - uint32(idx+1) |
|
| 133 |
- lf.jt = 0 |
|
| 134 |
- lf.jf = 0 |
|
| 135 |
- } else if ((labelJT << 8) | labelJF) == rel {
|
|
| 136 |
- if labels[lf.k].location != 0xffffffff {
|
|
| 137 |
- return nil, ErrDuplicateLabel |
|
| 138 |
- } |
|
| 139 |
- labels[lf.k].location = uint32(idx) |
|
| 140 |
- lf.k = 0 |
|
| 141 |
- lf.jt = 0 |
|
| 142 |
- lf.jf = 0 |
|
| 143 |
- } |
|
| 144 |
- } |
|
| 145 |
- return *f, nil |
|
| 146 |
-} |
| 147 | 1 |
deleted file mode 100644 |
| ... | ... |
@@ -1,118 +0,0 @@ |
| 1 |
-// +build linux |
|
| 2 |
- |
|
| 3 |
-package seccomp |
|
| 4 |
- |
|
| 5 |
-import ( |
|
| 6 |
- "fmt" |
|
| 7 |
- "syscall" |
|
| 8 |
- "unsafe" |
|
| 9 |
-) |
|
| 10 |
- |
|
| 11 |
-type sockFilter struct {
|
|
| 12 |
- code uint16 |
|
| 13 |
- jt uint8 |
|
| 14 |
- jf uint8 |
|
| 15 |
- k uint32 |
|
| 16 |
-} |
|
| 17 |
- |
|
| 18 |
-func newFilter() *filter {
|
|
| 19 |
- var f filter |
|
| 20 |
- f = append(f, sockFilter{
|
|
| 21 |
- pfLD + syscall.BPF_W + syscall.BPF_ABS, |
|
| 22 |
- 0, |
|
| 23 |
- 0, |
|
| 24 |
- uint32(unsafe.Offsetof(secData.nr)), |
|
| 25 |
- }) |
|
| 26 |
- return &f |
|
| 27 |
-} |
|
| 28 |
- |
|
| 29 |
-type filter []sockFilter |
|
| 30 |
- |
|
| 31 |
-func (f *filter) addSyscall(s *Syscall, labels *bpfLabels) {
|
|
| 32 |
- if len(s.Args) == 0 {
|
|
| 33 |
- f.call(s.Value, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction())) |
|
| 34 |
- } else {
|
|
| 35 |
- if len(s.Args[0]) > 0 {
|
|
| 36 |
- lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[0][0].Index) |
|
| 37 |
- f.call(s.Value, |
|
| 38 |
- scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), |
|
| 39 |
- jumpJT, jumpJF)) |
|
| 40 |
- } |
|
| 41 |
- } |
|
| 42 |
-} |
|
| 43 |
- |
|
| 44 |
-func (f *filter) addArguments(s *Syscall, labels *bpfLabels) error {
|
|
| 45 |
- for i := 0; len(s.Args) > i; i++ {
|
|
| 46 |
- if len(s.Args[i]) > 0 {
|
|
| 47 |
- lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[i][0].Index) |
|
| 48 |
- f.label(labels, lb) |
|
| 49 |
- f.arg(s.Args[i][0].Index) |
|
| 50 |
- } |
|
| 51 |
- for j := 0; j < len(s.Args[i]); j++ {
|
|
| 52 |
- var jf sockFilter |
|
| 53 |
- if len(s.Args)-1 > i && len(s.Args[i+1]) > 0 {
|
|
| 54 |
- lbj := fmt.Sprintf(labelTemplate, s.Value, s.Args[i+1][0].Index) |
|
| 55 |
- jf = scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, |
|
| 56 |
- labelIndex(labels, lbj), jumpJT, jumpJF) |
|
| 57 |
- } else {
|
|
| 58 |
- jf = scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction()) |
|
| 59 |
- } |
|
| 60 |
- if err := f.op(s.Args[i][j].Op, s.Args[i][j].Value, jf); err != nil {
|
|
| 61 |
- return err |
|
| 62 |
- } |
|
| 63 |
- } |
|
| 64 |
- f.allow() |
|
| 65 |
- } |
|
| 66 |
- return nil |
|
| 67 |
-} |
|
| 68 |
- |
|
| 69 |
-func (f *filter) label(labels *bpfLabels, lb string) {
|
|
| 70 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), labelJT, labelJF)) |
|
| 71 |
-} |
|
| 72 |
- |
|
| 73 |
-func (f *filter) call(nr uint32, jt sockFilter) {
|
|
| 74 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, nr, 0, 1)) |
|
| 75 |
- *f = append(*f, jt) |
|
| 76 |
-} |
|
| 77 |
- |
|
| 78 |
-func (f *filter) allow() {
|
|
| 79 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retAllow)) |
|
| 80 |
-} |
|
| 81 |
- |
|
| 82 |
-func (f *filter) deny() {
|
|
| 83 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retTrap)) |
|
| 84 |
-} |
|
| 85 |
- |
|
| 86 |
-func (f *filter) arg(index uint32) {
|
|
| 87 |
- arg(f, index) |
|
| 88 |
-} |
|
| 89 |
- |
|
| 90 |
-func (f *filter) op(operation Operator, v uint, jf sockFilter) error {
|
|
| 91 |
- switch operation {
|
|
| 92 |
- case EqualTo: |
|
| 93 |
- jumpEqualTo(f, v, jf) |
|
| 94 |
- case NotEqualTo: |
|
| 95 |
- jumpNotEqualTo(f, v, jf) |
|
| 96 |
- case GreatherThan: |
|
| 97 |
- jumpGreaterThan(f, v, jf) |
|
| 98 |
- case LessThan: |
|
| 99 |
- jumpLessThan(f, v, jf) |
|
| 100 |
- case MaskEqualTo: |
|
| 101 |
- jumpMaskEqualTo(f, v, jf) |
|
| 102 |
- default: |
|
| 103 |
- return ErrUnsupportedOperation |
|
| 104 |
- } |
|
| 105 |
- return nil |
|
| 106 |
-} |
|
| 107 |
- |
|
| 108 |
-func arg(f *filter, idx uint32) {
|
|
| 109 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.low(idx))) |
|
| 110 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_ST, 0)) |
|
| 111 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.hi(idx))) |
|
| 112 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_ST, 1)) |
|
| 113 |
-} |
|
| 114 |
- |
|
| 115 |
-func jump(f *filter, labels *bpfLabels, lb string) {
|
|
| 116 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), |
|
| 117 |
- jumpJT, jumpJF)) |
|
| 118 |
-} |
| 119 | 1 |
deleted file mode 100644 |
| ... | ... |
@@ -1,68 +0,0 @@ |
| 1 |
-// +build linux,amd64 |
|
| 2 |
- |
|
| 3 |
-package seccomp |
|
| 4 |
- |
|
| 5 |
-// Using BPF filters |
|
| 6 |
-// |
|
| 7 |
-// ref: http://www.gsp.com/cgi-bin/man.cgi?topic=bpf |
|
| 8 |
-import "syscall" |
|
| 9 |
- |
|
| 10 |
-func jumpGreaterThan(f *filter, v uint, jt sockFilter) {
|
|
| 11 |
- lo := uint32(uint64(v) % 0x100000000) |
|
| 12 |
- hi := uint32(uint64(v) / 0x100000000) |
|
| 13 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 4, 0)) |
|
| 14 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5)) |
|
| 15 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) |
|
| 16 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGE+syscall.BPF_K, (lo), 0, 2)) |
|
| 17 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 18 |
- *f = append(*f, jt) |
|
| 19 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 20 |
-} |
|
| 21 |
- |
|
| 22 |
-func jumpEqualTo(f *filter, v uint, jt sockFilter) {
|
|
| 23 |
- lo := uint32(uint64(v) % 0x100000000) |
|
| 24 |
- hi := uint32(uint64(v) / 0x100000000) |
|
| 25 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5)) |
|
| 26 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) |
|
| 27 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (lo), 0, 2)) |
|
| 28 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 29 |
- *f = append(*f, jt) |
|
| 30 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 31 |
-} |
|
| 32 |
- |
|
| 33 |
-func jumpLessThan(f *filter, v uint, jt sockFilter) {
|
|
| 34 |
- lo := uint32(uint64(v) % 0x100000000) |
|
| 35 |
- hi := uint32(uint64(v) / 0x100000000) |
|
| 36 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 6, 0)) |
|
| 37 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 3)) |
|
| 38 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) |
|
| 39 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (lo), 2, 0)) |
|
| 40 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 41 |
- *f = append(*f, jt) |
|
| 42 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 43 |
-} |
|
| 44 |
- |
|
| 45 |
-func jumpNotEqualTo(f *filter, v uint, jt sockFilter) {
|
|
| 46 |
- lo := uint32(uint64(v) % 0x100000000) |
|
| 47 |
- hi := uint32(uint64(v) / 0x100000000) |
|
| 48 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 5, 0)) |
|
| 49 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) |
|
| 50 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 2, 0)) |
|
| 51 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 52 |
- *f = append(*f, jt) |
|
| 53 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 54 |
-} |
|
| 55 |
- |
|
| 56 |
-// this checks for a value inside a mask. The evalusation is equal to doing |
|
| 57 |
-// CLONE_NEWUSER & syscallMask == CLONE_NEWUSER |
|
| 58 |
-func jumpMaskEqualTo(f *filter, v uint, jt sockFilter) {
|
|
| 59 |
- lo := uint32(uint64(v) % 0x100000000) |
|
| 60 |
- hi := uint32(uint64(v) / 0x100000000) |
|
| 61 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 0, 6)) |
|
| 62 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) |
|
| 63 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_ALU+syscall.BPF_AND, uint32(v))) |
|
| 64 |
- *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 0, 2)) |
|
| 65 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 66 |
- *f = append(*f, jt) |
|
| 67 |
- *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) |
|
| 68 |
-} |
| 69 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,165 @@ |
| 0 |
+// +build linux,cgo,seccomp |
|
| 1 |
+ |
|
| 2 |
+package seccomp |
|
| 3 |
+ |
|
| 4 |
+import ( |
|
| 5 |
+ "fmt" |
|
| 6 |
+ "log" |
|
| 7 |
+ "syscall" |
|
| 8 |
+ |
|
| 9 |
+ "github.com/opencontainers/runc/libcontainer/configs" |
|
| 10 |
+ libseccomp "github.com/seccomp/libseccomp-golang" |
|
| 11 |
+) |
|
| 12 |
+ |
|
| 13 |
+var ( |
|
| 14 |
+ actAllow = libseccomp.ActAllow |
|
| 15 |
+ actTrap = libseccomp.ActTrap |
|
| 16 |
+ actKill = libseccomp.ActKill |
|
| 17 |
+ actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM)) |
|
| 18 |
+) |
|
| 19 |
+ |
|
| 20 |
+// Filters given syscalls in a container, preventing them from being used |
|
| 21 |
+// Started in the container init process, and carried over to all child processes |
|
| 22 |
+// Setns calls, however, require a separate invocation, as they are not children |
|
| 23 |
+// of the init until they join the namespace |
|
| 24 |
+func InitSeccomp(config *configs.Seccomp) error {
|
|
| 25 |
+ if config == nil {
|
|
| 26 |
+ return fmt.Errorf("cannot initialize Seccomp - nil config passed")
|
|
| 27 |
+ } |
|
| 28 |
+ |
|
| 29 |
+ defaultAction, err := getAction(config.DefaultAction) |
|
| 30 |
+ if err != nil {
|
|
| 31 |
+ return fmt.Errorf("error initializing seccomp - invalid default action")
|
|
| 32 |
+ } |
|
| 33 |
+ |
|
| 34 |
+ filter, err := libseccomp.NewFilter(defaultAction) |
|
| 35 |
+ if err != nil {
|
|
| 36 |
+ return fmt.Errorf("error creating filter: %s", err)
|
|
| 37 |
+ } |
|
| 38 |
+ |
|
| 39 |
+ // Unset no new privs bit |
|
| 40 |
+ if err := filter.SetNoNewPrivsBit(false); err != nil {
|
|
| 41 |
+ return fmt.Errorf("error setting no new privileges: %s", err)
|
|
| 42 |
+ } |
|
| 43 |
+ |
|
| 44 |
+ // Add a rule for each syscall |
|
| 45 |
+ for _, call := range config.Syscalls {
|
|
| 46 |
+ if call == nil {
|
|
| 47 |
+ return fmt.Errorf("encountered nil syscall while initializing Seccomp")
|
|
| 48 |
+ } |
|
| 49 |
+ |
|
| 50 |
+ if err = matchCall(filter, call); err != nil {
|
|
| 51 |
+ return err |
|
| 52 |
+ } |
|
| 53 |
+ } |
|
| 54 |
+ |
|
| 55 |
+ if err = filter.Load(); err != nil {
|
|
| 56 |
+ return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
|
|
| 57 |
+ } |
|
| 58 |
+ |
|
| 59 |
+ return nil |
|
| 60 |
+} |
|
| 61 |
+ |
|
| 62 |
+// Convert Libcontainer Action to Libseccomp ScmpAction |
|
| 63 |
+func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
|
|
| 64 |
+ switch act {
|
|
| 65 |
+ case configs.Kill: |
|
| 66 |
+ return actKill, nil |
|
| 67 |
+ case configs.Errno: |
|
| 68 |
+ return actErrno, nil |
|
| 69 |
+ case configs.Trap: |
|
| 70 |
+ return actTrap, nil |
|
| 71 |
+ case configs.Allow: |
|
| 72 |
+ return actAllow, nil |
|
| 73 |
+ default: |
|
| 74 |
+ return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
|
|
| 75 |
+ } |
|
| 76 |
+} |
|
| 77 |
+ |
|
| 78 |
+// Convert Libcontainer Operator to Libseccomp ScmpCompareOp |
|
| 79 |
+func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
|
|
| 80 |
+ switch op {
|
|
| 81 |
+ case configs.EqualTo: |
|
| 82 |
+ return libseccomp.CompareEqual, nil |
|
| 83 |
+ case configs.NotEqualTo: |
|
| 84 |
+ return libseccomp.CompareNotEqual, nil |
|
| 85 |
+ case configs.GreaterThan: |
|
| 86 |
+ return libseccomp.CompareGreater, nil |
|
| 87 |
+ case configs.GreaterThanOrEqualTo: |
|
| 88 |
+ return libseccomp.CompareGreaterEqual, nil |
|
| 89 |
+ case configs.LessThan: |
|
| 90 |
+ return libseccomp.CompareLess, nil |
|
| 91 |
+ case configs.LessThanOrEqualTo: |
|
| 92 |
+ return libseccomp.CompareLessOrEqual, nil |
|
| 93 |
+ case configs.MaskEqualTo: |
|
| 94 |
+ return libseccomp.CompareMaskedEqual, nil |
|
| 95 |
+ default: |
|
| 96 |
+ return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
|
|
| 97 |
+ } |
|
| 98 |
+} |
|
| 99 |
+ |
|
| 100 |
+// Convert Libcontainer Arg to Libseccomp ScmpCondition |
|
| 101 |
+func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
|
|
| 102 |
+ cond := libseccomp.ScmpCondition{}
|
|
| 103 |
+ |
|
| 104 |
+ if arg == nil {
|
|
| 105 |
+ return cond, fmt.Errorf("cannot convert nil to syscall condition")
|
|
| 106 |
+ } |
|
| 107 |
+ |
|
| 108 |
+ op, err := getOperator(arg.Op) |
|
| 109 |
+ if err != nil {
|
|
| 110 |
+ return cond, err |
|
| 111 |
+ } |
|
| 112 |
+ |
|
| 113 |
+ return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) |
|
| 114 |
+} |
|
| 115 |
+ |
|
| 116 |
+// Add a rule to match a single syscall |
|
| 117 |
+func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
|
|
| 118 |
+ if call == nil || filter == nil {
|
|
| 119 |
+ return fmt.Errorf("cannot use nil as syscall to block")
|
|
| 120 |
+ } |
|
| 121 |
+ |
|
| 122 |
+ if len(call.Name) == 0 {
|
|
| 123 |
+ return fmt.Errorf("empty string is not a valid syscall")
|
|
| 124 |
+ } |
|
| 125 |
+ |
|
| 126 |
+ // If we can't resolve the syscall, assume it's not supported on this kernel |
|
| 127 |
+ // Ignore it, don't error out |
|
| 128 |
+ callNum, err := libseccomp.GetSyscallFromName(call.Name) |
|
| 129 |
+ if err != nil {
|
|
| 130 |
+ log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
|
|
| 131 |
+ return nil |
|
| 132 |
+ } |
|
| 133 |
+ |
|
| 134 |
+ // Convert the call's action to the libseccomp equivalent |
|
| 135 |
+ callAct, err := getAction(call.Action) |
|
| 136 |
+ if err != nil {
|
|
| 137 |
+ return err |
|
| 138 |
+ } |
|
| 139 |
+ |
|
| 140 |
+ // Unconditional match - just add the rule |
|
| 141 |
+ if len(call.Args) == 0 {
|
|
| 142 |
+ if err = filter.AddRule(callNum, callAct); err != nil {
|
|
| 143 |
+ return err |
|
| 144 |
+ } |
|
| 145 |
+ } else {
|
|
| 146 |
+ // Conditional match - convert the per-arg rules into library format |
|
| 147 |
+ conditions := []libseccomp.ScmpCondition{}
|
|
| 148 |
+ |
|
| 149 |
+ for _, cond := range call.Args {
|
|
| 150 |
+ newCond, err := getCondition(cond) |
|
| 151 |
+ if err != nil {
|
|
| 152 |
+ return err |
|
| 153 |
+ } |
|
| 154 |
+ |
|
| 155 |
+ conditions = append(conditions, newCond) |
|
| 156 |
+ } |
|
| 157 |
+ |
|
| 158 |
+ if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
|
|
| 159 |
+ return err |
|
| 160 |
+ } |
|
| 161 |
+ } |
|
| 162 |
+ |
|
| 163 |
+ return nil |
|
| 164 |
+} |
| 0 | 165 |
deleted file mode 100644 |
| ... | ... |
@@ -1,124 +0,0 @@ |
| 1 |
-// +build linux |
|
| 2 |
- |
|
| 3 |
-// Package seccomp provides native seccomp ( https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt ) support for go. |
|
| 4 |
-package seccomp |
|
| 5 |
- |
|
| 6 |
-import ( |
|
| 7 |
- "syscall" |
|
| 8 |
- "unsafe" |
|
| 9 |
-) |
|
| 10 |
- |
|
| 11 |
-// Operator that is used for argument comparison. |
|
| 12 |
-type Operator int |
|
| 13 |
- |
|
| 14 |
-const ( |
|
| 15 |
- EqualTo Operator = iota |
|
| 16 |
- NotEqualTo |
|
| 17 |
- GreatherThan |
|
| 18 |
- LessThan |
|
| 19 |
- MaskEqualTo |
|
| 20 |
-) |
|
| 21 |
- |
|
| 22 |
-const ( |
|
| 23 |
- jumpJT = 0xff |
|
| 24 |
- jumpJF = 0xff |
|
| 25 |
- labelJT = 0xfe |
|
| 26 |
- labelJF = 0xfe |
|
| 27 |
-) |
|
| 28 |
- |
|
| 29 |
-const ( |
|
| 30 |
- pfLD = 0x0 |
|
| 31 |
- retKill = 0x00000000 |
|
| 32 |
- retTrap = 0x00030000 |
|
| 33 |
- retAllow = 0x7fff0000 |
|
| 34 |
- modeFilter = 0x2 |
|
| 35 |
- prSetNoNewPrivileges = 0x26 |
|
| 36 |
-) |
|
| 37 |
- |
|
| 38 |
-func actionErrno(errno uint32) uint32 {
|
|
| 39 |
- return 0x00050000 | (errno & 0x0000ffff) |
|
| 40 |
-} |
|
| 41 |
- |
|
| 42 |
-var ( |
|
| 43 |
- secData = struct {
|
|
| 44 |
- nr int32 |
|
| 45 |
- arch uint32 |
|
| 46 |
- insPointer uint64 |
|
| 47 |
- args [6]uint64 |
|
| 48 |
- }{0, 0, 0, [6]uint64{0, 0, 0, 0, 0, 0}}
|
|
| 49 |
-) |
|
| 50 |
- |
|
| 51 |
-var isLittle = func() bool {
|
|
| 52 |
- var ( |
|
| 53 |
- x = 0x1234 |
|
| 54 |
- p = unsafe.Pointer(&x) |
|
| 55 |
- p2 = (*[unsafe.Sizeof(0)]byte)(p) |
|
| 56 |
- ) |
|
| 57 |
- if p2[0] == 0 {
|
|
| 58 |
- return false |
|
| 59 |
- } |
|
| 60 |
- return true |
|
| 61 |
-}() |
|
| 62 |
- |
|
| 63 |
-var endian endianSupport |
|
| 64 |
- |
|
| 65 |
-type endianSupport struct {
|
|
| 66 |
-} |
|
| 67 |
- |
|
| 68 |
-func (e endianSupport) hi(i uint32) uint32 {
|
|
| 69 |
- if isLittle {
|
|
| 70 |
- return e.little(i) |
|
| 71 |
- } |
|
| 72 |
- return e.big(i) |
|
| 73 |
-} |
|
| 74 |
- |
|
| 75 |
-func (e endianSupport) low(i uint32) uint32 {
|
|
| 76 |
- if isLittle {
|
|
| 77 |
- return e.big(i) |
|
| 78 |
- } |
|
| 79 |
- return e.little(i) |
|
| 80 |
-} |
|
| 81 |
- |
|
| 82 |
-func (endianSupport) big(idx uint32) uint32 {
|
|
| 83 |
- if idx >= 6 {
|
|
| 84 |
- return 0 |
|
| 85 |
- } |
|
| 86 |
- return uint32(unsafe.Offsetof(secData.args)) + 8*idx |
|
| 87 |
-} |
|
| 88 |
- |
|
| 89 |
-func (endianSupport) little(idx uint32) uint32 {
|
|
| 90 |
- if idx < 0 || idx >= 6 {
|
|
| 91 |
- return 0 |
|
| 92 |
- } |
|
| 93 |
- return uint32(unsafe.Offsetof(secData.args)) + |
|
| 94 |
- uint32(unsafe.Alignof(secData.args[0]))*idx + uint32(unsafe.Sizeof(secData.arch)) |
|
| 95 |
-} |
|
| 96 |
- |
|
| 97 |
-func prctl(option int, arg2, arg3, arg4, arg5 uintptr) error {
|
|
| 98 |
- _, _, err := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) |
|
| 99 |
- if err != 0 {
|
|
| 100 |
- return err |
|
| 101 |
- } |
|
| 102 |
- return nil |
|
| 103 |
-} |
|
| 104 |
- |
|
| 105 |
-func newSockFprog(filter []sockFilter) *sockFprog {
|
|
| 106 |
- return &sockFprog{
|
|
| 107 |
- len: uint16(len(filter)), |
|
| 108 |
- filt: filter, |
|
| 109 |
- } |
|
| 110 |
-} |
|
| 111 |
- |
|
| 112 |
-type sockFprog struct {
|
|
| 113 |
- len uint16 |
|
| 114 |
- filt []sockFilter |
|
| 115 |
-} |
|
| 116 |
- |
|
| 117 |
-func (s *sockFprog) set() error {
|
|
| 118 |
- _, _, err := syscall.Syscall(syscall.SYS_PRCTL, uintptr(syscall.PR_SET_SECCOMP), |
|
| 119 |
- uintptr(modeFilter), uintptr(unsafe.Pointer(s))) |
|
| 120 |
- if err != 0 {
|
|
| 121 |
- return err |
|
| 122 |
- } |
|
| 123 |
- return nil |
|
| 124 |
-} |
| ... | ... |
@@ -1,3 +1,19 @@ |
| 1 |
-// +build !linux |
|
| 1 |
+// +build !linux !cgo !seccomp |
|
| 2 | 2 |
|
| 3 | 3 |
package seccomp |
| 4 |
+ |
|
| 5 |
+import ( |
|
| 6 |
+ "errors" |
|
| 7 |
+ |
|
| 8 |
+ "github.com/opencontainers/runc/libcontainer/configs" |
|
| 9 |
+) |
|
| 10 |
+ |
|
| 11 |
+var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
|
|
| 12 |
+ |
|
| 13 |
+// Seccomp not supported, do nothing |
|
| 14 |
+func InitSeccomp(config *configs.Seccomp) error {
|
|
| 15 |
+ if config != nil {
|
|
| 16 |
+ return ErrSeccompNotEnabled |
|
| 17 |
+ } |
|
| 18 |
+ return nil |
|
| 19 |
+} |
| ... | ... |
@@ -7,6 +7,7 @@ import ( |
| 7 | 7 |
|
| 8 | 8 |
"github.com/opencontainers/runc/libcontainer/apparmor" |
| 9 | 9 |
"github.com/opencontainers/runc/libcontainer/label" |
| 10 |
+ "github.com/opencontainers/runc/libcontainer/seccomp" |
|
| 10 | 11 |
"github.com/opencontainers/runc/libcontainer/system" |
| 11 | 12 |
) |
| 12 | 13 |
|
| ... | ... |
@@ -20,6 +21,14 @@ func (l *linuxSetnsInit) Init() error {
|
| 20 | 20 |
if err := setupRlimits(l.config.Config); err != nil {
|
| 21 | 21 |
return err |
| 22 | 22 |
} |
| 23 |
+ if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
|
| 24 |
+ return err |
|
| 25 |
+ } |
|
| 26 |
+ if l.config.Config.Seccomp != nil {
|
|
| 27 |
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
|
| 28 |
+ return err |
|
| 29 |
+ } |
|
| 30 |
+ } |
|
| 23 | 31 |
if err := finalizeNamespace(l.config); err != nil {
|
| 24 | 32 |
return err |
| 25 | 33 |
} |
| ... | ... |
@@ -9,6 +9,7 @@ import ( |
| 9 | 9 |
"github.com/opencontainers/runc/libcontainer/apparmor" |
| 10 | 10 |
"github.com/opencontainers/runc/libcontainer/configs" |
| 11 | 11 |
"github.com/opencontainers/runc/libcontainer/label" |
| 12 |
+ "github.com/opencontainers/runc/libcontainer/seccomp" |
|
| 12 | 13 |
"github.com/opencontainers/runc/libcontainer/system" |
| 13 | 14 |
) |
| 14 | 15 |
|
| ... | ... |
@@ -46,6 +47,10 @@ func (l *linuxStandardInit) Init() error {
|
| 46 | 46 |
if err := setupRlimits(l.config.Config); err != nil {
|
| 47 | 47 |
return err |
| 48 | 48 |
} |
| 49 |
+ if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
|
| 50 |
+ return err |
|
| 51 |
+ } |
|
| 52 |
+ |
|
| 49 | 53 |
label.Init() |
| 50 | 54 |
// InitializeMountNamespace() can be executed only for a new mount namespace |
| 51 | 55 |
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
| ... | ... |
@@ -85,6 +90,11 @@ func (l *linuxStandardInit) Init() error {
|
| 85 | 85 |
if err != nil {
|
| 86 | 86 |
return err |
| 87 | 87 |
} |
| 88 |
+ if l.config.Config.Seccomp != nil {
|
|
| 89 |
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
|
| 90 |
+ return err |
|
| 91 |
+ } |
|
| 92 |
+ } |
|
| 88 | 93 |
if err := finalizeNamespace(l.config); err != nil {
|
| 89 | 94 |
return err |
| 90 | 95 |
} |
| ... | ... |
@@ -99,8 +109,5 @@ func (l *linuxStandardInit) Init() error {
|
| 99 | 99 |
if syscall.Getppid() != l.parentPid {
|
| 100 | 100 |
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) |
| 101 | 101 |
} |
| 102 |
- if err := finalizeSeccomp(l.config); err != nil {
|
|
| 103 |
- return err |
|
| 104 |
- } |
|
| 105 | 102 |
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) |
| 106 | 103 |
} |