Browse code

Vendor libcontainer v0.0.4

Noteworthy changes:

- Add Prestart/Poststop hook support
- Fix bug finding cgroup mount directory
- Add OomScoreAdj as a container configuration option
- Ensure the cleanup jobs in the deferrer are executed on error
- Don't make modifications to /dev when it is bind mounted

Other changes in runc:

https://github.com/opencontainers/runc/compare/v0.0.3...v0.0.4

Signed-off-by: David Calavera <david.calavera@gmail.com>

David Calavera authored on 2015/09/12 05:17:59
Showing 28 changed files
... ...
@@ -1112,12 +1112,9 @@ func (container *Container) unmountVolumes(forceSyscall bool) error {
1112 1112
 
1113 1113
 func (container *Container) networkMounts() []execdriver.Mount {
1114 1114
 	var mounts []execdriver.Mount
1115
-	mode := "Z"
1116
-	if container.hostConfig.NetworkMode.IsContainer() {
1117
-		mode = "z"
1118
-	}
1115
+	shared := container.hostConfig.NetworkMode.IsContainer()
1119 1116
 	if container.ResolvConfPath != "" {
1120
-		label.Relabel(container.ResolvConfPath, container.MountLabel, mode)
1117
+		label.Relabel(container.ResolvConfPath, container.MountLabel, shared)
1121 1118
 		writable := !container.hostConfig.ReadonlyRootfs
1122 1119
 		if m, exists := container.MountPoints["/etc/resolv.conf"]; exists {
1123 1120
 			writable = m.RW
... ...
@@ -1130,7 +1127,7 @@ func (container *Container) networkMounts() []execdriver.Mount {
1130 1130
 		})
1131 1131
 	}
1132 1132
 	if container.HostnamePath != "" {
1133
-		label.Relabel(container.HostnamePath, container.MountLabel, mode)
1133
+		label.Relabel(container.HostnamePath, container.MountLabel, shared)
1134 1134
 		writable := !container.hostConfig.ReadonlyRootfs
1135 1135
 		if m, exists := container.MountPoints["/etc/hostname"]; exists {
1136 1136
 			writable = m.RW
... ...
@@ -1143,7 +1140,7 @@ func (container *Container) networkMounts() []execdriver.Mount {
1143 1143
 		})
1144 1144
 	}
1145 1145
 	if container.HostsPath != "" {
1146
-		label.Relabel(container.HostsPath, container.MountLabel, mode)
1146
+		label.Relabel(container.HostsPath, container.MountLabel, shared)
1147 1147
 		writable := !container.hostConfig.ReadonlyRootfs
1148 1148
 		if m, exists := container.MountPoints["/etc/hosts"]; exists {
1149 1149
 			writable = m.RW
... ...
@@ -59,7 +59,7 @@ func createContainerPlatformSpecificSettings(container *Container, config *runco
59 59
 			return err
60 60
 		}
61 61
 
62
-		if err := label.Relabel(v.Path(), container.MountLabel, "z"); err != nil {
62
+		if err := label.Relabel(v.Path(), container.MountLabel, true); err != nil {
63 63
 			return err
64 64
 		}
65 65
 
... ...
@@ -355,7 +355,8 @@ func (daemon *Daemon) registerMountPoints(container *Container, hostConfig *runc
355 355
 			}
356 356
 		}
357 357
 
358
-		if err := label.Relabel(bind.Source, container.MountLabel, bind.Mode); err != nil {
358
+		shared := label.IsShared(bind.Mode)
359
+		if err := label.Relabel(bind.Source, container.MountLabel, shared); err != nil {
359 360
 			return err
360 361
 		}
361 362
 		binds[bind.Destination] = true
... ...
@@ -42,7 +42,7 @@ clone git github.com/endophage/gotuf 9bcdad0308e34a49f38448b8ad436ad8860825ce
42 42
 clone git github.com/jfrazelle/go 6e461eb70cb4187b41a84e9a567d7137bdbe0f16
43 43
 clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
44 44
 
45
-clone git github.com/opencontainers/runc v0.0.3 # libcontainer
45
+clone git github.com/opencontainers/runc v0.0.4 # libcontainer
46 46
 # libcontainer deps (see src/github.com/docker/libcontainer/update-vendor.sh)
47 47
 clone git github.com/coreos/go-systemd v3
48 48
 clone git github.com/godbus/dbus v2
... ...
@@ -83,7 +83,7 @@ type data struct {
83 83
 	pid    int
84 84
 }
85 85
 
86
-func (m *Manager) Apply(pid int) error {
86
+func (m *Manager) Apply(pid int) (err error) {
87 87
 	if m.Cgroups == nil {
88 88
 		return nil
89 89
 	}
... ...
@@ -235,12 +235,12 @@ func getCgroupData(c *configs.Cgroup, pid int) (*data, error) {
235 235
 	}, nil
236 236
 }
237 237
 
238
-func (raw *data) parent(subsystem, mountpoint, src string) (string, error) {
239
-	initPath, err := cgroups.GetInitCgroupDir(subsystem)
238
+func (raw *data) parent(subsystem, mountpoint, root string) (string, error) {
239
+	initPath, err := cgroups.GetThisCgroupDir(subsystem)
240 240
 	if err != nil {
241 241
 		return "", err
242 242
 	}
243
-	relDir, err := filepath.Rel(src, initPath)
243
+	relDir, err := filepath.Rel(root, initPath)
244 244
 	if err != nil {
245 245
 		return "", err
246 246
 	}
... ...
@@ -248,7 +248,7 @@ func (raw *data) parent(subsystem, mountpoint, src string) (string, error) {
248 248
 }
249 249
 
250 250
 func (raw *data) path(subsystem string) (string, error) {
251
-	mnt, src, err := cgroups.FindCgroupMountpointAndSource(subsystem)
251
+	mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem)
252 252
 	// If we didn't mount the subsystem, there is no point we make the path.
253 253
 	if err != nil {
254 254
 		return "", err
... ...
@@ -259,7 +259,7 @@ func (raw *data) path(subsystem string) (string, error) {
259 259
 		return filepath.Join(raw.root, filepath.Base(mnt), raw.cgroup), nil
260 260
 	}
261 261
 
262
-	parent, err := raw.parent(subsystem, mnt, src)
262
+	parent, err := raw.parent(subsystem, mnt, root)
263 263
 	if err != nil {
264 264
 		return "", err
265 265
 	}
... ...
@@ -17,7 +17,7 @@ import (
17 17
 type MemoryGroup struct {
18 18
 }
19 19
 
20
-func (s *MemoryGroup) Apply(d *data) error {
20
+func (s *MemoryGroup) Apply(d *data) (err error) {
21 21
 	path, err := d.path("memory")
22 22
 	if err != nil {
23 23
 		if cgroups.IsNotFound(err) {
... ...
@@ -28,21 +28,22 @@ func (s *MemoryGroup) Apply(d *data) error {
28 28
 	if err := os.MkdirAll(path, 0755); err != nil {
29 29
 		return err
30 30
 	}
31
+
32
+	defer func() {
33
+		if err != nil {
34
+			os.RemoveAll(path)
35
+		}
36
+	}()
37
+
31 38
 	if err := s.Set(path, d.c); err != nil {
32 39
 		return err
33 40
 	}
34 41
 
35 42
 	// We need to join memory cgroup after set memory limits, because
36 43
 	// kmem.limit_in_bytes can only be set when the cgroup is empty.
37
-	_, err = d.join("memory")
38
-	if err != nil {
44
+	if _, err = d.join("memory"); err != nil {
39 45
 		return err
40 46
 	}
41
-	defer func() {
42
-		if err != nil {
43
-			os.RemoveAll(path)
44
-		}
45
-	}()
46 47
 
47 48
 	return nil
48 49
 }
... ...
@@ -21,6 +21,9 @@ const cgroupNamePrefix = "name="
21 21
 
22 22
 // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
23 23
 func FindCgroupMountpoint(subsystem string) (string, error) {
24
+	// We are not using mount.GetMounts() because it's super-inefficient,
25
+	// parsing it directly sped up x10 times because of not using Sscanf.
26
+	// It was one of two major performance drawbacks in container start.
24 27
 	f, err := os.Open("/proc/self/mountinfo")
25 28
 	if err != nil {
26 29
 		return "", err
... ...
@@ -44,7 +47,7 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
44 44
 	return "", NewNotFoundError(subsystem)
45 45
 }
46 46
 
47
-func FindCgroupMountpointAndSource(subsystem string) (string, string, error) {
47
+func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
48 48
 	f, err := os.Open("/proc/self/mountinfo")
49 49
 	if err != nil {
50 50
 		return "", "", err
... ...
@@ -69,16 +72,29 @@ func FindCgroupMountpointAndSource(subsystem string) (string, string, error) {
69 69
 }
70 70
 
71 71
 func FindCgroupMountpointDir() (string, error) {
72
-	mounts, err := mount.GetMounts()
72
+	f, err := os.Open("/proc/self/mountinfo")
73 73
 	if err != nil {
74 74
 		return "", err
75 75
 	}
76
+	defer f.Close()
76 77
 
77
-	for _, mount := range mounts {
78
-		if mount.Fstype == "cgroup" {
79
-			return filepath.Dir(mount.Mountpoint), nil
78
+	scanner := bufio.NewScanner(f)
79
+	for scanner.Scan() {
80
+		text := scanner.Text()
81
+		fields := strings.Split(text, " ")
82
+		// Safe as mountinfo encodes mountpoints with spaces as \040.
83
+		index := strings.Index(text, " - ")
84
+		postSeparatorFields := strings.Fields(text[index+3:])
85
+		if len(postSeparatorFields) < 3 {
86
+			return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
87
+		}
88
+		if postSeparatorFields[0] == "cgroup" {
89
+			return filepath.Dir(fields[4]), nil
80 90
 		}
81 91
 	}
92
+	if err := scanner.Err(); err != nil {
93
+		return "", err
94
+	}
82 95
 
83 96
 	return "", NewNotFoundError("cgroup")
84 97
 }
... ...
@@ -1,5 +1,11 @@
1 1
 package configs
2 2
 
3
+import (
4
+	"bytes"
5
+	"encoding/json"
6
+	"os/exec"
7
+)
8
+
3 9
 type Rlimit struct {
4 10
 	Type int    `json:"type"`
5 11
 	Hard uint64 `json:"hard"`
... ...
@@ -13,36 +19,46 @@ type IDMap struct {
13 13
 	Size        int `json:"size"`
14 14
 }
15 15
 
16
+// Seccomp represents syscall restrictions
16 17
 type Seccomp struct {
17
-	Syscalls []*Syscall `json:"syscalls"`
18
+	DefaultAction Action     `json:"default_action"`
19
+	Syscalls      []*Syscall `json:"syscalls"`
18 20
 }
19 21
 
22
+// An action to be taken upon rule match in Seccomp
20 23
 type Action int
21 24
 
22 25
 const (
23
-	Kill Action = iota - 3
26
+	Kill Action = iota - 4
27
+	Errno
24 28
 	Trap
25 29
 	Allow
26 30
 )
27 31
 
32
+// A comparison operator to be used when matching syscall arguments in Seccomp
28 33
 type Operator int
29 34
 
30 35
 const (
31 36
 	EqualTo Operator = iota
32 37
 	NotEqualTo
33
-	GreatherThan
38
+	GreaterThan
39
+	GreaterThanOrEqualTo
34 40
 	LessThan
41
+	LessThanOrEqualTo
35 42
 	MaskEqualTo
36 43
 )
37 44
 
45
+// A rule to match a specific syscall argument in Seccomp
38 46
 type Arg struct {
39
-	Index int      `json:"index"`
40
-	Value uint32   `json:"value"`
41
-	Op    Operator `json:"op"`
47
+	Index    uint     `json:"index"`
48
+	Value    uint64   `json:"value"`
49
+	ValueTwo uint64   `json:"value_two"`
50
+	Op       Operator `json:"op"`
42 51
 }
43 52
 
53
+// An rule to match a syscall in Seccomp
44 54
 type Syscall struct {
45
-	Value  int    `json:"value"`
55
+	Name   string `json:"name"`
46 56
 	Action Action `json:"action"`
47 57
 	Args   []*Arg `json:"args"`
48 58
 }
... ...
@@ -117,6 +133,12 @@ type Config struct {
117 117
 	// If Rlimits are not set, the container will inherit rlimits from the parent process
118 118
 	Rlimits []Rlimit `json:"rlimits"`
119 119
 
120
+	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
121
+	// for a process. Valid values are between the range [-1000, '1000'], where processes with
122
+	// higher scores are preferred for being killed.
123
+	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
124
+	OomScoreAdj int `json:"oom_score_adj"`
125
+
120 126
 	// AdditionalGroups specifies the gids that should be added to supplementary groups
121 127
 	// in addition to those that the user belongs to.
122 128
 	AdditionalGroups []string `json:"additional_groups"`
... ...
@@ -140,7 +162,79 @@ type Config struct {
140 140
 	Sysctl map[string]string `json:"sysctl"`
141 141
 
142 142
 	// Seccomp allows actions to be taken whenever a syscall is made within the container.
143
-	// By default, all syscalls are allowed with actions to allow, trap, kill, or return an errno
144
-	// can be specified on a per syscall basis.
143
+	// A number of rules are given, each having an action to be taken if a syscall matches it.
144
+	// A default action to be taken if no rules match is also given.
145 145
 	Seccomp *Seccomp `json:"seccomp"`
146
+
147
+	// Hooks are a collection of actions to perform at various container lifecycle events.
148
+	// Hooks are not able to be marshaled to json but they are also not needed to.
149
+	Hooks *Hooks `json:"-"`
150
+}
151
+
152
+type Hooks struct {
153
+	// Prestart commands are executed after the container namespaces are created,
154
+	// but before the user supplied command is executed from init.
155
+	Prestart []Hook
156
+
157
+	// Poststop commands are executed after the container init process exits.
158
+	Poststop []Hook
159
+}
160
+
161
+// HookState is the payload provided to a hook on execution.
162
+type HookState struct {
163
+	ID   string `json:"id"`
164
+	Pid  int    `json:"pid"`
165
+	Root string `json:"root"`
166
+}
167
+
168
+type Hook interface {
169
+	// Run executes the hook with the provided state.
170
+	Run(HookState) error
171
+}
172
+
173
+// NewFunctionHooks will call the provided function when the hook is run.
174
+func NewFunctionHook(f func(HookState) error) FuncHook {
175
+	return FuncHook{
176
+		run: f,
177
+	}
178
+}
179
+
180
+type FuncHook struct {
181
+	run func(HookState) error
182
+}
183
+
184
+func (f FuncHook) Run(s HookState) error {
185
+	return f.run(s)
186
+}
187
+
188
+type Command struct {
189
+	Path string   `json:"path"`
190
+	Args []string `json:"args"`
191
+	Env  []string `json:"env"`
192
+	Dir  string   `json:"dir"`
193
+}
194
+
195
+// NewCommandHooks will execute the provided command when the hook is run.
196
+func NewCommandHook(cmd Command) CommandHook {
197
+	return CommandHook{
198
+		Command: cmd,
199
+	}
200
+}
201
+
202
+type CommandHook struct {
203
+	Command
204
+}
205
+
206
+func (c Command) Run(s HookState) error {
207
+	b, err := json.Marshal(s)
208
+	if err != nil {
209
+		return err
210
+	}
211
+	cmd := exec.Cmd{
212
+		Path:  c.Path,
213
+		Args:  c.Args,
214
+		Env:   c.Env,
215
+		Stdin: bytes.NewReader(b),
216
+	}
217
+	return cmd.Run()
146 218
 }
... ...
@@ -25,10 +25,3 @@ type Mount struct {
25 25
 	// Optional Command to be run after Source is mounted.
26 26
 	PostmountCmds []Command `json:"postmount_cmds"`
27 27
 }
28
-
29
-type Command struct {
30
-	Path string   `json:"path"`
31
-	Args []string `json:"args"`
32
-	Env  []string `json:"env"`
33
-	Dir  string   `json:"dir"`
34
-}
... ...
@@ -185,6 +185,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
185 185
 		parentPipe: parentPipe,
186 186
 		manager:    c.cgroupManager,
187 187
 		config:     c.newInitConfig(p),
188
+		container:  c,
188 189
 	}, nil
189 190
 }
190 191
 
... ...
@@ -247,6 +248,17 @@ func (c *linuxContainer) Destroy() error {
247 247
 		err = rerr
248 248
 	}
249 249
 	c.initProcess = nil
250
+	if c.config.Hooks != nil {
251
+		s := configs.HookState{
252
+			ID:   c.id,
253
+			Root: c.config.Rootfs,
254
+		}
255
+		for _, hook := range c.config.Hooks.Poststop {
256
+			if err := hook.Run(s); err != nil {
257
+				return err
258
+			}
259
+		}
260
+	}
250 261
 	return err
251 262
 }
252 263
 
... ...
@@ -299,7 +311,7 @@ func (c *linuxContainer) checkCriuVersion() error {
299 299
 	return nil
300 300
 }
301 301
 
302
-const descriptors_filename = "descriptors.json"
302
+const descriptorsFilename = "descriptors.json"
303 303
 
304 304
 func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
305 305
 	mountDest := m.Destination
... ...
@@ -406,7 +418,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
406 406
 		return err
407 407
 	}
408 408
 
409
-	err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptors_filename), fdsJSON, 0655)
409
+	err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
410 410
 	if err != nil {
411 411
 		return err
412 412
 	}
... ...
@@ -532,13 +544,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
532 532
 			break
533 533
 		}
534 534
 	}
535
+	for _, i := range criuOpts.VethPairs {
536
+		veth := new(criurpc.CriuVethPair)
537
+		veth.IfOut = proto.String(i.HostInterfaceName)
538
+		veth.IfIn = proto.String(i.ContainerInterfaceName)
539
+		req.Opts.Veths = append(req.Opts.Veths, veth)
540
+	}
535 541
 
536 542
 	var (
537 543
 		fds    []string
538 544
 		fdJSON []byte
539 545
 	)
540 546
 
541
-	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptors_filename)); err != nil {
547
+	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
542 548
 		return err
543 549
 	}
544 550
 
... ...
@@ -568,6 +586,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
568 568
 		return err
569 569
 	}
570 570
 
571
+	logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
571 572
 	criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
572 573
 	criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
573 574
 	defer criuClient.Close()
... ...
@@ -631,7 +650,8 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
631 631
 			return err
632 632
 		}
633 633
 		if !resp.GetSuccess() {
634
-			return fmt.Errorf("criu failed: type %s errno %d", req.GetType().String(), resp.GetCrErrno())
634
+			typeString := req.GetType().String()
635
+			return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
635 636
 		}
636 637
 
637 638
 		t := resp.GetType()
... ...
@@ -671,7 +691,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
671 671
 		return err
672 672
 	}
673 673
 	if !st.Success() {
674
-		return fmt.Errorf("criu failed: %s", st.String())
674
+		return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
675 675
 	}
676 676
 	return nil
677 677
 }
... ...
@@ -5,6 +5,11 @@ type CriuPageServerInfo struct {
5 5
 	Port    int32  // port number of CRIU page server
6 6
 }
7 7
 
8
+type VethPairName struct {
9
+	ContainerInterfaceName string
10
+	HostInterfaceName      string
11
+}
12
+
8 13
 type CriuOpts struct {
9 14
 	ImagesDirectory         string             // directory for storing image files
10 15
 	WorkDirectory           string             // directory to cd and write logs/pidfiles/stats to
... ...
@@ -14,4 +19,5 @@ type CriuOpts struct {
14 14
 	ShellJob                bool               // allow to dump and restore shell jobs
15 15
 	FileLocks               bool               // handle file locks, for safety
16 16
 	PageServer              CriuPageServerInfo // allow to dump to criu page server
17
+	VethPairs               []VethPairName     // pass the veth to criu when restore
17 18
 }
... ...
@@ -5,7 +5,9 @@ package libcontainer
5 5
 import (
6 6
 	"encoding/json"
7 7
 	"fmt"
8
+	"io/ioutil"
8 9
 	"os"
10
+	"strconv"
9 11
 	"strings"
10 12
 	"syscall"
11 13
 
... ...
@@ -13,7 +15,6 @@ import (
13 13
 	"github.com/opencontainers/runc/libcontainer/cgroups"
14 14
 	"github.com/opencontainers/runc/libcontainer/configs"
15 15
 	"github.com/opencontainers/runc/libcontainer/netlink"
16
-	"github.com/opencontainers/runc/libcontainer/seccomp"
17 16
 	"github.com/opencontainers/runc/libcontainer/system"
18 17
 	"github.com/opencontainers/runc/libcontainer/user"
19 18
 	"github.com/opencontainers/runc/libcontainer/utils"
... ...
@@ -239,6 +240,11 @@ func setupRlimits(config *configs.Config) error {
239 239
 	return nil
240 240
 }
241 241
 
242
+func setOomScoreAdj(oomScoreAdj int) error {
243
+	path := "/proc/self/oom_score_adj"
244
+	return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
245
+}
246
+
242 247
 // killCgroupProcesses freezes then iterates over all the processes inside the
243 248
 // manager's cgroups sending a SIGKILL to each process then waiting for them to
244 249
 // exit.
... ...
@@ -270,61 +276,3 @@ func killCgroupProcesses(m cgroups.Manager) error {
270 270
 	}
271 271
 	return nil
272 272
 }
273
-
274
-func finalizeSeccomp(config *initConfig) error {
275
-	if config.Config.Seccomp == nil {
276
-		return nil
277
-	}
278
-	context := seccomp.New()
279
-	for _, s := range config.Config.Seccomp.Syscalls {
280
-		ss := &seccomp.Syscall{
281
-			Value:  uint32(s.Value),
282
-			Action: seccompAction(s.Action),
283
-		}
284
-		if len(s.Args) > 0 {
285
-			ss.Args = seccompArgs(s.Args)
286
-		}
287
-		context.Add(ss)
288
-	}
289
-	return context.Load()
290
-}
291
-
292
-func seccompAction(a configs.Action) seccomp.Action {
293
-	switch a {
294
-	case configs.Kill:
295
-		return seccomp.Kill
296
-	case configs.Trap:
297
-		return seccomp.Trap
298
-	case configs.Allow:
299
-		return seccomp.Allow
300
-	}
301
-	return seccomp.Error(syscall.Errno(int(a)))
302
-}
303
-
304
-func seccompArgs(args []*configs.Arg) seccomp.Args {
305
-	var sa []seccomp.Arg
306
-	for _, a := range args {
307
-		sa = append(sa, seccomp.Arg{
308
-			Index: uint32(a.Index),
309
-			Op:    seccompOperator(a.Op),
310
-			Value: uint(a.Value),
311
-		})
312
-	}
313
-	return seccomp.Args{sa}
314
-}
315
-
316
-func seccompOperator(o configs.Operator) seccomp.Operator {
317
-	switch o {
318
-	case configs.EqualTo:
319
-		return seccomp.EqualTo
320
-	case configs.NotEqualTo:
321
-		return seccomp.NotEqualTo
322
-	case configs.GreatherThan:
323
-		return seccomp.GreatherThan
324
-	case configs.LessThan:
325
-		return seccomp.LessThan
326
-	case configs.MaskEqualTo:
327
-		return seccomp.MaskEqualTo
328
-	}
329
-	return 0
330
-}
... ...
@@ -29,7 +29,7 @@ func SetFileCreateLabel(fileLabel string) error {
29 29
 	return nil
30 30
 }
31 31
 
32
-func Relabel(path string, fileLabel string, relabel string) error {
32
+func Relabel(path string, fileLabel string, shared bool) error {
33 33
 	return nil
34 34
 }
35 35
 
... ...
@@ -59,3 +59,13 @@ func DupSecOpt(src string) []string {
59 59
 func DisableSecOpt() []string {
60 60
 	return nil
61 61
 }
62
+
63
+// Validate checks that the label does not include unexpected options
64
+func Validate(label string) error {
65
+	return nil
66
+}
67
+
68
+// IsShared checks that the label includes a "shared" mark
69
+func IsShared(label string) bool {
70
+	return false
71
+}
... ...
@@ -9,6 +9,8 @@ import (
9 9
 	"github.com/opencontainers/runc/libcontainer/selinux"
10 10
 )
11 11
 
12
+var ErrIncompatibleLabel = fmt.Errorf("Bad SELinux option z and Z can not be used together")
13
+
12 14
 // InitLabels returns the process label and file labels to be used within
13 15
 // the container.  A list of options can be passed into this function to alter
14 16
 // the labels.  The labels returned will include a random MCS String, that is
... ...
@@ -95,28 +97,24 @@ func SetFileCreateLabel(fileLabel string) error {
95 95
 	return nil
96 96
 }
97 97
 
98
-// Change the label of path to the filelabel string.  If the relabel string
99
-// is "z", relabel will change the MCS label to s0.  This will allow all
100
-// containers to share the content.  If the relabel string is a "Z" then
101
-// the MCS label should continue to be used.  SELinux will use this field
102
-// to make sure the content can not be shared by other containes.
103
-func Relabel(path string, fileLabel string, relabel string) error {
104
-	exclude_path := []string{"/", "/usr", "/etc"}
105
-	if fileLabel == "" {
98
+// Change the label of path to the filelabel string.
99
+// It changes the MCS label to s0 if shared is true.
100
+// This will allow all containers to share the content.
101
+func Relabel(path string, fileLabel string, shared bool) error {
102
+	if !selinux.SelinuxEnabled() {
106 103
 		return nil
107 104
 	}
108
-	if !strings.ContainsAny(relabel, "zZ") {
105
+
106
+	if fileLabel == "" {
109 107
 		return nil
110 108
 	}
111
-	for _, p := range exclude_path {
112
-		if path == p {
113
-			return fmt.Errorf("Relabeling of %s is not allowed", path)
114
-		}
115
-	}
116
-	if strings.Contains(relabel, "z") && strings.Contains(relabel, "Z") {
117
-		return fmt.Errorf("Bad SELinux option z and Z can not be used together")
109
+
110
+	exclude_paths := map[string]bool{"/": true, "/usr": true, "/etc": true}
111
+	if exclude_paths[path] {
112
+		return fmt.Errorf("Relabeling of %s is not allowed", path)
118 113
 	}
119
-	if strings.Contains(relabel, "z") {
114
+
115
+	if shared {
120 116
 		c := selinux.NewContext(fileLabel)
121 117
 		c["level"] = "s0"
122 118
 		fileLabel = c.Get()
... ...
@@ -161,3 +159,16 @@ func DupSecOpt(src string) []string {
161 161
 func DisableSecOpt() []string {
162 162
 	return selinux.DisableSecOpt()
163 163
 }
164
+
165
+// Validate checks that the label does not include unexpected options
166
+func Validate(label string) error {
167
+	if strings.Contains(label, "z") && strings.Contains(label, "Z") {
168
+		return ErrIncompatibleLabel
169
+	}
170
+	return nil
171
+}
172
+
173
+// IsShared checks that the label includes a "shared" mark
174
+func IsShared(label string) bool {
175
+	return strings.Contains(label, "z")
176
+}
... ...
@@ -1,4 +1,4 @@
1
-// +build arm ppc64
1
+// +build arm ppc64 ppc64le
2 2
 
3 3
 package netlink
4 4
 
... ...
@@ -1,4 +1,4 @@
1
-// +build !arm,!ppc64
1
+// +build !arm,!ppc64,!ppc64le
2 2
 
3 3
 package netlink
4 4
 
... ...
@@ -13,6 +13,7 @@ import (
13 13
 	"syscall"
14 14
 
15 15
 	"github.com/opencontainers/runc/libcontainer/cgroups"
16
+	"github.com/opencontainers/runc/libcontainer/configs"
16 17
 	"github.com/opencontainers/runc/libcontainer/system"
17 18
 )
18 19
 
... ...
@@ -138,11 +139,9 @@ func (p *setnsProcess) terminate() error {
138 138
 
139 139
 func (p *setnsProcess) wait() (*os.ProcessState, error) {
140 140
 	err := p.cmd.Wait()
141
-	if err != nil {
142
-		return p.cmd.ProcessState, err
143
-	}
144 141
 
145
-	return p.cmd.ProcessState, nil
142
+	// Return actual ProcessState even on Wait error
143
+	return p.cmd.ProcessState, err
146 144
 }
147 145
 
148 146
 func (p *setnsProcess) pid() int {
... ...
@@ -175,9 +174,9 @@ func (p *initProcess) externalDescriptors() []string {
175 175
 	return p.fds
176 176
 }
177 177
 
178
-func (p *initProcess) start() error {
178
+func (p *initProcess) start() (err error) {
179 179
 	defer p.parentPipe.Close()
180
-	err := p.cmd.Start()
180
+	err = p.cmd.Start()
181 181
 	p.childPipe.Close()
182 182
 	if err != nil {
183 183
 		return newSystemError(err)
... ...
@@ -202,6 +201,18 @@ func (p *initProcess) start() error {
202 202
 			p.manager.Destroy()
203 203
 		}
204 204
 	}()
205
+	if p.config.Config.Hooks != nil {
206
+		s := configs.HookState{
207
+			ID:   p.container.id,
208
+			Pid:  p.pid(),
209
+			Root: p.config.Config.Rootfs,
210
+		}
211
+		for _, hook := range p.config.Config.Hooks.Prestart {
212
+			if err := hook.Run(s); err != nil {
213
+				return newSystemError(err)
214
+			}
215
+		}
216
+	}
205 217
 	if err := p.createNetworkInterfaces(); err != nil {
206 218
 		return newSystemError(err)
207 219
 	}
... ...
@@ -286,9 +297,7 @@ func (p *initProcess) setExternalDescriptors(newFds []string) {
286 286
 }
287 287
 
288 288
 func getPipeFds(pid int) ([]string, error) {
289
-	var fds []string
290
-
291
-	fds = make([]string, 3)
289
+	fds := make([]string, 3)
292 290
 
293 291
 	dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
294 292
 	for i := 0; i < 3; i++ {
... ...
@@ -27,6 +27,8 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
27 27
 	if err := prepareRoot(config); err != nil {
28 28
 		return newSystemError(err)
29 29
 	}
30
+
31
+	setupDev := len(config.Devices) == 0
30 32
 	for _, m := range config.Mounts {
31 33
 		for _, precmd := range m.PremountCmds {
32 34
 			if err := mountCmd(precmd); err != nil {
... ...
@@ -43,14 +45,16 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
43 43
 			}
44 44
 		}
45 45
 	}
46
-	if err := createDevices(config); err != nil {
47
-		return newSystemError(err)
48
-	}
49
-	if err := setupPtmx(config, console); err != nil {
50
-		return newSystemError(err)
51
-	}
52
-	if err := setupDevSymlinks(config.Rootfs); err != nil {
53
-		return newSystemError(err)
46
+	if !setupDev {
47
+		if err := createDevices(config); err != nil {
48
+			return newSystemError(err)
49
+		}
50
+		if err := setupPtmx(config, console); err != nil {
51
+			return newSystemError(err)
52
+		}
53
+		if err := setupDevSymlinks(config.Rootfs); err != nil {
54
+			return newSystemError(err)
55
+		}
54 56
 	}
55 57
 	if err := syscall.Chdir(config.Rootfs); err != nil {
56 58
 		return newSystemError(err)
... ...
@@ -63,8 +67,10 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
63 63
 	if err != nil {
64 64
 		return newSystemError(err)
65 65
 	}
66
-	if err := reOpenDevNull(config.Rootfs); err != nil {
67
-		return newSystemError(err)
66
+	if !setupDev {
67
+		if err := reOpenDevNull(config.Rootfs); err != nil {
68
+			return newSystemError(err)
69
+		}
68 70
 	}
69 71
 	if config.Readonlyfs {
70 72
 		if err := setReadonly(); err != nil {
... ...
@@ -131,6 +137,11 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
131 131
 			return err
132 132
 		}
133 133
 		return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data)
134
+	case "securityfs":
135
+		if err := os.MkdirAll(dest, 0755); err != nil {
136
+			return err
137
+		}
138
+		return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data)
134 139
 	case "bind":
135 140
 		stat, err := os.Stat(m.Source)
136 141
 		if err != nil {
... ...
@@ -160,7 +171,11 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
160 160
 			}
161 161
 		}
162 162
 		if m.Relabel != "" {
163
-			if err := label.Relabel(m.Source, mountLabel, m.Relabel); err != nil {
163
+			if err := label.Validate(m.Relabel); err != nil {
164
+				return err
165
+			}
166
+			shared := label.IsShared(m.Relabel)
167
+			if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
164 168
 				return err
165 169
 			}
166 170
 		}
167 171
deleted file mode 100644
... ...
@@ -1,34 +0,0 @@
1
-// +build linux
2
-
3
-package seccomp
4
-
5
-import "strings"
6
-
7
-type bpfLabel struct {
8
-	label    string
9
-	location uint32
10
-}
11
-
12
-type bpfLabels []bpfLabel
13
-
14
-// labelIndex returns the index for the label if it exists in the slice.
15
-// if it does not exist in the slice it appends the label lb to the end
16
-// of the slice and returns the index.
17
-func labelIndex(labels *bpfLabels, lb string) uint32 {
18
-	var id uint32
19
-	for id = 0; id < uint32(len(*labels)); id++ {
20
-		if strings.EqualFold(lb, (*labels)[id].label) {
21
-			return id
22
-		}
23
-	}
24
-	*labels = append(*labels, bpfLabel{lb, 0xffffffff})
25
-	return id
26
-}
27
-
28
-func scmpBpfStmt(code uint16, k uint32) sockFilter {
29
-	return sockFilter{code, 0, 0, k}
30
-}
31
-
32
-func scmpBpfJump(code uint16, k uint32, jt, jf uint8) sockFilter {
33
-	return sockFilter{code, jt, jf, k}
34
-}
35 1
new file mode 100644
... ...
@@ -0,0 +1,53 @@
0
+package seccomp
1
+
2
+import (
3
+	"fmt"
4
+
5
+	"github.com/opencontainers/runc/libcontainer/configs"
6
+)
7
+
8
+// ConvertStringToOperator converts a string into a Seccomp comparison operator.
9
+// Comparison operators use the names they are assigned by Libseccomp's header.
10
+// Attempting to convert a string that is not a valid operator results in an
11
+// error.
12
+func ConvertStringToOperator(in string) (configs.Operator, error) {
13
+	switch in {
14
+	case "SCMP_CMP_NE":
15
+		return configs.NotEqualTo, nil
16
+	case "SCMP_CMP_LT":
17
+		return configs.LessThan, nil
18
+	case "SCMP_CMP_LE":
19
+		return configs.LessThanOrEqualTo, nil
20
+	case "SCMP_CMP_EQ":
21
+		return configs.EqualTo, nil
22
+	case "SCMP_CMP_GE":
23
+		return configs.GreaterThan, nil
24
+	case "SCMP_CMP_GT":
25
+		return configs.GreaterThanOrEqualTo, nil
26
+	case "SCMP_CMP_MASKED_EQ":
27
+		return configs.MaskEqualTo, nil
28
+	default:
29
+		return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
30
+	}
31
+}
32
+
33
+// ConvertStringToAction converts a string into a Seccomp rule match action.
34
+// Actions use the named they are assigned in Libseccomp's header, though some
35
+// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
36
+// return errors.
37
+// Attempting to convert a string that is not a valid action results in an
38
+// error.
39
+func ConvertStringToAction(in string) (configs.Action, error) {
40
+	switch in {
41
+	case "SCMP_ACT_KILL":
42
+		return configs.Kill, nil
43
+	case "SCMP_ACT_ERRNO":
44
+		return configs.Errno, nil
45
+	case "SCMP_ACT_TRAP":
46
+		return configs.Trap, nil
47
+	case "SCMP_ACT_ALLOW":
48
+		return configs.Allow, nil
49
+	default:
50
+		return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
51
+	}
52
+}
0 53
deleted file mode 100644
... ...
@@ -1,146 +0,0 @@
1
-// +build linux
2
-
3
-package seccomp
4
-
5
-import (
6
-	"errors"
7
-	"syscall"
8
-)
9
-
10
-const labelTemplate = "lb-%d-%d"
11
-
12
-// Action is the type of action that will be taken when a
13
-// syscall is performed.
14
-type Action int
15
-
16
-const (
17
-	Kill  Action = iota - 3 // Kill the calling process of the syscall.
18
-	Trap                    // Trap and coredump the calling process of the syscall.
19
-	Allow                   // Allow the syscall to be completed.
20
-)
21
-
22
-// Syscall is the specified syscall, action, and any type of arguments
23
-// to filter on.
24
-type Syscall struct {
25
-	// Value is the syscall number.
26
-	Value uint32
27
-	// Action is the action to perform when the specified syscall is made.
28
-	Action Action
29
-	// Args are filters that can be specified on the arguments to the syscall.
30
-	Args Args
31
-}
32
-
33
-func (s *Syscall) scmpAction() uint32 {
34
-	switch s.Action {
35
-	case Allow:
36
-		return retAllow
37
-	case Trap:
38
-		return retTrap
39
-	case Kill:
40
-		return retKill
41
-	}
42
-	return actionErrno(uint32(s.Action))
43
-}
44
-
45
-// Arg represents an argument to the syscall with the argument's index,
46
-// the operator to apply when matching, and the argument's value at that time.
47
-type Arg struct {
48
-	Index uint32   // index of args which start from zero
49
-	Op    Operator // operation, such as EQ/NE/GE/LE
50
-	Value uint     // the value of arg
51
-}
52
-
53
-type Args [][]Arg
54
-
55
-var (
56
-	ErrUnresolvedLabel      = errors.New("seccomp: unresolved label")
57
-	ErrDuplicateLabel       = errors.New("seccomp: duplicate label use")
58
-	ErrUnsupportedOperation = errors.New("seccomp: unsupported operation for argument")
59
-)
60
-
61
-// Error returns an Action that will be used to send the calling
62
-// process the specified errno when the syscall is made.
63
-func Error(code syscall.Errno) Action {
64
-	return Action(code)
65
-}
66
-
67
-// New returns a new syscall context for use.
68
-func New() *Context {
69
-	return &Context{
70
-		syscalls: make(map[uint32]*Syscall),
71
-	}
72
-}
73
-
74
-// Context holds syscalls for the current process to limit the type of
75
-// actions the calling process can make.
76
-type Context struct {
77
-	syscalls map[uint32]*Syscall
78
-}
79
-
80
-// Add will add the specified syscall, action, and arguments to the seccomp
81
-// Context.
82
-func (c *Context) Add(s *Syscall) {
83
-	c.syscalls[s.Value] = s
84
-}
85
-
86
-// Remove removes the specified syscall configuration from the Context.
87
-func (c *Context) Remove(call uint32) {
88
-	delete(c.syscalls, call)
89
-}
90
-
91
-// Load will apply the Context to the calling process makeing any secccomp process changes
92
-// apply after the context is loaded.
93
-func (c *Context) Load() error {
94
-	filter, err := c.newFilter()
95
-	if err != nil {
96
-		return err
97
-	}
98
-	if err := prctl(prSetNoNewPrivileges, 1, 0, 0, 0); err != nil {
99
-		return err
100
-	}
101
-	prog := newSockFprog(filter)
102
-	return prog.set()
103
-}
104
-
105
-func (c *Context) newFilter() ([]sockFilter, error) {
106
-	var (
107
-		labels bpfLabels
108
-		f      = newFilter()
109
-	)
110
-	for _, s := range c.syscalls {
111
-		f.addSyscall(s, &labels)
112
-	}
113
-	f.allow()
114
-	// process args for the syscalls
115
-	for _, s := range c.syscalls {
116
-		if err := f.addArguments(s, &labels); err != nil {
117
-			return nil, err
118
-		}
119
-	}
120
-	// apply labels for arguments
121
-	idx := int32(len(*f) - 1)
122
-	for ; idx >= 0; idx-- {
123
-		lf := &(*f)[idx]
124
-		if lf.code != (syscall.BPF_JMP + syscall.BPF_JA) {
125
-			continue
126
-		}
127
-		rel := int32(lf.jt)<<8 | int32(lf.jf)
128
-		if ((jumpJT << 8) | jumpJF) == rel {
129
-			if labels[lf.k].location == 0xffffffff {
130
-				return nil, ErrUnresolvedLabel
131
-			}
132
-			lf.k = labels[lf.k].location - uint32(idx+1)
133
-			lf.jt = 0
134
-			lf.jf = 0
135
-		} else if ((labelJT << 8) | labelJF) == rel {
136
-			if labels[lf.k].location != 0xffffffff {
137
-				return nil, ErrDuplicateLabel
138
-			}
139
-			labels[lf.k].location = uint32(idx)
140
-			lf.k = 0
141
-			lf.jt = 0
142
-			lf.jf = 0
143
-		}
144
-	}
145
-	return *f, nil
146
-}
147 1
deleted file mode 100644
... ...
@@ -1,118 +0,0 @@
1
-// +build linux
2
-
3
-package seccomp
4
-
5
-import (
6
-	"fmt"
7
-	"syscall"
8
-	"unsafe"
9
-)
10
-
11
-type sockFilter struct {
12
-	code uint16
13
-	jt   uint8
14
-	jf   uint8
15
-	k    uint32
16
-}
17
-
18
-func newFilter() *filter {
19
-	var f filter
20
-	f = append(f, sockFilter{
21
-		pfLD + syscall.BPF_W + syscall.BPF_ABS,
22
-		0,
23
-		0,
24
-		uint32(unsafe.Offsetof(secData.nr)),
25
-	})
26
-	return &f
27
-}
28
-
29
-type filter []sockFilter
30
-
31
-func (f *filter) addSyscall(s *Syscall, labels *bpfLabels) {
32
-	if len(s.Args) == 0 {
33
-		f.call(s.Value, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction()))
34
-	} else {
35
-		if len(s.Args[0]) > 0 {
36
-			lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[0][0].Index)
37
-			f.call(s.Value,
38
-				scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb),
39
-					jumpJT, jumpJF))
40
-		}
41
-	}
42
-}
43
-
44
-func (f *filter) addArguments(s *Syscall, labels *bpfLabels) error {
45
-	for i := 0; len(s.Args) > i; i++ {
46
-		if len(s.Args[i]) > 0 {
47
-			lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[i][0].Index)
48
-			f.label(labels, lb)
49
-			f.arg(s.Args[i][0].Index)
50
-		}
51
-		for j := 0; j < len(s.Args[i]); j++ {
52
-			var jf sockFilter
53
-			if len(s.Args)-1 > i && len(s.Args[i+1]) > 0 {
54
-				lbj := fmt.Sprintf(labelTemplate, s.Value, s.Args[i+1][0].Index)
55
-				jf = scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA,
56
-					labelIndex(labels, lbj), jumpJT, jumpJF)
57
-			} else {
58
-				jf = scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction())
59
-			}
60
-			if err := f.op(s.Args[i][j].Op, s.Args[i][j].Value, jf); err != nil {
61
-				return err
62
-			}
63
-		}
64
-		f.allow()
65
-	}
66
-	return nil
67
-}
68
-
69
-func (f *filter) label(labels *bpfLabels, lb string) {
70
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), labelJT, labelJF))
71
-}
72
-
73
-func (f *filter) call(nr uint32, jt sockFilter) {
74
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, nr, 0, 1))
75
-	*f = append(*f, jt)
76
-}
77
-
78
-func (f *filter) allow() {
79
-	*f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retAllow))
80
-}
81
-
82
-func (f *filter) deny() {
83
-	*f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retTrap))
84
-}
85
-
86
-func (f *filter) arg(index uint32) {
87
-	arg(f, index)
88
-}
89
-
90
-func (f *filter) op(operation Operator, v uint, jf sockFilter) error {
91
-	switch operation {
92
-	case EqualTo:
93
-		jumpEqualTo(f, v, jf)
94
-	case NotEqualTo:
95
-		jumpNotEqualTo(f, v, jf)
96
-	case GreatherThan:
97
-		jumpGreaterThan(f, v, jf)
98
-	case LessThan:
99
-		jumpLessThan(f, v, jf)
100
-	case MaskEqualTo:
101
-		jumpMaskEqualTo(f, v, jf)
102
-	default:
103
-		return ErrUnsupportedOperation
104
-	}
105
-	return nil
106
-}
107
-
108
-func arg(f *filter, idx uint32) {
109
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.low(idx)))
110
-	*f = append(*f, scmpBpfStmt(syscall.BPF_ST, 0))
111
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.hi(idx)))
112
-	*f = append(*f, scmpBpfStmt(syscall.BPF_ST, 1))
113
-}
114
-
115
-func jump(f *filter, labels *bpfLabels, lb string) {
116
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb),
117
-		jumpJT, jumpJF))
118
-}
119 1
deleted file mode 100644
... ...
@@ -1,68 +0,0 @@
1
-// +build linux,amd64
2
-
3
-package seccomp
4
-
5
-// Using BPF filters
6
-//
7
-// ref: http://www.gsp.com/cgi-bin/man.cgi?topic=bpf
8
-import "syscall"
9
-
10
-func jumpGreaterThan(f *filter, v uint, jt sockFilter) {
11
-	lo := uint32(uint64(v) % 0x100000000)
12
-	hi := uint32(uint64(v) / 0x100000000)
13
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 4, 0))
14
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5))
15
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
16
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGE+syscall.BPF_K, (lo), 0, 2))
17
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
18
-	*f = append(*f, jt)
19
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
20
-}
21
-
22
-func jumpEqualTo(f *filter, v uint, jt sockFilter) {
23
-	lo := uint32(uint64(v) % 0x100000000)
24
-	hi := uint32(uint64(v) / 0x100000000)
25
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5))
26
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
27
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (lo), 0, 2))
28
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
29
-	*f = append(*f, jt)
30
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
31
-}
32
-
33
-func jumpLessThan(f *filter, v uint, jt sockFilter) {
34
-	lo := uint32(uint64(v) % 0x100000000)
35
-	hi := uint32(uint64(v) / 0x100000000)
36
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 6, 0))
37
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 3))
38
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
39
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (lo), 2, 0))
40
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
41
-	*f = append(*f, jt)
42
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
43
-}
44
-
45
-func jumpNotEqualTo(f *filter, v uint, jt sockFilter) {
46
-	lo := uint32(uint64(v) % 0x100000000)
47
-	hi := uint32(uint64(v) / 0x100000000)
48
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 5, 0))
49
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
50
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 2, 0))
51
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
52
-	*f = append(*f, jt)
53
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
54
-}
55
-
56
-// this checks for a value inside a mask. The evalusation is equal to doing
57
-// CLONE_NEWUSER & syscallMask == CLONE_NEWUSER
58
-func jumpMaskEqualTo(f *filter, v uint, jt sockFilter) {
59
-	lo := uint32(uint64(v) % 0x100000000)
60
-	hi := uint32(uint64(v) / 0x100000000)
61
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 0, 6))
62
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
63
-	*f = append(*f, scmpBpfStmt(syscall.BPF_ALU+syscall.BPF_AND, uint32(v)))
64
-	*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 0, 2))
65
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
66
-	*f = append(*f, jt)
67
-	*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
68
-}
69 1
new file mode 100644
... ...
@@ -0,0 +1,165 @@
0
+// +build linux,cgo,seccomp
1
+
2
+package seccomp
3
+
4
+import (
5
+	"fmt"
6
+	"log"
7
+	"syscall"
8
+
9
+	"github.com/opencontainers/runc/libcontainer/configs"
10
+	libseccomp "github.com/seccomp/libseccomp-golang"
11
+)
12
+
13
+var (
14
+	actAllow = libseccomp.ActAllow
15
+	actTrap  = libseccomp.ActTrap
16
+	actKill  = libseccomp.ActKill
17
+	actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM))
18
+)
19
+
20
+// Filters given syscalls in a container, preventing them from being used
21
+// Started in the container init process, and carried over to all child processes
22
+// Setns calls, however, require a separate invocation, as they are not children
23
+// of the init until they join the namespace
24
+func InitSeccomp(config *configs.Seccomp) error {
25
+	if config == nil {
26
+		return fmt.Errorf("cannot initialize Seccomp - nil config passed")
27
+	}
28
+
29
+	defaultAction, err := getAction(config.DefaultAction)
30
+	if err != nil {
31
+		return fmt.Errorf("error initializing seccomp - invalid default action")
32
+	}
33
+
34
+	filter, err := libseccomp.NewFilter(defaultAction)
35
+	if err != nil {
36
+		return fmt.Errorf("error creating filter: %s", err)
37
+	}
38
+
39
+	// Unset no new privs bit
40
+	if err := filter.SetNoNewPrivsBit(false); err != nil {
41
+		return fmt.Errorf("error setting no new privileges: %s", err)
42
+	}
43
+
44
+	// Add a rule for each syscall
45
+	for _, call := range config.Syscalls {
46
+		if call == nil {
47
+			return fmt.Errorf("encountered nil syscall while initializing Seccomp")
48
+		}
49
+
50
+		if err = matchCall(filter, call); err != nil {
51
+			return err
52
+		}
53
+	}
54
+
55
+	if err = filter.Load(); err != nil {
56
+		return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
57
+	}
58
+
59
+	return nil
60
+}
61
+
62
+// Convert Libcontainer Action to Libseccomp ScmpAction
63
+func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
64
+	switch act {
65
+	case configs.Kill:
66
+		return actKill, nil
67
+	case configs.Errno:
68
+		return actErrno, nil
69
+	case configs.Trap:
70
+		return actTrap, nil
71
+	case configs.Allow:
72
+		return actAllow, nil
73
+	default:
74
+		return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
75
+	}
76
+}
77
+
78
+// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
79
+func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
80
+	switch op {
81
+	case configs.EqualTo:
82
+		return libseccomp.CompareEqual, nil
83
+	case configs.NotEqualTo:
84
+		return libseccomp.CompareNotEqual, nil
85
+	case configs.GreaterThan:
86
+		return libseccomp.CompareGreater, nil
87
+	case configs.GreaterThanOrEqualTo:
88
+		return libseccomp.CompareGreaterEqual, nil
89
+	case configs.LessThan:
90
+		return libseccomp.CompareLess, nil
91
+	case configs.LessThanOrEqualTo:
92
+		return libseccomp.CompareLessOrEqual, nil
93
+	case configs.MaskEqualTo:
94
+		return libseccomp.CompareMaskedEqual, nil
95
+	default:
96
+		return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
97
+	}
98
+}
99
+
100
+// Convert Libcontainer Arg to Libseccomp ScmpCondition
101
+func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
102
+	cond := libseccomp.ScmpCondition{}
103
+
104
+	if arg == nil {
105
+		return cond, fmt.Errorf("cannot convert nil to syscall condition")
106
+	}
107
+
108
+	op, err := getOperator(arg.Op)
109
+	if err != nil {
110
+		return cond, err
111
+	}
112
+
113
+	return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
114
+}
115
+
116
+// Add a rule to match a single syscall
117
+func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
118
+	if call == nil || filter == nil {
119
+		return fmt.Errorf("cannot use nil as syscall to block")
120
+	}
121
+
122
+	if len(call.Name) == 0 {
123
+		return fmt.Errorf("empty string is not a valid syscall")
124
+	}
125
+
126
+	// If we can't resolve the syscall, assume it's not supported on this kernel
127
+	// Ignore it, don't error out
128
+	callNum, err := libseccomp.GetSyscallFromName(call.Name)
129
+	if err != nil {
130
+		log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
131
+		return nil
132
+	}
133
+
134
+	// Convert the call's action to the libseccomp equivalent
135
+	callAct, err := getAction(call.Action)
136
+	if err != nil {
137
+		return err
138
+	}
139
+
140
+	// Unconditional match - just add the rule
141
+	if len(call.Args) == 0 {
142
+		if err = filter.AddRule(callNum, callAct); err != nil {
143
+			return err
144
+		}
145
+	} else {
146
+		// Conditional match - convert the per-arg rules into library format
147
+		conditions := []libseccomp.ScmpCondition{}
148
+
149
+		for _, cond := range call.Args {
150
+			newCond, err := getCondition(cond)
151
+			if err != nil {
152
+				return err
153
+			}
154
+
155
+			conditions = append(conditions, newCond)
156
+		}
157
+
158
+		if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
159
+			return err
160
+		}
161
+	}
162
+
163
+	return nil
164
+}
0 165
deleted file mode 100644
... ...
@@ -1,124 +0,0 @@
1
-// +build linux
2
-
3
-// Package seccomp provides native seccomp ( https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt ) support for go.
4
-package seccomp
5
-
6
-import (
7
-	"syscall"
8
-	"unsafe"
9
-)
10
-
11
-// Operator that is used for argument comparison.
12
-type Operator int
13
-
14
-const (
15
-	EqualTo Operator = iota
16
-	NotEqualTo
17
-	GreatherThan
18
-	LessThan
19
-	MaskEqualTo
20
-)
21
-
22
-const (
23
-	jumpJT  = 0xff
24
-	jumpJF  = 0xff
25
-	labelJT = 0xfe
26
-	labelJF = 0xfe
27
-)
28
-
29
-const (
30
-	pfLD                 = 0x0
31
-	retKill              = 0x00000000
32
-	retTrap              = 0x00030000
33
-	retAllow             = 0x7fff0000
34
-	modeFilter           = 0x2
35
-	prSetNoNewPrivileges = 0x26
36
-)
37
-
38
-func actionErrno(errno uint32) uint32 {
39
-	return 0x00050000 | (errno & 0x0000ffff)
40
-}
41
-
42
-var (
43
-	secData = struct {
44
-		nr         int32
45
-		arch       uint32
46
-		insPointer uint64
47
-		args       [6]uint64
48
-	}{0, 0, 0, [6]uint64{0, 0, 0, 0, 0, 0}}
49
-)
50
-
51
-var isLittle = func() bool {
52
-	var (
53
-		x  = 0x1234
54
-		p  = unsafe.Pointer(&x)
55
-		p2 = (*[unsafe.Sizeof(0)]byte)(p)
56
-	)
57
-	if p2[0] == 0 {
58
-		return false
59
-	}
60
-	return true
61
-}()
62
-
63
-var endian endianSupport
64
-
65
-type endianSupport struct {
66
-}
67
-
68
-func (e endianSupport) hi(i uint32) uint32 {
69
-	if isLittle {
70
-		return e.little(i)
71
-	}
72
-	return e.big(i)
73
-}
74
-
75
-func (e endianSupport) low(i uint32) uint32 {
76
-	if isLittle {
77
-		return e.big(i)
78
-	}
79
-	return e.little(i)
80
-}
81
-
82
-func (endianSupport) big(idx uint32) uint32 {
83
-	if idx >= 6 {
84
-		return 0
85
-	}
86
-	return uint32(unsafe.Offsetof(secData.args)) + 8*idx
87
-}
88
-
89
-func (endianSupport) little(idx uint32) uint32 {
90
-	if idx < 0 || idx >= 6 {
91
-		return 0
92
-	}
93
-	return uint32(unsafe.Offsetof(secData.args)) +
94
-		uint32(unsafe.Alignof(secData.args[0]))*idx + uint32(unsafe.Sizeof(secData.arch))
95
-}
96
-
97
-func prctl(option int, arg2, arg3, arg4, arg5 uintptr) error {
98
-	_, _, err := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
99
-	if err != 0 {
100
-		return err
101
-	}
102
-	return nil
103
-}
104
-
105
-func newSockFprog(filter []sockFilter) *sockFprog {
106
-	return &sockFprog{
107
-		len:  uint16(len(filter)),
108
-		filt: filter,
109
-	}
110
-}
111
-
112
-type sockFprog struct {
113
-	len  uint16
114
-	filt []sockFilter
115
-}
116
-
117
-func (s *sockFprog) set() error {
118
-	_, _, err := syscall.Syscall(syscall.SYS_PRCTL, uintptr(syscall.PR_SET_SECCOMP),
119
-		uintptr(modeFilter), uintptr(unsafe.Pointer(s)))
120
-	if err != 0 {
121
-		return err
122
-	}
123
-	return nil
124
-}
... ...
@@ -1,3 +1,19 @@
1
-// +build !linux
1
+// +build !linux !cgo !seccomp
2 2
 
3 3
 package seccomp
4
+
5
+import (
6
+	"errors"
7
+
8
+	"github.com/opencontainers/runc/libcontainer/configs"
9
+)
10
+
11
+var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
12
+
13
+// Seccomp not supported, do nothing
14
+func InitSeccomp(config *configs.Seccomp) error {
15
+	if config != nil {
16
+		return ErrSeccompNotEnabled
17
+	}
18
+	return nil
19
+}
... ...
@@ -7,6 +7,7 @@ import (
7 7
 
8 8
 	"github.com/opencontainers/runc/libcontainer/apparmor"
9 9
 	"github.com/opencontainers/runc/libcontainer/label"
10
+	"github.com/opencontainers/runc/libcontainer/seccomp"
10 11
 	"github.com/opencontainers/runc/libcontainer/system"
11 12
 )
12 13
 
... ...
@@ -20,6 +21,14 @@ func (l *linuxSetnsInit) Init() error {
20 20
 	if err := setupRlimits(l.config.Config); err != nil {
21 21
 		return err
22 22
 	}
23
+	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
24
+		return err
25
+	}
26
+	if l.config.Config.Seccomp != nil {
27
+		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
28
+			return err
29
+		}
30
+	}
23 31
 	if err := finalizeNamespace(l.config); err != nil {
24 32
 		return err
25 33
 	}
... ...
@@ -9,6 +9,7 @@ import (
9 9
 	"github.com/opencontainers/runc/libcontainer/apparmor"
10 10
 	"github.com/opencontainers/runc/libcontainer/configs"
11 11
 	"github.com/opencontainers/runc/libcontainer/label"
12
+	"github.com/opencontainers/runc/libcontainer/seccomp"
12 13
 	"github.com/opencontainers/runc/libcontainer/system"
13 14
 )
14 15
 
... ...
@@ -46,6 +47,10 @@ func (l *linuxStandardInit) Init() error {
46 46
 	if err := setupRlimits(l.config.Config); err != nil {
47 47
 		return err
48 48
 	}
49
+	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
50
+		return err
51
+	}
52
+
49 53
 	label.Init()
50 54
 	// InitializeMountNamespace() can be executed only for a new mount namespace
51 55
 	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
... ...
@@ -85,6 +90,11 @@ func (l *linuxStandardInit) Init() error {
85 85
 	if err != nil {
86 86
 		return err
87 87
 	}
88
+	if l.config.Config.Seccomp != nil {
89
+		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
90
+			return err
91
+		}
92
+	}
88 93
 	if err := finalizeNamespace(l.config); err != nil {
89 94
 		return err
90 95
 	}
... ...
@@ -99,8 +109,5 @@ func (l *linuxStandardInit) Init() error {
99 99
 	if syscall.Getppid() != l.parentPid {
100 100
 		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
101 101
 	}
102
-	if err := finalizeSeccomp(l.config); err != nil {
103
-		return err
104
-	}
105 102
 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
106 103
 }