Browse code

Merge pull request #40662 from AkihiroSuda/cgroup2-dockerinfo

cgroup2: implement `docker info`

Sebastiaan van Stijn authored on 2020/04/30 05:57:00
Showing 66 changed files
... ...
@@ -4047,6 +4047,13 @@ definitions:
4047 4047
         enum: ["cgroupfs", "systemd", "none"]
4048 4048
         default: "cgroupfs"
4049 4049
         example: "cgroupfs"
4050
+      CgroupVersion:
4051
+        description: |
4052
+          The version of the cgroup.
4053
+        type: "string"
4054
+        enum: ["1", "2"]
4055
+        default: "1"
4056
+        example: "1"
4050 4057
       NEventsListener:
4051 4058
         description: "Number of event listeners subscribed."
4052 4059
         type: "integer"
... ...
@@ -175,6 +175,7 @@ type Info struct {
175 175
 	SystemTime         string
176 176
 	LoggingDriver      string
177 177
 	CgroupDriver       string
178
+	CgroupVersion      string `json:",omitempty"`
178 179
 	NEventsListener    int
179 180
 	KernelVersion      string
180 181
 	OperatingSystem    string
... ...
@@ -44,6 +44,7 @@ import (
44 44
 	"github.com/docker/docker/pkg/pidfile"
45 45
 	"github.com/docker/docker/pkg/plugingetter"
46 46
 	"github.com/docker/docker/pkg/signal"
47
+	"github.com/docker/docker/pkg/sysinfo"
47 48
 	"github.com/docker/docker/pkg/system"
48 49
 	"github.com/docker/docker/plugin"
49 50
 	"github.com/docker/docker/rootless"
... ...
@@ -452,7 +453,11 @@ func warnOnDeprecatedConfigOptions(config *config.Config) {
452 452
 }
453 453
 
454 454
 func initRouter(opts routerOptions) {
455
-	decoder := runconfig.ContainerDecoder{}
455
+	decoder := runconfig.ContainerDecoder{
456
+		GetSysInfo: func() *sysinfo.SysInfo {
457
+			return opts.daemon.RawSysInfo(true)
458
+		},
459
+	}
456 460
 
457 461
 	routers := []router.Router{
458 462
 		// we need to add the checkpoint router before the container router or the DELETE gets masked
... ...
@@ -42,6 +42,7 @@ import (
42 42
 	"github.com/docker/docker/errdefs"
43 43
 	bkconfig "github.com/moby/buildkit/cmd/buildkitd/config"
44 44
 	"github.com/moby/buildkit/util/resolver"
45
+	rsystem "github.com/opencontainers/runc/libcontainer/system"
45 46
 	"github.com/sirupsen/logrus"
46 47
 
47 48
 	// register graph drivers
... ...
@@ -56,7 +57,6 @@ import (
56 56
 	"github.com/docker/docker/pkg/idtools"
57 57
 	"github.com/docker/docker/pkg/locker"
58 58
 	"github.com/docker/docker/pkg/plugingetter"
59
-	"github.com/docker/docker/pkg/sysinfo"
60 59
 	"github.com/docker/docker/pkg/system"
61 60
 	"github.com/docker/docker/pkg/truncindex"
62 61
 	"github.com/docker/docker/plugin"
... ...
@@ -1026,10 +1026,10 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
1026 1026
 		return nil, err
1027 1027
 	}
1028 1028
 
1029
-	sysInfo := sysinfo.New(false)
1029
+	sysInfo := d.RawSysInfo(false)
1030 1030
 	// Check if Devices cgroup is mounted, it is hard requirement for container security,
1031 1031
 	// on Linux.
1032
-	if runtime.GOOS == "linux" && !sysInfo.CgroupDevicesEnabled {
1032
+	if runtime.GOOS == "linux" && !sysInfo.CgroupDevicesEnabled && !rsystem.RunningInUserNS() {
1033 1033
 		return nil, errors.New("Devices cgroup isn't mounted")
1034 1034
 	}
1035 1035
 
... ...
@@ -644,7 +644,7 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.
644 644
 	if hostConfig == nil {
645 645
 		return nil, nil
646 646
 	}
647
-	sysInfo := sysinfo.New(true)
647
+	sysInfo := daemon.RawSysInfo(true)
648 648
 
649 649
 	w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update)
650 650
 
... ...
@@ -1745,7 +1745,7 @@ func (daemon *Daemon) initCgroupsPath(path string) error {
1745 1745
 	}
1746 1746
 
1747 1747
 	path = filepath.Join(mnt, root, path)
1748
-	sysInfo := sysinfo.New(true)
1748
+	sysInfo := daemon.RawSysInfo(true)
1749 1749
 	if err := maybeCreateCPURealTimeFile(sysInfo.CPURealtimePeriod, daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil {
1750 1750
 		return err
1751 1751
 	}
... ...
@@ -1779,3 +1779,16 @@ func (daemon *Daemon) setupSeccompProfile() error {
1779 1779
 func (daemon *Daemon) useShimV2() bool {
1780 1780
 	return cgroups.IsCgroup2UnifiedMode()
1781 1781
 }
1782
+
1783
+// RawSysInfo returns *sysinfo.SysInfo .
1784
+func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo {
1785
+	var opts []sysinfo.Opt
1786
+	if daemon.getCgroupDriver() == cgroupSystemdDriver {
1787
+		rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
1788
+		if rootlesskitParentEUID != "" {
1789
+			groupPath := fmt.Sprintf("/user.slice/user-%s.slice", rootlesskitParentEUID)
1790
+			opts = append(opts, sysinfo.WithCgroup2GroupPath(groupPath))
1791
+		}
1792
+	}
1793
+	return sysinfo.New(quiet, opts...)
1794
+}
... ...
@@ -1,9 +1,18 @@
1 1
 // +build !linux,!freebsd,!windows
2 2
 
3 3
 package daemon // import "github.com/docker/docker/daemon"
4
-import "github.com/docker/docker/daemon/config"
4
+
5
+import (
6
+	"github.com/docker/docker/daemon/config"
7
+	"github.com/docker/docker/pkg/sysinfo"
8
+)
5 9
 
6 10
 const platformSupported = false
7 11
 
8 12
 func setupResolvConf(config *config.Config) {
9 13
 }
14
+
15
+// RawSysInfo returns *sysinfo.SysInfo .
16
+func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo {
17
+	return sysinfo.New(quiet)
18
+}
... ...
@@ -657,3 +657,8 @@ func setupResolvConf(config *config.Config) {
657 657
 func (daemon *Daemon) useShimV2() bool {
658 658
 	return true
659 659
 }
660
+
661
+// RawSysInfo returns *sysinfo.SysInfo .
662
+func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo {
663
+	return sysinfo.New(quiet)
664
+}
... ...
@@ -28,7 +28,7 @@ import (
28 28
 func (daemon *Daemon) SystemInfo() *types.Info {
29 29
 	defer metrics.StartTimer(hostInfoFunctions.WithValues("system_info"))()
30 30
 
31
-	sysInfo := sysinfo.New(true)
31
+	sysInfo := daemon.RawSysInfo(true)
32 32
 	cRunning, cPaused, cStopped := stateCtr.get()
33 33
 
34 34
 	v := &types.Info{
... ...
@@ -47,7 +47,6 @@ func (daemon *Daemon) SystemInfo() *types.Info {
47 47
 		NGoroutines:        runtime.NumGoroutine(),
48 48
 		SystemTime:         time.Now().Format(time.RFC3339Nano),
49 49
 		LoggingDriver:      daemon.defaultLogConfig.Type,
50
-		CgroupDriver:       daemon.getCgroupDriver(),
51 50
 		NEventsListener:    daemon.EventsService.SubscribersCount(),
52 51
 		KernelVersion:      kernelVersion(),
53 52
 		OperatingSystem:    operatingSystem(),
... ...
@@ -19,6 +19,12 @@ import (
19 19
 
20 20
 // fillPlatformInfo fills the platform related info.
21 21
 func (daemon *Daemon) fillPlatformInfo(v *types.Info, sysInfo *sysinfo.SysInfo) {
22
+	v.CgroupDriver = daemon.getCgroupDriver()
23
+	v.CgroupVersion = "1"
24
+	if sysInfo.CgroupUnified {
25
+		v.CgroupVersion = "2"
26
+	}
27
+
22 28
 	v.MemoryLimit = sysInfo.MemoryLimit
23 29
 	v.SwapLimit = sysInfo.SwapLimit
24 30
 	v.KernelMemory = sysInfo.KernelMemory
... ...
@@ -81,32 +87,43 @@ func (daemon *Daemon) fillPlatformInfo(v *types.Info, sysInfo *sysinfo.SysInfo)
81 81
 		v.InitCommit.ID = "N/A"
82 82
 	}
83 83
 
84
-	if !v.MemoryLimit {
85
-		v.Warnings = append(v.Warnings, "WARNING: No memory limit support")
86
-	}
87
-	if !v.SwapLimit {
88
-		v.Warnings = append(v.Warnings, "WARNING: No swap limit support")
89
-	}
90
-	if !v.KernelMemory {
91
-		v.Warnings = append(v.Warnings, "WARNING: No kernel memory limit support")
92
-	}
93
-	if !v.KernelMemoryTCP {
94
-		v.Warnings = append(v.Warnings, "WARNING: No kernel memory TCP limit support")
95
-	}
96
-	if !v.OomKillDisable {
97
-		v.Warnings = append(v.Warnings, "WARNING: No oom kill disable support")
98
-	}
99
-	if !v.CPUCfsQuota {
100
-		v.Warnings = append(v.Warnings, "WARNING: No cpu cfs quota support")
101
-	}
102
-	if !v.CPUCfsPeriod {
103
-		v.Warnings = append(v.Warnings, "WARNING: No cpu cfs period support")
104
-	}
105
-	if !v.CPUShares {
106
-		v.Warnings = append(v.Warnings, "WARNING: No cpu shares support")
107
-	}
108
-	if !v.CPUSet {
109
-		v.Warnings = append(v.Warnings, "WARNING: No cpuset support")
84
+	if v.CgroupDriver == cgroupNoneDriver {
85
+		if v.CgroupVersion == "2" {
86
+			v.Warnings = append(v.Warnings, "WARNING: Running in rootless-mode without cgroup. To enable cgroup in rootless-mode, you need to set exec-opt \"native.cgroupdriver=systemd\".")
87
+		} else {
88
+			v.Warnings = append(v.Warnings, "WARNING: Running in rootless-mode without cgroup. To enable cgroup in rootless-mode, you need to boot the system in cgroup v2 mode and set exec-opt \"native.cgroupdriver=systemd\".")
89
+		}
90
+	} else {
91
+		if !v.MemoryLimit {
92
+			v.Warnings = append(v.Warnings, "WARNING: No memory limit support")
93
+		}
94
+		if !v.SwapLimit {
95
+			v.Warnings = append(v.Warnings, "WARNING: No swap limit support")
96
+		}
97
+		if !v.KernelMemory {
98
+			v.Warnings = append(v.Warnings, "WARNING: No kernel memory limit support")
99
+		}
100
+		if !v.KernelMemoryTCP {
101
+			v.Warnings = append(v.Warnings, "WARNING: No kernel memory TCP limit support")
102
+		}
103
+		if !v.OomKillDisable {
104
+			v.Warnings = append(v.Warnings, "WARNING: No oom kill disable support")
105
+		}
106
+		if !v.CPUCfsQuota {
107
+			v.Warnings = append(v.Warnings, "WARNING: No cpu cfs quota support")
108
+		}
109
+		if !v.CPUCfsPeriod {
110
+			v.Warnings = append(v.Warnings, "WARNING: No cpu cfs period support")
111
+		}
112
+		if !v.CPUShares {
113
+			v.Warnings = append(v.Warnings, "WARNING: No cpu shares support")
114
+		}
115
+		if !v.CPUSet {
116
+			v.Warnings = append(v.Warnings, "WARNING: No cpuset support")
117
+		}
118
+		if v.CgroupVersion == "2" {
119
+			v.Warnings = append(v.Warnings, "WARNING: Support for cgroup v2 is experimental")
120
+		}
110 121
 	}
111 122
 	if !v.IPv4Forwarding {
112 123
 		v.Warnings = append(v.Warnings, "WARNING: IPv4 forwarding is disabled")
... ...
@@ -17,6 +17,7 @@ keywords: "API, Docker, rcli, REST, documentation"
17 17
 
18 18
 [Docker Engine API v1.41](https://docs.docker.com/engine/api/v1.41/) documentation
19 19
 
20
+* `GET /info` now returns an `CgroupVersion` field, containing the cgroup version.
20 21
 * `POST /services/create` and `POST /services/{id}/update` now supports `BindOptions.NonRecursive`.
21 22
 * The `ClusterStore` and `ClusterAdvertise` fields in `GET /info` are deprecated
22 23
   and are now omitted if they contain an empty value. This change is not versioned,
23 24
new file mode 100644
... ...
@@ -0,0 +1,151 @@
0
+package sysinfo // import "github.com/docker/docker/pkg/sysinfo"
1
+
2
+import (
3
+	"io/ioutil"
4
+	"path"
5
+	"strings"
6
+
7
+	cgroupsV2 "github.com/containerd/cgroups/v2"
8
+	rsystem "github.com/opencontainers/runc/libcontainer/system"
9
+	"github.com/sirupsen/logrus"
10
+)
11
+
12
+type infoCollectorV2 func(info *SysInfo, controllers map[string]struct{}, dirPath string) (warnings []string)
13
+
14
+func newV2(quiet bool, opts *opts) *SysInfo {
15
+	var warnings []string
16
+	sysInfo := &SysInfo{
17
+		CgroupUnified: true,
18
+	}
19
+	g := opts.cg2GroupPath
20
+	if g == "" {
21
+		g = "/"
22
+	}
23
+	m, err := cgroupsV2.LoadManager("/sys/fs/cgroup", g)
24
+	if err != nil {
25
+		logrus.Warn(err)
26
+	} else {
27
+		controllersM := make(map[string]struct{})
28
+		controllers, err := m.Controllers()
29
+		if err != nil {
30
+			logrus.Warn(err)
31
+		}
32
+		for _, c := range controllers {
33
+			controllersM[c] = struct{}{}
34
+		}
35
+		opsV2 := []infoCollectorV2{
36
+			applyMemoryCgroupInfoV2,
37
+			applyCPUCgroupInfoV2,
38
+			applyIOCgroupInfoV2,
39
+			applyCPUSetCgroupInfoV2,
40
+			applyPIDSCgroupInfoV2,
41
+			applyDevicesCgroupInfoV2,
42
+		}
43
+		dirPath := path.Join("/sys/fs/cgroup", path.Clean(g))
44
+		for _, o := range opsV2 {
45
+			w := o(sysInfo, controllersM, dirPath)
46
+			warnings = append(warnings, w...)
47
+		}
48
+	}
49
+
50
+	ops := []infoCollector{
51
+		applyNetworkingInfo,
52
+		applyAppArmorInfo,
53
+		applySeccompInfo,
54
+		applyCgroupNsInfo,
55
+	}
56
+	for _, o := range ops {
57
+		w := o(sysInfo, nil)
58
+		warnings = append(warnings, w...)
59
+	}
60
+	if !quiet {
61
+		for _, w := range warnings {
62
+			logrus.Warn(w)
63
+		}
64
+	}
65
+	return sysInfo
66
+}
67
+
68
+func applyMemoryCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string {
69
+	var warnings []string
70
+	if _, ok := controllers["memory"]; !ok {
71
+		warnings = append(warnings, "Unable to find memory controller")
72
+		return warnings
73
+	}
74
+
75
+	info.MemoryLimit = true
76
+	info.SwapLimit = true
77
+	info.MemoryReservation = true
78
+	info.OomKillDisable = false
79
+	info.MemorySwappiness = false
80
+	info.KernelMemory = false
81
+	info.KernelMemoryTCP = false
82
+	return warnings
83
+}
84
+
85
+func applyCPUCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string {
86
+	var warnings []string
87
+	if _, ok := controllers["cpu"]; !ok {
88
+		warnings = append(warnings, "Unable to find cpu controller")
89
+		return warnings
90
+	}
91
+	info.CPUShares = true
92
+	info.CPUCfsPeriod = true
93
+	info.CPUCfsQuota = true
94
+	info.CPURealtimePeriod = false
95
+	info.CPURealtimeRuntime = false
96
+	return warnings
97
+}
98
+
99
+func applyIOCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string {
100
+	var warnings []string
101
+	if _, ok := controllers["io"]; !ok {
102
+		warnings = append(warnings, "Unable to find io controller")
103
+		return warnings
104
+	}
105
+
106
+	info.BlkioWeight = true
107
+	info.BlkioWeightDevice = true
108
+	info.BlkioReadBpsDevice = true
109
+	info.BlkioWriteBpsDevice = true
110
+	info.BlkioReadIOpsDevice = true
111
+	info.BlkioWriteIOpsDevice = true
112
+	return warnings
113
+}
114
+
115
+func applyCPUSetCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, dirPath string) []string {
116
+	var warnings []string
117
+	if _, ok := controllers["cpuset"]; !ok {
118
+		warnings = append(warnings, "Unable to find cpuset controller")
119
+		return warnings
120
+	}
121
+	info.Cpuset = true
122
+
123
+	cpus, err := ioutil.ReadFile(path.Join(dirPath, "cpuset.cpus.effective"))
124
+	if err != nil {
125
+		return warnings
126
+	}
127
+	info.Cpus = strings.TrimSpace(string(cpus))
128
+
129
+	mems, err := ioutil.ReadFile(path.Join(dirPath, "cpuset.mems.effective"))
130
+	if err != nil {
131
+		return warnings
132
+	}
133
+	info.Mems = strings.TrimSpace(string(mems))
134
+	return warnings
135
+}
136
+
137
+func applyPIDSCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string {
138
+	var warnings []string
139
+	if _, ok := controllers["pids"]; !ok {
140
+		warnings = append(warnings, "Unable to find pids controller")
141
+		return warnings
142
+	}
143
+	info.PidsLimit = true
144
+	return warnings
145
+}
146
+
147
+func applyDevicesCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string {
148
+	info.CgroupDevicesEnabled = !rsystem.RunningInUserNS()
149
+	return nil
150
+}
... ...
@@ -30,6 +30,9 @@ type SysInfo struct {
30 30
 
31 31
 	// Whether the cgroup has the mountpoint of "devices" or not
32 32
 	CgroupDevicesEnabled bool
33
+
34
+	// Whether the cgroup is in unified mode (v2).
35
+	CgroupUnified bool
33 36
 }
34 37
 
35 38
 type cgroupMemInfo struct {
... ...
@@ -28,10 +28,37 @@ func findCgroupMountpoints() (map[string]string, error) {
28 28
 
29 29
 type infoCollector func(info *SysInfo, cgMounts map[string]string) (warnings []string)
30 30
 
31
+type opts struct {
32
+	cg2GroupPath string
33
+}
34
+
35
+// Opt for New().
36
+type Opt func(*opts)
37
+
38
+// WithCgroup2GroupPath specifies the cgroup v2 group path to inspect availability
39
+// of the controllers.
40
+//
41
+// WithCgroup2GroupPath is expected to be used for rootless mode with systemd driver.
42
+//
43
+// e.g. g = "/user.slice/user-1000.slice/user@1000.service"
44
+func WithCgroup2GroupPath(g string) Opt {
45
+	return func(o *opts) {
46
+		o.cg2GroupPath = path.Clean(g)
47
+	}
48
+}
49
+
31 50
 // New returns a new SysInfo, using the filesystem to detect which features
32 51
 // the kernel supports. If `quiet` is `false` warnings are printed in logs
33 52
 // whenever an error occurs or misconfigurations are present.
34
-func New(quiet bool) *SysInfo {
53
+func New(quiet bool, options ...Opt) *SysInfo {
54
+	var opts opts
55
+	for _, o := range options {
56
+		o(&opts)
57
+	}
58
+	if cgroups.IsCgroup2UnifiedMode() {
59
+		return newV2(quiet, &opts)
60
+	}
61
+
35 62
 	var ops []infoCollector
36 63
 	var warnings []string
37 64
 	sysInfo := &SysInfo{}
... ...
@@ -60,9 +87,6 @@ func New(quiet bool) *SysInfo {
60 60
 		w := o(sysInfo, cgMounts)
61 61
 		warnings = append(warnings, w...)
62 62
 	}
63
-	if cgroups.IsCgroup2UnifiedMode() {
64
-		warnings = append(warnings, "Your system is running cgroup v2 (unsupported)")
65
-	}
66 63
 	if !quiet {
67 64
 		for _, w := range warnings {
68 65
 			logrus.Warn(w)
... ...
@@ -73,15 +97,6 @@ func New(quiet bool) *SysInfo {
73 73
 
74 74
 // applyMemoryCgroupInfo reads the memory information from the memory cgroup mount point.
75 75
 func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
76
-	if cgroups.IsCgroup2UnifiedMode() {
77
-		// TODO: check cgroup2 info correctly
78
-		info.MemoryLimit = true
79
-		info.SwapLimit = true
80
-		info.MemoryReservation = true
81
-		info.OomKillDisable = true
82
-		info.MemorySwappiness = true
83
-		return nil
84
-	}
85 76
 	var warnings []string
86 77
 	mountPoint, ok := cgMounts["memory"]
87 78
 	if !ok {
... ...
@@ -120,15 +135,6 @@ func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
120 120
 
121 121
 // applyCPUCgroupInfo reads the cpu information from the cpu cgroup mount point.
122 122
 func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
123
-	if cgroups.IsCgroup2UnifiedMode() {
124
-		// TODO: check cgroup2 info correctly
125
-		info.CPUShares = true
126
-		info.CPUCfsPeriod = true
127
-		info.CPUCfsQuota = true
128
-		info.CPURealtimePeriod = true
129
-		info.CPURealtimeRuntime = true
130
-		return nil
131
-	}
132 123
 	var warnings []string
133 124
 	mountPoint, ok := cgMounts["cpu"]
134 125
 	if !ok {
... ...
@@ -166,15 +172,6 @@ func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
166 166
 
167 167
 // applyBlkioCgroupInfo reads the blkio information from the blkio cgroup mount point.
168 168
 func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
169
-	if cgroups.IsCgroup2UnifiedMode() {
170
-		// TODO: check cgroup2 info correctly
171
-		info.BlkioWeight = true
172
-		info.BlkioReadBpsDevice = true
173
-		info.BlkioWriteBpsDevice = true
174
-		info.BlkioReadIOpsDevice = true
175
-		info.BlkioWriteIOpsDevice = true
176
-		return nil
177
-	}
178 169
 	var warnings []string
179 170
 	mountPoint, ok := cgMounts["blkio"]
180 171
 	if !ok {
... ...
@@ -216,11 +213,6 @@ func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
216 216
 
217 217
 // applyCPUSetCgroupInfo reads the cpuset information from the cpuset cgroup mount point.
218 218
 func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
219
-	if cgroups.IsCgroup2UnifiedMode() {
220
-		// TODO: check cgroup2 info correctly
221
-		info.Cpuset = true
222
-		return nil
223
-	}
224 219
 	var warnings []string
225 220
 	mountPoint, ok := cgMounts["cpuset"]
226 221
 	if !ok {
... ...
@@ -248,11 +240,6 @@ func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
248 248
 
249 249
 // applyPIDSCgroupInfo reads the pids information from the pids cgroup mount point.
250 250
 func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string {
251
-	if cgroups.IsCgroup2UnifiedMode() {
252
-		// TODO: check cgroup2 info correctly
253
-		info.PidsLimit = true
254
-		return nil
255
-	}
256 251
 	var warnings []string
257 252
 	_, err := cgroups.FindCgroupMountpoint("", "pids")
258 253
 	if err != nil {
... ...
@@ -265,11 +252,6 @@ func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string {
265 265
 
266 266
 // applyDevicesCgroupInfo reads the pids information from the devices cgroup mount point.
267 267
 func applyDevicesCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
268
-	if cgroups.IsCgroup2UnifiedMode() {
269
-		// TODO: check cgroup2 info correctly
270
-		info.CgroupDevicesEnabled = true
271
-		return nil
272
-	}
273 268
 	var warnings []string
274 269
 	_, ok := cgMounts["devices"]
275 270
 	info.CgroupDevicesEnabled = ok
... ...
@@ -2,8 +2,13 @@
2 2
 
3 3
 package sysinfo // import "github.com/docker/docker/pkg/sysinfo"
4 4
 
5
+type opts struct{}
6
+
7
+// Opt for New().
8
+type Opt func(*opts)
9
+
5 10
 // New returns an empty SysInfo for non linux for now.
6
-func New(quiet bool) *SysInfo {
11
+func New(quiet bool, options ...Opt) *SysInfo {
7 12
 	sysInfo := &SysInfo{}
8 13
 	return sysInfo
9 14
 }
... ...
@@ -1,7 +1,12 @@
1 1
 package sysinfo // import "github.com/docker/docker/pkg/sysinfo"
2 2
 
3
+type opts struct{}
4
+
5
+// Opt for New().
6
+type Opt func(*opts)
7
+
3 8
 // New returns an empty SysInfo for windows for now.
4
-func New(quiet bool) *SysInfo {
9
+func New(quiet bool, options ...Opt) *SysInfo {
5 10
 	sysInfo := &SysInfo{}
6 11
 	return sysInfo
7 12
 }
... ...
@@ -11,11 +11,20 @@ import (
11 11
 
12 12
 // ContainerDecoder implements httputils.ContainerDecoder
13 13
 // calling DecodeContainerConfig.
14
-type ContainerDecoder struct{}
14
+type ContainerDecoder struct {
15
+	GetSysInfo func() *sysinfo.SysInfo
16
+}
15 17
 
16 18
 // DecodeConfig makes ContainerDecoder to implement httputils.ContainerDecoder
17 19
 func (r ContainerDecoder) DecodeConfig(src io.Reader) (*container.Config, *container.HostConfig, *networktypes.NetworkingConfig, error) {
18
-	return decodeContainerConfig(src)
20
+	var si *sysinfo.SysInfo
21
+	if r.GetSysInfo != nil {
22
+		si = r.GetSysInfo()
23
+	} else {
24
+		si = sysinfo.New(true)
25
+	}
26
+
27
+	return decodeContainerConfig(src, si)
19 28
 }
20 29
 
21 30
 // DecodeHostConfig makes ContainerDecoder to implement httputils.ContainerDecoder
... ...
@@ -27,7 +36,7 @@ func (r ContainerDecoder) DecodeHostConfig(src io.Reader) (*container.HostConfig
27 27
 // struct and returns both a Config and a HostConfig struct
28 28
 // Be aware this function is not checking whether the resulted structs are nil,
29 29
 // it's your business to do so
30
-func decodeContainerConfig(src io.Reader) (*container.Config, *container.HostConfig, *networktypes.NetworkingConfig, error) {
30
+func decodeContainerConfig(src io.Reader, si *sysinfo.SysInfo) (*container.Config, *container.HostConfig, *networktypes.NetworkingConfig, error) {
31 31
 	var w ContainerConfigWrapper
32 32
 
33 33
 	decoder := json.NewDecoder(src)
... ...
@@ -63,7 +72,7 @@ func decodeContainerConfig(src io.Reader) (*container.Config, *container.HostCon
63 63
 	}
64 64
 
65 65
 	// Validate Resources
66
-	if err := validateResources(hc, sysinfo.New(true)); err != nil {
66
+	if err := validateResources(hc, si); err != nil {
67 67
 		return nil, nil, nil, err
68 68
 	}
69 69
 
... ...
@@ -12,6 +12,7 @@ import (
12 12
 	"github.com/docker/docker/api/types/container"
13 13
 	networktypes "github.com/docker/docker/api/types/network"
14 14
 	"github.com/docker/docker/api/types/strslice"
15
+	"github.com/docker/docker/pkg/sysinfo"
15 16
 )
16 17
 
17 18
 type f struct {
... ...
@@ -46,7 +47,7 @@ func TestDecodeContainerConfig(t *testing.T) {
46 46
 			t.Fatal(err)
47 47
 		}
48 48
 
49
-		c, h, _, err := decodeContainerConfig(bytes.NewReader(b))
49
+		c, h, _, err := decodeContainerConfig(bytes.NewReader(b), sysinfo.New(true))
50 50
 		if err != nil {
51 51
 			t.Fatal(fmt.Errorf("Error parsing %s: %v", f, err))
52 52
 		}
... ...
@@ -130,5 +131,5 @@ func callDecodeContainerConfigIsolation(isolation string) (*container.Config, *c
130 130
 	if b, err = json.Marshal(w); err != nil {
131 131
 		return nil, nil, nil, fmt.Errorf("Error on marshal %s", err.Error())
132 132
 	}
133
-	return decodeContainerConfig(bytes.NewReader(b))
133
+	return decodeContainerConfig(bytes.NewReader(b), sysinfo.New(true))
134 134
 }
... ...
@@ -130,6 +130,7 @@ github.com/containerd/go-runc                       7016d3ce2328dd2cb1192b2076eb
130 130
 github.com/containerd/typeurl                       b45ef1f1f737e10bd45b25b669df25f0da8b9ba0
131 131
 github.com/containerd/ttrpc                         0be804eadb152bc3b3c20c5edc314c4633833398
132 132
 github.com/gogo/googleapis                          01e0f9cca9b92166042241267ee2a5cdf5cff46c # v1.3.2
133
+github.com/cilium/ebpf                              60c3aa43f488292fe2ee50fb8b833b383ca8ebbb
133 134
 
134 135
 # cluster
135 136
 github.com/docker/swarmkit                          ebe39a32e3ed4c3a3783a02c11cccf388818694c
136 137
new file mode 100644
... ...
@@ -0,0 +1,23 @@
0
+MIT License
1
+
2
+Copyright (c) 2017 Nathan Sweet
3
+Copyright (c) 2018, 2019 Cloudflare
4
+Copyright (c) 2019 Authors of Cilium
5
+
6
+Permission is hereby granted, free of charge, to any person obtaining a copy
7
+of this software and associated documentation files (the "Software"), to deal
8
+in the Software without restriction, including without limitation the rights
9
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+copies of the Software, and to permit persons to whom the Software is
11
+furnished to do so, subject to the following conditions:
12
+
13
+The above copyright notice and this permission notice shall be included in all
14
+copies or substantial portions of the Software.
15
+
16
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+SOFTWARE.
0 23
new file mode 100644
... ...
@@ -0,0 +1,203 @@
0
+package ebpf
1
+
2
+import (
3
+	"bufio"
4
+	"bytes"
5
+	"fmt"
6
+	"io"
7
+	"os"
8
+	"syscall"
9
+
10
+	"github.com/cilium/ebpf/internal"
11
+	"github.com/pkg/errors"
12
+)
13
+
14
+// MapABI are the attributes of a Map which are available across all supported kernels.
15
+type MapABI struct {
16
+	Type       MapType
17
+	KeySize    uint32
18
+	ValueSize  uint32
19
+	MaxEntries uint32
20
+	Flags      uint32
21
+}
22
+
23
+func newMapABIFromSpec(spec *MapSpec) *MapABI {
24
+	return &MapABI{
25
+		spec.Type,
26
+		spec.KeySize,
27
+		spec.ValueSize,
28
+		spec.MaxEntries,
29
+		spec.Flags,
30
+	}
31
+}
32
+
33
+func newMapABIFromFd(fd *bpfFD) (string, *MapABI, error) {
34
+	info, err := bpfGetMapInfoByFD(fd)
35
+	if err != nil {
36
+		if errors.Cause(err) == syscall.EINVAL {
37
+			abi, err := newMapABIFromProc(fd)
38
+			return "", abi, err
39
+		}
40
+		return "", nil, err
41
+	}
42
+
43
+	return "", &MapABI{
44
+		MapType(info.mapType),
45
+		info.keySize,
46
+		info.valueSize,
47
+		info.maxEntries,
48
+		info.flags,
49
+	}, nil
50
+}
51
+
52
+func newMapABIFromProc(fd *bpfFD) (*MapABI, error) {
53
+	var abi MapABI
54
+	err := scanFdInfo(fd, map[string]interface{}{
55
+		"map_type":    &abi.Type,
56
+		"key_size":    &abi.KeySize,
57
+		"value_size":  &abi.ValueSize,
58
+		"max_entries": &abi.MaxEntries,
59
+		"map_flags":   &abi.Flags,
60
+	})
61
+	if err != nil {
62
+		return nil, err
63
+	}
64
+	return &abi, nil
65
+}
66
+
67
+// Equal returns true if two ABIs have the same values.
68
+func (abi *MapABI) Equal(other *MapABI) bool {
69
+	switch {
70
+	case abi.Type != other.Type:
71
+		return false
72
+	case abi.KeySize != other.KeySize:
73
+		return false
74
+	case abi.ValueSize != other.ValueSize:
75
+		return false
76
+	case abi.MaxEntries != other.MaxEntries:
77
+		return false
78
+	case abi.Flags != other.Flags:
79
+		return false
80
+	default:
81
+		return true
82
+	}
83
+}
84
+
85
+// ProgramABI are the attributes of a Program which are available across all supported kernels.
86
+type ProgramABI struct {
87
+	Type ProgramType
88
+}
89
+
90
+func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI {
91
+	return &ProgramABI{
92
+		spec.Type,
93
+	}
94
+}
95
+
96
+func newProgramABIFromFd(fd *bpfFD) (string, *ProgramABI, error) {
97
+	info, err := bpfGetProgInfoByFD(fd)
98
+	if err != nil {
99
+		if errors.Cause(err) == syscall.EINVAL {
100
+			return newProgramABIFromProc(fd)
101
+		}
102
+
103
+		return "", nil, err
104
+	}
105
+
106
+	var name string
107
+	if bpfName := convertCString(info.name[:]); bpfName != "" {
108
+		name = bpfName
109
+	} else {
110
+		name = convertCString(info.tag[:])
111
+	}
112
+
113
+	return name, &ProgramABI{
114
+		Type: ProgramType(info.progType),
115
+	}, nil
116
+}
117
+
118
+func newProgramABIFromProc(fd *bpfFD) (string, *ProgramABI, error) {
119
+	var (
120
+		abi  ProgramABI
121
+		name string
122
+	)
123
+
124
+	err := scanFdInfo(fd, map[string]interface{}{
125
+		"prog_type": &abi.Type,
126
+		"prog_tag":  &name,
127
+	})
128
+	if errors.Cause(err) == errMissingFields {
129
+		return "", nil, &internal.UnsupportedFeatureError{
130
+			Name:           "reading ABI from /proc/self/fdinfo",
131
+			MinimumVersion: internal.Version{4, 11, 0},
132
+		}
133
+	}
134
+	if err != nil {
135
+		return "", nil, err
136
+	}
137
+
138
+	return name, &abi, nil
139
+}
140
+
141
+func scanFdInfo(fd *bpfFD, fields map[string]interface{}) error {
142
+	raw, err := fd.value()
143
+	if err != nil {
144
+		return err
145
+	}
146
+
147
+	fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw))
148
+	if err != nil {
149
+		return err
150
+	}
151
+	defer fh.Close()
152
+
153
+	return errors.Wrap(scanFdInfoReader(fh, fields), fh.Name())
154
+}
155
+
156
+var errMissingFields = errors.New("missing fields")
157
+
158
+func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error {
159
+	var (
160
+		scanner = bufio.NewScanner(r)
161
+		scanned int
162
+	)
163
+
164
+	for scanner.Scan() {
165
+		parts := bytes.SplitN(scanner.Bytes(), []byte("\t"), 2)
166
+		if len(parts) != 2 {
167
+			continue
168
+		}
169
+
170
+		name := bytes.TrimSuffix(parts[0], []byte(":"))
171
+		field, ok := fields[string(name)]
172
+		if !ok {
173
+			continue
174
+		}
175
+
176
+		if n, err := fmt.Fscanln(bytes.NewReader(parts[1]), field); err != nil || n != 1 {
177
+			return errors.Wrapf(err, "can't parse field %s", name)
178
+		}
179
+
180
+		scanned++
181
+	}
182
+
183
+	if err := scanner.Err(); err != nil {
184
+		return err
185
+	}
186
+
187
+	if scanned != len(fields) {
188
+		return errMissingFields
189
+	}
190
+
191
+	return nil
192
+}
193
+
194
+// Equal returns true if two ABIs have the same values.
195
+func (abi *ProgramABI) Equal(other *ProgramABI) bool {
196
+	switch {
197
+	case abi.Type != other.Type:
198
+		return false
199
+	default:
200
+		return true
201
+	}
202
+}
0 203
new file mode 100644
... ...
@@ -0,0 +1,149 @@
0
+package asm
1
+
2
+//go:generate stringer -output alu_string.go -type=Source,Endianness,ALUOp
3
+
4
+// Source of ALU / ALU64 / Branch operations
5
+//
6
+//    msb      lsb
7
+//    +----+-+---+
8
+//    |op  |S|cls|
9
+//    +----+-+---+
10
+type Source uint8
11
+
12
+const sourceMask OpCode = 0x08
13
+
14
+// Source bitmask
15
+const (
16
+	// InvalidSource is returned by getters when invoked
17
+	// on non ALU / branch OpCodes.
18
+	InvalidSource Source = 0xff
19
+	// ImmSource src is from constant
20
+	ImmSource Source = 0x00
21
+	// RegSource src is from register
22
+	RegSource Source = 0x08
23
+)
24
+
25
+// The Endianness of a byte swap instruction.
26
+type Endianness uint8
27
+
28
+const endianMask = sourceMask
29
+
30
+// Endian flags
31
+const (
32
+	InvalidEndian Endianness = 0xff
33
+	// Convert to little endian
34
+	LE Endianness = 0x00
35
+	// Convert to big endian
36
+	BE Endianness = 0x08
37
+)
38
+
39
+// ALUOp are ALU / ALU64 operations
40
+//
41
+//    msb      lsb
42
+//    +----+-+---+
43
+//    |OP  |s|cls|
44
+//    +----+-+---+
45
+type ALUOp uint8
46
+
47
+const aluMask OpCode = 0xf0
48
+
49
+const (
50
+	// InvalidALUOp is returned by getters when invoked
51
+	// on non ALU OpCodes
52
+	InvalidALUOp ALUOp = 0xff
53
+	// Add - addition
54
+	Add ALUOp = 0x00
55
+	// Sub - subtraction
56
+	Sub ALUOp = 0x10
57
+	// Mul - multiplication
58
+	Mul ALUOp = 0x20
59
+	// Div - division
60
+	Div ALUOp = 0x30
61
+	// Or - bitwise or
62
+	Or ALUOp = 0x40
63
+	// And - bitwise and
64
+	And ALUOp = 0x50
65
+	// LSh - bitwise shift left
66
+	LSh ALUOp = 0x60
67
+	// RSh - bitwise shift right
68
+	RSh ALUOp = 0x70
69
+	// Neg - sign/unsign signing bit
70
+	Neg ALUOp = 0x80
71
+	// Mod - modulo
72
+	Mod ALUOp = 0x90
73
+	// Xor - bitwise xor
74
+	Xor ALUOp = 0xa0
75
+	// Mov - move value from one place to another
76
+	Mov ALUOp = 0xb0
77
+	// ArSh - arithmatic shift
78
+	ArSh ALUOp = 0xc0
79
+	// Swap - endian conversions
80
+	Swap ALUOp = 0xd0
81
+)
82
+
83
+// HostTo converts from host to another endianness.
84
+func HostTo(endian Endianness, dst Register, size Size) Instruction {
85
+	var imm int64
86
+	switch size {
87
+	case Half:
88
+		imm = 16
89
+	case Word:
90
+		imm = 32
91
+	case DWord:
92
+		imm = 64
93
+	default:
94
+		return Instruction{OpCode: InvalidOpCode}
95
+	}
96
+
97
+	return Instruction{
98
+		OpCode:   OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)),
99
+		Dst:      dst,
100
+		Constant: imm,
101
+	}
102
+}
103
+
104
+// Op returns the OpCode for an ALU operation with a given source.
105
+func (op ALUOp) Op(source Source) OpCode {
106
+	return OpCode(ALU64Class).SetALUOp(op).SetSource(source)
107
+}
108
+
109
+// Reg emits `dst (op) src`.
110
+func (op ALUOp) Reg(dst, src Register) Instruction {
111
+	return Instruction{
112
+		OpCode: op.Op(RegSource),
113
+		Dst:    dst,
114
+		Src:    src,
115
+	}
116
+}
117
+
118
+// Imm emits `dst (op) value`.
119
+func (op ALUOp) Imm(dst Register, value int32) Instruction {
120
+	return Instruction{
121
+		OpCode:   op.Op(ImmSource),
122
+		Dst:      dst,
123
+		Constant: int64(value),
124
+	}
125
+}
126
+
127
+// Op32 returns the OpCode for a 32-bit ALU operation with a given source.
128
+func (op ALUOp) Op32(source Source) OpCode {
129
+	return OpCode(ALUClass).SetALUOp(op).SetSource(source)
130
+}
131
+
132
+// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst.
133
+func (op ALUOp) Reg32(dst, src Register) Instruction {
134
+	return Instruction{
135
+		OpCode: op.Op32(RegSource),
136
+		Dst:    dst,
137
+		Src:    src,
138
+	}
139
+}
140
+
141
+// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst.
142
+func (op ALUOp) Imm32(dst Register, value int32) Instruction {
143
+	return Instruction{
144
+		OpCode:   op.Op32(ImmSource),
145
+		Dst:      dst,
146
+		Constant: int64(value),
147
+	}
148
+}
0 149
new file mode 100644
... ...
@@ -0,0 +1,107 @@
0
+// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT.
1
+
2
+package asm
3
+
4
+import "strconv"
5
+
6
+func _() {
7
+	// An "invalid array index" compiler error signifies that the constant values have changed.
8
+	// Re-run the stringer command to generate them again.
9
+	var x [1]struct{}
10
+	_ = x[InvalidSource-255]
11
+	_ = x[ImmSource-0]
12
+	_ = x[RegSource-8]
13
+}
14
+
15
+const (
16
+	_Source_name_0 = "ImmSource"
17
+	_Source_name_1 = "RegSource"
18
+	_Source_name_2 = "InvalidSource"
19
+)
20
+
21
+func (i Source) String() string {
22
+	switch {
23
+	case i == 0:
24
+		return _Source_name_0
25
+	case i == 8:
26
+		return _Source_name_1
27
+	case i == 255:
28
+		return _Source_name_2
29
+	default:
30
+		return "Source(" + strconv.FormatInt(int64(i), 10) + ")"
31
+	}
32
+}
33
+func _() {
34
+	// An "invalid array index" compiler error signifies that the constant values have changed.
35
+	// Re-run the stringer command to generate them again.
36
+	var x [1]struct{}
37
+	_ = x[InvalidEndian-255]
38
+	_ = x[LE-0]
39
+	_ = x[BE-8]
40
+}
41
+
42
+const (
43
+	_Endianness_name_0 = "LE"
44
+	_Endianness_name_1 = "BE"
45
+	_Endianness_name_2 = "InvalidEndian"
46
+)
47
+
48
+func (i Endianness) String() string {
49
+	switch {
50
+	case i == 0:
51
+		return _Endianness_name_0
52
+	case i == 8:
53
+		return _Endianness_name_1
54
+	case i == 255:
55
+		return _Endianness_name_2
56
+	default:
57
+		return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")"
58
+	}
59
+}
60
+func _() {
61
+	// An "invalid array index" compiler error signifies that the constant values have changed.
62
+	// Re-run the stringer command to generate them again.
63
+	var x [1]struct{}
64
+	_ = x[InvalidALUOp-255]
65
+	_ = x[Add-0]
66
+	_ = x[Sub-16]
67
+	_ = x[Mul-32]
68
+	_ = x[Div-48]
69
+	_ = x[Or-64]
70
+	_ = x[And-80]
71
+	_ = x[LSh-96]
72
+	_ = x[RSh-112]
73
+	_ = x[Neg-128]
74
+	_ = x[Mod-144]
75
+	_ = x[Xor-160]
76
+	_ = x[Mov-176]
77
+	_ = x[ArSh-192]
78
+	_ = x[Swap-208]
79
+}
80
+
81
+const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapInvalidALUOp"
82
+
83
+var _ALUOp_map = map[ALUOp]string{
84
+	0:   _ALUOp_name[0:3],
85
+	16:  _ALUOp_name[3:6],
86
+	32:  _ALUOp_name[6:9],
87
+	48:  _ALUOp_name[9:12],
88
+	64:  _ALUOp_name[12:14],
89
+	80:  _ALUOp_name[14:17],
90
+	96:  _ALUOp_name[17:20],
91
+	112: _ALUOp_name[20:23],
92
+	128: _ALUOp_name[23:26],
93
+	144: _ALUOp_name[26:29],
94
+	160: _ALUOp_name[29:32],
95
+	176: _ALUOp_name[32:35],
96
+	192: _ALUOp_name[35:39],
97
+	208: _ALUOp_name[39:43],
98
+	255: _ALUOp_name[43:55],
99
+}
100
+
101
+func (i ALUOp) String() string {
102
+	if str, ok := _ALUOp_map[i]; ok {
103
+		return str
104
+	}
105
+	return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")"
106
+}
0 107
new file mode 100644
... ...
@@ -0,0 +1,2 @@
0
+// Package asm is an assembler for eBPF bytecode.
1
+package asm
0 2
new file mode 100644
... ...
@@ -0,0 +1,143 @@
0
+package asm
1
+
2
+//go:generate stringer -output func_string.go -type=BuiltinFunc
3
+
4
+// BuiltinFunc is a built-in eBPF function.
5
+type BuiltinFunc int32
6
+
7
+// eBPF built-in functions
8
+//
9
+// You can renegerate this list using the following gawk script:
10
+//
11
+//    /FN\(.+\),/ {
12
+//      match($1, /\((.+)\)/, r)
13
+//      split(r[1], p, "_")
14
+//      printf "Fn"
15
+//      for (i in p) {
16
+//        printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2)
17
+//      }
18
+//      print ""
19
+//    }
20
+//
21
+// The script expects include/uapi/linux/bpf.h as it's input.
22
+const (
23
+	FnUnspec BuiltinFunc = iota
24
+	FnMapLookupElem
25
+	FnMapUpdateElem
26
+	FnMapDeleteElem
27
+	FnProbeRead
28
+	FnKtimeGetNs
29
+	FnTracePrintk
30
+	FnGetPrandomU32
31
+	FnGetSmpProcessorId
32
+	FnSkbStoreBytes
33
+	FnL3CsumReplace
34
+	FnL4CsumReplace
35
+	FnTailCall
36
+	FnCloneRedirect
37
+	FnGetCurrentPidTgid
38
+	FnGetCurrentUidGid
39
+	FnGetCurrentComm
40
+	FnGetCgroupClassid
41
+	FnSkbVlanPush
42
+	FnSkbVlanPop
43
+	FnSkbGetTunnelKey
44
+	FnSkbSetTunnelKey
45
+	FnPerfEventRead
46
+	FnRedirect
47
+	FnGetRouteRealm
48
+	FnPerfEventOutput
49
+	FnSkbLoadBytes
50
+	FnGetStackid
51
+	FnCsumDiff
52
+	FnSkbGetTunnelOpt
53
+	FnSkbSetTunnelOpt
54
+	FnSkbChangeProto
55
+	FnSkbChangeType
56
+	FnSkbUnderCgroup
57
+	FnGetHashRecalc
58
+	FnGetCurrentTask
59
+	FnProbeWriteUser
60
+	FnCurrentTaskUnderCgroup
61
+	FnSkbChangeTail
62
+	FnSkbPullData
63
+	FnCsumUpdate
64
+	FnSetHashInvalid
65
+	FnGetNumaNodeId
66
+	FnSkbChangeHead
67
+	FnXdpAdjustHead
68
+	FnProbeReadStr
69
+	FnGetSocketCookie
70
+	FnGetSocketUid
71
+	FnSetHash
72
+	FnSetsockopt
73
+	FnSkbAdjustRoom
74
+	FnRedirectMap
75
+	FnSkRedirectMap
76
+	FnSockMapUpdate
77
+	FnXdpAdjustMeta
78
+	FnPerfEventReadValue
79
+	FnPerfProgReadValue
80
+	FnGetsockopt
81
+	FnOverrideReturn
82
+	FnSockOpsCbFlagsSet
83
+	FnMsgRedirectMap
84
+	FnMsgApplyBytes
85
+	FnMsgCorkBytes
86
+	FnMsgPullData
87
+	FnBind
88
+	FnXdpAdjustTail
89
+	FnSkbGetXfrmState
90
+	FnGetStack
91
+	FnSkbLoadBytesRelative
92
+	FnFibLookup
93
+	FnSockHashUpdate
94
+	FnMsgRedirectHash
95
+	FnSkRedirectHash
96
+	FnLwtPushEncap
97
+	FnLwtSeg6StoreBytes
98
+	FnLwtSeg6AdjustSrh
99
+	FnLwtSeg6Action
100
+	FnRcRepeat
101
+	FnRcKeydown
102
+	FnSkbCgroupId
103
+	FnGetCurrentCgroupId
104
+	FnGetLocalStorage
105
+	FnSkSelectReuseport
106
+	FnSkbAncestorCgroupId
107
+	FnSkLookupTcp
108
+	FnSkLookupUdp
109
+	FnSkRelease
110
+	FnMapPushElem
111
+	FnMapPopElem
112
+	FnMapPeekElem
113
+	FnMsgPushData
114
+	FnMsgPopData
115
+	FnRcPointerRel
116
+	FnSpinLock
117
+	FnSpinUnlock
118
+	FnSkFullsock
119
+	FnTcpSock
120
+	FnSkbEcnSetCe
121
+	FnGetListenerSock
122
+	FnSkcLookupTcp
123
+	FnTcpCheckSyncookie
124
+	FnSysctlGetName
125
+	FnSysctlGetCurrentValue
126
+	FnSysctlGetNewValue
127
+	FnSysctlSetNewValue
128
+	FnStrtol
129
+	FnStrtoul
130
+	FnSkStorageGet
131
+	FnSkStorageDelete
132
+	FnSendSignal
133
+	FnTcpGenSyncookie
134
+)
135
+
136
+// Call emits a function call.
137
+func (fn BuiltinFunc) Call() Instruction {
138
+	return Instruction{
139
+		OpCode:   OpCode(JumpClass).SetJumpOp(Call),
140
+		Constant: int64(fn),
141
+	}
142
+}
0 143
new file mode 100644
... ...
@@ -0,0 +1,133 @@
0
+// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT.
1
+
2
+package asm
3
+
4
+import "strconv"
5
+
6
+func _() {
7
+	// An "invalid array index" compiler error signifies that the constant values have changed.
8
+	// Re-run the stringer command to generate them again.
9
+	var x [1]struct{}
10
+	_ = x[FnUnspec-0]
11
+	_ = x[FnMapLookupElem-1]
12
+	_ = x[FnMapUpdateElem-2]
13
+	_ = x[FnMapDeleteElem-3]
14
+	_ = x[FnProbeRead-4]
15
+	_ = x[FnKtimeGetNs-5]
16
+	_ = x[FnTracePrintk-6]
17
+	_ = x[FnGetPrandomU32-7]
18
+	_ = x[FnGetSmpProcessorId-8]
19
+	_ = x[FnSkbStoreBytes-9]
20
+	_ = x[FnL3CsumReplace-10]
21
+	_ = x[FnL4CsumReplace-11]
22
+	_ = x[FnTailCall-12]
23
+	_ = x[FnCloneRedirect-13]
24
+	_ = x[FnGetCurrentPidTgid-14]
25
+	_ = x[FnGetCurrentUidGid-15]
26
+	_ = x[FnGetCurrentComm-16]
27
+	_ = x[FnGetCgroupClassid-17]
28
+	_ = x[FnSkbVlanPush-18]
29
+	_ = x[FnSkbVlanPop-19]
30
+	_ = x[FnSkbGetTunnelKey-20]
31
+	_ = x[FnSkbSetTunnelKey-21]
32
+	_ = x[FnPerfEventRead-22]
33
+	_ = x[FnRedirect-23]
34
+	_ = x[FnGetRouteRealm-24]
35
+	_ = x[FnPerfEventOutput-25]
36
+	_ = x[FnSkbLoadBytes-26]
37
+	_ = x[FnGetStackid-27]
38
+	_ = x[FnCsumDiff-28]
39
+	_ = x[FnSkbGetTunnelOpt-29]
40
+	_ = x[FnSkbSetTunnelOpt-30]
41
+	_ = x[FnSkbChangeProto-31]
42
+	_ = x[FnSkbChangeType-32]
43
+	_ = x[FnSkbUnderCgroup-33]
44
+	_ = x[FnGetHashRecalc-34]
45
+	_ = x[FnGetCurrentTask-35]
46
+	_ = x[FnProbeWriteUser-36]
47
+	_ = x[FnCurrentTaskUnderCgroup-37]
48
+	_ = x[FnSkbChangeTail-38]
49
+	_ = x[FnSkbPullData-39]
50
+	_ = x[FnCsumUpdate-40]
51
+	_ = x[FnSetHashInvalid-41]
52
+	_ = x[FnGetNumaNodeId-42]
53
+	_ = x[FnSkbChangeHead-43]
54
+	_ = x[FnXdpAdjustHead-44]
55
+	_ = x[FnProbeReadStr-45]
56
+	_ = x[FnGetSocketCookie-46]
57
+	_ = x[FnGetSocketUid-47]
58
+	_ = x[FnSetHash-48]
59
+	_ = x[FnSetsockopt-49]
60
+	_ = x[FnSkbAdjustRoom-50]
61
+	_ = x[FnRedirectMap-51]
62
+	_ = x[FnSkRedirectMap-52]
63
+	_ = x[FnSockMapUpdate-53]
64
+	_ = x[FnXdpAdjustMeta-54]
65
+	_ = x[FnPerfEventReadValue-55]
66
+	_ = x[FnPerfProgReadValue-56]
67
+	_ = x[FnGetsockopt-57]
68
+	_ = x[FnOverrideReturn-58]
69
+	_ = x[FnSockOpsCbFlagsSet-59]
70
+	_ = x[FnMsgRedirectMap-60]
71
+	_ = x[FnMsgApplyBytes-61]
72
+	_ = x[FnMsgCorkBytes-62]
73
+	_ = x[FnMsgPullData-63]
74
+	_ = x[FnBind-64]
75
+	_ = x[FnXdpAdjustTail-65]
76
+	_ = x[FnSkbGetXfrmState-66]
77
+	_ = x[FnGetStack-67]
78
+	_ = x[FnSkbLoadBytesRelative-68]
79
+	_ = x[FnFibLookup-69]
80
+	_ = x[FnSockHashUpdate-70]
81
+	_ = x[FnMsgRedirectHash-71]
82
+	_ = x[FnSkRedirectHash-72]
83
+	_ = x[FnLwtPushEncap-73]
84
+	_ = x[FnLwtSeg6StoreBytes-74]
85
+	_ = x[FnLwtSeg6AdjustSrh-75]
86
+	_ = x[FnLwtSeg6Action-76]
87
+	_ = x[FnRcRepeat-77]
88
+	_ = x[FnRcKeydown-78]
89
+	_ = x[FnSkbCgroupId-79]
90
+	_ = x[FnGetCurrentCgroupId-80]
91
+	_ = x[FnGetLocalStorage-81]
92
+	_ = x[FnSkSelectReuseport-82]
93
+	_ = x[FnSkbAncestorCgroupId-83]
94
+	_ = x[FnSkLookupTcp-84]
95
+	_ = x[FnSkLookupUdp-85]
96
+	_ = x[FnSkRelease-86]
97
+	_ = x[FnMapPushElem-87]
98
+	_ = x[FnMapPopElem-88]
99
+	_ = x[FnMapPeekElem-89]
100
+	_ = x[FnMsgPushData-90]
101
+	_ = x[FnMsgPopData-91]
102
+	_ = x[FnRcPointerRel-92]
103
+	_ = x[FnSpinLock-93]
104
+	_ = x[FnSpinUnlock-94]
105
+	_ = x[FnSkFullsock-95]
106
+	_ = x[FnTcpSock-96]
107
+	_ = x[FnSkbEcnSetCe-97]
108
+	_ = x[FnGetListenerSock-98]
109
+	_ = x[FnSkcLookupTcp-99]
110
+	_ = x[FnTcpCheckSyncookie-100]
111
+	_ = x[FnSysctlGetName-101]
112
+	_ = x[FnSysctlGetCurrentValue-102]
113
+	_ = x[FnSysctlGetNewValue-103]
114
+	_ = x[FnSysctlSetNewValue-104]
115
+	_ = x[FnStrtol-105]
116
+	_ = x[FnStrtoul-106]
117
+	_ = x[FnSkStorageGet-107]
118
+	_ = x[FnSkStorageDelete-108]
119
+	_ = x[FnSendSignal-109]
120
+	_ = x[FnTcpGenSyncookie-110]
121
+}
122
+
123
+const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookie"
124
+
125
+var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632}
126
+
127
+func (i BuiltinFunc) String() string {
128
+	if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) {
129
+		return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")"
130
+	}
131
+	return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]]
132
+}
0 133
new file mode 100644
... ...
@@ -0,0 +1,416 @@
0
+package asm
1
+
2
+import (
3
+	"encoding/binary"
4
+	"fmt"
5
+	"io"
6
+	"math"
7
+	"strings"
8
+
9
+	"github.com/pkg/errors"
10
+)
11
+
12
+// InstructionSize is the size of a BPF instruction in bytes
13
+const InstructionSize = 8
14
+
15
+// Instruction is a single eBPF instruction.
16
+type Instruction struct {
17
+	OpCode    OpCode
18
+	Dst       Register
19
+	Src       Register
20
+	Offset    int16
21
+	Constant  int64
22
+	Reference string
23
+	Symbol    string
24
+}
25
+
26
+// Sym creates a symbol.
27
+func (ins Instruction) Sym(name string) Instruction {
28
+	ins.Symbol = name
29
+	return ins
30
+}
31
+
32
+// Unmarshal decodes a BPF instruction.
33
+func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) {
34
+	var bi bpfInstruction
35
+	err := binary.Read(r, bo, &bi)
36
+	if err != nil {
37
+		return 0, err
38
+	}
39
+
40
+	ins.OpCode = bi.OpCode
41
+	ins.Dst = bi.Registers.Dst()
42
+	ins.Src = bi.Registers.Src()
43
+	ins.Offset = bi.Offset
44
+	ins.Constant = int64(bi.Constant)
45
+
46
+	if !bi.OpCode.isDWordLoad() {
47
+		return InstructionSize, nil
48
+	}
49
+
50
+	var bi2 bpfInstruction
51
+	if err := binary.Read(r, bo, &bi2); err != nil {
52
+		// No Wrap, to avoid io.EOF clash
53
+		return 0, errors.New("64bit immediate is missing second half")
54
+	}
55
+	if bi2.OpCode != 0 || bi2.Offset != 0 || bi2.Registers != 0 {
56
+		return 0, errors.New("64bit immediate has non-zero fields")
57
+	}
58
+	ins.Constant = int64(uint64(uint32(bi2.Constant))<<32 | uint64(uint32(bi.Constant)))
59
+
60
+	return 2 * InstructionSize, nil
61
+}
62
+
63
+// Marshal encodes a BPF instruction.
64
+func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) {
65
+	if ins.OpCode == InvalidOpCode {
66
+		return 0, errors.New("invalid opcode")
67
+	}
68
+
69
+	isDWordLoad := ins.OpCode.isDWordLoad()
70
+
71
+	cons := int32(ins.Constant)
72
+	if isDWordLoad {
73
+		// Encode least significant 32bit first for 64bit operations.
74
+		cons = int32(uint32(ins.Constant))
75
+	}
76
+
77
+	bpfi := bpfInstruction{
78
+		ins.OpCode,
79
+		newBPFRegisters(ins.Dst, ins.Src),
80
+		ins.Offset,
81
+		cons,
82
+	}
83
+
84
+	if err := binary.Write(w, bo, &bpfi); err != nil {
85
+		return 0, err
86
+	}
87
+
88
+	if !isDWordLoad {
89
+		return InstructionSize, nil
90
+	}
91
+
92
+	bpfi = bpfInstruction{
93
+		Constant: int32(ins.Constant >> 32),
94
+	}
95
+
96
+	if err := binary.Write(w, bo, &bpfi); err != nil {
97
+		return 0, err
98
+	}
99
+
100
+	return 2 * InstructionSize, nil
101
+}
102
+
103
+// RewriteMapPtr changes an instruction to use a new map fd.
104
+//
105
+// Returns an error if the fd is invalid, or the instruction
106
+// is incorrect.
107
+func (ins *Instruction) RewriteMapPtr(fd int) error {
108
+	if !ins.OpCode.isDWordLoad() {
109
+		return errors.Errorf("%s is not a 64 bit load", ins.OpCode)
110
+	}
111
+
112
+	if fd < 0 {
113
+		return errors.New("invalid fd")
114
+	}
115
+
116
+	ins.Src = R1
117
+	ins.Constant = int64(fd)
118
+	return nil
119
+}
120
+
121
+// Format implements fmt.Formatter.
122
+func (ins Instruction) Format(f fmt.State, c rune) {
123
+	if c != 'v' {
124
+		fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c)
125
+		return
126
+	}
127
+
128
+	op := ins.OpCode
129
+
130
+	if op == InvalidOpCode {
131
+		fmt.Fprint(f, "INVALID")
132
+		return
133
+	}
134
+
135
+	// Omit trailing space for Exit
136
+	if op.JumpOp() == Exit {
137
+		fmt.Fprint(f, op)
138
+		return
139
+	}
140
+
141
+	fmt.Fprintf(f, "%v ", op)
142
+	switch cls := op.Class(); cls {
143
+	case LdClass, LdXClass, StClass, StXClass:
144
+		switch op.Mode() {
145
+		case ImmMode:
146
+			fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant)
147
+		case AbsMode:
148
+			fmt.Fprintf(f, "imm: %d", ins.Constant)
149
+		case IndMode:
150
+			fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant)
151
+		case MemMode:
152
+			fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant)
153
+		case XAddMode:
154
+			fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src)
155
+		}
156
+
157
+	case ALU64Class, ALUClass:
158
+		fmt.Fprintf(f, "dst: %s ", ins.Dst)
159
+		if op.ALUOp() == Swap || op.Source() == ImmSource {
160
+			fmt.Fprintf(f, "imm: %d", ins.Constant)
161
+		} else {
162
+			fmt.Fprintf(f, "src: %s", ins.Src)
163
+		}
164
+
165
+	case JumpClass:
166
+		switch jop := op.JumpOp(); jop {
167
+		case Call:
168
+			if ins.Src == R1 {
169
+				// bpf-to-bpf call
170
+				fmt.Fprint(f, ins.Constant)
171
+			} else {
172
+				fmt.Fprint(f, BuiltinFunc(ins.Constant))
173
+			}
174
+
175
+		default:
176
+			fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset)
177
+			if op.Source() == ImmSource {
178
+				fmt.Fprintf(f, "imm: %d", ins.Constant)
179
+			} else {
180
+				fmt.Fprintf(f, "src: %s", ins.Src)
181
+			}
182
+		}
183
+	}
184
+
185
+	if ins.Reference != "" {
186
+		fmt.Fprintf(f, " <%s>", ins.Reference)
187
+	}
188
+}
189
+
190
+// Instructions is an eBPF program.
191
+type Instructions []Instruction
192
+
193
+func (insns Instructions) String() string {
194
+	return fmt.Sprint(insns)
195
+}
196
+
197
+// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd.
198
+//
199
+// Returns an error if the symbol isn't used, see IsUnreferencedSymbol.
200
+func (insns Instructions) RewriteMapPtr(symbol string, fd int) error {
201
+	if symbol == "" {
202
+		return errors.New("empty symbol")
203
+	}
204
+
205
+	found := false
206
+	for i := range insns {
207
+		ins := &insns[i]
208
+		if ins.Reference != symbol {
209
+			continue
210
+		}
211
+
212
+		if err := ins.RewriteMapPtr(fd); err != nil {
213
+			return err
214
+		}
215
+
216
+		found = true
217
+	}
218
+
219
+	if !found {
220
+		return &unreferencedSymbolError{symbol}
221
+	}
222
+
223
+	return nil
224
+}
225
+
226
+// SymbolOffsets returns the set of symbols and their offset in
227
+// the instructions.
228
+func (insns Instructions) SymbolOffsets() (map[string]int, error) {
229
+	offsets := make(map[string]int)
230
+
231
+	for i, ins := range insns {
232
+		if ins.Symbol == "" {
233
+			continue
234
+		}
235
+
236
+		if _, ok := offsets[ins.Symbol]; ok {
237
+			return nil, errors.Errorf("duplicate symbol %s", ins.Symbol)
238
+		}
239
+
240
+		offsets[ins.Symbol] = i
241
+	}
242
+
243
+	return offsets, nil
244
+}
245
+
246
+// ReferenceOffsets returns the set of references and their offset in
247
+// the instructions.
248
+func (insns Instructions) ReferenceOffsets() map[string][]int {
249
+	offsets := make(map[string][]int)
250
+
251
+	for i, ins := range insns {
252
+		if ins.Reference == "" {
253
+			continue
254
+		}
255
+
256
+		offsets[ins.Reference] = append(offsets[ins.Reference], i)
257
+	}
258
+
259
+	return offsets
260
+}
261
+
262
+func (insns Instructions) marshalledOffsets() (map[string]int, error) {
263
+	symbols := make(map[string]int)
264
+
265
+	marshalledPos := 0
266
+	for _, ins := range insns {
267
+		currentPos := marshalledPos
268
+		marshalledPos += ins.OpCode.marshalledInstructions()
269
+
270
+		if ins.Symbol == "" {
271
+			continue
272
+		}
273
+
274
+		if _, ok := symbols[ins.Symbol]; ok {
275
+			return nil, errors.Errorf("duplicate symbol %s", ins.Symbol)
276
+		}
277
+
278
+		symbols[ins.Symbol] = currentPos
279
+	}
280
+
281
+	return symbols, nil
282
+}
283
+
284
+// Format implements fmt.Formatter.
285
+//
286
+// You can control indentation of symbols by
287
+// specifying a width. Setting a precision controls the indentation of
288
+// instructions.
289
+// The default character is a tab, which can be overriden by specifying
290
+// the ' ' space flag.
291
+func (insns Instructions) Format(f fmt.State, c rune) {
292
+	if c != 's' && c != 'v' {
293
+		fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c)
294
+		return
295
+	}
296
+
297
+	// Precision is better in this case, because it allows
298
+	// specifying 0 padding easily.
299
+	padding, ok := f.Precision()
300
+	if !ok {
301
+		padding = 1
302
+	}
303
+
304
+	indent := strings.Repeat("\t", padding)
305
+	if f.Flag(' ') {
306
+		indent = strings.Repeat(" ", padding)
307
+	}
308
+
309
+	symPadding, ok := f.Width()
310
+	if !ok {
311
+		symPadding = padding - 1
312
+	}
313
+	if symPadding < 0 {
314
+		symPadding = 0
315
+	}
316
+
317
+	symIndent := strings.Repeat("\t", symPadding)
318
+	if f.Flag(' ') {
319
+		symIndent = strings.Repeat(" ", symPadding)
320
+	}
321
+
322
+	// Figure out how many digits we need to represent the highest
323
+	// offset.
324
+	highestOffset := 0
325
+	for _, ins := range insns {
326
+		highestOffset += ins.OpCode.marshalledInstructions()
327
+	}
328
+	offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset))))
329
+
330
+	offset := 0
331
+	for _, ins := range insns {
332
+		if ins.Symbol != "" {
333
+			fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol)
334
+		}
335
+		fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins)
336
+		offset += ins.OpCode.marshalledInstructions()
337
+	}
338
+
339
+	return
340
+}
341
+
342
+// Marshal encodes a BPF program into the kernel format.
343
+func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
344
+	absoluteOffsets, err := insns.marshalledOffsets()
345
+	if err != nil {
346
+		return err
347
+	}
348
+
349
+	num := 0
350
+	for i, ins := range insns {
351
+		switch {
352
+		case ins.OpCode.JumpOp() == Call && ins.Constant == -1:
353
+			// Rewrite bpf to bpf call
354
+			offset, ok := absoluteOffsets[ins.Reference]
355
+			if !ok {
356
+				return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
357
+			}
358
+
359
+			ins.Constant = int64(offset - num - 1)
360
+
361
+		case ins.OpCode.Class() == JumpClass && ins.Offset == -1:
362
+			// Rewrite jump to label
363
+			offset, ok := absoluteOffsets[ins.Reference]
364
+			if !ok {
365
+				return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
366
+			}
367
+
368
+			ins.Offset = int16(offset - num - 1)
369
+		}
370
+
371
+		n, err := ins.Marshal(w, bo)
372
+		if err != nil {
373
+			return errors.Wrapf(err, "instruction %d", i)
374
+		}
375
+
376
+		num += int(n / InstructionSize)
377
+	}
378
+	return nil
379
+}
380
+
381
+type bpfInstruction struct {
382
+	OpCode    OpCode
383
+	Registers bpfRegisters
384
+	Offset    int16
385
+	Constant  int32
386
+}
387
+
388
+type bpfRegisters uint8
389
+
390
+func newBPFRegisters(dst, src Register) bpfRegisters {
391
+	return bpfRegisters((src << 4) | (dst & 0xF))
392
+}
393
+
394
+func (r bpfRegisters) Dst() Register {
395
+	return Register(r & 0xF)
396
+}
397
+
398
+func (r bpfRegisters) Src() Register {
399
+	return Register(r >> 4)
400
+}
401
+
402
+type unreferencedSymbolError struct {
403
+	symbol string
404
+}
405
+
406
+func (use *unreferencedSymbolError) Error() string {
407
+	return fmt.Sprintf("unreferenced symbol %s", use.symbol)
408
+}
409
+
410
+// IsUnreferencedSymbol returns true if err was caused by
411
+// an unreferenced symbol.
412
+func IsUnreferencedSymbol(err error) bool {
413
+	_, ok := err.(*unreferencedSymbolError)
414
+	return ok
415
+}
0 416
new file mode 100644
... ...
@@ -0,0 +1,109 @@
0
+package asm
1
+
2
+//go:generate stringer -output jump_string.go -type=JumpOp
3
+
4
+// JumpOp affect control flow.
5
+//
6
+//    msb      lsb
7
+//    +----+-+---+
8
+//    |OP  |s|cls|
9
+//    +----+-+---+
10
+type JumpOp uint8
11
+
12
+const jumpMask OpCode = aluMask
13
+
14
+const (
15
+	// InvalidJumpOp is returned by getters when invoked
16
+	// on non branch OpCodes
17
+	InvalidJumpOp JumpOp = 0xff
18
+	// Ja jumps by offset unconditionally
19
+	Ja JumpOp = 0x00
20
+	// JEq jumps by offset if r == imm
21
+	JEq JumpOp = 0x10
22
+	// JGT jumps by offset if r > imm
23
+	JGT JumpOp = 0x20
24
+	// JGE jumps by offset if r >= imm
25
+	JGE JumpOp = 0x30
26
+	// JSet jumps by offset if r & imm
27
+	JSet JumpOp = 0x40
28
+	// JNE jumps by offset if r != imm
29
+	JNE JumpOp = 0x50
30
+	// JSGT jumps by offset if signed r > signed imm
31
+	JSGT JumpOp = 0x60
32
+	// JSGE jumps by offset if signed r >= signed imm
33
+	JSGE JumpOp = 0x70
34
+	// Call builtin or user defined function from imm
35
+	Call JumpOp = 0x80
36
+	// Exit ends execution, with value in r0
37
+	Exit JumpOp = 0x90
38
+	// JLT jumps by offset if r < imm
39
+	JLT JumpOp = 0xa0
40
+	// JLE jumps by offset if r <= imm
41
+	JLE JumpOp = 0xb0
42
+	// JSLT jumps by offset if signed r < signed imm
43
+	JSLT JumpOp = 0xc0
44
+	// JSLE jumps by offset if signed r <= signed imm
45
+	JSLE JumpOp = 0xd0
46
+)
47
+
48
+// Return emits an exit instruction.
49
+//
50
+// Requires a return value in R0.
51
+func Return() Instruction {
52
+	return Instruction{
53
+		OpCode: OpCode(JumpClass).SetJumpOp(Exit),
54
+	}
55
+}
56
+
57
+// Op returns the OpCode for a given jump source.
58
+func (op JumpOp) Op(source Source) OpCode {
59
+	return OpCode(JumpClass).SetJumpOp(op).SetSource(source)
60
+}
61
+
62
+// Imm compares dst to value, and adjusts PC by offset if the condition is fulfilled.
63
+func (op JumpOp) Imm(dst Register, value int32, label string) Instruction {
64
+	if op == Exit || op == Call || op == Ja {
65
+		return Instruction{OpCode: InvalidOpCode}
66
+	}
67
+
68
+	return Instruction{
69
+		OpCode:    OpCode(JumpClass).SetJumpOp(op).SetSource(ImmSource),
70
+		Dst:       dst,
71
+		Offset:    -1,
72
+		Constant:  int64(value),
73
+		Reference: label,
74
+	}
75
+}
76
+
77
+// Reg compares dst to src, and adjusts PC by offset if the condition is fulfilled.
78
+func (op JumpOp) Reg(dst, src Register, label string) Instruction {
79
+	if op == Exit || op == Call || op == Ja {
80
+		return Instruction{OpCode: InvalidOpCode}
81
+	}
82
+
83
+	return Instruction{
84
+		OpCode:    OpCode(JumpClass).SetJumpOp(op).SetSource(RegSource),
85
+		Dst:       dst,
86
+		Src:       src,
87
+		Offset:    -1,
88
+		Reference: label,
89
+	}
90
+}
91
+
92
+// Label adjusts PC to the address of the label.
93
+func (op JumpOp) Label(label string) Instruction {
94
+	if op == Call {
95
+		return Instruction{
96
+			OpCode:    OpCode(JumpClass).SetJumpOp(Call),
97
+			Src:       R1,
98
+			Constant:  -1,
99
+			Reference: label,
100
+		}
101
+	}
102
+
103
+	return Instruction{
104
+		OpCode:    OpCode(JumpClass).SetJumpOp(op),
105
+		Offset:    -1,
106
+		Reference: label,
107
+	}
108
+}
0 109
new file mode 100644
... ...
@@ -0,0 +1,53 @@
0
+// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT.
1
+
2
+package asm
3
+
4
+import "strconv"
5
+
6
+func _() {
7
+	// An "invalid array index" compiler error signifies that the constant values have changed.
8
+	// Re-run the stringer command to generate them again.
9
+	var x [1]struct{}
10
+	_ = x[InvalidJumpOp-255]
11
+	_ = x[Ja-0]
12
+	_ = x[JEq-16]
13
+	_ = x[JGT-32]
14
+	_ = x[JGE-48]
15
+	_ = x[JSet-64]
16
+	_ = x[JNE-80]
17
+	_ = x[JSGT-96]
18
+	_ = x[JSGE-112]
19
+	_ = x[Call-128]
20
+	_ = x[Exit-144]
21
+	_ = x[JLT-160]
22
+	_ = x[JLE-176]
23
+	_ = x[JSLT-192]
24
+	_ = x[JSLE-208]
25
+}
26
+
27
+const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp"
28
+
29
+var _JumpOp_map = map[JumpOp]string{
30
+	0:   _JumpOp_name[0:2],
31
+	16:  _JumpOp_name[2:5],
32
+	32:  _JumpOp_name[5:8],
33
+	48:  _JumpOp_name[8:11],
34
+	64:  _JumpOp_name[11:15],
35
+	80:  _JumpOp_name[15:18],
36
+	96:  _JumpOp_name[18:22],
37
+	112: _JumpOp_name[22:26],
38
+	128: _JumpOp_name[26:30],
39
+	144: _JumpOp_name[30:34],
40
+	160: _JumpOp_name[34:37],
41
+	176: _JumpOp_name[37:40],
42
+	192: _JumpOp_name[40:44],
43
+	208: _JumpOp_name[44:48],
44
+	255: _JumpOp_name[48:61],
45
+}
46
+
47
+func (i JumpOp) String() string {
48
+	if str, ok := _JumpOp_map[i]; ok {
49
+		return str
50
+	}
51
+	return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")"
52
+}
0 53
new file mode 100644
... ...
@@ -0,0 +1,189 @@
0
+package asm
1
+
2
+//go:generate stringer -output load_store_string.go -type=Mode,Size
3
+
4
+// Mode for load and store operations
5
+//
6
+//    msb      lsb
7
+//    +---+--+---+
8
+//    |MDE|sz|cls|
9
+//    +---+--+---+
10
+type Mode uint8
11
+
12
+const modeMask OpCode = 0xe0
13
+
14
+const (
15
+	// InvalidMode is returned by getters when invoked
16
+	// on non load / store OpCodes
17
+	InvalidMode Mode = 0xff
18
+	// ImmMode - immediate value
19
+	ImmMode Mode = 0x00
20
+	// AbsMode - immediate value + offset
21
+	AbsMode Mode = 0x20
22
+	// IndMode - indirect (imm+src)
23
+	IndMode Mode = 0x40
24
+	// MemMode - load from memory
25
+	MemMode Mode = 0x60
26
+	// XAddMode - add atomically across processors.
27
+	XAddMode Mode = 0xc0
28
+)
29
+
30
+// Size of load and store operations
31
+//
32
+//    msb      lsb
33
+//    +---+--+---+
34
+//    |mde|SZ|cls|
35
+//    +---+--+---+
36
+type Size uint8
37
+
38
+const sizeMask OpCode = 0x18
39
+
40
+const (
41
+	// InvalidSize is returned by getters when invoked
42
+	// on non load / store OpCodes
43
+	InvalidSize Size = 0xff
44
+	// DWord - double word; 64 bits
45
+	DWord Size = 0x18
46
+	// Word - word; 32 bits
47
+	Word Size = 0x00
48
+	// Half - half-word; 16 bits
49
+	Half Size = 0x08
50
+	// Byte - byte; 8 bits
51
+	Byte Size = 0x10
52
+)
53
+
54
+// Sizeof returns the size in bytes.
55
+func (s Size) Sizeof() int {
56
+	switch s {
57
+	case DWord:
58
+		return 8
59
+	case Word:
60
+		return 4
61
+	case Half:
62
+		return 2
63
+	case Byte:
64
+		return 1
65
+	default:
66
+		return -1
67
+	}
68
+}
69
+
70
+// LoadMemOp returns the OpCode to load a value of given size from memory.
71
+func LoadMemOp(size Size) OpCode {
72
+	return OpCode(LdXClass).SetMode(MemMode).SetSize(size)
73
+}
74
+
75
+// LoadMem emits `dst = *(size *)(src + offset)`.
76
+func LoadMem(dst, src Register, offset int16, size Size) Instruction {
77
+	return Instruction{
78
+		OpCode: LoadMemOp(size),
79
+		Dst:    dst,
80
+		Src:    src,
81
+		Offset: offset,
82
+	}
83
+}
84
+
85
+// LoadImmOp returns the OpCode to load an immediate of given size.
86
+//
87
+// As of kernel 4.20, only DWord size is accepted.
88
+func LoadImmOp(size Size) OpCode {
89
+	return OpCode(LdClass).SetMode(ImmMode).SetSize(size)
90
+}
91
+
92
+// LoadImm emits `dst = (size)value`.
93
+//
94
+// As of kernel 4.20, only DWord size is accepted.
95
+func LoadImm(dst Register, value int64, size Size) Instruction {
96
+	return Instruction{
97
+		OpCode:   LoadImmOp(size),
98
+		Dst:      dst,
99
+		Constant: value,
100
+	}
101
+}
102
+
103
+// LoadMapPtr stores a pointer to a map in dst.
104
+func LoadMapPtr(dst Register, fd int) Instruction {
105
+	if fd < 0 {
106
+		return Instruction{OpCode: InvalidOpCode}
107
+	}
108
+
109
+	return Instruction{
110
+		OpCode:   LoadImmOp(DWord),
111
+		Dst:      dst,
112
+		Src:      R1,
113
+		Constant: int64(fd),
114
+	}
115
+}
116
+
117
+// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff.
118
+func LoadIndOp(size Size) OpCode {
119
+	return OpCode(LdClass).SetMode(IndMode).SetSize(size)
120
+}
121
+
122
+// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`.
123
+func LoadInd(dst, src Register, offset int32, size Size) Instruction {
124
+	return Instruction{
125
+		OpCode:   LoadIndOp(size),
126
+		Dst:      dst,
127
+		Src:      src,
128
+		Constant: int64(offset),
129
+	}
130
+}
131
+
132
+// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff.
133
+func LoadAbsOp(size Size) OpCode {
134
+	return OpCode(LdClass).SetMode(AbsMode).SetSize(size)
135
+}
136
+
137
+// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`.
138
+func LoadAbs(offset int32, size Size) Instruction {
139
+	return Instruction{
140
+		OpCode:   LoadAbsOp(size),
141
+		Dst:      R0,
142
+		Constant: int64(offset),
143
+	}
144
+}
145
+
146
+// StoreMemOp returns the OpCode for storing a register of given size in memory.
147
+func StoreMemOp(size Size) OpCode {
148
+	return OpCode(StXClass).SetMode(MemMode).SetSize(size)
149
+}
150
+
151
+// StoreMem emits `*(size *)(dst + offset) = src`
152
+func StoreMem(dst Register, offset int16, src Register, size Size) Instruction {
153
+	return Instruction{
154
+		OpCode: StoreMemOp(size),
155
+		Dst:    dst,
156
+		Src:    src,
157
+		Offset: offset,
158
+	}
159
+}
160
+
161
+// StoreImmOp returns the OpCode for storing an immediate of given size in memory.
162
+func StoreImmOp(size Size) OpCode {
163
+	return OpCode(StClass).SetMode(MemMode).SetSize(size)
164
+}
165
+
166
+// StoreImm emits `*(size *)(dst + offset) = value`.
167
+func StoreImm(dst Register, offset int16, value int64, size Size) Instruction {
168
+	return Instruction{
169
+		OpCode:   StoreImmOp(size),
170
+		Dst:      dst,
171
+		Offset:   offset,
172
+		Constant: value,
173
+	}
174
+}
175
+
176
+// StoreXAddOp returns the OpCode to atomically add a register to a value in memory.
177
+func StoreXAddOp(size Size) OpCode {
178
+	return OpCode(StXClass).SetMode(XAddMode).SetSize(size)
179
+}
180
+
181
+// StoreXAdd atomically adds src to *dst.
182
+func StoreXAdd(dst, src Register, size Size) Instruction {
183
+	return Instruction{
184
+		OpCode: StoreXAddOp(size),
185
+		Dst:    dst,
186
+		Src:    src,
187
+	}
188
+}
0 189
new file mode 100644
... ...
@@ -0,0 +1,80 @@
0
+// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT.
1
+
2
+package asm
3
+
4
+import "strconv"
5
+
6
+func _() {
7
+	// An "invalid array index" compiler error signifies that the constant values have changed.
8
+	// Re-run the stringer command to generate them again.
9
+	var x [1]struct{}
10
+	_ = x[InvalidMode-255]
11
+	_ = x[ImmMode-0]
12
+	_ = x[AbsMode-32]
13
+	_ = x[IndMode-64]
14
+	_ = x[MemMode-96]
15
+	_ = x[XAddMode-192]
16
+}
17
+
18
+const (
19
+	_Mode_name_0 = "ImmMode"
20
+	_Mode_name_1 = "AbsMode"
21
+	_Mode_name_2 = "IndMode"
22
+	_Mode_name_3 = "MemMode"
23
+	_Mode_name_4 = "XAddMode"
24
+	_Mode_name_5 = "InvalidMode"
25
+)
26
+
27
+func (i Mode) String() string {
28
+	switch {
29
+	case i == 0:
30
+		return _Mode_name_0
31
+	case i == 32:
32
+		return _Mode_name_1
33
+	case i == 64:
34
+		return _Mode_name_2
35
+	case i == 96:
36
+		return _Mode_name_3
37
+	case i == 192:
38
+		return _Mode_name_4
39
+	case i == 255:
40
+		return _Mode_name_5
41
+	default:
42
+		return "Mode(" + strconv.FormatInt(int64(i), 10) + ")"
43
+	}
44
+}
45
+func _() {
46
+	// An "invalid array index" compiler error signifies that the constant values have changed.
47
+	// Re-run the stringer command to generate them again.
48
+	var x [1]struct{}
49
+	_ = x[InvalidSize-255]
50
+	_ = x[DWord-24]
51
+	_ = x[Word-0]
52
+	_ = x[Half-8]
53
+	_ = x[Byte-16]
54
+}
55
+
56
+const (
57
+	_Size_name_0 = "Word"
58
+	_Size_name_1 = "Half"
59
+	_Size_name_2 = "Byte"
60
+	_Size_name_3 = "DWord"
61
+	_Size_name_4 = "InvalidSize"
62
+)
63
+
64
+func (i Size) String() string {
65
+	switch {
66
+	case i == 0:
67
+		return _Size_name_0
68
+	case i == 8:
69
+		return _Size_name_1
70
+	case i == 16:
71
+		return _Size_name_2
72
+	case i == 24:
73
+		return _Size_name_3
74
+	case i == 255:
75
+		return _Size_name_4
76
+	default:
77
+		return "Size(" + strconv.FormatInt(int64(i), 10) + ")"
78
+	}
79
+}
0 80
new file mode 100644
... ...
@@ -0,0 +1,237 @@
0
+package asm
1
+
2
+import (
3
+	"fmt"
4
+	"strings"
5
+)
6
+
7
+//go:generate stringer -output opcode_string.go -type=Class
8
+
9
+type encoding int
10
+
11
+const (
12
+	unknownEncoding encoding = iota
13
+	loadOrStore
14
+	jumpOrALU
15
+)
16
+
17
+// Class of operations
18
+//
19
+//    msb      lsb
20
+//    +---+--+---+
21
+//    |  ??  |CLS|
22
+//    +---+--+---+
23
+type Class uint8
24
+
25
+const classMask OpCode = 0x07
26
+
27
+const (
28
+	// LdClass load memory
29
+	LdClass Class = 0x00
30
+	// LdXClass load memory from constant
31
+	LdXClass Class = 0x01
32
+	// StClass load register from memory
33
+	StClass Class = 0x02
34
+	// StXClass load register from constant
35
+	StXClass Class = 0x03
36
+	// ALUClass arithmetic operators
37
+	ALUClass Class = 0x04
38
+	// JumpClass jump operators
39
+	JumpClass Class = 0x05
40
+	// ALU64Class arithmetic in 64 bit mode
41
+	ALU64Class Class = 0x07
42
+)
43
+
44
+func (cls Class) encoding() encoding {
45
+	switch cls {
46
+	case LdClass, LdXClass, StClass, StXClass:
47
+		return loadOrStore
48
+	case ALU64Class, ALUClass, JumpClass:
49
+		return jumpOrALU
50
+	default:
51
+		return unknownEncoding
52
+	}
53
+}
54
+
55
+// OpCode is a packed eBPF opcode.
56
+//
57
+// Its encoding is defined by a Class value:
58
+//
59
+//    msb      lsb
60
+//    +----+-+---+
61
+//    | ???? |CLS|
62
+//    +----+-+---+
63
+type OpCode uint8
64
+
65
+// InvalidOpCode is returned by setters on OpCode
66
+const InvalidOpCode OpCode = 0xff
67
+
68
+// marshalledInstructions returns the number of BPF instructions required
69
+// to encode this opcode.
70
+func (op OpCode) marshalledInstructions() int {
71
+	if op == LoadImmOp(DWord) {
72
+		return 2
73
+	}
74
+	return 1
75
+}
76
+
77
+func (op OpCode) isDWordLoad() bool {
78
+	return op == LoadImmOp(DWord)
79
+}
80
+
81
+// Class returns the class of operation.
82
+func (op OpCode) Class() Class {
83
+	return Class(op & classMask)
84
+}
85
+
86
+// Mode returns the mode for load and store operations.
87
+func (op OpCode) Mode() Mode {
88
+	if op.Class().encoding() != loadOrStore {
89
+		return InvalidMode
90
+	}
91
+	return Mode(op & modeMask)
92
+}
93
+
94
+// Size returns the size for load and store operations.
95
+func (op OpCode) Size() Size {
96
+	if op.Class().encoding() != loadOrStore {
97
+		return InvalidSize
98
+	}
99
+	return Size(op & sizeMask)
100
+}
101
+
102
+// Source returns the source for branch and ALU operations.
103
+func (op OpCode) Source() Source {
104
+	if op.Class().encoding() != jumpOrALU || op.ALUOp() == Swap {
105
+		return InvalidSource
106
+	}
107
+	return Source(op & sourceMask)
108
+}
109
+
110
+// ALUOp returns the ALUOp.
111
+func (op OpCode) ALUOp() ALUOp {
112
+	if op.Class().encoding() != jumpOrALU {
113
+		return InvalidALUOp
114
+	}
115
+	return ALUOp(op & aluMask)
116
+}
117
+
118
+// Endianness returns the Endianness for a byte swap instruction.
119
+func (op OpCode) Endianness() Endianness {
120
+	if op.ALUOp() != Swap {
121
+		return InvalidEndian
122
+	}
123
+	return Endianness(op & endianMask)
124
+}
125
+
126
+// JumpOp returns the JumpOp.
127
+func (op OpCode) JumpOp() JumpOp {
128
+	if op.Class().encoding() != jumpOrALU {
129
+		return InvalidJumpOp
130
+	}
131
+	return JumpOp(op & jumpMask)
132
+}
133
+
134
+// SetMode sets the mode on load and store operations.
135
+//
136
+// Returns InvalidOpCode if op is of the wrong class.
137
+func (op OpCode) SetMode(mode Mode) OpCode {
138
+	if op.Class().encoding() != loadOrStore || !valid(OpCode(mode), modeMask) {
139
+		return InvalidOpCode
140
+	}
141
+	return (op & ^modeMask) | OpCode(mode)
142
+}
143
+
144
+// SetSize sets the size on load and store operations.
145
+//
146
+// Returns InvalidOpCode if op is of the wrong class.
147
+func (op OpCode) SetSize(size Size) OpCode {
148
+	if op.Class().encoding() != loadOrStore || !valid(OpCode(size), sizeMask) {
149
+		return InvalidOpCode
150
+	}
151
+	return (op & ^sizeMask) | OpCode(size)
152
+}
153
+
154
+// SetSource sets the source on jump and ALU operations.
155
+//
156
+// Returns InvalidOpCode if op is of the wrong class.
157
+func (op OpCode) SetSource(source Source) OpCode {
158
+	if op.Class().encoding() != jumpOrALU || !valid(OpCode(source), sourceMask) {
159
+		return InvalidOpCode
160
+	}
161
+	return (op & ^sourceMask) | OpCode(source)
162
+}
163
+
164
+// SetALUOp sets the ALUOp on ALU operations.
165
+//
166
+// Returns InvalidOpCode if op is of the wrong class.
167
+func (op OpCode) SetALUOp(alu ALUOp) OpCode {
168
+	class := op.Class()
169
+	if (class != ALUClass && class != ALU64Class) || !valid(OpCode(alu), aluMask) {
170
+		return InvalidOpCode
171
+	}
172
+	return (op & ^aluMask) | OpCode(alu)
173
+}
174
+
175
+// SetJumpOp sets the JumpOp on jump operations.
176
+//
177
+// Returns InvalidOpCode if op is of the wrong class.
178
+func (op OpCode) SetJumpOp(jump JumpOp) OpCode {
179
+	if op.Class() != JumpClass || !valid(OpCode(jump), jumpMask) {
180
+		return InvalidOpCode
181
+	}
182
+	return (op & ^jumpMask) | OpCode(jump)
183
+}
184
+
185
+func (op OpCode) String() string {
186
+	var f strings.Builder
187
+
188
+	switch class := op.Class(); class {
189
+	case LdClass, LdXClass, StClass, StXClass:
190
+		f.WriteString(strings.TrimSuffix(class.String(), "Class"))
191
+
192
+		mode := op.Mode()
193
+		f.WriteString(strings.TrimSuffix(mode.String(), "Mode"))
194
+
195
+		switch op.Size() {
196
+		case DWord:
197
+			f.WriteString("DW")
198
+		case Word:
199
+			f.WriteString("W")
200
+		case Half:
201
+			f.WriteString("H")
202
+		case Byte:
203
+			f.WriteString("B")
204
+		}
205
+
206
+	case ALU64Class, ALUClass:
207
+		f.WriteString(op.ALUOp().String())
208
+
209
+		if op.ALUOp() == Swap {
210
+			// Width for Endian is controlled by Constant
211
+			f.WriteString(op.Endianness().String())
212
+		} else {
213
+			if class == ALUClass {
214
+				f.WriteString("32")
215
+			}
216
+
217
+			f.WriteString(strings.TrimSuffix(op.Source().String(), "Source"))
218
+		}
219
+
220
+	case JumpClass:
221
+		f.WriteString(op.JumpOp().String())
222
+		if jop := op.JumpOp(); jop != Exit && jop != Call {
223
+			f.WriteString(strings.TrimSuffix(op.Source().String(), "Source"))
224
+		}
225
+
226
+	default:
227
+		fmt.Fprintf(&f, "%#x", op)
228
+	}
229
+
230
+	return f.String()
231
+}
232
+
233
+// valid returns true if all bits in value are covered by mask.
234
+func valid(value, mask OpCode) bool {
235
+	return value & ^mask == 0
236
+}
0 237
new file mode 100644
... ...
@@ -0,0 +1,38 @@
0
+// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT.
1
+
2
+package asm
3
+
4
+import "strconv"
5
+
6
+func _() {
7
+	// An "invalid array index" compiler error signifies that the constant values have changed.
8
+	// Re-run the stringer command to generate them again.
9
+	var x [1]struct{}
10
+	_ = x[LdClass-0]
11
+	_ = x[LdXClass-1]
12
+	_ = x[StClass-2]
13
+	_ = x[StXClass-3]
14
+	_ = x[ALUClass-4]
15
+	_ = x[JumpClass-5]
16
+	_ = x[ALU64Class-7]
17
+}
18
+
19
+const (
20
+	_Class_name_0 = "LdClassLdXClassStClassStXClassALUClassJumpClass"
21
+	_Class_name_1 = "ALU64Class"
22
+)
23
+
24
+var (
25
+	_Class_index_0 = [...]uint8{0, 7, 15, 22, 30, 38, 47}
26
+)
27
+
28
+func (i Class) String() string {
29
+	switch {
30
+	case 0 <= i && i <= 5:
31
+		return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]]
32
+	case i == 7:
33
+		return _Class_name_1
34
+	default:
35
+		return "Class(" + strconv.FormatInt(int64(i), 10) + ")"
36
+	}
37
+}
0 38
new file mode 100644
... ...
@@ -0,0 +1,42 @@
0
+package asm
1
+
2
+import (
3
+	"fmt"
4
+)
5
+
6
+// Register is the source or destination of most operations.
7
+type Register uint8
8
+
9
+// R0 contains return values.
10
+const R0 Register = 0
11
+
12
+// Registers for function arguments.
13
+const (
14
+	R1 Register = R0 + 1 + iota
15
+	R2
16
+	R3
17
+	R4
18
+	R5
19
+)
20
+
21
+// Callee saved registers preserved by function calls.
22
+const (
23
+	R6 Register = R5 + 1 + iota
24
+	R7
25
+	R8
26
+	R9
27
+)
28
+
29
+// Read-only frame pointer to access stack.
30
+const (
31
+	R10 Register = R9 + 1
32
+	RFP          = R10
33
+)
34
+
35
+func (r Register) String() string {
36
+	v := uint8(r)
37
+	if v == 10 {
38
+		return "rfp"
39
+	}
40
+	return fmt.Sprintf("r%d", v)
41
+}
0 42
new file mode 100644
... ...
@@ -0,0 +1,148 @@
0
+package ebpf
1
+
2
+import (
3
+	"github.com/cilium/ebpf/asm"
4
+	"github.com/pkg/errors"
5
+)
6
+
7
+// CollectionOptions control loading a collection into the kernel.
8
+type CollectionOptions struct {
9
+	Programs ProgramOptions
10
+}
11
+
12
+// CollectionSpec describes a collection.
13
+type CollectionSpec struct {
14
+	Maps     map[string]*MapSpec
15
+	Programs map[string]*ProgramSpec
16
+}
17
+
18
+// Copy returns a recursive copy of the spec.
19
+func (cs *CollectionSpec) Copy() *CollectionSpec {
20
+	if cs == nil {
21
+		return nil
22
+	}
23
+
24
+	cpy := CollectionSpec{
25
+		Maps:     make(map[string]*MapSpec, len(cs.Maps)),
26
+		Programs: make(map[string]*ProgramSpec, len(cs.Programs)),
27
+	}
28
+
29
+	for name, spec := range cs.Maps {
30
+		cpy.Maps[name] = spec.Copy()
31
+	}
32
+
33
+	for name, spec := range cs.Programs {
34
+		cpy.Programs[name] = spec.Copy()
35
+	}
36
+
37
+	return &cpy
38
+}
39
+
40
+// Collection is a collection of Programs and Maps associated
41
+// with their symbols
42
+type Collection struct {
43
+	Programs map[string]*Program
44
+	Maps     map[string]*Map
45
+}
46
+
47
+// NewCollection creates a Collection from a specification.
48
+//
49
+// Only maps referenced by at least one of the programs are initialized.
50
+func NewCollection(spec *CollectionSpec) (*Collection, error) {
51
+	return NewCollectionWithOptions(spec, CollectionOptions{})
52
+}
53
+
54
+// NewCollectionWithOptions creates a Collection from a specification.
55
+//
56
+// Only maps referenced by at least one of the programs are initialized.
57
+func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) {
58
+	maps := make(map[string]*Map)
59
+	for mapName, mapSpec := range spec.Maps {
60
+		m, err := NewMap(mapSpec)
61
+		if err != nil {
62
+			return nil, errors.Wrapf(err, "map %s", mapName)
63
+		}
64
+		maps[mapName] = m
65
+	}
66
+
67
+	progs := make(map[string]*Program)
68
+	for progName, origProgSpec := range spec.Programs {
69
+		progSpec := origProgSpec.Copy()
70
+
71
+		// Rewrite any reference to a valid map.
72
+		for i := range progSpec.Instructions {
73
+			var (
74
+				ins = &progSpec.Instructions[i]
75
+				m   = maps[ins.Reference]
76
+			)
77
+
78
+			if ins.Reference == "" || m == nil {
79
+				continue
80
+			}
81
+
82
+			if ins.Src == asm.R1 {
83
+				// Don't overwrite maps already rewritten, users can
84
+				// rewrite programs in the spec themselves
85
+				continue
86
+			}
87
+
88
+			if err := ins.RewriteMapPtr(m.FD()); err != nil {
89
+				return nil, errors.Wrapf(err, "progam %s: map %s", progName, ins.Reference)
90
+			}
91
+		}
92
+
93
+		prog, err := NewProgramWithOptions(progSpec, opts.Programs)
94
+		if err != nil {
95
+			return nil, errors.Wrapf(err, "program %s", progName)
96
+		}
97
+		progs[progName] = prog
98
+	}
99
+
100
+	return &Collection{
101
+		progs,
102
+		maps,
103
+	}, nil
104
+}
105
+
106
+// LoadCollection parses an object file and converts it to a collection.
107
+func LoadCollection(file string) (*Collection, error) {
108
+	spec, err := LoadCollectionSpec(file)
109
+	if err != nil {
110
+		return nil, err
111
+	}
112
+	return NewCollection(spec)
113
+}
114
+
115
+// Close frees all maps and programs associated with the collection.
116
+//
117
+// The collection mustn't be used afterwards.
118
+func (coll *Collection) Close() {
119
+	for _, prog := range coll.Programs {
120
+		prog.Close()
121
+	}
122
+	for _, m := range coll.Maps {
123
+		m.Close()
124
+	}
125
+}
126
+
127
+// DetachMap removes the named map from the Collection.
128
+//
129
+// This means that a later call to Close() will not affect this map.
130
+//
131
+// Returns nil if no map of that name exists.
132
+func (coll *Collection) DetachMap(name string) *Map {
133
+	m := coll.Maps[name]
134
+	delete(coll.Maps, name)
135
+	return m
136
+}
137
+
138
+// DetachProgram removes the named program from the Collection.
139
+//
140
+// This means that a later call to Close() will not affect this program.
141
+//
142
+// Returns nil if no program of that name exists.
143
+func (coll *Collection) DetachProgram(name string) *Program {
144
+	p := coll.Programs[name]
145
+	delete(coll.Programs, name)
146
+	return p
147
+}
0 148
new file mode 100644
... ...
@@ -0,0 +1,17 @@
0
+// Package ebpf is a toolkit for working with eBPF programs.
1
+//
2
+// eBPF programs are small snippets of code which are executed directly
3
+// in a VM in the Linux kernel, which makes them very fast and flexible.
4
+// Many Linux subsystems now accept eBPF programs. This makes it possible
5
+// to implement highly application specific logic inside the kernel,
6
+// without having to modify the actual kernel itself.
7
+//
8
+// This package is designed for long-running processes which
9
+// want to use eBPF to implement part of their application logic. It has no
10
+// run-time dependencies outside of the library and the Linux kernel itself.
11
+// eBPF code should be compiled ahead of time using clang, and shipped with
12
+// your application as any other resource.
13
+//
14
+// This package doesn't include code required to attach eBPF to Linux
15
+// subsystems, since this varies per subsystem.
16
+package ebpf
0 17
new file mode 100644
... ...
@@ -0,0 +1,392 @@
0
+package ebpf
1
+
2
+import (
3
+	"bytes"
4
+	"debug/elf"
5
+	"encoding/binary"
6
+	"fmt"
7
+	"io"
8
+	"os"
9
+	"strings"
10
+
11
+	"github.com/cilium/ebpf/asm"
12
+
13
+	"github.com/pkg/errors"
14
+)
15
+
16
+type elfCode struct {
17
+	*elf.File
18
+	symbols           []elf.Symbol
19
+	symbolsPerSection map[elf.SectionIndex]map[uint64]string
20
+}
21
+
22
+// LoadCollectionSpec parses an ELF file into a CollectionSpec.
23
+func LoadCollectionSpec(file string) (*CollectionSpec, error) {
24
+	f, err := os.Open(file)
25
+	if err != nil {
26
+		return nil, err
27
+	}
28
+	defer f.Close()
29
+
30
+	spec, err := LoadCollectionSpecFromReader(f)
31
+	return spec, errors.Wrapf(err, "file %s", file)
32
+}
33
+
34
+// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec.
35
+func LoadCollectionSpecFromReader(code io.ReaderAt) (*CollectionSpec, error) {
36
+	f, err := elf.NewFile(code)
37
+	if err != nil {
38
+		return nil, err
39
+	}
40
+	defer f.Close()
41
+
42
+	symbols, err := f.Symbols()
43
+	if err != nil {
44
+		return nil, errors.Wrap(err, "load symbols")
45
+	}
46
+
47
+	ec := &elfCode{f, symbols, symbolsPerSection(symbols)}
48
+
49
+	var licenseSection, versionSection *elf.Section
50
+	progSections := make(map[elf.SectionIndex]*elf.Section)
51
+	relSections := make(map[elf.SectionIndex]*elf.Section)
52
+	mapSections := make(map[elf.SectionIndex]*elf.Section)
53
+	for i, sec := range ec.Sections {
54
+		switch {
55
+		case strings.HasPrefix(sec.Name, "license"):
56
+			licenseSection = sec
57
+		case strings.HasPrefix(sec.Name, "version"):
58
+			versionSection = sec
59
+		case strings.HasPrefix(sec.Name, "maps"):
60
+			mapSections[elf.SectionIndex(i)] = sec
61
+		case sec.Type == elf.SHT_REL:
62
+			if int(sec.Info) >= len(ec.Sections) {
63
+				return nil, errors.Errorf("found relocation section %v for missing section %v", i, sec.Info)
64
+			}
65
+
66
+			// Store relocations under the section index of the target
67
+			idx := elf.SectionIndex(sec.Info)
68
+			if relSections[idx] != nil {
69
+				return nil, errors.Errorf("section %d has multiple relocation sections", idx)
70
+			}
71
+			relSections[idx] = sec
72
+		case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0:
73
+			progSections[elf.SectionIndex(i)] = sec
74
+		}
75
+	}
76
+
77
+	license, err := loadLicense(licenseSection)
78
+	if err != nil {
79
+		return nil, errors.Wrap(err, "load license")
80
+	}
81
+
82
+	version, err := loadVersion(versionSection, ec.ByteOrder)
83
+	if err != nil {
84
+		return nil, errors.Wrap(err, "load version")
85
+	}
86
+
87
+	maps, err := ec.loadMaps(mapSections)
88
+	if err != nil {
89
+		return nil, errors.Wrap(err, "load maps")
90
+	}
91
+
92
+	progs, libs, err := ec.loadPrograms(progSections, relSections, license, version)
93
+	if err != nil {
94
+		return nil, errors.Wrap(err, "load programs")
95
+	}
96
+
97
+	if len(libs) > 0 {
98
+		for name, prog := range progs {
99
+			prog.Instructions, err = link(prog.Instructions, libs...)
100
+			if err != nil {
101
+				return nil, errors.Wrapf(err, "program %s", name)
102
+			}
103
+		}
104
+	}
105
+
106
+	return &CollectionSpec{maps, progs}, nil
107
+}
108
+
109
+func loadLicense(sec *elf.Section) (string, error) {
110
+	if sec == nil {
111
+		return "", errors.Errorf("missing license section")
112
+	}
113
+	data, err := sec.Data()
114
+	if err != nil {
115
+		return "", errors.Wrapf(err, "section %s", sec.Name)
116
+	}
117
+	return string(bytes.TrimRight(data, "\000")), nil
118
+}
119
+
120
+func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) {
121
+	if sec == nil {
122
+		return 0, nil
123
+	}
124
+
125
+	var version uint32
126
+	err := binary.Read(sec.Open(), bo, &version)
127
+	return version, errors.Wrapf(err, "section %s", sec.Name)
128
+}
129
+
130
+func (ec *elfCode) loadPrograms(progSections, relSections map[elf.SectionIndex]*elf.Section, license string, version uint32) (map[string]*ProgramSpec, []asm.Instructions, error) {
131
+	var (
132
+		progs = make(map[string]*ProgramSpec)
133
+		libs  []asm.Instructions
134
+	)
135
+	for idx, prog := range progSections {
136
+		syms := ec.symbolsPerSection[idx]
137
+		if len(syms) == 0 {
138
+			return nil, nil, errors.Errorf("section %v: missing symbols", prog.Name)
139
+		}
140
+
141
+		funcSym := syms[0]
142
+		if funcSym == "" {
143
+			return nil, nil, errors.Errorf("section %v: no label at start", prog.Name)
144
+		}
145
+
146
+		rels, err := ec.loadRelocations(relSections[idx])
147
+		if err != nil {
148
+			return nil, nil, errors.Wrapf(err, "program %s: can't load relocations", funcSym)
149
+		}
150
+
151
+		insns, err := ec.loadInstructions(prog, syms, rels)
152
+		if err != nil {
153
+			return nil, nil, errors.Wrapf(err, "program %s: can't unmarshal instructions", funcSym)
154
+		}
155
+
156
+		if progType, attachType := getProgType(prog.Name); progType == UnspecifiedProgram {
157
+			// There is no single name we can use for "library" sections,
158
+			// since they may contain multiple functions. We'll decode the
159
+			// labels they contain later on, and then link sections that way.
160
+			libs = append(libs, insns)
161
+		} else {
162
+			progs[funcSym] = &ProgramSpec{
163
+				Name:          funcSym,
164
+				Type:          progType,
165
+				AttachType:    attachType,
166
+				License:       license,
167
+				KernelVersion: version,
168
+				Instructions:  insns,
169
+			}
170
+		}
171
+	}
172
+	return progs, libs, nil
173
+}
174
+
175
+func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]string) (asm.Instructions, error) {
176
+	var (
177
+		r      = section.Open()
178
+		insns  asm.Instructions
179
+		ins    asm.Instruction
180
+		offset uint64
181
+	)
182
+	for {
183
+		n, err := ins.Unmarshal(r, ec.ByteOrder)
184
+		if err == io.EOF {
185
+			return insns, nil
186
+		}
187
+		if err != nil {
188
+			return nil, errors.Wrapf(err, "offset %d", offset)
189
+		}
190
+
191
+		ins.Symbol = symbols[offset]
192
+		ins.Reference = relocations[offset]
193
+
194
+		insns = append(insns, ins)
195
+		offset += n
196
+	}
197
+}
198
+
199
+func (ec *elfCode) loadMaps(mapSections map[elf.SectionIndex]*elf.Section) (map[string]*MapSpec, error) {
200
+	var (
201
+		maps = make(map[string]*MapSpec)
202
+		b    = make([]byte, 1)
203
+	)
204
+	for idx, sec := range mapSections {
205
+		syms := ec.symbolsPerSection[idx]
206
+		if len(syms) == 0 {
207
+			return nil, errors.Errorf("section %v: no symbols", sec.Name)
208
+		}
209
+
210
+		if sec.Size%uint64(len(syms)) != 0 {
211
+			return nil, errors.Errorf("section %v: map descriptors are not of equal size", sec.Name)
212
+		}
213
+
214
+		var (
215
+			r    = sec.Open()
216
+			size = sec.Size / uint64(len(syms))
217
+		)
218
+		for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size {
219
+			mapSym := syms[offset]
220
+			if mapSym == "" {
221
+				fmt.Println(syms)
222
+				return nil, errors.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
223
+			}
224
+
225
+			if maps[mapSym] != nil {
226
+				return nil, errors.Errorf("section %v: map %v already exists", sec.Name, mapSym)
227
+			}
228
+
229
+			lr := io.LimitReader(r, int64(size))
230
+
231
+			var spec MapSpec
232
+			switch {
233
+			case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil:
234
+				return nil, errors.Errorf("map %v: missing type", mapSym)
235
+			case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil:
236
+				return nil, errors.Errorf("map %v: missing key size", mapSym)
237
+			case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil:
238
+				return nil, errors.Errorf("map %v: missing value size", mapSym)
239
+			case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil:
240
+				return nil, errors.Errorf("map %v: missing max entries", mapSym)
241
+			case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil:
242
+				return nil, errors.Errorf("map %v: missing flags", mapSym)
243
+			}
244
+
245
+			for {
246
+				_, err := lr.Read(b)
247
+				if err == io.EOF {
248
+					break
249
+				}
250
+				if err != nil {
251
+					return nil, err
252
+				}
253
+				if b[0] != 0 {
254
+					return nil, errors.Errorf("map %v: unknown and non-zero fields in definition", mapSym)
255
+				}
256
+			}
257
+
258
+			maps[mapSym] = &spec
259
+		}
260
+	}
261
+	return maps, nil
262
+}
263
+
264
+func getProgType(v string) (ProgramType, AttachType) {
265
+	types := map[string]ProgramType{
266
+		// From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n3568
267
+		"socket":         SocketFilter,
268
+		"seccomp":        SocketFilter,
269
+		"kprobe/":        Kprobe,
270
+		"kretprobe/":     Kprobe,
271
+		"tracepoint/":    TracePoint,
272
+		"xdp":            XDP,
273
+		"perf_event":     PerfEvent,
274
+		"sockops":        SockOps,
275
+		"sk_skb":         SkSKB,
276
+		"sk_msg":         SkMsg,
277
+		"lirc_mode2":     LircMode2,
278
+		"flow_dissector": FlowDissector,
279
+
280
+		"cgroup_skb/":       CGroupSKB,
281
+		"cgroup/dev":        CGroupDevice,
282
+		"cgroup/skb":        CGroupSKB,
283
+		"cgroup/sock":       CGroupSock,
284
+		"cgroup/post_bind":  CGroupSock,
285
+		"cgroup/bind":       CGroupSockAddr,
286
+		"cgroup/connect":    CGroupSockAddr,
287
+		"cgroup/sendmsg":    CGroupSockAddr,
288
+		"cgroup/recvmsg":    CGroupSockAddr,
289
+		"cgroup/sysctl":     CGroupSysctl,
290
+		"cgroup/getsockopt": CGroupSockopt,
291
+		"cgroup/setsockopt": CGroupSockopt,
292
+		"classifier":        SchedCLS,
293
+		"action":            SchedACT,
294
+	}
295
+	attachTypes := map[string]AttachType{
296
+		"cgroup_skb/ingress":    AttachCGroupInetIngress,
297
+		"cgroup_skb/egress":     AttachCGroupInetEgress,
298
+		"cgroup/sock":           AttachCGroupInetSockCreate,
299
+		"cgroup/post_bind4":     AttachCGroupInet4PostBind,
300
+		"cgroup/post_bind6":     AttachCGroupInet6PostBind,
301
+		"cgroup/dev":            AttachCGroupDevice,
302
+		"sockops":               AttachCGroupSockOps,
303
+		"sk_skb/stream_parser":  AttachSkSKBStreamParser,
304
+		"sk_skb/stream_verdict": AttachSkSKBStreamVerdict,
305
+		"sk_msg":                AttachSkSKBStreamVerdict,
306
+		"lirc_mode2":            AttachLircMode2,
307
+		"flow_dissector":        AttachFlowDissector,
308
+		"cgroup/bind4":          AttachCGroupInet4Bind,
309
+		"cgroup/bind6":          AttachCGroupInet6Bind,
310
+		"cgroup/connect4":       AttachCGroupInet4Connect,
311
+		"cgroup/connect6":       AttachCGroupInet6Connect,
312
+		"cgroup/sendmsg4":       AttachCGroupUDP4Sendmsg,
313
+		"cgroup/sendmsg6":       AttachCGroupUDP6Sendmsg,
314
+		"cgroup/recvmsg4":       AttachCGroupUDP4Recvmsg,
315
+		"cgroup/recvmsg6":       AttachCGroupUDP6Recvmsg,
316
+		"cgroup/sysctl":         AttachCGroupSysctl,
317
+		"cgroup/getsockopt":     AttachCGroupGetsockopt,
318
+		"cgroup/setsockopt":     AttachCGroupSetsockopt,
319
+	}
320
+	attachType := AttachNone
321
+	for k, t := range attachTypes {
322
+		if strings.HasPrefix(v, k) {
323
+			attachType = t
324
+		}
325
+	}
326
+
327
+	for k, t := range types {
328
+		if strings.HasPrefix(v, k) {
329
+			return t, attachType
330
+		}
331
+	}
332
+	return UnspecifiedProgram, AttachNone
333
+}
334
+
335
+func (ec *elfCode) loadRelocations(sec *elf.Section) (map[uint64]string, error) {
336
+	rels := make(map[uint64]string)
337
+	if sec == nil {
338
+		return rels, nil
339
+	}
340
+
341
+	if sec.Entsize < 16 {
342
+		return nil, errors.New("rels are less than 16 bytes")
343
+	}
344
+
345
+	r := sec.Open()
346
+	for off := uint64(0); off < sec.Size; off += sec.Entsize {
347
+		ent := io.LimitReader(r, int64(sec.Entsize))
348
+
349
+		var rel elf.Rel64
350
+		if binary.Read(ent, ec.ByteOrder, &rel) != nil {
351
+			return nil, errors.Errorf("can't parse relocation at offset %v", off)
352
+		}
353
+
354
+		symNo := int(elf.R_SYM64(rel.Info) - 1)
355
+		if symNo >= len(ec.symbols) {
356
+			return nil, errors.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo)
357
+		}
358
+
359
+		rels[rel.Off] = ec.symbols[symNo].Name
360
+	}
361
+	return rels, nil
362
+}
363
+
364
+func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]string {
365
+	result := make(map[elf.SectionIndex]map[uint64]string)
366
+	for i, sym := range symbols {
367
+		switch elf.ST_TYPE(sym.Info) {
368
+		case elf.STT_NOTYPE:
369
+			// Older versions of LLVM doesn't tag
370
+			// symbols correctly.
371
+			break
372
+		case elf.STT_OBJECT:
373
+			break
374
+		case elf.STT_FUNC:
375
+			break
376
+		default:
377
+			continue
378
+		}
379
+
380
+		if sym.Name == "" {
381
+			continue
382
+		}
383
+
384
+		idx := sym.Section
385
+		if _, ok := result[idx]; !ok {
386
+			result[idx] = make(map[uint64]string)
387
+		}
388
+		result[idx][sym.Value] = symbols[i].Name
389
+	}
390
+	return result
391
+}
0 392
new file mode 100644
... ...
@@ -0,0 +1,8 @@
0
+module github.com/cilium/ebpf
1
+
2
+go 1.12
3
+
4
+require (
5
+	github.com/pkg/errors v0.8.1
6
+	golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7
7
+)
0 8
new file mode 100644
... ...
@@ -0,0 +1,64 @@
0
+package internal
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"sync"
6
+
7
+	"github.com/pkg/errors"
8
+)
9
+
10
+var sysCPU struct {
11
+	once sync.Once
12
+	err  error
13
+	num  int
14
+}
15
+
16
+// PossibleCPUs returns the max number of CPUs a system may possibly have
17
+// Logical CPU numbers must be of the form 0-n
18
+func PossibleCPUs() (int, error) {
19
+	sysCPU.once.Do(func() {
20
+		sysCPU.num, sysCPU.err = parseCPUs("/sys/devices/system/cpu/possible")
21
+	})
22
+
23
+	return sysCPU.num, sysCPU.err
24
+}
25
+
26
+var onlineCPU struct {
27
+	once sync.Once
28
+	err  error
29
+	num  int
30
+}
31
+
32
+// OnlineCPUs returns the number of currently online CPUs
33
+// Logical CPU numbers must be of the form 0-n
34
+func OnlineCPUs() (int, error) {
35
+	onlineCPU.once.Do(func() {
36
+		onlineCPU.num, onlineCPU.err = parseCPUs("/sys/devices/system/cpu/online")
37
+	})
38
+
39
+	return onlineCPU.num, onlineCPU.err
40
+}
41
+
42
+// parseCPUs parses the number of cpus from sysfs,
43
+// in the format of "/sys/devices/system/cpu/{possible,online,..}.
44
+// Logical CPU numbers must be of the form 0-n
45
+func parseCPUs(path string) (int, error) {
46
+	file, err := os.Open(path)
47
+	if err != nil {
48
+		return 0, err
49
+	}
50
+	defer file.Close()
51
+
52
+	var low, high int
53
+	n, _ := fmt.Fscanf(file, "%d-%d", &low, &high)
54
+	if n < 1 || low != 0 {
55
+		return 0, errors.Wrapf(err, "%s has unknown format", path)
56
+	}
57
+	if n == 1 {
58
+		high = low
59
+	}
60
+
61
+	// cpus is 0 indexed
62
+	return high + 1, nil
63
+}
0 64
new file mode 100644
... ...
@@ -0,0 +1,24 @@
0
+package internal
1
+
2
+import (
3
+	"encoding/binary"
4
+	"unsafe"
5
+)
6
+
7
+// NativeEndian is set to either binary.BigEndian or binary.LittleEndian,
8
+// depending on the host's endianness.
9
+var NativeEndian binary.ByteOrder
10
+
11
+func init() {
12
+	if isBigEndian() {
13
+		NativeEndian = binary.BigEndian
14
+	} else {
15
+		NativeEndian = binary.LittleEndian
16
+	}
17
+}
18
+
19
+func isBigEndian() (ret bool) {
20
+	i := int(0x1)
21
+	bs := (*[int(unsafe.Sizeof(i))]byte)(unsafe.Pointer(&i))
22
+	return bs[0] == 0
23
+}
0 24
new file mode 100644
... ...
@@ -0,0 +1,85 @@
0
+package internal
1
+
2
+import (
3
+	"fmt"
4
+	"sync"
5
+
6
+	"github.com/pkg/errors"
7
+)
8
+
9
+// UnsupportedFeatureError is returned by FeatureTest() functions.
10
+type UnsupportedFeatureError struct {
11
+	// The minimum Linux mainline version required for this feature.
12
+	// Used for the error string, and for sanity checking during testing.
13
+	MinimumVersion Version
14
+
15
+	// The name of the feature that isn't supported.
16
+	Name string
17
+}
18
+
19
+func (ufe *UnsupportedFeatureError) Error() string {
20
+	return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion)
21
+}
22
+
23
+// FeatureTest wraps a function so that it is run at most once.
24
+//
25
+// name should identify the tested feature, while version must be in the
26
+// form Major.Minor[.Patch].
27
+//
28
+// Returns a descriptive UnsupportedFeatureError if the feature is not available.
29
+func FeatureTest(name, version string, fn func() bool) func() error {
30
+	v, err := NewVersion(version)
31
+	if err != nil {
32
+		return func() error { return err }
33
+	}
34
+
35
+	var (
36
+		once   sync.Once
37
+		result error
38
+	)
39
+
40
+	return func() error {
41
+		once.Do(func() {
42
+			if !fn() {
43
+				result = &UnsupportedFeatureError{
44
+					MinimumVersion: v,
45
+					Name:           name,
46
+				}
47
+			}
48
+		})
49
+		return result
50
+	}
51
+}
52
+
53
+// A Version in the form Major.Minor.Patch.
54
+type Version [3]uint16
55
+
56
+// NewVersion creates a version from a string like "Major.Minor.Patch".
57
+//
58
+// Patch is optional.
59
+func NewVersion(ver string) (Version, error) {
60
+	var major, minor, patch uint16
61
+	n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch)
62
+	if n < 2 {
63
+		return Version{}, errors.Errorf("invalid version: %s", ver)
64
+	}
65
+	return Version{major, minor, patch}, nil
66
+}
67
+
68
+func (v Version) String() string {
69
+	if v[2] == 0 {
70
+		return fmt.Sprintf("v%d.%d", v[0], v[1])
71
+	}
72
+	return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2])
73
+}
74
+
75
+// Less returns true if the version is less than another version.
76
+func (v Version) Less(other Version) bool {
77
+	for i, a := range v {
78
+		if a == other[i] {
79
+			continue
80
+		}
81
+		return a < other[i]
82
+	}
83
+	return false
84
+}
0 85
new file mode 100644
... ...
@@ -0,0 +1,127 @@
0
+// +build linux
1
+
2
+package unix
3
+
4
+import (
5
+	"syscall"
6
+
7
+	linux "golang.org/x/sys/unix"
8
+)
9
+
10
+const (
11
+	ENOENT                   = linux.ENOENT
12
+	EAGAIN                   = linux.EAGAIN
13
+	ENOSPC                   = linux.ENOSPC
14
+	EINVAL                   = linux.EINVAL
15
+	EPOLLIN                  = linux.EPOLLIN
16
+	BPF_OBJ_NAME_LEN         = linux.BPF_OBJ_NAME_LEN
17
+	BPF_TAG_SIZE             = linux.BPF_TAG_SIZE
18
+	SYS_BPF                  = linux.SYS_BPF
19
+	F_DUPFD_CLOEXEC          = linux.F_DUPFD_CLOEXEC
20
+	EPOLL_CTL_ADD            = linux.EPOLL_CTL_ADD
21
+	EPOLL_CLOEXEC            = linux.EPOLL_CLOEXEC
22
+	O_CLOEXEC                = linux.O_CLOEXEC
23
+	O_NONBLOCK               = linux.O_NONBLOCK
24
+	PROT_READ                = linux.PROT_READ
25
+	PROT_WRITE               = linux.PROT_WRITE
26
+	MAP_SHARED               = linux.MAP_SHARED
27
+	PERF_TYPE_SOFTWARE       = linux.PERF_TYPE_SOFTWARE
28
+	PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT
29
+	PerfBitWatermark         = linux.PerfBitWatermark
30
+	PERF_SAMPLE_RAW          = linux.PERF_SAMPLE_RAW
31
+	PERF_FLAG_FD_CLOEXEC     = linux.PERF_FLAG_FD_CLOEXEC
32
+	RLIM_INFINITY            = linux.RLIM_INFINITY
33
+)
34
+
35
+// Statfs_t is a wrapper
36
+type Statfs_t = linux.Statfs_t
37
+
38
+// Rlimit is a wrapper
39
+type Rlimit = linux.Rlimit
40
+
41
+// Setrlimit is a wrapper
42
+func Setrlimit(resource int, rlim *Rlimit) (err error) {
43
+	return linux.Setrlimit(resource, rlim)
44
+}
45
+
46
+// Syscall is a wrapper
47
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
48
+	return linux.Syscall(trap, a1, a2, a3)
49
+}
50
+
51
+// FcntlInt is a wrapper
52
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
53
+	return linux.FcntlInt(fd, cmd, arg)
54
+}
55
+
56
+// Statfs is a wrapper
57
+func Statfs(path string, buf *Statfs_t) (err error) {
58
+	return linux.Statfs(path, buf)
59
+}
60
+
61
+// Close is a wrapper
62
+func Close(fd int) (err error) {
63
+	return linux.Close(fd)
64
+}
65
+
66
+// EpollEvent is a wrapper
67
+type EpollEvent = linux.EpollEvent
68
+
69
+// EpollWait is a wrapper
70
+func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) {
71
+	return linux.EpollWait(epfd, events, msec)
72
+}
73
+
74
+// EpollCtl is a wrapper
75
+func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) {
76
+	return linux.EpollCtl(epfd, op, fd, event)
77
+}
78
+
79
+// Eventfd is a wrapper
80
+func Eventfd(initval uint, flags int) (fd int, err error) {
81
+	return linux.Eventfd(initval, flags)
82
+}
83
+
84
+// Write is a wrapper
85
+func Write(fd int, p []byte) (n int, err error) {
86
+	return linux.Write(fd, p)
87
+}
88
+
89
+// EpollCreate1 is a wrapper
90
+func EpollCreate1(flag int) (fd int, err error) {
91
+	return linux.EpollCreate1(flag)
92
+}
93
+
94
+// PerfEventMmapPage is a wrapper
95
+type PerfEventMmapPage linux.PerfEventMmapPage
96
+
97
+// SetNonblock is a wrapper
98
+func SetNonblock(fd int, nonblocking bool) (err error) {
99
+	return linux.SetNonblock(fd, nonblocking)
100
+}
101
+
102
+// Mmap is a wrapper
103
+func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) {
104
+	return linux.Mmap(fd, offset, length, prot, flags)
105
+}
106
+
107
+// Munmap is a wrapper
108
+func Munmap(b []byte) (err error) {
109
+	return linux.Munmap(b)
110
+}
111
+
112
+// PerfEventAttr is a wrapper
113
+type PerfEventAttr = linux.PerfEventAttr
114
+
115
+// PerfEventOpen is a wrapper
116
+func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) {
117
+	return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags)
118
+}
119
+
120
+// Utsname is a wrapper
121
+type Utsname = linux.Utsname
122
+
123
+// Uname is a wrapper
124
+func Uname(buf *Utsname) (err error) {
125
+	return linux.Uname(buf)
126
+}
0 127
new file mode 100644
... ...
@@ -0,0 +1,193 @@
0
+// +build !linux
1
+
2
+package unix
3
+
4
+import (
5
+	"fmt"
6
+	"runtime"
7
+	"syscall"
8
+)
9
+
10
+var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
11
+
12
+const (
13
+	ENOENT                   = syscall.ENOENT
14
+	EAGAIN                   = syscall.EAGAIN
15
+	ENOSPC                   = syscall.ENOSPC
16
+	EINVAL                   = syscall.EINVAL
17
+	BPF_OBJ_NAME_LEN         = 0x10
18
+	BPF_TAG_SIZE             = 0x8
19
+	SYS_BPF                  = 321
20
+	F_DUPFD_CLOEXEC          = 0x406
21
+	EPOLLIN                  = 0x1
22
+	EPOLL_CTL_ADD            = 0x1
23
+	EPOLL_CLOEXEC            = 0x80000
24
+	O_CLOEXEC                = 0x80000
25
+	O_NONBLOCK               = 0x800
26
+	PROT_READ                = 0x1
27
+	PROT_WRITE               = 0x2
28
+	MAP_SHARED               = 0x1
29
+	PERF_TYPE_SOFTWARE       = 0x1
30
+	PERF_COUNT_SW_BPF_OUTPUT = 0xa
31
+	PerfBitWatermark         = 0x4000
32
+	PERF_SAMPLE_RAW          = 0x400
33
+	PERF_FLAG_FD_CLOEXEC     = 0x8
34
+)
35
+
36
+// Statfs_t is a wrapper
37
+type Statfs_t struct {
38
+	Type    int64
39
+	Bsize   int64
40
+	Blocks  uint64
41
+	Bfree   uint64
42
+	Bavail  uint64
43
+	Files   uint64
44
+	Ffree   uint64
45
+	Fsid    [2]int32
46
+	Namelen int64
47
+	Frsize  int64
48
+	Flags   int64
49
+	Spare   [4]int64
50
+}
51
+
52
+// Rlimit is a wrapper
53
+type Rlimit struct {
54
+	Cur uint64
55
+	Max uint64
56
+}
57
+
58
+// Setrlimit is a wrapper
59
+func Setrlimit(resource int, rlim *Rlimit) (err error) {
60
+	return errNonLinux
61
+}
62
+
63
+// Syscall is a wrapper
64
+func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) {
65
+	return 0, 0, syscall.Errno(1)
66
+}
67
+
68
+// FcntlInt is a wrapper
69
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
70
+	return -1, errNonLinux
71
+}
72
+
73
+// Statfs is a wrapper
74
+func Statfs(path string, buf *Statfs_t) error {
75
+	return errNonLinux
76
+}
77
+
78
+// Close is a wrapper
79
+func Close(fd int) (err error) {
80
+	return errNonLinux
81
+}
82
+
83
+// EpollEvent is a wrapper
84
+type EpollEvent struct {
85
+	Events uint32
86
+	Fd     int32
87
+	Pad    int32
88
+}
89
+
90
+// EpollWait is a wrapper
91
+func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) {
92
+	return 0, errNonLinux
93
+}
94
+
95
+// EpollCtl is a wrapper
96
+func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) {
97
+	return errNonLinux
98
+}
99
+
100
+// Eventfd is a wrapper
101
+func Eventfd(initval uint, flags int) (fd int, err error) {
102
+	return 0, errNonLinux
103
+}
104
+
105
+// Write is a wrapper
106
+func Write(fd int, p []byte) (n int, err error) {
107
+	return 0, errNonLinux
108
+}
109
+
110
+// EpollCreate1 is a wrapper
111
+func EpollCreate1(flag int) (fd int, err error) {
112
+	return 0, errNonLinux
113
+}
114
+
115
+// PerfEventMmapPage is a wrapper
116
+type PerfEventMmapPage struct {
117
+	Version        uint32
118
+	Compat_version uint32
119
+	Lock           uint32
120
+	Index          uint32
121
+	Offset         int64
122
+	Time_enabled   uint64
123
+	Time_running   uint64
124
+	Capabilities   uint64
125
+	Pmc_width      uint16
126
+	Time_shift     uint16
127
+	Time_mult      uint32
128
+	Time_offset    uint64
129
+	Time_zero      uint64
130
+	Size           uint32
131
+
132
+	Data_head   uint64
133
+	Data_tail   uint64
134
+	Data_offset uint64
135
+	Data_size   uint64
136
+	Aux_head    uint64
137
+	Aux_tail    uint64
138
+	Aux_offset  uint64
139
+	Aux_size    uint64
140
+}
141
+
142
+// SetNonblock is a wrapper
143
+func SetNonblock(fd int, nonblocking bool) (err error) {
144
+	return errNonLinux
145
+}
146
+
147
+// Mmap is a wrapper
148
+func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) {
149
+	return []byte{}, errNonLinux
150
+}
151
+
152
+// Munmap is a wrapper
153
+func Munmap(b []byte) (err error) {
154
+	return errNonLinux
155
+}
156
+
157
+// PerfEventAttr is a wrapper
158
+type PerfEventAttr struct {
159
+	Type               uint32
160
+	Size               uint32
161
+	Config             uint64
162
+	Sample             uint64
163
+	Sample_type        uint64
164
+	Read_format        uint64
165
+	Bits               uint64
166
+	Wakeup             uint32
167
+	Bp_type            uint32
168
+	Ext1               uint64
169
+	Ext2               uint64
170
+	Branch_sample_type uint64
171
+	Sample_regs_user   uint64
172
+	Sample_stack_user  uint32
173
+	Clockid            int32
174
+	Sample_regs_intr   uint64
175
+	Aux_watermark      uint32
176
+	Sample_max_stack   uint16
177
+}
178
+
179
+// PerfEventOpen is a wrapper
180
+func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) {
181
+	return 0, errNonLinux
182
+}
183
+
184
+// Utsname is a wrapper
185
+type Utsname struct {
186
+    Release    [65]byte
187
+}
188
+
189
+// Uname is a wrapper
190
+func Uname(buf *Utsname) (err error) {
191
+	return errNonLinux
192
+}
0 193
new file mode 100644
... ...
@@ -0,0 +1,58 @@
0
+package ebpf
1
+
2
+import (
3
+	"github.com/cilium/ebpf/asm"
4
+)
5
+
6
+// link resolves bpf-to-bpf calls.
7
+//
8
+// Each section may contain multiple functions / labels, and is only linked
9
+// if the program being edited references one of these functions.
10
+//
11
+// Sections must not require linking themselves.
12
+func link(insns asm.Instructions, sections ...asm.Instructions) (asm.Instructions, error) {
13
+	for _, section := range sections {
14
+		var err error
15
+		insns, err = linkSection(insns, section)
16
+		if err != nil {
17
+			return nil, err
18
+		}
19
+	}
20
+	return insns, nil
21
+}
22
+
23
+func linkSection(insns, section asm.Instructions) (asm.Instructions, error) {
24
+	// A map of symbols to the libraries which contain them.
25
+	symbols, err := section.SymbolOffsets()
26
+	if err != nil {
27
+		return nil, err
28
+	}
29
+
30
+	for _, ins := range insns {
31
+		if ins.Reference == "" {
32
+			continue
33
+		}
34
+
35
+		if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.R1 {
36
+			continue
37
+		}
38
+
39
+		if ins.Constant != -1 {
40
+			// This is already a valid call, no need to link again.
41
+			continue
42
+		}
43
+
44
+		if _, ok := symbols[ins.Reference]; !ok {
45
+			// Symbol isn't available in this section
46
+			continue
47
+		}
48
+
49
+		// At this point we know that at least one function in the
50
+		// library is called from insns. Merge the two sections.
51
+		// The rewrite of ins.Constant happens in asm.Instruction.Marshal.
52
+		return append(insns, section...), nil
53
+	}
54
+
55
+	// None of the functions in the section are called. Do nothing.
56
+	return insns, nil
57
+}
0 58
new file mode 100644
... ...
@@ -0,0 +1,604 @@
0
+package ebpf
1
+
2
+import (
3
+	"fmt"
4
+	"unsafe"
5
+
6
+	"github.com/cilium/ebpf/internal"
7
+	"github.com/cilium/ebpf/internal/unix"
8
+
9
+	"github.com/pkg/errors"
10
+)
11
+
12
+// MapSpec defines a Map.
13
+type MapSpec struct {
14
+	// Name is passed to the kernel as a debug aid. Must only contain
15
+	// alpha numeric and '_' characters.
16
+	Name       string
17
+	Type       MapType
18
+	KeySize    uint32
19
+	ValueSize  uint32
20
+	MaxEntries uint32
21
+	Flags      uint32
22
+	// InnerMap is used as a template for ArrayOfMaps and HashOfMaps
23
+	InnerMap *MapSpec
24
+}
25
+
26
+func (ms *MapSpec) String() string {
27
+	return fmt.Sprintf("%s(keySize=%d, valueSize=%d, maxEntries=%d, flags=%d)", ms.Type, ms.KeySize, ms.ValueSize, ms.MaxEntries, ms.Flags)
28
+}
29
+
30
+// Copy returns a copy of the spec.
31
+func (ms *MapSpec) Copy() *MapSpec {
32
+	if ms == nil {
33
+		return nil
34
+	}
35
+
36
+	cpy := *ms
37
+	cpy.InnerMap = ms.InnerMap.Copy()
38
+	return &cpy
39
+}
40
+
41
+// Map represents a Map file descriptor.
42
+//
43
+// It is not safe to close a map which is used by other goroutines.
44
+//
45
+// Methods which take interface{} arguments by default encode
46
+// them using binary.Read/Write in the machine's native endianness.
47
+//
48
+// Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler
49
+// if you require custom encoding.
50
+type Map struct {
51
+	name string
52
+	fd   *bpfFD
53
+	abi  MapABI
54
+	// Per CPU maps return values larger than the size in the spec
55
+	fullValueSize int
56
+}
57
+
58
+// NewMapFromFD creates a map from a raw fd.
59
+//
60
+// You should not use fd after calling this function.
61
+func NewMapFromFD(fd int) (*Map, error) {
62
+	if fd < 0 {
63
+		return nil, errors.New("invalid fd")
64
+	}
65
+	bpfFd := newBPFFD(uint32(fd))
66
+
67
+	name, abi, err := newMapABIFromFd(bpfFd)
68
+	if err != nil {
69
+		bpfFd.forget()
70
+		return nil, err
71
+	}
72
+	return newMap(bpfFd, name, abi)
73
+}
74
+
75
+// NewMap creates a new Map.
76
+//
77
+// Creating a map for the first time will perform feature detection
78
+// by creating small, temporary maps.
79
+func NewMap(spec *MapSpec) (*Map, error) {
80
+	if spec.Type != ArrayOfMaps && spec.Type != HashOfMaps {
81
+		return createMap(spec, nil)
82
+	}
83
+
84
+	if spec.InnerMap == nil {
85
+		return nil, errors.Errorf("%s requires InnerMap", spec.Type)
86
+	}
87
+
88
+	template, err := createMap(spec.InnerMap, nil)
89
+	if err != nil {
90
+		return nil, err
91
+	}
92
+	defer template.Close()
93
+
94
+	return createMap(spec, template.fd)
95
+}
96
+
97
+func createMap(spec *MapSpec, inner *bpfFD) (*Map, error) {
98
+	spec = spec.Copy()
99
+
100
+	switch spec.Type {
101
+	case ArrayOfMaps:
102
+		fallthrough
103
+	case HashOfMaps:
104
+		if err := haveNestedMaps(); err != nil {
105
+			return nil, err
106
+		}
107
+
108
+		if spec.ValueSize != 0 && spec.ValueSize != 4 {
109
+			return nil, errors.Errorf("ValueSize must be zero or four for map of map")
110
+		}
111
+		spec.ValueSize = 4
112
+
113
+	case PerfEventArray:
114
+		if spec.KeySize != 0 {
115
+			return nil, errors.Errorf("KeySize must be zero for perf event array")
116
+		}
117
+		if spec.ValueSize != 0 {
118
+			return nil, errors.Errorf("ValueSize must be zero for perf event array")
119
+		}
120
+		if spec.MaxEntries == 0 {
121
+			n, err := internal.OnlineCPUs()
122
+			if err != nil {
123
+				return nil, errors.Wrap(err, "perf event array")
124
+			}
125
+			spec.MaxEntries = uint32(n)
126
+		}
127
+
128
+		spec.KeySize = 4
129
+		spec.ValueSize = 4
130
+	}
131
+
132
+	attr := bpfMapCreateAttr{
133
+		mapType:    spec.Type,
134
+		keySize:    spec.KeySize,
135
+		valueSize:  spec.ValueSize,
136
+		maxEntries: spec.MaxEntries,
137
+		flags:      spec.Flags,
138
+	}
139
+
140
+	if inner != nil {
141
+		var err error
142
+		attr.innerMapFd, err = inner.value()
143
+		if err != nil {
144
+			return nil, errors.Wrap(err, "map create")
145
+		}
146
+	}
147
+
148
+	name, err := newBPFObjName(spec.Name)
149
+	if err != nil {
150
+		return nil, errors.Wrap(err, "map create")
151
+	}
152
+
153
+	if haveObjName() == nil {
154
+		attr.mapName = name
155
+	}
156
+
157
+	fd, err := bpfMapCreate(&attr)
158
+	if err != nil {
159
+		return nil, errors.Wrap(err, "map create")
160
+	}
161
+
162
+	return newMap(fd, spec.Name, newMapABIFromSpec(spec))
163
+}
164
+
165
+func newMap(fd *bpfFD, name string, abi *MapABI) (*Map, error) {
166
+	m := &Map{
167
+		name,
168
+		fd,
169
+		*abi,
170
+		int(abi.ValueSize),
171
+	}
172
+
173
+	if !abi.Type.hasPerCPUValue() {
174
+		return m, nil
175
+	}
176
+
177
+	possibleCPUs, err := internal.PossibleCPUs()
178
+	if err != nil {
179
+		return nil, err
180
+	}
181
+
182
+	m.fullValueSize = align(int(abi.ValueSize), 8) * possibleCPUs
183
+	return m, nil
184
+}
185
+
186
+func (m *Map) String() string {
187
+	if m.name != "" {
188
+		return fmt.Sprintf("%s(%s)#%v", m.abi.Type, m.name, m.fd)
189
+	}
190
+	return fmt.Sprintf("%s#%v", m.abi.Type, m.fd)
191
+}
192
+
193
+// ABI gets the ABI of the Map
194
+func (m *Map) ABI() MapABI {
195
+	return m.abi
196
+}
197
+
198
+// Lookup retrieves a value from a Map.
199
+//
200
+// Calls Close() on valueOut if it is of type **Map or **Program,
201
+// and *valueOut is not nil.
202
+//
203
+// Returns an error if the key doesn't exist, see IsNotExist.
204
+func (m *Map) Lookup(key, valueOut interface{}) error {
205
+	valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize)
206
+
207
+	if err := m.lookup(key, valuePtr); err != nil {
208
+		return err
209
+	}
210
+
211
+	if valueBytes == nil {
212
+		return nil
213
+	}
214
+
215
+	if m.abi.Type.hasPerCPUValue() {
216
+		return unmarshalPerCPUValue(valueOut, int(m.abi.ValueSize), valueBytes)
217
+	}
218
+
219
+	switch value := valueOut.(type) {
220
+	case **Map:
221
+		m, err := unmarshalMap(valueBytes)
222
+		if err != nil {
223
+			return err
224
+		}
225
+
226
+		(*value).Close()
227
+		*value = m
228
+		return nil
229
+	case *Map:
230
+		return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil))
231
+	case Map:
232
+		return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil))
233
+
234
+	case **Program:
235
+		p, err := unmarshalProgram(valueBytes)
236
+		if err != nil {
237
+			return err
238
+		}
239
+
240
+		(*value).Close()
241
+		*value = p
242
+		return nil
243
+	case *Program:
244
+		return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil))
245
+	case Program:
246
+		return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil))
247
+
248
+	default:
249
+		return unmarshalBytes(valueOut, valueBytes)
250
+	}
251
+}
252
+
253
+// LookupBytes gets a value from Map.
254
+//
255
+// Returns a nil value if a key doesn't exist.
256
+func (m *Map) LookupBytes(key interface{}) ([]byte, error) {
257
+	valueBytes := make([]byte, m.fullValueSize)
258
+	valuePtr := newPtr(unsafe.Pointer(&valueBytes[0]))
259
+
260
+	err := m.lookup(key, valuePtr)
261
+	if IsNotExist(err) {
262
+		return nil, nil
263
+	}
264
+
265
+	return valueBytes, err
266
+}
267
+
268
+func (m *Map) lookup(key interface{}, valueOut syscallPtr) error {
269
+	keyPtr, err := marshalPtr(key, int(m.abi.KeySize))
270
+	if err != nil {
271
+		return errors.WithMessage(err, "can't marshal key")
272
+	}
273
+
274
+	err = bpfMapLookupElem(m.fd, keyPtr, valueOut)
275
+	return errors.WithMessage(err, "lookup failed")
276
+}
277
+
278
+// MapUpdateFlags controls the behaviour of the Map.Update call.
279
+//
280
+// The exact semantics depend on the specific MapType.
281
+type MapUpdateFlags uint64
282
+
283
+const (
284
+	// UpdateAny creates a new element or update an existing one.
285
+	UpdateAny MapUpdateFlags = iota
286
+	// UpdateNoExist creates a new element.
287
+	UpdateNoExist MapUpdateFlags = 1 << (iota - 1)
288
+	// UpdateExist updates an existing element.
289
+	UpdateExist
290
+)
291
+
292
+// Put replaces or creates a value in map.
293
+//
294
+// It is equivalent to calling Update with UpdateAny.
295
+func (m *Map) Put(key, value interface{}) error {
296
+	return m.Update(key, value, UpdateAny)
297
+}
298
+
299
+// Update changes the value of a key.
300
+func (m *Map) Update(key, value interface{}, flags MapUpdateFlags) error {
301
+	keyPtr, err := marshalPtr(key, int(m.abi.KeySize))
302
+	if err != nil {
303
+		return errors.WithMessage(err, "can't marshal key")
304
+	}
305
+
306
+	var valuePtr syscallPtr
307
+	if m.abi.Type.hasPerCPUValue() {
308
+		valuePtr, err = marshalPerCPUValue(value, int(m.abi.ValueSize))
309
+	} else {
310
+		valuePtr, err = marshalPtr(value, int(m.abi.ValueSize))
311
+	}
312
+	if err != nil {
313
+		return errors.WithMessage(err, "can't marshal value")
314
+	}
315
+
316
+	return bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags))
317
+}
318
+
319
+// Delete removes a value.
320
+//
321
+// Returns an error if the key does not exist, see IsNotExist.
322
+func (m *Map) Delete(key interface{}) error {
323
+	keyPtr, err := marshalPtr(key, int(m.abi.KeySize))
324
+	if err != nil {
325
+		return errors.WithMessage(err, "can't marshal key")
326
+	}
327
+
328
+	err = bpfMapDeleteElem(m.fd, keyPtr)
329
+	return errors.WithMessage(err, "can't delete key")
330
+}
331
+
332
+// NextKey finds the key following an initial key.
333
+//
334
+// See NextKeyBytes for details.
335
+func (m *Map) NextKey(key, nextKeyOut interface{}) error {
336
+	nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.abi.KeySize))
337
+
338
+	if err := m.nextKey(key, nextKeyPtr); err != nil {
339
+		return err
340
+	}
341
+
342
+	if nextKeyBytes == nil {
343
+		return nil
344
+	}
345
+
346
+	err := unmarshalBytes(nextKeyOut, nextKeyBytes)
347
+	return errors.WithMessage(err, "can't unmarshal next key")
348
+}
349
+
350
+// NextKeyBytes returns the key following an initial key as a byte slice.
351
+//
352
+// Passing nil will return the first key.
353
+//
354
+// Use Iterate if you want to traverse all entries in the map.
355
+func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) {
356
+	nextKey := make([]byte, m.abi.KeySize)
357
+	nextKeyPtr := newPtr(unsafe.Pointer(&nextKey[0]))
358
+
359
+	err := m.nextKey(key, nextKeyPtr)
360
+	if IsNotExist(err) {
361
+		return nil, nil
362
+	}
363
+
364
+	return nextKey, err
365
+}
366
+
367
+func (m *Map) nextKey(key interface{}, nextKeyOut syscallPtr) error {
368
+	var (
369
+		keyPtr syscallPtr
370
+		err    error
371
+	)
372
+
373
+	if key != nil {
374
+		keyPtr, err = marshalPtr(key, int(m.abi.KeySize))
375
+		if err != nil {
376
+			return errors.WithMessage(err, "can't marshal key")
377
+		}
378
+	}
379
+
380
+	err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut)
381
+	return errors.WithMessage(err, "can't get next key")
382
+}
383
+
384
+// Iterate traverses a map.
385
+//
386
+// It's safe to create multiple iterators at the same time.
387
+//
388
+// It's not possible to guarantee that all keys in a map will be
389
+// returned if there are concurrent modifications to the map.
390
+func (m *Map) Iterate() *MapIterator {
391
+	return newMapIterator(m)
392
+}
393
+
394
+// Close removes a Map
395
+func (m *Map) Close() error {
396
+	if m == nil {
397
+		// This makes it easier to clean up when iterating maps
398
+		// of maps / programs.
399
+		return nil
400
+	}
401
+
402
+	return m.fd.close()
403
+}
404
+
405
+// FD gets the file descriptor of the Map.
406
+//
407
+// Calling this function is invalid after Close has been called.
408
+func (m *Map) FD() int {
409
+	fd, err := m.fd.value()
410
+	if err != nil {
411
+		// Best effort: -1 is the number most likely to be an
412
+		// invalid file descriptor.
413
+		return -1
414
+	}
415
+
416
+	return int(fd)
417
+}
418
+
419
+// Clone creates a duplicate of the Map.
420
+//
421
+// Closing the duplicate does not affect the original, and vice versa.
422
+// Changes made to the map are reflected by both instances however.
423
+//
424
+// Cloning a nil Map returns nil.
425
+func (m *Map) Clone() (*Map, error) {
426
+	if m == nil {
427
+		return nil, nil
428
+	}
429
+
430
+	dup, err := m.fd.dup()
431
+	if err != nil {
432
+		return nil, errors.Wrap(err, "can't clone map")
433
+	}
434
+
435
+	return newMap(dup, m.name, &m.abi)
436
+}
437
+
438
+// Pin persists the map past the lifetime of the process that created it.
439
+//
440
+// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional
441
+func (m *Map) Pin(fileName string) error {
442
+	return bpfPinObject(fileName, m.fd)
443
+}
444
+
445
+// LoadPinnedMap load a Map from a BPF file.
446
+//
447
+// The function is not compatible with nested maps.
448
+// Use LoadPinnedMapExplicit in these situations.
449
+func LoadPinnedMap(fileName string) (*Map, error) {
450
+	fd, err := bpfGetObject(fileName)
451
+	if err != nil {
452
+		return nil, err
453
+	}
454
+	name, abi, err := newMapABIFromFd(fd)
455
+	if err != nil {
456
+		_ = fd.close()
457
+		return nil, err
458
+	}
459
+	return newMap(fd, name, abi)
460
+}
461
+
462
+// LoadPinnedMapExplicit loads a map with explicit parameters.
463
+func LoadPinnedMapExplicit(fileName string, abi *MapABI) (*Map, error) {
464
+	fd, err := bpfGetObject(fileName)
465
+	if err != nil {
466
+		return nil, err
467
+	}
468
+	return newMap(fd, "", abi)
469
+}
470
+
471
+func unmarshalMap(buf []byte) (*Map, error) {
472
+	if len(buf) != 4 {
473
+		return nil, errors.New("map id requires 4 byte value")
474
+	}
475
+
476
+	// Looking up an entry in a nested map or prog array returns an id,
477
+	// not an fd.
478
+	id := internal.NativeEndian.Uint32(buf)
479
+	fd, err := bpfGetMapFDByID(id)
480
+	if err != nil {
481
+		return nil, err
482
+	}
483
+
484
+	name, abi, err := newMapABIFromFd(fd)
485
+	if err != nil {
486
+		_ = fd.close()
487
+		return nil, err
488
+	}
489
+
490
+	return newMap(fd, name, abi)
491
+}
492
+
493
+// MarshalBinary implements BinaryMarshaler.
494
+func (m *Map) MarshalBinary() ([]byte, error) {
495
+	fd, err := m.fd.value()
496
+	if err != nil {
497
+		return nil, err
498
+	}
499
+
500
+	buf := make([]byte, 4)
501
+	internal.NativeEndian.PutUint32(buf, fd)
502
+	return buf, nil
503
+}
504
+
505
+// MapIterator iterates a Map.
506
+//
507
+// See Map.Iterate.
508
+type MapIterator struct {
509
+	target            *Map
510
+	prevKey           interface{}
511
+	prevBytes         []byte
512
+	count, maxEntries uint32
513
+	done              bool
514
+	err               error
515
+}
516
+
517
+func newMapIterator(target *Map) *MapIterator {
518
+	return &MapIterator{
519
+		target:     target,
520
+		maxEntries: target.abi.MaxEntries,
521
+		prevBytes:  make([]byte, int(target.abi.KeySize)),
522
+	}
523
+}
524
+
525
+var errIterationAborted = errors.New("iteration aborted")
526
+
527
+// Next decodes the next key and value.
528
+//
529
+// Iterating a hash map from which keys are being deleted is not
530
+// safe. You may see the same key multiple times. Iteration may
531
+// also abort with an error, see IsIterationAborted.
532
+//
533
+// Returns false if there are no more entries. You must check
534
+// the result of Err afterwards.
535
+//
536
+// See Map.Get for further caveats around valueOut.
537
+func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool {
538
+	if mi.err != nil || mi.done {
539
+		return false
540
+	}
541
+
542
+	for ; mi.count < mi.maxEntries; mi.count++ {
543
+		var nextBytes []byte
544
+		nextBytes, mi.err = mi.target.NextKeyBytes(mi.prevKey)
545
+		if mi.err != nil {
546
+			return false
547
+		}
548
+
549
+		if nextBytes == nil {
550
+			mi.done = true
551
+			return false
552
+		}
553
+
554
+		// The user can get access to nextBytes since unmarshalBytes
555
+		// does not copy when unmarshaling into a []byte.
556
+		// Make a copy to prevent accidental corruption of
557
+		// iterator state.
558
+		copy(mi.prevBytes, nextBytes)
559
+		mi.prevKey = mi.prevBytes
560
+
561
+		mi.err = mi.target.Lookup(nextBytes, valueOut)
562
+		if IsNotExist(mi.err) {
563
+			// Even though the key should be valid, we couldn't look up
564
+			// its value. If we're iterating a hash map this is probably
565
+			// because a concurrent delete removed the value before we
566
+			// could get it. This means that the next call to NextKeyBytes
567
+			// is very likely to restart iteration.
568
+			// If we're iterating one of the fd maps like
569
+			// ProgramArray it means that a given slot doesn't have
570
+			// a valid fd associated. It's OK to continue to the next slot.
571
+			continue
572
+		}
573
+		if mi.err != nil {
574
+			return false
575
+		}
576
+
577
+		mi.err = unmarshalBytes(keyOut, nextBytes)
578
+		return mi.err == nil
579
+	}
580
+
581
+	mi.err = errIterationAborted
582
+	return false
583
+}
584
+
585
+// Err returns any encountered error.
586
+//
587
+// The method must be called after Next returns nil.
588
+func (mi *MapIterator) Err() error {
589
+	return mi.err
590
+}
591
+
592
+// IsNotExist returns true if the error indicates that a
593
+// key doesn't exist.
594
+func IsNotExist(err error) bool {
595
+	return errors.Cause(err) == unix.ENOENT
596
+}
597
+
598
+// IsIterationAborted returns true if the iteration was aborted.
599
+//
600
+// This occurs when keys are deleted from a hash map during iteration.
601
+func IsIterationAborted(err error) bool {
602
+	return errors.Cause(err) == errIterationAborted
603
+}
0 604
new file mode 100644
... ...
@@ -0,0 +1,192 @@
0
+package ebpf
1
+
2
+import (
3
+	"bytes"
4
+	"encoding"
5
+	"encoding/binary"
6
+	"reflect"
7
+	"runtime"
8
+	"unsafe"
9
+
10
+	"github.com/cilium/ebpf/internal"
11
+
12
+	"github.com/pkg/errors"
13
+)
14
+
15
+func marshalPtr(data interface{}, length int) (syscallPtr, error) {
16
+	if ptr, ok := data.(unsafe.Pointer); ok {
17
+		return newPtr(ptr), nil
18
+	}
19
+
20
+	buf, err := marshalBytes(data, length)
21
+	if err != nil {
22
+		return syscallPtr{}, err
23
+	}
24
+
25
+	return newPtr(unsafe.Pointer(&buf[0])), nil
26
+}
27
+
28
+func marshalBytes(data interface{}, length int) (buf []byte, err error) {
29
+	switch value := data.(type) {
30
+	case encoding.BinaryMarshaler:
31
+		buf, err = value.MarshalBinary()
32
+	case string:
33
+		buf = []byte(value)
34
+	case []byte:
35
+		buf = value
36
+	case unsafe.Pointer:
37
+		err = errors.New("can't marshal from unsafe.Pointer")
38
+	default:
39
+		var wr bytes.Buffer
40
+		err = binary.Write(&wr, internal.NativeEndian, value)
41
+		err = errors.Wrapf(err, "encoding %T", value)
42
+		buf = wr.Bytes()
43
+	}
44
+	if err != nil {
45
+		return nil, err
46
+	}
47
+
48
+	if len(buf) != length {
49
+		return nil, errors.Errorf("%T doesn't marshal to %d bytes", data, length)
50
+	}
51
+	return buf, nil
52
+}
53
+
54
+func makeBuffer(dst interface{}, length int) (syscallPtr, []byte) {
55
+	if ptr, ok := dst.(unsafe.Pointer); ok {
56
+		return newPtr(ptr), nil
57
+	}
58
+
59
+	buf := make([]byte, length)
60
+	return newPtr(unsafe.Pointer(&buf[0])), buf
61
+}
62
+
63
+func unmarshalBytes(data interface{}, buf []byte) error {
64
+	switch value := data.(type) {
65
+	case unsafe.Pointer:
66
+		sh := &reflect.SliceHeader{
67
+			Data: uintptr(value),
68
+			Len:  len(buf),
69
+			Cap:  len(buf),
70
+		}
71
+
72
+		dst := *(*[]byte)(unsafe.Pointer(sh))
73
+		copy(dst, buf)
74
+		runtime.KeepAlive(value)
75
+		return nil
76
+	case encoding.BinaryUnmarshaler:
77
+		return value.UnmarshalBinary(buf)
78
+	case *string:
79
+		*value = string(buf)
80
+		return nil
81
+	case *[]byte:
82
+		*value = buf
83
+		return nil
84
+	case string:
85
+		return errors.New("require pointer to string")
86
+	case []byte:
87
+		return errors.New("require pointer to []byte")
88
+	default:
89
+		rd := bytes.NewReader(buf)
90
+		err := binary.Read(rd, internal.NativeEndian, value)
91
+		return errors.Wrapf(err, "decoding %T", value)
92
+	}
93
+}
94
+
95
+// marshalPerCPUValue encodes a slice containing one value per
96
+// possible CPU into a buffer of bytes.
97
+//
98
+// Values are initialized to zero if the slice has less elements than CPUs.
99
+//
100
+// slice must have a type like []elementType.
101
+func marshalPerCPUValue(slice interface{}, elemLength int) (syscallPtr, error) {
102
+	sliceType := reflect.TypeOf(slice)
103
+	if sliceType.Kind() != reflect.Slice {
104
+		return syscallPtr{}, errors.New("per-CPU value requires slice")
105
+	}
106
+
107
+	possibleCPUs, err := internal.PossibleCPUs()
108
+	if err != nil {
109
+		return syscallPtr{}, err
110
+	}
111
+
112
+	sliceValue := reflect.ValueOf(slice)
113
+	sliceLen := sliceValue.Len()
114
+	if sliceLen > possibleCPUs {
115
+		return syscallPtr{}, errors.Errorf("per-CPU value exceeds number of CPUs")
116
+	}
117
+
118
+	alignedElemLength := align(elemLength, 8)
119
+	buf := make([]byte, alignedElemLength*possibleCPUs)
120
+
121
+	for i := 0; i < sliceLen; i++ {
122
+		elem := sliceValue.Index(i).Interface()
123
+		elemBytes, err := marshalBytes(elem, elemLength)
124
+		if err != nil {
125
+			return syscallPtr{}, err
126
+		}
127
+
128
+		offset := i * alignedElemLength
129
+		copy(buf[offset:offset+elemLength], elemBytes)
130
+	}
131
+
132
+	return newPtr(unsafe.Pointer(&buf[0])), nil
133
+}
134
+
135
+// unmarshalPerCPUValue decodes a buffer into a slice containing one value per
136
+// possible CPU.
137
+//
138
+// valueOut must have a type like *[]elementType
139
+func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error {
140
+	slicePtrType := reflect.TypeOf(slicePtr)
141
+	if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice {
142
+		return errors.Errorf("per-cpu value requires pointer to slice")
143
+	}
144
+
145
+	possibleCPUs, err := internal.PossibleCPUs()
146
+	if err != nil {
147
+		return err
148
+	}
149
+
150
+	sliceType := slicePtrType.Elem()
151
+	slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs)
152
+
153
+	sliceElemType := sliceType.Elem()
154
+	sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr
155
+	if sliceElemIsPointer {
156
+		sliceElemType = sliceElemType.Elem()
157
+	}
158
+
159
+	step := len(buf) / possibleCPUs
160
+	if step < elemLength {
161
+		return errors.Errorf("per-cpu element length is larger than available data")
162
+	}
163
+	for i := 0; i < possibleCPUs; i++ {
164
+		var elem interface{}
165
+		if sliceElemIsPointer {
166
+			newElem := reflect.New(sliceElemType)
167
+			slice.Index(i).Set(newElem)
168
+			elem = newElem.Interface()
169
+		} else {
170
+			elem = slice.Index(i).Addr().Interface()
171
+		}
172
+
173
+		// Make a copy, since unmarshal can hold on to itemBytes
174
+		elemBytes := make([]byte, elemLength)
175
+		copy(elemBytes, buf[:elemLength])
176
+
177
+		err := unmarshalBytes(elem, elemBytes)
178
+		if err != nil {
179
+			return errors.Wrapf(err, "cpu %d", i)
180
+		}
181
+
182
+		buf = buf[step:]
183
+	}
184
+
185
+	reflect.ValueOf(slicePtr).Elem().Set(slice)
186
+	return nil
187
+}
188
+
189
+func align(n, alignment int) int {
190
+	return (int(n) + alignment - 1) / alignment * alignment
191
+}
0 192
new file mode 100644
... ...
@@ -0,0 +1,504 @@
0
+package ebpf
1
+
2
+import (
3
+	"bytes"
4
+	"fmt"
5
+	"math"
6
+	"strings"
7
+	"time"
8
+	"unsafe"
9
+
10
+	"github.com/cilium/ebpf/asm"
11
+	"github.com/cilium/ebpf/internal"
12
+	"github.com/cilium/ebpf/internal/unix"
13
+
14
+	"github.com/pkg/errors"
15
+)
16
+
17
+const (
18
+	// Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN.
19
+	// This is currently the maximum of spare space allocated for SKB
20
+	// and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN.
21
+	outputPad = 256 + 2
22
+)
23
+
24
+// DefaultVerifierLogSize is the default number of bytes allocated for the
25
+// verifier log.
26
+const DefaultVerifierLogSize = 64 * 1024
27
+
28
+// ProgramOptions control loading a program into the kernel.
29
+type ProgramOptions struct {
30
+	// Controls the detail emitted by the kernel verifier. Set to non-zero
31
+	// to enable logging.
32
+	LogLevel uint32
33
+	// Controls the output buffer size for the verifier. Defaults to
34
+	// DefaultVerifierLogSize.
35
+	LogSize int
36
+}
37
+
38
+// ProgramSpec defines a Program
39
+type ProgramSpec struct {
40
+	// Name is passed to the kernel as a debug aid. Must only contain
41
+	// alpha numeric and '_' characters.
42
+	Name          string
43
+	Type          ProgramType
44
+	AttachType    AttachType
45
+	Instructions  asm.Instructions
46
+	License       string
47
+	KernelVersion uint32
48
+}
49
+
50
+// Copy returns a copy of the spec.
51
+func (ps *ProgramSpec) Copy() *ProgramSpec {
52
+	if ps == nil {
53
+		return nil
54
+	}
55
+
56
+	cpy := *ps
57
+	cpy.Instructions = make(asm.Instructions, len(ps.Instructions))
58
+	copy(cpy.Instructions, ps.Instructions)
59
+	return &cpy
60
+}
61
+
62
+// Program represents BPF program loaded into the kernel.
63
+//
64
+// It is not safe to close a Program which is used by other goroutines.
65
+type Program struct {
66
+	// Contains the output of the kernel verifier if enabled,
67
+	// otherwise it is empty.
68
+	VerifierLog string
69
+
70
+	fd   *bpfFD
71
+	name string
72
+	abi  ProgramABI
73
+}
74
+
75
+// NewProgram creates a new Program.
76
+//
77
+// Loading a program for the first time will perform
78
+// feature detection by loading small, temporary programs.
79
+func NewProgram(spec *ProgramSpec) (*Program, error) {
80
+	return NewProgramWithOptions(spec, ProgramOptions{})
81
+}
82
+
83
+// NewProgramWithOptions creates a new Program.
84
+//
85
+// Loading a program for the first time will perform
86
+// feature detection by loading small, temporary programs.
87
+func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
88
+	attr, err := convertProgramSpec(spec)
89
+	if err != nil {
90
+		return nil, err
91
+	}
92
+
93
+	logSize := DefaultVerifierLogSize
94
+	if opts.LogSize > 0 {
95
+		logSize = opts.LogSize
96
+	}
97
+
98
+	var logBuf []byte
99
+	if opts.LogLevel > 0 {
100
+		logBuf = make([]byte, logSize)
101
+		attr.logLevel = opts.LogLevel
102
+		attr.logSize = uint32(len(logBuf))
103
+		attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0]))
104
+	}
105
+
106
+	fd, err := bpfProgLoad(attr)
107
+	if err == nil {
108
+		prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type})
109
+		prog.VerifierLog = convertCString(logBuf)
110
+		return prog, nil
111
+	}
112
+
113
+	truncated := errors.Cause(err) == unix.ENOSPC
114
+	if opts.LogLevel == 0 {
115
+		// Re-run with the verifier enabled to get better error messages.
116
+		logBuf = make([]byte, logSize)
117
+		attr.logLevel = 1
118
+		attr.logSize = uint32(len(logBuf))
119
+		attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0]))
120
+
121
+		_, nerr := bpfProgLoad(attr)
122
+		truncated = errors.Cause(nerr) == unix.ENOSPC
123
+	}
124
+
125
+	logs := convertCString(logBuf)
126
+	if truncated {
127
+		logs += "\n(truncated...)"
128
+	}
129
+
130
+	return nil, &loadError{err, logs}
131
+}
132
+
133
+// NewProgramFromFD creates a program from a raw fd.
134
+//
135
+// You should not use fd after calling this function.
136
+//
137
+// Requires at least Linux 4.11.
138
+func NewProgramFromFD(fd int) (*Program, error) {
139
+	if fd < 0 {
140
+		return nil, errors.New("invalid fd")
141
+	}
142
+	bpfFd := newBPFFD(uint32(fd))
143
+
144
+	name, abi, err := newProgramABIFromFd(bpfFd)
145
+	if err != nil {
146
+		bpfFd.forget()
147
+		return nil, err
148
+	}
149
+
150
+	return newProgram(bpfFd, name, abi), nil
151
+}
152
+
153
+func newProgram(fd *bpfFD, name string, abi *ProgramABI) *Program {
154
+	return &Program{
155
+		name: name,
156
+		fd:   fd,
157
+		abi:  *abi,
158
+	}
159
+}
160
+
161
+func convertProgramSpec(spec *ProgramSpec) (*bpfProgLoadAttr, error) {
162
+	if len(spec.Instructions) == 0 {
163
+		return nil, errors.New("Instructions cannot be empty")
164
+	}
165
+
166
+	if len(spec.License) == 0 {
167
+		return nil, errors.New("License cannot be empty")
168
+	}
169
+
170
+	buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
171
+	err := spec.Instructions.Marshal(buf, internal.NativeEndian)
172
+	if err != nil {
173
+		return nil, err
174
+	}
175
+
176
+	bytecode := buf.Bytes()
177
+	insCount := uint32(len(bytecode) / asm.InstructionSize)
178
+	lic := []byte(spec.License)
179
+	attr := &bpfProgLoadAttr{
180
+		progType:           spec.Type,
181
+		expectedAttachType: spec.AttachType,
182
+		insCount:           insCount,
183
+		instructions:       newPtr(unsafe.Pointer(&bytecode[0])),
184
+		license:            newPtr(unsafe.Pointer(&lic[0])),
185
+	}
186
+
187
+	name, err := newBPFObjName(spec.Name)
188
+	if err != nil {
189
+		return nil, err
190
+	}
191
+
192
+	if haveObjName() == nil {
193
+		attr.progName = name
194
+	}
195
+
196
+	return attr, nil
197
+}
198
+
199
+func (p *Program) String() string {
200
+	if p.name != "" {
201
+		return fmt.Sprintf("%s(%s)#%v", p.abi.Type, p.name, p.fd)
202
+	}
203
+	return fmt.Sprintf("%s#%v", p.abi.Type, p.fd)
204
+}
205
+
206
+// ABI gets the ABI of the Program
207
+func (p *Program) ABI() ProgramABI {
208
+	return p.abi
209
+}
210
+
211
+// FD gets the file descriptor of the Program.
212
+//
213
+// It is invalid to call this function after Close has been called.
214
+func (p *Program) FD() int {
215
+	fd, err := p.fd.value()
216
+	if err != nil {
217
+		// Best effort: -1 is the number most likely to be an
218
+		// invalid file descriptor.
219
+		return -1
220
+	}
221
+
222
+	return int(fd)
223
+}
224
+
225
+// Clone creates a duplicate of the Program.
226
+//
227
+// Closing the duplicate does not affect the original, and vice versa.
228
+//
229
+// Cloning a nil Program returns nil.
230
+func (p *Program) Clone() (*Program, error) {
231
+	if p == nil {
232
+		return nil, nil
233
+	}
234
+
235
+	dup, err := p.fd.dup()
236
+	if err != nil {
237
+		return nil, errors.Wrap(err, "can't clone program")
238
+	}
239
+
240
+	return newProgram(dup, p.name, &p.abi), nil
241
+}
242
+
243
+// Pin persists the Program past the lifetime of the process that created it
244
+//
245
+// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional
246
+func (p *Program) Pin(fileName string) error {
247
+	return errors.Wrap(bpfPinObject(fileName, p.fd), "can't pin program")
248
+}
249
+
250
+// Close unloads the program from the kernel.
251
+func (p *Program) Close() error {
252
+	if p == nil {
253
+		return nil
254
+	}
255
+
256
+	return p.fd.close()
257
+}
258
+
259
+// Test runs the Program in the kernel with the given input and returns the
260
+// value returned by the eBPF program. outLen may be zero.
261
+//
262
+// Note: the kernel expects at least 14 bytes input for an ethernet header for
263
+// XDP and SKB programs.
264
+//
265
+// This function requires at least Linux 4.12.
266
+func (p *Program) Test(in []byte) (uint32, []byte, error) {
267
+	ret, out, _, err := p.testRun(in, 1)
268
+	return ret, out, errors.Wrap(err, "can't test program")
269
+}
270
+
271
+// Benchmark runs the Program with the given input for a number of times
272
+// and returns the time taken per iteration.
273
+//
274
+// The returned value is the return value of the last execution of
275
+// the program.
276
+//
277
+// This function requires at least Linux 4.12.
278
+func (p *Program) Benchmark(in []byte, repeat int) (uint32, time.Duration, error) {
279
+	ret, _, total, err := p.testRun(in, repeat)
280
+	return ret, total, errors.Wrap(err, "can't benchmark program")
281
+}
282
+
283
+var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() bool {
284
+	prog, err := NewProgram(&ProgramSpec{
285
+		Type: SocketFilter,
286
+		Instructions: asm.Instructions{
287
+			asm.LoadImm(asm.R0, 0, asm.DWord),
288
+			asm.Return(),
289
+		},
290
+		License: "MIT",
291
+	})
292
+	if err != nil {
293
+		// This may be because we lack sufficient permissions, etc.
294
+		return false
295
+	}
296
+	defer prog.Close()
297
+
298
+	fd, err := prog.fd.value()
299
+	if err != nil {
300
+		return false
301
+	}
302
+
303
+	// Programs require at least 14 bytes input
304
+	in := make([]byte, 14)
305
+	attr := bpfProgTestRunAttr{
306
+		fd:         fd,
307
+		dataSizeIn: uint32(len(in)),
308
+		dataIn:     newPtr(unsafe.Pointer(&in[0])),
309
+	}
310
+
311
+	_, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
312
+
313
+	// Check for EINVAL specifically, rather than err != nil since we
314
+	// otherwise misdetect due to insufficient permissions.
315
+	return errors.Cause(err) != unix.EINVAL
316
+})
317
+
318
+func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, error) {
319
+	if uint(repeat) > math.MaxUint32 {
320
+		return 0, nil, 0, fmt.Errorf("repeat is too high")
321
+	}
322
+
323
+	if len(in) == 0 {
324
+		return 0, nil, 0, fmt.Errorf("missing input")
325
+	}
326
+
327
+	if uint(len(in)) > math.MaxUint32 {
328
+		return 0, nil, 0, fmt.Errorf("input is too long")
329
+	}
330
+
331
+	if err := haveProgTestRun(); err != nil {
332
+		return 0, nil, 0, err
333
+	}
334
+
335
+	// Older kernels ignore the dataSizeOut argument when copying to user space.
336
+	// Combined with things like bpf_xdp_adjust_head() we don't really know what the final
337
+	// size will be. Hence we allocate an output buffer which we hope will always be large
338
+	// enough, and panic if the kernel wrote past the end of the allocation.
339
+	// See https://patchwork.ozlabs.org/cover/1006822/
340
+	out := make([]byte, len(in)+outputPad)
341
+
342
+	fd, err := p.fd.value()
343
+	if err != nil {
344
+		return 0, nil, 0, err
345
+	}
346
+
347
+	attr := bpfProgTestRunAttr{
348
+		fd:          fd,
349
+		dataSizeIn:  uint32(len(in)),
350
+		dataSizeOut: uint32(len(out)),
351
+		dataIn:      newPtr(unsafe.Pointer(&in[0])),
352
+		dataOut:     newPtr(unsafe.Pointer(&out[0])),
353
+		repeat:      uint32(repeat),
354
+	}
355
+
356
+	_, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
357
+	if err != nil {
358
+		return 0, nil, 0, errors.Wrap(err, "can't run test")
359
+	}
360
+
361
+	if int(attr.dataSizeOut) > cap(out) {
362
+		// Houston, we have a problem. The program created more data than we allocated,
363
+		// and the kernel wrote past the end of our buffer.
364
+		panic("kernel wrote past end of output buffer")
365
+	}
366
+	out = out[:int(attr.dataSizeOut)]
367
+
368
+	total := time.Duration(attr.duration) * time.Nanosecond
369
+	return attr.retval, out, total, nil
370
+}
371
+
372
+func unmarshalProgram(buf []byte) (*Program, error) {
373
+	if len(buf) != 4 {
374
+		return nil, errors.New("program id requires 4 byte value")
375
+	}
376
+
377
+	// Looking up an entry in a nested map or prog array returns an id,
378
+	// not an fd.
379
+	id := internal.NativeEndian.Uint32(buf)
380
+	fd, err := bpfGetProgramFDByID(id)
381
+	if err != nil {
382
+		return nil, err
383
+	}
384
+
385
+	name, abi, err := newProgramABIFromFd(fd)
386
+	if err != nil {
387
+		_ = fd.close()
388
+		return nil, err
389
+	}
390
+
391
+	return newProgram(fd, name, abi), nil
392
+}
393
+
394
+// MarshalBinary implements BinaryMarshaler.
395
+func (p *Program) MarshalBinary() ([]byte, error) {
396
+	value, err := p.fd.value()
397
+	if err != nil {
398
+		return nil, err
399
+	}
400
+
401
+	buf := make([]byte, 4)
402
+	internal.NativeEndian.PutUint32(buf, value)
403
+	return buf, nil
404
+}
405
+
406
+// Attach a Program to a container object fd
407
+func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error {
408
+	if fd < 0 {
409
+		return errors.New("invalid fd")
410
+	}
411
+
412
+	pfd, err := p.fd.value()
413
+	if err != nil {
414
+		return err
415
+	}
416
+
417
+	attr := bpfProgAlterAttr{
418
+		targetFd:    uint32(fd),
419
+		attachBpfFd: pfd,
420
+		attachType:  uint32(typ),
421
+		attachFlags: uint32(flags),
422
+	}
423
+
424
+	return bpfProgAlter(_ProgAttach, &attr)
425
+}
426
+
427
+// Detach a Program from a container object fd
428
+func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error {
429
+	if fd < 0 {
430
+		return errors.New("invalid fd")
431
+	}
432
+
433
+	pfd, err := p.fd.value()
434
+	if err != nil {
435
+		return err
436
+	}
437
+
438
+	attr := bpfProgAlterAttr{
439
+		targetFd:    uint32(fd),
440
+		attachBpfFd: pfd,
441
+		attachType:  uint32(typ),
442
+		attachFlags: uint32(flags),
443
+	}
444
+
445
+	return bpfProgAlter(_ProgDetach, &attr)
446
+}
447
+
448
+// LoadPinnedProgram loads a Program from a BPF file.
449
+//
450
+// Requires at least Linux 4.11.
451
+func LoadPinnedProgram(fileName string) (*Program, error) {
452
+	fd, err := bpfGetObject(fileName)
453
+	if err != nil {
454
+		return nil, err
455
+	}
456
+
457
+	name, abi, err := newProgramABIFromFd(fd)
458
+	if err != nil {
459
+		_ = fd.close()
460
+		return nil, errors.Wrapf(err, "can't get ABI for %s", fileName)
461
+	}
462
+
463
+	return newProgram(fd, name, abi), nil
464
+}
465
+
466
+// SanitizeName replaces all invalid characters in name.
467
+//
468
+// Use this to automatically generate valid names for maps and
469
+// programs at run time.
470
+//
471
+// Passing a negative value for replacement will delete characters
472
+// instead of replacing them.
473
+func SanitizeName(name string, replacement rune) string {
474
+	return strings.Map(func(char rune) rune {
475
+		if invalidBPFObjNameChar(char) {
476
+			return replacement
477
+		}
478
+		return char
479
+	}, name)
480
+}
481
+
482
+type loadError struct {
483
+	cause       error
484
+	verifierLog string
485
+}
486
+
487
+func (le *loadError) Error() string {
488
+	if le.verifierLog == "" {
489
+		return fmt.Sprintf("failed to load program: %s", le.cause)
490
+	}
491
+	return fmt.Sprintf("failed to load program: %s: %s", le.cause, le.verifierLog)
492
+}
493
+
494
+func (le *loadError) Cause() error {
495
+	return le.cause
496
+}
497
+
498
+// IsNotSupported returns true if an error occurred because
499
+// the kernel does not have support for a specific feature.
500
+func IsNotSupported(err error) bool {
501
+	_, notSupported := errors.Cause(err).(*internal.UnsupportedFeatureError)
502
+	return notSupported
503
+}
0 504
new file mode 100644
... ...
@@ -0,0 +1,14 @@
0
+// +build armbe mips mips64p32
1
+
2
+package ebpf
3
+
4
+import (
5
+	"unsafe"
6
+)
7
+
8
+// ptr wraps an unsafe.Pointer to be 64bit to
9
+// conform to the syscall specification.
10
+type syscallPtr struct {
11
+	pad uint32
12
+	ptr unsafe.Pointer
13
+}
0 14
new file mode 100644
... ...
@@ -0,0 +1,14 @@
0
+// +build 386 amd64p32 arm mipsle mips64p32le
1
+
2
+package ebpf
3
+
4
+import (
5
+	"unsafe"
6
+)
7
+
8
+// ptr wraps an unsafe.Pointer to be 64bit to
9
+// conform to the syscall specification.
10
+type syscallPtr struct {
11
+	ptr unsafe.Pointer
12
+	pad uint32
13
+}
0 14
new file mode 100644
... ...
@@ -0,0 +1,14 @@
0
+// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le
1
+// +build !armbe,!mips,!mips64p32
2
+
3
+package ebpf
4
+
5
+import (
6
+	"unsafe"
7
+)
8
+
9
+// ptr wraps an unsafe.Pointer to be 64bit to
10
+// conform to the syscall specification.
11
+type syscallPtr struct {
12
+	ptr unsafe.Pointer
13
+}
0 14
new file mode 100644
... ...
@@ -0,0 +1,20 @@
0
+eBPF
1
+-------
2
+[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf)
3
+
4
+eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes.
5
+
6
+[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler.
7
+
8
+The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
9
+
10
+## Current status
11
+
12
+The package is production ready, but **the API is explicitly unstable
13
+right now**. Expect to update your code if you want to follow along.
14
+
15
+## Useful resources
16
+
17
+* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended)
18
+* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt)
19
+* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)
0 20
new file mode 100644
... ...
@@ -0,0 +1,447 @@
0
+package ebpf
1
+
2
+import (
3
+	"bytes"
4
+	"path/filepath"
5
+	"runtime"
6
+	"strconv"
7
+	"strings"
8
+	"unsafe"
9
+
10
+	"github.com/cilium/ebpf/internal"
11
+	"github.com/cilium/ebpf/internal/unix"
12
+
13
+	"github.com/pkg/errors"
14
+)
15
+
16
+var errClosedFd = errors.New("use of closed file descriptor")
17
+
18
+type bpfFD struct {
19
+	raw int64
20
+}
21
+
22
+func newBPFFD(value uint32) *bpfFD {
23
+	fd := &bpfFD{int64(value)}
24
+	runtime.SetFinalizer(fd, (*bpfFD).close)
25
+	return fd
26
+}
27
+
28
+func (fd *bpfFD) String() string {
29
+	return strconv.FormatInt(fd.raw, 10)
30
+}
31
+
32
+func (fd *bpfFD) value() (uint32, error) {
33
+	if fd.raw < 0 {
34
+		return 0, errClosedFd
35
+	}
36
+
37
+	return uint32(fd.raw), nil
38
+}
39
+
40
+func (fd *bpfFD) close() error {
41
+	if fd.raw < 0 {
42
+		return nil
43
+	}
44
+
45
+	value := int(fd.raw)
46
+	fd.raw = -1
47
+
48
+	fd.forget()
49
+	return unix.Close(value)
50
+}
51
+
52
+func (fd *bpfFD) forget() {
53
+	runtime.SetFinalizer(fd, nil)
54
+}
55
+
56
+func (fd *bpfFD) dup() (*bpfFD, error) {
57
+	if fd.raw < 0 {
58
+		return nil, errClosedFd
59
+	}
60
+
61
+	dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0)
62
+	if err != nil {
63
+		return nil, errors.Wrap(err, "can't dup fd")
64
+	}
65
+
66
+	return newBPFFD(uint32(dup)), nil
67
+}
68
+
69
+// bpfObjName is a null-terminated string made up of
70
+// 'A-Za-z0-9_' characters.
71
+type bpfObjName [unix.BPF_OBJ_NAME_LEN]byte
72
+
73
+// newBPFObjName truncates the result if it is too long.
74
+func newBPFObjName(name string) (bpfObjName, error) {
75
+	idx := strings.IndexFunc(name, invalidBPFObjNameChar)
76
+	if idx != -1 {
77
+		return bpfObjName{}, errors.Errorf("invalid character '%c' in name '%s'", name[idx], name)
78
+	}
79
+
80
+	var result bpfObjName
81
+	copy(result[:unix.BPF_OBJ_NAME_LEN-1], name)
82
+	return result, nil
83
+}
84
+
85
+func invalidBPFObjNameChar(char rune) bool {
86
+	switch {
87
+	case char >= 'A' && char <= 'Z':
88
+		fallthrough
89
+	case char >= 'a' && char <= 'z':
90
+		fallthrough
91
+	case char >= '0' && char <= '9':
92
+		fallthrough
93
+	case char == '_':
94
+		return false
95
+	default:
96
+		return true
97
+	}
98
+}
99
+
100
+type bpfMapCreateAttr struct {
101
+	mapType    MapType
102
+	keySize    uint32
103
+	valueSize  uint32
104
+	maxEntries uint32
105
+	flags      uint32
106
+	innerMapFd uint32     // since 4.12 56f668dfe00d
107
+	numaNode   uint32     // since 4.14 96eabe7a40aa
108
+	mapName    bpfObjName // since 4.15 ad5b177bd73f
109
+}
110
+
111
+type bpfMapOpAttr struct {
112
+	mapFd   uint32
113
+	padding uint32
114
+	key     syscallPtr
115
+	value   syscallPtr
116
+	flags   uint64
117
+}
118
+
119
+type bpfMapInfo struct {
120
+	mapType    uint32
121
+	id         uint32
122
+	keySize    uint32
123
+	valueSize  uint32
124
+	maxEntries uint32
125
+	flags      uint32
126
+	mapName    bpfObjName // since 4.15 ad5b177bd73f
127
+}
128
+
129
+type bpfPinObjAttr struct {
130
+	fileName syscallPtr
131
+	fd       uint32
132
+	padding  uint32
133
+}
134
+
135
+type bpfProgLoadAttr struct {
136
+	progType           ProgramType
137
+	insCount           uint32
138
+	instructions       syscallPtr
139
+	license            syscallPtr
140
+	logLevel           uint32
141
+	logSize            uint32
142
+	logBuf             syscallPtr
143
+	kernelVersion      uint32     // since 4.1  2541517c32be
144
+	progFlags          uint32     // since 4.11 e07b98d9bffe
145
+	progName           bpfObjName // since 4.15 067cae47771c
146
+	progIfIndex        uint32     // since 4.15 1f6f4cb7ba21
147
+	expectedAttachType AttachType // since 4.17 5e43f899b03a
148
+}
149
+
150
+type bpfProgInfo struct {
151
+	progType     uint32
152
+	id           uint32
153
+	tag          [unix.BPF_TAG_SIZE]byte
154
+	jitedLen     uint32
155
+	xlatedLen    uint32
156
+	jited        syscallPtr
157
+	xlated       syscallPtr
158
+	loadTime     uint64 // since 4.15 cb4d2b3f03d8
159
+	createdByUID uint32
160
+	nrMapIDs     uint32
161
+	mapIds       syscallPtr
162
+	name         bpfObjName
163
+}
164
+
165
+type bpfProgTestRunAttr struct {
166
+	fd          uint32
167
+	retval      uint32
168
+	dataSizeIn  uint32
169
+	dataSizeOut uint32
170
+	dataIn      syscallPtr
171
+	dataOut     syscallPtr
172
+	repeat      uint32
173
+	duration    uint32
174
+}
175
+
176
+type bpfProgAlterAttr struct {
177
+	targetFd    uint32
178
+	attachBpfFd uint32
179
+	attachType  uint32
180
+	attachFlags uint32
181
+}
182
+
183
+type bpfObjGetInfoByFDAttr struct {
184
+	fd      uint32
185
+	infoLen uint32
186
+	info    syscallPtr // May be either bpfMapInfo or bpfProgInfo
187
+}
188
+
189
+type bpfGetFDByIDAttr struct {
190
+	id   uint32
191
+	next uint32
192
+}
193
+
194
+func newPtr(ptr unsafe.Pointer) syscallPtr {
195
+	return syscallPtr{ptr: ptr}
196
+}
197
+
198
+func bpfProgLoad(attr *bpfProgLoadAttr) (*bpfFD, error) {
199
+	for {
200
+		fd, err := bpfCall(_ProgLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
201
+		// As of ~4.20 the verifier can be interrupted by a signal,
202
+		// and returns EAGAIN in that case.
203
+		if err == unix.EAGAIN {
204
+			continue
205
+		}
206
+
207
+		if err != nil {
208
+			return nil, err
209
+		}
210
+
211
+		return newBPFFD(uint32(fd)), nil
212
+	}
213
+}
214
+
215
+func bpfProgAlter(cmd int, attr *bpfProgAlterAttr) error {
216
+	_, err := bpfCall(cmd, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
217
+	return err
218
+}
219
+
220
+func bpfMapCreate(attr *bpfMapCreateAttr) (*bpfFD, error) {
221
+	fd, err := bpfCall(_MapCreate, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
222
+	if err != nil {
223
+		return nil, err
224
+	}
225
+
226
+	return newBPFFD(uint32(fd)), nil
227
+}
228
+
229
+var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() bool {
230
+	inner, err := bpfMapCreate(&bpfMapCreateAttr{
231
+		mapType:    Array,
232
+		keySize:    4,
233
+		valueSize:  4,
234
+		maxEntries: 1,
235
+	})
236
+	if err != nil {
237
+		return false
238
+	}
239
+	defer inner.close()
240
+
241
+	innerFd, _ := inner.value()
242
+	nested, err := bpfMapCreate(&bpfMapCreateAttr{
243
+		mapType:    ArrayOfMaps,
244
+		keySize:    4,
245
+		valueSize:  4,
246
+		maxEntries: 1,
247
+		innerMapFd: innerFd,
248
+	})
249
+	if err != nil {
250
+		return false
251
+	}
252
+
253
+	_ = nested.close()
254
+	return true
255
+})
256
+
257
+func bpfMapLookupElem(m *bpfFD, key, valueOut syscallPtr) error {
258
+	fd, err := m.value()
259
+	if err != nil {
260
+		return err
261
+	}
262
+
263
+	attr := bpfMapOpAttr{
264
+		mapFd: fd,
265
+		key:   key,
266
+		value: valueOut,
267
+	}
268
+	_, err = bpfCall(_MapLookupElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
269
+	return err
270
+}
271
+
272
+func bpfMapUpdateElem(m *bpfFD, key, valueOut syscallPtr, flags uint64) error {
273
+	fd, err := m.value()
274
+	if err != nil {
275
+		return err
276
+	}
277
+
278
+	attr := bpfMapOpAttr{
279
+		mapFd: fd,
280
+		key:   key,
281
+		value: valueOut,
282
+		flags: flags,
283
+	}
284
+	_, err = bpfCall(_MapUpdateElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
285
+	return err
286
+}
287
+
288
+func bpfMapDeleteElem(m *bpfFD, key syscallPtr) error {
289
+	fd, err := m.value()
290
+	if err != nil {
291
+		return err
292
+	}
293
+
294
+	attr := bpfMapOpAttr{
295
+		mapFd: fd,
296
+		key:   key,
297
+	}
298
+	_, err = bpfCall(_MapDeleteElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
299
+	return err
300
+}
301
+
302
+func bpfMapGetNextKey(m *bpfFD, key, nextKeyOut syscallPtr) error {
303
+	fd, err := m.value()
304
+	if err != nil {
305
+		return err
306
+	}
307
+
308
+	attr := bpfMapOpAttr{
309
+		mapFd: fd,
310
+		key:   key,
311
+		value: nextKeyOut,
312
+	}
313
+	_, err = bpfCall(_MapGetNextKey, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
314
+	return err
315
+}
316
+
317
+const bpfFSType = 0xcafe4a11
318
+
319
+func bpfPinObject(fileName string, fd *bpfFD) error {
320
+	dirName := filepath.Dir(fileName)
321
+	var statfs unix.Statfs_t
322
+	if err := unix.Statfs(dirName, &statfs); err != nil {
323
+		return err
324
+	}
325
+	if uint64(statfs.Type) != bpfFSType {
326
+		return errors.Errorf("%s is not on a bpf filesystem", fileName)
327
+	}
328
+
329
+	value, err := fd.value()
330
+	if err != nil {
331
+		return err
332
+	}
333
+
334
+	_, err = bpfCall(_ObjPin, unsafe.Pointer(&bpfPinObjAttr{
335
+		fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])),
336
+		fd:       value,
337
+	}), 16)
338
+	return errors.Wrapf(err, "pin object %s", fileName)
339
+}
340
+
341
+func bpfGetObject(fileName string) (*bpfFD, error) {
342
+	ptr, err := bpfCall(_ObjGet, unsafe.Pointer(&bpfPinObjAttr{
343
+		fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])),
344
+	}), 16)
345
+	if err != nil {
346
+		return nil, errors.Wrapf(err, "get object %s", fileName)
347
+	}
348
+	return newBPFFD(uint32(ptr)), nil
349
+}
350
+
351
+func bpfGetObjectInfoByFD(fd *bpfFD, info unsafe.Pointer, size uintptr) error {
352
+	value, err := fd.value()
353
+	if err != nil {
354
+		return err
355
+	}
356
+
357
+	// available from 4.13
358
+	attr := bpfObjGetInfoByFDAttr{
359
+		fd:      value,
360
+		infoLen: uint32(size),
361
+		info:    newPtr(info),
362
+	}
363
+	_, err = bpfCall(_ObjGetInfoByFD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
364
+	return errors.Wrapf(err, "fd %d", value)
365
+}
366
+
367
+func bpfGetProgInfoByFD(fd *bpfFD) (*bpfProgInfo, error) {
368
+	var info bpfProgInfo
369
+	err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
370
+	return &info, errors.Wrap(err, "can't get program info")
371
+}
372
+
373
+func bpfGetMapInfoByFD(fd *bpfFD) (*bpfMapInfo, error) {
374
+	var info bpfMapInfo
375
+	err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
376
+	return &info, errors.Wrap(err, "can't get map info")
377
+}
378
+
379
+var haveObjName = internal.FeatureTest("object names", "4.15", func() bool {
380
+	name, err := newBPFObjName("feature_test")
381
+	if err != nil {
382
+		// This really is a fatal error, but it should be caught
383
+		// by the unit tests not working.
384
+		return false
385
+	}
386
+
387
+	attr := bpfMapCreateAttr{
388
+		mapType:    Array,
389
+		keySize:    4,
390
+		valueSize:  4,
391
+		maxEntries: 1,
392
+		mapName:    name,
393
+	}
394
+
395
+	fd, err := bpfMapCreate(&attr)
396
+	if err != nil {
397
+		return false
398
+	}
399
+
400
+	_ = fd.close()
401
+	return true
402
+})
403
+
404
+func bpfGetMapFDByID(id uint32) (*bpfFD, error) {
405
+	// available from 4.13
406
+	attr := bpfGetFDByIDAttr{
407
+		id: id,
408
+	}
409
+	ptr, err := bpfCall(_MapGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
410
+	if err != nil {
411
+		return nil, errors.Wrapf(err, "can't get fd for map id %d", id)
412
+	}
413
+	return newBPFFD(uint32(ptr)), nil
414
+}
415
+
416
+func bpfGetProgramFDByID(id uint32) (*bpfFD, error) {
417
+	// available from 4.13
418
+	attr := bpfGetFDByIDAttr{
419
+		id: id,
420
+	}
421
+	ptr, err := bpfCall(_ProgGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
422
+	if err != nil {
423
+		return nil, errors.Wrapf(err, "can't get fd for program id %d", id)
424
+	}
425
+	return newBPFFD(uint32(ptr)), nil
426
+}
427
+
428
+func bpfCall(cmd int, attr unsafe.Pointer, size uintptr) (uintptr, error) {
429
+	r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size)
430
+	runtime.KeepAlive(attr)
431
+
432
+	var err error
433
+	if errNo != 0 {
434
+		err = errNo
435
+	}
436
+
437
+	return r1, err
438
+}
439
+
440
+func convertCString(in []byte) string {
441
+	inLen := bytes.IndexByte(in, 0)
442
+	if inLen == -1 {
443
+		return ""
444
+	}
445
+	return string(in[:inLen])
446
+}
0 447
new file mode 100644
... ...
@@ -0,0 +1,189 @@
0
+package ebpf
1
+
2
+//go:generate stringer -output types_string.go -type=MapType,ProgramType
3
+
4
+// MapType indicates the type map structure
5
+// that will be initialized in the kernel.
6
+type MapType uint32
7
+
8
+// All the various map types that can be created
9
+const (
10
+	UnspecifiedMap MapType = iota
11
+	// Hash is a hash map
12
+	Hash
13
+	// Array is an array map
14
+	Array
15
+	// ProgramArray - A program array map is a special kind of array map whose map
16
+	// values contain only file descriptors referring to other eBPF
17
+	// programs.  Thus, both the key_size and value_size must be
18
+	// exactly four bytes.  This map is used in conjunction with the
19
+	// TailCall helper.
20
+	ProgramArray
21
+	// PerfEventArray - A perf event array is used in conjunction with PerfEventRead
22
+	// and PerfEventOutput calls, to read the raw bpf_perf_data from the registers.
23
+	PerfEventArray
24
+	// PerCPUHash - This data structure is useful for people who have high performance
25
+	// network needs and can reconcile adds at the end of some cycle, so that
26
+	// hashes can be lock free without the use of XAdd, which can be costly.
27
+	PerCPUHash
28
+	// PerCPUArray - This data structure is useful for people who have high performance
29
+	// network needs and can reconcile adds at the end of some cycle, so that
30
+	// hashes can be lock free without the use of XAdd, which can be costly.
31
+	// Each CPU gets a copy of this hash, the contents of all of which can be reconciled
32
+	// later.
33
+	PerCPUArray
34
+	// StackTrace - This holds whole user and kernel stack traces, it can be retrieved with
35
+	// GetStackID
36
+	StackTrace
37
+	// CGroupArray - This is a very niche structure used to help SKBInCGroup determine
38
+	// if an skb is from a socket belonging to a specific cgroup
39
+	CGroupArray
40
+	// LRUHash - This allows you to create a small hash structure that will purge the
41
+	// least recently used items rather than thow an error when you run out of memory
42
+	LRUHash
43
+	// LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs,
44
+	// it has more to do with including the CPU id with the LRU calculation so that if a
45
+	// particular CPU is using a value over-and-over again, then it will be saved, but if
46
+	// a value is being retrieved a lot but sparsely across CPUs it is not as important, basically
47
+	// giving weight to CPU locality over overall usage.
48
+	LRUCPUHash
49
+	// LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful,
50
+	// for storing things like IP addresses which can be bit masked allowing for keys of differing
51
+	// values to refer to the same reference based on their masks. See wikipedia for more details.
52
+	LPMTrie
53
+	// ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps
54
+	// itself.
55
+	ArrayOfMaps
56
+	// HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps
57
+	// itself.
58
+	HashOfMaps
59
+)
60
+
61
+// hasPerCPUValue returns true if the Map stores a value per CPU.
62
+func (mt MapType) hasPerCPUValue() bool {
63
+	if mt == PerCPUHash || mt == PerCPUArray {
64
+		return true
65
+	}
66
+	return false
67
+}
68
+
69
+const (
70
+	_MapCreate = iota
71
+	_MapLookupElem
72
+	_MapUpdateElem
73
+	_MapDeleteElem
74
+	_MapGetNextKey
75
+	_ProgLoad
76
+	_ObjPin
77
+	_ObjGet
78
+	_ProgAttach
79
+	_ProgDetach
80
+	_ProgTestRun
81
+	_ProgGetNextID
82
+	_MapGetNextID
83
+	_ProgGetFDByID
84
+	_MapGetFDByID
85
+	_ObjGetInfoByFD
86
+)
87
+
88
+const (
89
+	_Any = iota
90
+	_NoExist
91
+	_Exist
92
+)
93
+
94
+// ProgramType of the eBPF program
95
+type ProgramType uint32
96
+
97
+// eBPF program types
98
+const (
99
+	// Unrecognized program type
100
+	UnspecifiedProgram ProgramType = iota
101
+	// SocketFilter socket or seccomp filter
102
+	SocketFilter
103
+	// Kprobe program
104
+	Kprobe
105
+	// SchedCLS traffic control shaper
106
+	SchedCLS
107
+	// SchedACT routing control shaper
108
+	SchedACT
109
+	// TracePoint program
110
+	TracePoint
111
+	// XDP program
112
+	XDP
113
+	// PerfEvent program
114
+	PerfEvent
115
+	// CGroupSKB program
116
+	CGroupSKB
117
+	// CGroupSock program
118
+	CGroupSock
119
+	// LWTIn program
120
+	LWTIn
121
+	// LWTOut program
122
+	LWTOut
123
+	// LWTXmit program
124
+	LWTXmit
125
+	// SockOps program
126
+	SockOps
127
+	// SkSKB program
128
+	SkSKB
129
+	// CGroupDevice program
130
+	CGroupDevice
131
+	// SkMsg program
132
+	SkMsg
133
+	// RawTracepoint program
134
+	RawTracepoint
135
+	// CGroupSockAddr program
136
+	CGroupSockAddr
137
+	// LWTSeg6Local program
138
+	LWTSeg6Local
139
+	// LircMode2 program
140
+	LircMode2
141
+	// SkReuseport program
142
+	SkReuseport
143
+	// FlowDissector program
144
+	FlowDissector
145
+	// CGroupSysctl program
146
+	CGroupSysctl
147
+	// RawTracepointWritable program
148
+	RawTracepointWritable
149
+	// CGroupSockopt program
150
+	CGroupSockopt
151
+)
152
+
153
+// AttachType of the eBPF program, needed to differentiate allowed context accesses in
154
+// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required.
155
+// Will cause invalid argument (EINVAL) at program load time if set incorrectly.
156
+type AttachType uint32
157
+
158
+// AttachNone is an alias for AttachCGroupInetIngress for readability reasons
159
+const AttachNone AttachType = 0
160
+
161
+const (
162
+	AttachCGroupInetIngress AttachType = iota
163
+	AttachCGroupInetEgress
164
+	AttachCGroupInetSockCreate
165
+	AttachCGroupSockOps
166
+	AttachSkSKBStreamParser
167
+	AttachSkSKBStreamVerdict
168
+	AttachCGroupDevice
169
+	AttachSkMsgVerdict
170
+	AttachCGroupInet4Bind
171
+	AttachCGroupInet6Bind
172
+	AttachCGroupInet4Connect
173
+	AttachCGroupInet6Connect
174
+	AttachCGroupInet4PostBind
175
+	AttachCGroupInet6PostBind
176
+	AttachCGroupUDP4Sendmsg
177
+	AttachCGroupUDP6Sendmsg
178
+	AttachLircMode2
179
+	AttachFlowDissector
180
+	AttachCGroupSysctl
181
+	AttachCGroupUDP4Recvmsg
182
+	AttachCGroupUDP6Recvmsg
183
+	AttachCGroupGetsockopt
184
+	AttachCGroupSetsockopt
185
+)
186
+
187
+// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command
188
+type AttachFlags uint32
0 189
new file mode 100644
... ...
@@ -0,0 +1,78 @@
0
+// Code generated by "stringer -output types_string.go -type=MapType,ProgramType"; DO NOT EDIT.
1
+
2
+package ebpf
3
+
4
+import "strconv"
5
+
6
+func _() {
7
+	// An "invalid array index" compiler error signifies that the constant values have changed.
8
+	// Re-run the stringer command to generate them again.
9
+	var x [1]struct{}
10
+	_ = x[UnspecifiedMap-0]
11
+	_ = x[Hash-1]
12
+	_ = x[Array-2]
13
+	_ = x[ProgramArray-3]
14
+	_ = x[PerfEventArray-4]
15
+	_ = x[PerCPUHash-5]
16
+	_ = x[PerCPUArray-6]
17
+	_ = x[StackTrace-7]
18
+	_ = x[CGroupArray-8]
19
+	_ = x[LRUHash-9]
20
+	_ = x[LRUCPUHash-10]
21
+	_ = x[LPMTrie-11]
22
+	_ = x[ArrayOfMaps-12]
23
+	_ = x[HashOfMaps-13]
24
+}
25
+
26
+const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMaps"
27
+
28
+var _MapType_index = [...]uint8{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136}
29
+
30
+func (i MapType) String() string {
31
+	if i >= MapType(len(_MapType_index)-1) {
32
+		return "MapType(" + strconv.FormatInt(int64(i), 10) + ")"
33
+	}
34
+	return _MapType_name[_MapType_index[i]:_MapType_index[i+1]]
35
+}
36
+func _() {
37
+	// An "invalid array index" compiler error signifies that the constant values have changed.
38
+	// Re-run the stringer command to generate them again.
39
+	var x [1]struct{}
40
+	_ = x[UnspecifiedProgram-0]
41
+	_ = x[SocketFilter-1]
42
+	_ = x[Kprobe-2]
43
+	_ = x[SchedCLS-3]
44
+	_ = x[SchedACT-4]
45
+	_ = x[TracePoint-5]
46
+	_ = x[XDP-6]
47
+	_ = x[PerfEvent-7]
48
+	_ = x[CGroupSKB-8]
49
+	_ = x[CGroupSock-9]
50
+	_ = x[LWTIn-10]
51
+	_ = x[LWTOut-11]
52
+	_ = x[LWTXmit-12]
53
+	_ = x[SockOps-13]
54
+	_ = x[SkSKB-14]
55
+	_ = x[CGroupDevice-15]
56
+	_ = x[SkMsg-16]
57
+	_ = x[RawTracepoint-17]
58
+	_ = x[CGroupSockAddr-18]
59
+	_ = x[LWTSeg6Local-19]
60
+	_ = x[LircMode2-20]
61
+	_ = x[SkReuseport-21]
62
+	_ = x[FlowDissector-22]
63
+	_ = x[CGroupSysctl-23]
64
+	_ = x[RawTracepointWritable-24]
65
+	_ = x[CGroupSockopt-25]
66
+}
67
+
68
+const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockopt"
69
+
70
+var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258}
71
+
72
+func (i ProgramType) String() string {
73
+	if i >= ProgramType(len(_ProgramType_index)-1) {
74
+		return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")"
75
+	}
76
+	return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]]
77
+}
0 78
new file mode 100644
... ...
@@ -0,0 +1,83 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"math"
20
+	"strconv"
21
+	"strings"
22
+)
23
+
24
+type CPUMax string
25
+
26
+func NewCPUMax(quota *int64, period *uint64) CPUMax {
27
+	max := "max"
28
+	if quota != nil {
29
+		max = strconv.FormatInt(*quota, 10)
30
+	}
31
+	return CPUMax(strings.Join([]string{max, strconv.FormatUint(*period, 10)}, " "))
32
+}
33
+
34
+type CPU struct {
35
+	Weight *uint64
36
+	Max    CPUMax
37
+	Cpus   string
38
+	Mems   string
39
+}
40
+
41
+func (c CPUMax) extractQuotaAndPeriod() (int64, uint64) {
42
+	var (
43
+		quota  int64
44
+		period uint64
45
+	)
46
+	values := strings.Split(string(c), " ")
47
+	if values[0] == "max" {
48
+		quota = math.MaxInt64
49
+	} else {
50
+		quota, _ = strconv.ParseInt(values[0], 10, 64)
51
+	}
52
+	period, _ = strconv.ParseUint(values[1], 10, 64)
53
+	return quota, period
54
+}
55
+
56
+func (r *CPU) Values() (o []Value) {
57
+	if r.Weight != nil {
58
+		o = append(o, Value{
59
+			filename: "cpu.weight",
60
+			value:    *r.Weight,
61
+		})
62
+	}
63
+	if r.Max != "" {
64
+		o = append(o, Value{
65
+			filename: "cpu.max",
66
+			value:    r.Max,
67
+		})
68
+	}
69
+	if r.Cpus != "" {
70
+		o = append(o, Value{
71
+			filename: "cpuset.cpus",
72
+			value:    r.Cpus,
73
+		})
74
+	}
75
+	if r.Mems != "" {
76
+		o = append(o, Value{
77
+			filename: "cpuset.mems",
78
+			value:    r.Mems,
79
+		})
80
+	}
81
+	return o
82
+}
0 83
new file mode 100644
... ...
@@ -0,0 +1,199 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+// Devicefilter containes eBPF device filter program
17
+//
18
+// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
19
+//
20
+// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
21
+// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
22
+//
23
+// This particular Go implementation based on runc version
24
+// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
25
+package v2
26
+
27
+import (
28
+	"fmt"
29
+	"math"
30
+
31
+	"github.com/cilium/ebpf/asm"
32
+	"github.com/opencontainers/runtime-spec/specs-go"
33
+	"github.com/pkg/errors"
34
+	"golang.org/x/sys/unix"
35
+)
36
+
37
+const (
38
+	// license string format is same as kernel MODULE_LICENSE macro
39
+	license = "Apache"
40
+)
41
+
42
+// DeviceFilter returns eBPF device filter program and its license string
43
+func DeviceFilter(devices []specs.LinuxDeviceCgroup) (asm.Instructions, string, error) {
44
+	p := &program{}
45
+	p.init()
46
+	for i := len(devices) - 1; i >= 0; i-- {
47
+		if err := p.appendDevice(devices[i]); err != nil {
48
+			return nil, "", err
49
+		}
50
+	}
51
+	insts, err := p.finalize()
52
+	return insts, license, err
53
+}
54
+
55
+type program struct {
56
+	insts       asm.Instructions
57
+	hasWildCard bool
58
+	blockID     int
59
+}
60
+
61
+func (p *program) init() {
62
+	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
63
+	/*
64
+		u32 access_type
65
+		u32 major
66
+		u32 minor
67
+	*/
68
+	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
69
+	p.insts = append(p.insts,
70
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
71
+
72
+	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
73
+	p.insts = append(p.insts,
74
+		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
75
+		// RSh: bitwise shift right
76
+		asm.RSh.Imm32(asm.R3, 16))
77
+
78
+	// R4 <- major (u32 major at R1[4])
79
+	p.insts = append(p.insts,
80
+		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
81
+
82
+	// R5 <- minor (u32 minor at R1[8])
83
+	p.insts = append(p.insts,
84
+		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
85
+}
86
+
87
+// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
88
+func (p *program) appendDevice(dev specs.LinuxDeviceCgroup) error {
89
+	if p.blockID < 0 {
90
+		return errors.New("the program is finalized")
91
+	}
92
+	if p.hasWildCard {
93
+		// All entries after wildcard entry are ignored
94
+		return nil
95
+	}
96
+
97
+	bpfType := int32(-1)
98
+	hasType := true
99
+	switch dev.Type {
100
+	case string('c'):
101
+		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
102
+	case string('b'):
103
+		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
104
+	case string('a'):
105
+		hasType = false
106
+	default:
107
+		// if not specified in OCI json, typ is set to DeviceTypeAll
108
+		return errors.Errorf("invalid DeviceType %q", dev.Type)
109
+	}
110
+	if *dev.Major > math.MaxUint32 {
111
+		return errors.Errorf("invalid major %d", *dev.Major)
112
+	}
113
+	if *dev.Minor > math.MaxUint32 {
114
+		return errors.Errorf("invalid minor %d", *dev.Major)
115
+	}
116
+	hasMajor := *dev.Major >= 0 // if not specified in OCI json, major is set to -1
117
+	hasMinor := *dev.Minor >= 0
118
+	bpfAccess := int32(0)
119
+	for _, r := range dev.Access {
120
+		switch r {
121
+		case 'r':
122
+			bpfAccess |= unix.BPF_DEVCG_ACC_READ
123
+		case 'w':
124
+			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
125
+		case 'm':
126
+			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
127
+		default:
128
+			return errors.Errorf("unknown device access %v", r)
129
+		}
130
+	}
131
+	// If the access is rwm, skip the check.
132
+	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
133
+
134
+	blockSym := fmt.Sprintf("block-%d", p.blockID)
135
+	nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
136
+	prevBlockLastIdx := len(p.insts) - 1
137
+	if hasType {
138
+		p.insts = append(p.insts,
139
+			// if (R2 != bpfType) goto next
140
+			asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
141
+		)
142
+	}
143
+	if hasAccess {
144
+		p.insts = append(p.insts,
145
+			// if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
146
+			asm.Mov.Reg32(asm.R1, asm.R3),
147
+			asm.And.Imm32(asm.R1, bpfAccess),
148
+			asm.JEq.Imm(asm.R1, 0, nextBlockSym),
149
+		)
150
+	}
151
+	if hasMajor {
152
+		p.insts = append(p.insts,
153
+			// if (R4 != major) goto next
154
+			asm.JNE.Imm(asm.R4, int32(*dev.Major), nextBlockSym),
155
+		)
156
+	}
157
+	if hasMinor {
158
+		p.insts = append(p.insts,
159
+			// if (R5 != minor) goto next
160
+			asm.JNE.Imm(asm.R5, int32(*dev.Minor), nextBlockSym),
161
+		)
162
+	}
163
+	if !hasType && !hasAccess && !hasMajor && !hasMinor {
164
+		p.hasWildCard = true
165
+	}
166
+	p.insts = append(p.insts, acceptBlock(dev.Allow)...)
167
+	// set blockSym to the first instruction we added in this iteration
168
+	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
169
+	p.blockID++
170
+	return nil
171
+}
172
+
173
+func (p *program) finalize() (asm.Instructions, error) {
174
+	if p.hasWildCard {
175
+		// acceptBlock with asm.Return() is already inserted
176
+		return p.insts, nil
177
+	}
178
+	blockSym := fmt.Sprintf("block-%d", p.blockID)
179
+	p.insts = append(p.insts,
180
+		// R0 <- 0
181
+		asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
182
+		asm.Return(),
183
+	)
184
+	p.blockID = -1
185
+	return p.insts, nil
186
+}
187
+
188
+func acceptBlock(accept bool) asm.Instructions {
189
+	v := int32(0)
190
+	if accept {
191
+		v = 1
192
+	}
193
+	return []asm.Instruction{
194
+		// R0 <- v
195
+		asm.Mov.Imm32(asm.R0, v),
196
+		asm.Return(),
197
+	}
198
+}
0 199
new file mode 100644
... ...
@@ -0,0 +1,83 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"github.com/cilium/ebpf"
20
+	"github.com/cilium/ebpf/asm"
21
+	"github.com/opencontainers/runtime-spec/specs-go"
22
+	"github.com/pkg/errors"
23
+	"golang.org/x/sys/unix"
24
+)
25
+
26
+// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
27
+//
28
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
29
+//
30
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
31
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
32
+	nilCloser := func() error {
33
+		return nil
34
+	}
35
+	spec := &ebpf.ProgramSpec{
36
+		Type:         ebpf.CGroupDevice,
37
+		Instructions: insts,
38
+		License:      license,
39
+	}
40
+	prog, err := ebpf.NewProgram(spec)
41
+	if err != nil {
42
+		return nilCloser, err
43
+	}
44
+	if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
45
+		return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
46
+	}
47
+	closer := func() error {
48
+		if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
49
+			return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
50
+		}
51
+		return nil
52
+	}
53
+	return closer, nil
54
+}
55
+
56
+func isRWM(cgroupPermissions string) bool {
57
+	r := false
58
+	w := false
59
+	m := false
60
+	for _, rn := range cgroupPermissions {
61
+		switch rn {
62
+		case 'r':
63
+			r = true
64
+		case 'w':
65
+			w = true
66
+		case 'm':
67
+			m = true
68
+		}
69
+	}
70
+	return r && w && m
71
+}
72
+
73
+// the logic is from runc
74
+// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/fs/devices_v2.go#L44
75
+func canSkipEBPFError(devices []specs.LinuxDeviceCgroup) bool {
76
+	for _, dev := range devices {
77
+		if dev.Allow || !isRWM(dev.Access) {
78
+			return false
79
+		}
80
+	}
81
+	return true
82
+}
0 83
new file mode 100644
... ...
@@ -0,0 +1,50 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"errors"
20
+	"os"
21
+)
22
+
23
+var (
24
+	ErrInvalidPid               = errors.New("cgroups: pid must be greater than 0")
25
+	ErrMountPointNotExist       = errors.New("cgroups: cgroup mountpoint does not exist")
26
+	ErrInvalidFormat            = errors.New("cgroups: parsing file with invalid format failed")
27
+	ErrFreezerNotSupported      = errors.New("cgroups: freezer cgroup (v2) not supported on this system")
28
+	ErrMemoryNotSupported       = errors.New("cgroups: memory cgroup (v2) not supported on this system")
29
+	ErrPidsNotSupported         = errors.New("cgroups: pids cgroup (v2) not supported on this system")
30
+	ErrCPUNotSupported          = errors.New("cgroups: cpu cgroup (v2) not supported on this system")
31
+	ErrCgroupDeleted            = errors.New("cgroups: cgroup deleted")
32
+	ErrNoCgroupMountDestination = errors.New("cgroups: cannot find cgroup mount destination")
33
+	ErrInvalidGroupPath         = errors.New("cgroups: invalid group path")
34
+)
35
+
36
+// ErrorHandler is a function that handles and acts on errors
37
+type ErrorHandler func(err error) error
38
+
39
+// IgnoreNotExist ignores any errors that are for not existing files
40
+func IgnoreNotExist(err error) error {
41
+	if os.IsNotExist(err) {
42
+		return nil
43
+	}
44
+	return err
45
+}
46
+
47
+func errPassthrough(err error) error {
48
+	return err
49
+}
0 50
new file mode 100644
... ...
@@ -0,0 +1,37 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import "strings"
19
+
20
+type HugeTlb []HugeTlbEntry
21
+
22
+type HugeTlbEntry struct {
23
+	HugePageSize string
24
+	Limit        uint64
25
+}
26
+
27
+func (r *HugeTlb) Values() (o []Value) {
28
+	for _, e := range *r {
29
+		o = append(o, Value{
30
+			filename: strings.Join([]string{"hugetlb", e.HugePageSize, "max"}, "."),
31
+			value:    e.Limit,
32
+		})
33
+	}
34
+
35
+	return o
36
+}
0 37
new file mode 100644
... ...
@@ -0,0 +1,64 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import "fmt"
19
+
20
+type IOType string
21
+
22
+const (
23
+	ReadBPS   IOType = "rbps"
24
+	WriteBPS  IOType = "wbps"
25
+	ReadIOPS  IOType = "riops"
26
+	WriteIOPS IOType = "wiops"
27
+)
28
+
29
+type BFQ struct {
30
+	Weight uint16
31
+}
32
+
33
+type Entry struct {
34
+	Type  IOType
35
+	Major int64
36
+	Minor int64
37
+	Rate  uint64
38
+}
39
+
40
+func (e Entry) String() string {
41
+	return fmt.Sprintf("%d:%d %s=%d", e.Major, e.Minor, e.Type, e.Rate)
42
+}
43
+
44
+type IO struct {
45
+	BFQ BFQ
46
+	Max []Entry
47
+}
48
+
49
+func (i *IO) Values() (o []Value) {
50
+	if i.BFQ.Weight != 0 {
51
+		o = append(o, Value{
52
+			filename: "io.bfq.weight",
53
+			value:    i.BFQ.Weight,
54
+		})
55
+	}
56
+	for _, e := range i.Max {
57
+		o = append(o, Value{
58
+			filename: "io.max",
59
+			value:    e.String(),
60
+		})
61
+	}
62
+	return o
63
+}
0 64
new file mode 100644
... ...
@@ -0,0 +1,739 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"bufio"
20
+	"fmt"
21
+	"io/ioutil"
22
+	"math"
23
+	"os"
24
+	"path/filepath"
25
+	"strconv"
26
+	"strings"
27
+	"sync"
28
+	"syscall"
29
+	"time"
30
+
31
+	"golang.org/x/sys/unix"
32
+
33
+	"github.com/containerd/cgroups/v2/stats"
34
+	"github.com/godbus/dbus/v5"
35
+	"github.com/opencontainers/runtime-spec/specs-go"
36
+	"github.com/pkg/errors"
37
+	"github.com/sirupsen/logrus"
38
+
39
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
40
+)
41
+
42
+const (
43
+	subtreeControl     = "cgroup.subtree_control"
44
+	controllersFile    = "cgroup.controllers"
45
+	defaultCgroup2Path = "/sys/fs/cgroup"
46
+	defaultSlice       = "system.slice"
47
+)
48
+
49
+var (
50
+	canDelegate bool
51
+	once        sync.Once
52
+)
53
+
54
+type cgValuer interface {
55
+	Values() []Value
56
+}
57
+
58
+type Event struct {
59
+	Low     uint64
60
+	High    uint64
61
+	Max     uint64
62
+	OOM     uint64
63
+	OOMKill uint64
64
+}
65
+
66
+// Resources for a cgroups v2 unified hierarchy
67
+type Resources struct {
68
+	CPU     *CPU
69
+	Memory  *Memory
70
+	Pids    *Pids
71
+	IO      *IO
72
+	RDMA    *RDMA
73
+	HugeTlb *HugeTlb
74
+	// When len(Devices) is zero, devices are not controlled
75
+	Devices []specs.LinuxDeviceCgroup
76
+}
77
+
78
+// Values returns the raw filenames and values that
79
+// can be written to the unified hierarchy
80
+func (r *Resources) Values() (o []Value) {
81
+	if r.CPU != nil {
82
+		o = append(o, r.CPU.Values()...)
83
+	}
84
+	if r.Memory != nil {
85
+		o = append(o, r.Memory.Values()...)
86
+	}
87
+	if r.Pids != nil {
88
+		o = append(o, r.Pids.Values()...)
89
+	}
90
+	if r.IO != nil {
91
+		o = append(o, r.IO.Values()...)
92
+	}
93
+	if r.RDMA != nil {
94
+		o = append(o, r.RDMA.Values()...)
95
+	}
96
+	if r.HugeTlb != nil {
97
+		o = append(o, r.HugeTlb.Values()...)
98
+	}
99
+	return o
100
+}
101
+
102
+// EnabledControllers returns the list of all not nil resource controllers
103
+func (r *Resources) EnabledControllers() (c []string) {
104
+	if r.CPU != nil {
105
+		c = append(c, "cpu")
106
+		c = append(c, "cpuset")
107
+	}
108
+	if r.Memory != nil {
109
+		c = append(c, "memory")
110
+	}
111
+	if r.Pids != nil {
112
+		c = append(c, "pids")
113
+	}
114
+	if r.IO != nil {
115
+		c = append(c, "io")
116
+	}
117
+	if r.RDMA != nil {
118
+		c = append(c, "rdma")
119
+	}
120
+	if r.HugeTlb != nil {
121
+		c = append(c, "hugetlb")
122
+	}
123
+	return
124
+}
125
+
126
+// Value of a cgroup setting
127
+type Value struct {
128
+	filename string
129
+	value    interface{}
130
+}
131
+
132
+// write the value to the full, absolute path, of a unified hierarchy
133
+func (c *Value) write(path string, perm os.FileMode) error {
134
+	var data []byte
135
+	switch t := c.value.(type) {
136
+	case uint64:
137
+		data = []byte(strconv.FormatUint(t, 10))
138
+	case uint16:
139
+		data = []byte(strconv.FormatUint(uint64(t), 10))
140
+	case int64:
141
+		data = []byte(strconv.FormatInt(t, 10))
142
+	case []byte:
143
+		data = t
144
+	case string:
145
+		data = []byte(t)
146
+	case CPUMax:
147
+		data = []byte(t)
148
+	default:
149
+		return ErrInvalidFormat
150
+	}
151
+	return ioutil.WriteFile(
152
+		filepath.Join(path, c.filename),
153
+		data,
154
+		perm,
155
+	)
156
+}
157
+
158
+func writeValues(path string, values []Value) error {
159
+	for _, o := range values {
160
+		if err := o.write(path, defaultFilePerm); err != nil {
161
+			return err
162
+		}
163
+	}
164
+	return nil
165
+}
166
+
167
+func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) {
168
+	if err := VerifyGroupPath(group); err != nil {
169
+		return nil, err
170
+	}
171
+	path := filepath.Join(mountpoint, group)
172
+	if err := os.MkdirAll(path, defaultDirPerm); err != nil {
173
+		return nil, err
174
+	}
175
+	m := Manager{
176
+		unifiedMountpoint: mountpoint,
177
+		path:              path,
178
+	}
179
+	if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
180
+		// clean up cgroup dir on failure
181
+		os.Remove(path)
182
+		return nil, err
183
+	}
184
+	if err := setResources(path, resources); err != nil {
185
+		os.Remove(path)
186
+		return nil, err
187
+	}
188
+	return &m, nil
189
+}
190
+
191
+func LoadManager(mountpoint string, group string) (*Manager, error) {
192
+	if err := VerifyGroupPath(group); err != nil {
193
+		return nil, err
194
+	}
195
+	path := filepath.Join(mountpoint, group)
196
+	return &Manager{
197
+		unifiedMountpoint: mountpoint,
198
+		path:              path,
199
+	}, nil
200
+}
201
+
202
+type Manager struct {
203
+	unifiedMountpoint string
204
+	path              string
205
+}
206
+
207
+func setResources(path string, resources *Resources) error {
208
+	if resources != nil {
209
+		if err := writeValues(path, resources.Values()); err != nil {
210
+			return err
211
+		}
212
+		if err := setDevices(path, resources.Devices); err != nil {
213
+			return err
214
+		}
215
+	}
216
+	return nil
217
+}
218
+
219
+func (c *Manager) RootControllers() ([]string, error) {
220
+	b, err := ioutil.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile))
221
+	if err != nil {
222
+		return nil, err
223
+	}
224
+	return strings.Fields(string(b)), nil
225
+}
226
+
227
+func (c *Manager) Controllers() ([]string, error) {
228
+	b, err := ioutil.ReadFile(filepath.Join(c.path, controllersFile))
229
+	if err != nil {
230
+		return nil, err
231
+	}
232
+	return strings.Fields(string(b)), nil
233
+}
234
+
235
+type ControllerToggle int
236
+
237
+const (
238
+	Enable ControllerToggle = iota + 1
239
+	Disable
240
+)
241
+
242
+func toggleFunc(controllers []string, prefix string) []string {
243
+	out := make([]string, len(controllers))
244
+	for i, c := range controllers {
245
+		out[i] = prefix + c
246
+	}
247
+	return out
248
+}
249
+
250
+func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error {
251
+	// when c.path is like /foo/bar/baz, the following files need to be written:
252
+	// * /sys/fs/cgroup/cgroup.subtree_control
253
+	// * /sys/fs/cgroup/foo/cgroup.subtree_control
254
+	// * /sys/fs/cgroup/foo/bar/cgroup.subtree_control
255
+	// Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written.
256
+	split := strings.Split(c.path, "/")
257
+	var lastErr error
258
+	for i, _ := range split {
259
+		f := strings.Join(split[:i], "/")
260
+		if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path {
261
+			continue
262
+		}
263
+		filePath := filepath.Join(f, subtreeControl)
264
+		if err := c.writeSubtreeControl(filePath, controllers, t); err != nil {
265
+			// When running as rootless, the user may face EPERM on parent groups, but it is neglible when the
266
+			// controller is already written.
267
+			// So we only return the last error.
268
+			lastErr = errors.Wrapf(err, "failed to write subtree controllers %+v to %q", controllers, filePath)
269
+		}
270
+	}
271
+	return lastErr
272
+}
273
+
274
+func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error {
275
+	f, err := os.OpenFile(filePath, os.O_WRONLY, 0)
276
+	if err != nil {
277
+		return err
278
+	}
279
+	defer f.Close()
280
+	switch t {
281
+	case Enable:
282
+		controllers = toggleFunc(controllers, "+")
283
+	case Disable:
284
+		controllers = toggleFunc(controllers, "-")
285
+	}
286
+	_, err = f.WriteString(strings.Join(controllers, " "))
287
+	return err
288
+}
289
+
290
+func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) {
291
+	if strings.HasPrefix(name, "/") {
292
+		return nil, errors.New("name must be relative")
293
+	}
294
+	path := filepath.Join(c.path, name)
295
+	if err := os.MkdirAll(path, defaultDirPerm); err != nil {
296
+		return nil, err
297
+	}
298
+	if err := setResources(path, resources); err != nil {
299
+		// clean up cgroup dir on failure
300
+		os.Remove(path)
301
+		return nil, err
302
+	}
303
+	return &Manager{
304
+		unifiedMountpoint: c.unifiedMountpoint,
305
+		path:              path,
306
+	}, nil
307
+}
308
+
309
+func (c *Manager) AddProc(pid uint64) error {
310
+	v := Value{
311
+		filename: cgroupProcs,
312
+		value:    pid,
313
+	}
314
+	return writeValues(c.path, []Value{v})
315
+}
316
+
317
+func (c *Manager) Delete() error {
318
+	return remove(c.path)
319
+}
320
+
321
+func (c *Manager) Procs(recursive bool) ([]uint64, error) {
322
+	var processes []uint64
323
+	err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error {
324
+		if err != nil {
325
+			return err
326
+		}
327
+		if !recursive && info.IsDir() {
328
+			if p == c.path {
329
+				return nil
330
+			}
331
+			return filepath.SkipDir
332
+		}
333
+		_, name := filepath.Split(p)
334
+		if name != cgroupProcs {
335
+			return nil
336
+		}
337
+		procs, err := parseCgroupProcsFile(p)
338
+		if err != nil {
339
+			return err
340
+		}
341
+		processes = append(processes, procs...)
342
+		return nil
343
+	})
344
+	return processes, err
345
+}
346
+
347
+var singleValueFiles = []string{
348
+	"pids.current",
349
+	"pids.max",
350
+}
351
+
352
+func (c *Manager) Stat() (*stats.Metrics, error) {
353
+	controllers, err := c.Controllers()
354
+	if err != nil {
355
+		return nil, err
356
+	}
357
+	out := make(map[string]interface{})
358
+	for _, controller := range controllers {
359
+		switch controller {
360
+		case "cpu", "memory":
361
+			filename := fmt.Sprintf("%s.stat", controller)
362
+			if err := readKVStatsFile(c.path, filename, out); err != nil {
363
+				if os.IsNotExist(err) {
364
+					continue
365
+				}
366
+				return nil, err
367
+			}
368
+		}
369
+	}
370
+	for _, name := range singleValueFiles {
371
+		if err := readSingleFile(c.path, name, out); err != nil {
372
+			if os.IsNotExist(err) {
373
+				continue
374
+			}
375
+			return nil, err
376
+		}
377
+	}
378
+	var metrics stats.Metrics
379
+
380
+	metrics.Pids = &stats.PidsStat{
381
+		Current: getPidValue("pids.current", out),
382
+		Limit:   getPidValue("pids.max", out),
383
+	}
384
+	metrics.CPU = &stats.CPUStat{
385
+		UsageUsec:     getUint64Value("usage_usec", out),
386
+		UserUsec:      getUint64Value("user_usec", out),
387
+		SystemUsec:    getUint64Value("system_usec", out),
388
+		NrPeriods:     getUint64Value("nr_periods", out),
389
+		NrThrottled:   getUint64Value("nr_throttled", out),
390
+		ThrottledUsec: getUint64Value("throttled_usec", out),
391
+	}
392
+	metrics.Memory = &stats.MemoryStat{
393
+		Anon:                  getUint64Value("anon", out),
394
+		File:                  getUint64Value("file", out),
395
+		KernelStack:           getUint64Value("kernel_stack", out),
396
+		Slab:                  getUint64Value("slab", out),
397
+		Sock:                  getUint64Value("sock", out),
398
+		Shmem:                 getUint64Value("shmem", out),
399
+		FileMapped:            getUint64Value("file_mapped", out),
400
+		FileDirty:             getUint64Value("file_dirty", out),
401
+		FileWriteback:         getUint64Value("file_writeback", out),
402
+		AnonThp:               getUint64Value("anon_thp", out),
403
+		InactiveAnon:          getUint64Value("inactive_anon", out),
404
+		ActiveAnon:            getUint64Value("active_anon", out),
405
+		InactiveFile:          getUint64Value("inactive_file", out),
406
+		ActiveFile:            getUint64Value("active_file", out),
407
+		Unevictable:           getUint64Value("unevictable", out),
408
+		SlabReclaimable:       getUint64Value("slab_reclaimable", out),
409
+		SlabUnreclaimable:     getUint64Value("slab_unreclaimable", out),
410
+		Pgfault:               getUint64Value("pgfault", out),
411
+		Pgmajfault:            getUint64Value("pgmajfault", out),
412
+		WorkingsetRefault:     getUint64Value("workingset_refault", out),
413
+		WorkingsetActivate:    getUint64Value("workingset_activate", out),
414
+		WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out),
415
+		Pgrefill:              getUint64Value("pgrefill", out),
416
+		Pgscan:                getUint64Value("pgscan", out),
417
+		Pgsteal:               getUint64Value("pgsteal", out),
418
+		Pgactivate:            getUint64Value("pgactivate", out),
419
+		Pgdeactivate:          getUint64Value("pgdeactivate", out),
420
+		Pglazyfree:            getUint64Value("pglazyfree", out),
421
+		Pglazyfreed:           getUint64Value("pglazyfreed", out),
422
+		ThpFaultAlloc:         getUint64Value("thp_fault_alloc", out),
423
+		ThpCollapseAlloc:      getUint64Value("thp_collapse_alloc", out),
424
+		Usage:                 getStatFileContentUint64(filepath.Join(c.path, "memory.current")),
425
+		UsageLimit:            getStatFileContentUint64(filepath.Join(c.path, "memory.max")),
426
+		SwapUsage:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")),
427
+		SwapLimit:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")),
428
+	}
429
+
430
+	metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)}
431
+	metrics.Rdma = &stats.RdmaStat{
432
+		Current: rdmaStats(filepath.Join(c.path, "rdma.current")),
433
+		Limit:   rdmaStats(filepath.Join(c.path, "rdma.max")),
434
+	}
435
+	metrics.Hugetlb = readHugeTlbStats(c.path)
436
+
437
+	return &metrics, nil
438
+}
439
+
440
+func getUint64Value(key string, out map[string]interface{}) uint64 {
441
+	v, ok := out[key]
442
+	if !ok {
443
+		return 0
444
+	}
445
+	switch t := v.(type) {
446
+	case uint64:
447
+		return t
448
+	}
449
+	return 0
450
+}
451
+
452
+func getPidValue(key string, out map[string]interface{}) uint64 {
453
+	v, ok := out[key]
454
+	if !ok {
455
+		return 0
456
+	}
457
+	switch t := v.(type) {
458
+	case uint64:
459
+		return t
460
+	case string:
461
+		if t == "max" {
462
+			return math.MaxUint64
463
+		}
464
+	}
465
+	return 0
466
+}
467
+
468
+func readSingleFile(path string, file string, out map[string]interface{}) error {
469
+	f, err := os.Open(filepath.Join(path, file))
470
+	if err != nil {
471
+		return err
472
+	}
473
+	defer f.Close()
474
+	data, err := ioutil.ReadAll(f)
475
+	if err != nil {
476
+		return err
477
+	}
478
+	s := strings.TrimSpace(string(data))
479
+	v, err := parseUint(s, 10, 64)
480
+	if err != nil {
481
+		// if we cannot parse as a uint, parse as a string
482
+		out[file] = s
483
+		return nil
484
+	}
485
+	out[file] = v
486
+	return nil
487
+}
488
+
489
+func readKVStatsFile(path string, file string, out map[string]interface{}) error {
490
+	f, err := os.Open(filepath.Join(path, file))
491
+	if err != nil {
492
+		return err
493
+	}
494
+	defer f.Close()
495
+
496
+	s := bufio.NewScanner(f)
497
+	for s.Scan() {
498
+		if err := s.Err(); err != nil {
499
+			return err
500
+		}
501
+		name, value, err := parseKV(s.Text())
502
+		if err != nil {
503
+			return errors.Wrapf(err, "error while parsing %s (line=%q)", filepath.Join(path, file), s.Text())
504
+		}
505
+		out[name] = value
506
+	}
507
+	return nil
508
+}
509
+
510
+func (c *Manager) Freeze() error {
511
+	return c.freeze(c.path, Frozen)
512
+}
513
+
514
+func (c *Manager) Thaw() error {
515
+	return c.freeze(c.path, Thawed)
516
+}
517
+
518
+func (c *Manager) freeze(path string, state State) error {
519
+	values := state.Values()
520
+	for {
521
+		if err := writeValues(path, values); err != nil {
522
+			return err
523
+		}
524
+		current, err := fetchState(path)
525
+		if err != nil {
526
+			return err
527
+		}
528
+		if current == state {
529
+			return nil
530
+		}
531
+		time.Sleep(1 * time.Millisecond)
532
+	}
533
+}
534
+
535
+// MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
536
+func (c *Manager) MemoryEventFD() (int, uint32, error) {
537
+	fpath := filepath.Join(c.path, "memory.events")
538
+	fd, err := syscall.InotifyInit()
539
+	if err != nil {
540
+		return 0, 0, errors.Errorf("Failed to create inotify fd")
541
+	}
542
+	wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
543
+	if wd < 0 {
544
+		syscall.Close(fd)
545
+		return 0, 0, errors.Errorf("Failed to add inotify watch for %q", fpath)
546
+	}
547
+
548
+	return fd, uint32(wd), nil
549
+}
550
+
551
+func (c *Manager) EventChan() (<-chan Event, <-chan error) {
552
+	ec := make(chan Event)
553
+	errCh := make(chan error)
554
+	go c.waitForEvents(ec, errCh)
555
+
556
+	return ec, nil
557
+}
558
+
559
+func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
560
+	fd, wd, err := c.MemoryEventFD()
561
+
562
+	defer syscall.InotifyRmWatch(fd, wd)
563
+	defer syscall.Close(fd)
564
+
565
+	if err != nil {
566
+		errCh <- err
567
+		return
568
+	}
569
+
570
+	for {
571
+		buffer := make([]byte, syscall.SizeofInotifyEvent*10)
572
+		bytesRead, err := syscall.Read(fd, buffer)
573
+		if err != nil {
574
+			errCh <- err
575
+			return
576
+		}
577
+		var out map[string]interface{}
578
+		if bytesRead >= syscall.SizeofInotifyEvent {
579
+			if err := readKVStatsFile(c.path, "memory.events", out); err != nil {
580
+				e := Event{
581
+					High:    out["high"].(uint64),
582
+					Low:     out["low"].(uint64),
583
+					Max:     out["max"].(uint64),
584
+					OOM:     out["oom"].(uint64),
585
+					OOMKill: out["oom_kill"].(uint64),
586
+				}
587
+				ec <- e
588
+			} else {
589
+				errCh <- err
590
+				return
591
+			}
592
+		}
593
+	}
594
+}
595
+
596
+func setDevices(path string, devices []specs.LinuxDeviceCgroup) error {
597
+	if len(devices) == 0 {
598
+		return nil
599
+	}
600
+	insts, license, err := DeviceFilter(devices)
601
+	if err != nil {
602
+		return err
603
+	}
604
+	dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY, 0600)
605
+	if err != nil {
606
+		return errors.Errorf("cannot get dir FD for %s", path)
607
+	}
608
+	defer unix.Close(dirFD)
609
+	if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
610
+		if !canSkipEBPFError(devices) {
611
+			return err
612
+		}
613
+	}
614
+	return nil
615
+}
616
+
617
+func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) {
618
+	if slice == "" {
619
+		slice = defaultSlice
620
+	}
621
+	path := filepath.Join(defaultCgroup2Path, slice, group)
622
+	conn, err := systemdDbus.New()
623
+	if err != nil {
624
+		return &Manager{}, err
625
+	}
626
+	defer conn.Close()
627
+
628
+	properties := []systemdDbus.Property{
629
+		systemdDbus.PropDescription(fmt.Sprintf("cgroup %s", group)),
630
+		newSystemdProperty("DefaultDependencies", false),
631
+		newSystemdProperty("MemoryAccounting", true),
632
+		newSystemdProperty("CPUAccounting", true),
633
+		newSystemdProperty("IOAccounting", true),
634
+	}
635
+
636
+	// if we create a slice, the parent is defined via a Wants=
637
+	if strings.HasSuffix(group, ".slice") {
638
+		properties = append(properties, systemdDbus.PropWants(defaultSlice))
639
+	} else {
640
+		// otherwise, we use Slice=
641
+		properties = append(properties, systemdDbus.PropSlice(defaultSlice))
642
+	}
643
+
644
+	// only add pid if its valid, -1 is used w/ general slice creation.
645
+	if pid != -1 {
646
+		properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)}))
647
+	}
648
+
649
+	if resources.Memory != nil && *resources.Memory.Max != 0 {
650
+		properties = append(properties,
651
+			newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max)))
652
+	}
653
+
654
+	if resources.CPU != nil && *resources.CPU.Weight != 0 {
655
+		properties = append(properties,
656
+			newSystemdProperty("CPUWeight", *resources.CPU.Weight))
657
+	}
658
+
659
+	if resources.CPU != nil && resources.CPU.Max != "" {
660
+		quota, period := resources.CPU.Max.extractQuotaAndPeriod()
661
+		// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
662
+		// corresponds to USEC_INFINITY in systemd
663
+		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
664
+		// always setting a property value ensures we can apply a quota and remove it later
665
+		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
666
+		if quota > 0 {
667
+			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
668
+			// (integer percentage of CPU) internally.  This means that if a fractional percent of
669
+			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
670
+			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
671
+			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
672
+			if cpuQuotaPerSecUSec%10000 != 0 {
673
+				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
674
+			}
675
+		}
676
+		properties = append(properties,
677
+			newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
678
+	}
679
+
680
+	// If we can delegate, we add the property back in
681
+	if canDelegate {
682
+		properties = append(properties, newSystemdProperty("Delegate", true))
683
+	}
684
+
685
+	if resources.Pids != nil && resources.Pids.Max > 0 {
686
+		properties = append(properties,
687
+			newSystemdProperty("TasksAccounting", true),
688
+			newSystemdProperty("TasksMax", uint64(resources.Pids.Max)))
689
+	}
690
+
691
+	statusChan := make(chan string, 1)
692
+	if _, err := conn.StartTransientUnit(group, "replace", properties, statusChan); err == nil {
693
+		select {
694
+		case <-statusChan:
695
+		case <-time.After(time.Second):
696
+			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
697
+		}
698
+	} else if !isUnitExists(err) {
699
+		return &Manager{}, err
700
+	}
701
+
702
+	return &Manager{
703
+		path: path,
704
+	}, nil
705
+}
706
+
707
+func LoadSystemd(slice, group string) (*Manager, error) {
708
+	if slice == "" {
709
+		slice = defaultSlice
710
+	}
711
+	group = filepath.Join(defaultCgroup2Path, slice, group)
712
+	return &Manager{
713
+		path: group,
714
+	}, nil
715
+}
716
+
717
+func (c *Manager) DeleteSystemd() error {
718
+	conn, err := systemdDbus.New()
719
+	if err != nil {
720
+		return err
721
+	}
722
+	defer conn.Close()
723
+	group := systemdUnitFromPath(c.path)
724
+	ch := make(chan string)
725
+	_, err = conn.StopUnit(group, "replace", ch)
726
+	if err != nil {
727
+		return err
728
+	}
729
+	<-ch
730
+	return nil
731
+}
732
+
733
+func newSystemdProperty(name string, units interface{}) systemdDbus.Property {
734
+	return systemdDbus.Property{
735
+		Name:  name,
736
+		Value: dbus.MakeVariant(units),
737
+	}
738
+}
0 739
new file mode 100644
... ...
@@ -0,0 +1,52 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+type Memory struct {
19
+	Swap *int64
20
+	Max  *int64
21
+	Low  *int64
22
+	High *int64
23
+}
24
+
25
+func (r *Memory) Values() (o []Value) {
26
+	if r.Swap != nil {
27
+		o = append(o, Value{
28
+			filename: "memory.swap.max",
29
+			value:    *r.Swap,
30
+		})
31
+	}
32
+	if r.Max != nil {
33
+		o = append(o, Value{
34
+			filename: "memory.max",
35
+			value:    *r.Max,
36
+		})
37
+	}
38
+	if r.Low != nil {
39
+		o = append(o, Value{
40
+			filename: "memory.low",
41
+			value:    *r.Low,
42
+		})
43
+	}
44
+	if r.High != nil {
45
+		o = append(o, Value{
46
+			filename: "memory.high",
47
+			value:    *r.High,
48
+		})
49
+	}
50
+	return o
51
+}
0 52
new file mode 100644
... ...
@@ -0,0 +1,60 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"fmt"
20
+	"path/filepath"
21
+	"strings"
22
+)
23
+
24
+// NestedGroupPath will nest the cgroups based on the calling processes cgroup
25
+// placing its child processes inside its own path
26
+func NestedGroupPath(suffix string) (string, error) {
27
+	path, err := parseCgroupFile("/proc/self/cgroup")
28
+	if err != nil {
29
+		return "", err
30
+	}
31
+	return filepath.Join(string(path), suffix), nil
32
+}
33
+
34
+// PidGroupPath will return the correct cgroup paths for an existing process running inside a cgroup
35
+// This is commonly used for the Load function to restore an existing container
36
+func PidGroupPath(pid int) (string, error) {
37
+	p := fmt.Sprintf("/proc/%d/cgroup", pid)
38
+	return parseCgroupFile(p)
39
+}
40
+
41
+// VerifyGroupPath verifies the format of group path string g.
42
+// The format is same as the third field in /proc/PID/cgroup.
43
+// e.g. "/user.slice/user-1001.slice/session-1.scope"
44
+//
45
+// g must be a "clean" absolute path starts with "/", and must not contain "/sys/fs/cgroup" prefix.
46
+//
47
+// VerifyGroupPath doesn't verify whether g actually exists on the system.
48
+func VerifyGroupPath(g string) error {
49
+	if !strings.HasPrefix(g, "/") {
50
+		return ErrInvalidGroupPath
51
+	}
52
+	if filepath.Clean(g) != g {
53
+		return ErrInvalidGroupPath
54
+	}
55
+	if strings.HasPrefix(g, "/sys/fs/cgroup") {
56
+		return ErrInvalidGroupPath
57
+	}
58
+	return nil
59
+}
0 60
new file mode 100644
... ...
@@ -0,0 +1,37 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import "strconv"
19
+
20
+type Pids struct {
21
+	Max int64
22
+}
23
+
24
+func (r *Pids) Values() (o []Value) {
25
+	if r.Max != 0 {
26
+		limit := "max"
27
+		if r.Max > 0 {
28
+			limit = strconv.FormatInt(r.Max, 10)
29
+		}
30
+		o = append(o, Value{
31
+			filename: "pids.max",
32
+			value:    limit,
33
+		})
34
+	}
35
+	return o
36
+}
0 37
new file mode 100644
... ...
@@ -0,0 +1,46 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"fmt"
20
+)
21
+
22
+type RDMA struct {
23
+	Limit []RDMAEntry
24
+}
25
+
26
+type RDMAEntry struct {
27
+	Device     string
28
+	HcaHandles uint32
29
+	HcaObjects uint32
30
+}
31
+
32
+func (r RDMAEntry) String() string {
33
+	return fmt.Sprintf("%s hca_handle=%d hca_object=%d", r.Device, r.HcaHandles, r.HcaObjects)
34
+}
35
+
36
+func (r *RDMA) Values() (o []Value) {
37
+	for _, e := range r.Limit {
38
+		o = append(o, Value{
39
+			filename: "rdma.max",
40
+			value:    e.String(),
41
+		})
42
+	}
43
+
44
+	return o
45
+}
0 46
new file mode 100644
... ...
@@ -0,0 +1,65 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"io/ioutil"
20
+	"path/filepath"
21
+	"strings"
22
+)
23
+
24
+// State is a type that represents the state of the current cgroup
25
+type State string
26
+
27
+const (
28
+	Unknown State = ""
29
+	Thawed  State = "thawed"
30
+	Frozen  State = "frozen"
31
+	Deleted State = "deleted"
32
+
33
+	cgroupFreeze = "cgroup.freeze"
34
+)
35
+
36
+func (s State) Values() []Value {
37
+	v := Value{
38
+		filename: cgroupFreeze,
39
+	}
40
+	switch s {
41
+	case Frozen:
42
+		v.value = "1"
43
+	case Thawed:
44
+		v.value = "0"
45
+	}
46
+	return []Value{
47
+		v,
48
+	}
49
+}
50
+
51
+func fetchState(path string) (State, error) {
52
+	current, err := ioutil.ReadFile(filepath.Join(path, cgroupFreeze))
53
+	if err != nil {
54
+		return Unknown, err
55
+	}
56
+	switch strings.TrimSpace(string(current)) {
57
+	case "1":
58
+		return Frozen, nil
59
+	case "0":
60
+		return Thawed, nil
61
+	default:
62
+		return Unknown, nil
63
+	}
64
+}
0 65
new file mode 100644
... ...
@@ -0,0 +1,442 @@
0
+/*
1
+   Copyright The containerd Authors.
2
+
3
+   Licensed under the Apache License, Version 2.0 (the "License");
4
+   you may not use this file except in compliance with the License.
5
+   You may obtain a copy of the License at
6
+
7
+       http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+   Unless required by applicable law or agreed to in writing, software
10
+   distributed under the License is distributed on an "AS IS" BASIS,
11
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+   See the License for the specific language governing permissions and
13
+   limitations under the License.
14
+*/
15
+
16
+package v2
17
+
18
+import (
19
+	"bufio"
20
+	"fmt"
21
+	"io"
22
+	"io/ioutil"
23
+	"math"
24
+	"os"
25
+	"path/filepath"
26
+	"strconv"
27
+	"strings"
28
+	"time"
29
+
30
+	"github.com/godbus/dbus/v5"
31
+
32
+	"github.com/containerd/cgroups/v2/stats"
33
+	"github.com/opencontainers/runtime-spec/specs-go"
34
+	"github.com/pkg/errors"
35
+	"github.com/sirupsen/logrus"
36
+)
37
+
38
+const (
39
+	cgroupProcs    = "cgroup.procs"
40
+	defaultDirPerm = 0755
41
+)
42
+
43
+// defaultFilePerm is a var so that the test framework can change the filemode
44
+// of all files created when the tests are running.  The difference between the
45
+// tests and real world use is that files like "cgroup.procs" will exist when writing
46
+// to a read cgroup filesystem and do not exist prior when running in the tests.
47
+// this is set to a non 0 value in the test code
48
+var defaultFilePerm = os.FileMode(0)
49
+
50
+// remove will remove a cgroup path handling EAGAIN and EBUSY errors and
51
+// retrying the remove after a exp timeout
52
+func remove(path string) error {
53
+	var err error
54
+	delay := 10 * time.Millisecond
55
+	for i := 0; i < 5; i++ {
56
+		if i != 0 {
57
+			time.Sleep(delay)
58
+			delay *= 2
59
+		}
60
+		if err = os.RemoveAll(path); err == nil {
61
+			return nil
62
+		}
63
+	}
64
+	return errors.Wrapf(err, "cgroups: unable to remove path %q", path)
65
+}
66
+
67
+// parseCgroupProcsFile parses /sys/fs/cgroup/$GROUPPATH/cgroup.procs
68
+func parseCgroupProcsFile(path string) ([]uint64, error) {
69
+	f, err := os.Open(path)
70
+	if err != nil {
71
+		return nil, err
72
+	}
73
+	defer f.Close()
74
+	var (
75
+		out []uint64
76
+		s   = bufio.NewScanner(f)
77
+	)
78
+	for s.Scan() {
79
+		if t := s.Text(); t != "" {
80
+			pid, err := strconv.ParseUint(t, 10, 0)
81
+			if err != nil {
82
+				return nil, err
83
+			}
84
+			out = append(out, pid)
85
+		}
86
+	}
87
+	return out, nil
88
+}
89
+
90
+func parseKV(raw string) (string, interface{}, error) {
91
+	parts := strings.Fields(raw)
92
+	switch len(parts) {
93
+	case 2:
94
+		v, err := parseUint(parts[1], 10, 64)
95
+		if err != nil {
96
+			// if we cannot parse as a uint, parse as a string
97
+			return parts[0], parts[1], nil
98
+		}
99
+		return parts[0], v, nil
100
+	default:
101
+		return "", 0, ErrInvalidFormat
102
+	}
103
+}
104
+
105
+func readUint(path string) (uint64, error) {
106
+	v, err := ioutil.ReadFile(path)
107
+	if err != nil {
108
+		return 0, err
109
+	}
110
+	return parseUint(strings.TrimSpace(string(v)), 10, 64)
111
+}
112
+
113
+func parseUint(s string, base, bitSize int) (uint64, error) {
114
+	v, err := strconv.ParseUint(s, base, bitSize)
115
+	if err != nil {
116
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
117
+		// 1. Handle negative values greater than MinInt64 (and)
118
+		// 2. Handle negative values lesser than MinInt64
119
+		if intErr == nil && intValue < 0 {
120
+			return 0, nil
121
+		} else if intErr != nil &&
122
+			intErr.(*strconv.NumError).Err == strconv.ErrRange &&
123
+			intValue < 0 {
124
+			return 0, nil
125
+		}
126
+		return 0, err
127
+	}
128
+	return v, nil
129
+}
130
+
131
+// parseCgroupFile parses /proc/PID/cgroup file and return string
132
+func parseCgroupFile(path string) (string, error) {
133
+	f, err := os.Open(path)
134
+	if err != nil {
135
+		return "", err
136
+	}
137
+	defer f.Close()
138
+	return parseCgroupFromReader(f)
139
+}
140
+
141
+func parseCgroupFromReader(r io.Reader) (string, error) {
142
+	var (
143
+		s = bufio.NewScanner(r)
144
+	)
145
+	for s.Scan() {
146
+		if err := s.Err(); err != nil {
147
+			return "", err
148
+		}
149
+		var (
150
+			text  = s.Text()
151
+			parts = strings.SplitN(text, ":", 3)
152
+		)
153
+		if len(parts) < 3 {
154
+			return "", fmt.Errorf("invalid cgroup entry: %q", text)
155
+		}
156
+		// text is like "0::/user.slice/user-1001.slice/session-1.scope"
157
+		if parts[0] == "0" && parts[1] == "" {
158
+			return parts[2], nil
159
+		}
160
+	}
161
+	return "", fmt.Errorf("cgroup path not found")
162
+}
163
+
164
+// ToResources converts the oci LinuxResources struct into a
165
+// v2 Resources type for use with this package.
166
+//
167
+// converting cgroups configuration from v1 to v2
168
+// ref: https://github.com/containers/crun/blob/master/crun.1.md#cgroup-v2
169
+func ToResources(spec *specs.LinuxResources) *Resources {
170
+	var resources Resources
171
+	if cpu := spec.CPU; cpu != nil {
172
+		resources.CPU = &CPU{
173
+			Cpus: cpu.Cpus,
174
+			Mems: cpu.Mems,
175
+		}
176
+		if shares := cpu.Shares; shares != nil {
177
+			convertedWeight := (1 + ((*shares-2)*9999)/262142)
178
+			resources.CPU.Weight = &convertedWeight
179
+		}
180
+		if period := cpu.Period; period != nil {
181
+			resources.CPU.Max = NewCPUMax(cpu.Quota, period)
182
+		}
183
+	}
184
+	if mem := spec.Memory; mem != nil {
185
+		resources.Memory = &Memory{}
186
+		if swap := mem.Swap; swap != nil {
187
+			resources.Memory.Swap = swap
188
+		}
189
+		if l := mem.Limit; l != nil {
190
+			resources.Memory.Max = l
191
+		}
192
+		if l := mem.Reservation; l != nil {
193
+			resources.Memory.Low = l
194
+		}
195
+	}
196
+	if hugetlbs := spec.HugepageLimits; hugetlbs != nil {
197
+		hugeTlbUsage := HugeTlb{}
198
+		for _, hugetlb := range hugetlbs {
199
+			hugeTlbUsage = append(hugeTlbUsage, HugeTlbEntry{
200
+				HugePageSize: hugetlb.Pagesize,
201
+				Limit:        hugetlb.Limit,
202
+			})
203
+		}
204
+		resources.HugeTlb = &hugeTlbUsage
205
+	}
206
+	if pids := spec.Pids; pids != nil {
207
+		resources.Pids = &Pids{
208
+			Max: pids.Limit,
209
+		}
210
+	}
211
+	if i := spec.BlockIO; i != nil {
212
+		resources.IO = &IO{}
213
+		if i.Weight != nil {
214
+			resources.IO.BFQ.Weight = 1 + (*i.Weight-10)*9999/990
215
+		}
216
+		for t, devices := range map[IOType][]specs.LinuxThrottleDevice{
217
+			ReadBPS:   i.ThrottleReadBpsDevice,
218
+			WriteBPS:  i.ThrottleWriteBpsDevice,
219
+			ReadIOPS:  i.ThrottleReadIOPSDevice,
220
+			WriteIOPS: i.ThrottleWriteIOPSDevice,
221
+		} {
222
+			for _, d := range devices {
223
+				resources.IO.Max = append(resources.IO.Max, Entry{
224
+					Type:  t,
225
+					Major: d.Major,
226
+					Minor: d.Minor,
227
+					Rate:  d.Rate,
228
+				})
229
+			}
230
+		}
231
+	}
232
+	if i := spec.Rdma; i != nil {
233
+		resources.RDMA = &RDMA{}
234
+		for device, value := range spec.Rdma {
235
+			if device != "" && (value.HcaHandles != nil || value.HcaObjects != nil) {
236
+				resources.RDMA.Limit = append(resources.RDMA.Limit, RDMAEntry{
237
+					Device:     device,
238
+					HcaHandles: *value.HcaHandles,
239
+					HcaObjects: *value.HcaObjects,
240
+				})
241
+			}
242
+		}
243
+	}
244
+
245
+	return &resources
246
+}
247
+
248
+// Gets uint64 parsed content of single value cgroup stat file
249
+func getStatFileContentUint64(filePath string) uint64 {
250
+	contents, err := ioutil.ReadFile(filePath)
251
+	if err != nil {
252
+		return 0
253
+	}
254
+	trimmed := strings.TrimSpace(string(contents))
255
+	if trimmed == "max" {
256
+		return math.MaxUint64
257
+	}
258
+
259
+	res, err := parseUint(trimmed, 10, 64)
260
+	if err != nil {
261
+		logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), filePath)
262
+		return res
263
+	}
264
+
265
+	return res
266
+}
267
+
268
+func readIoStats(path string) []*stats.IOEntry {
269
+	// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
270
+	var usage []*stats.IOEntry
271
+	fpath := filepath.Join(path, "io.stat")
272
+	currentData, err := ioutil.ReadFile(fpath)
273
+	if err != nil {
274
+		return usage
275
+	}
276
+	entries := strings.Split(string(currentData), "\n")
277
+
278
+	for _, entry := range entries {
279
+		parts := strings.Split(entry, " ")
280
+		if len(parts) < 2 {
281
+			continue
282
+		}
283
+		majmin := strings.Split(parts[0], ":")
284
+		if len(majmin) != 2 {
285
+			continue
286
+		}
287
+		major, err := strconv.ParseUint(majmin[0], 10, 0)
288
+		if err != nil {
289
+			return usage
290
+		}
291
+		minor, err := strconv.ParseUint(majmin[1], 10, 0)
292
+		if err != nil {
293
+			return usage
294
+		}
295
+		parts = parts[1:]
296
+		ioEntry := stats.IOEntry{
297
+			Major: major,
298
+			Minor: minor,
299
+		}
300
+		for _, stats := range parts {
301
+			keyPairValue := strings.Split(stats, "=")
302
+			if len(keyPairValue) != 2 {
303
+				continue
304
+			}
305
+			v, err := strconv.ParseUint(keyPairValue[1], 10, 0)
306
+			if err != nil {
307
+				continue
308
+			}
309
+			switch keyPairValue[0] {
310
+			case "rbytes":
311
+				ioEntry.Rbytes = v
312
+			case "wbytes":
313
+				ioEntry.Wbytes = v
314
+			case "rios":
315
+				ioEntry.Rios = v
316
+			case "wios":
317
+				ioEntry.Wios = v
318
+			}
319
+		}
320
+		usage = append(usage, &ioEntry)
321
+	}
322
+	return usage
323
+}
324
+
325
+func rdmaStats(filepath string) []*stats.RdmaEntry {
326
+	currentData, err := ioutil.ReadFile(filepath)
327
+	if err != nil {
328
+		return []*stats.RdmaEntry{}
329
+	}
330
+	return toRdmaEntry(strings.Split(string(currentData), "\n"))
331
+}
332
+
333
+func parseRdmaKV(raw string, entry *stats.RdmaEntry) {
334
+	var value uint64
335
+	var err error
336
+
337
+	parts := strings.Split(raw, "=")
338
+	switch len(parts) {
339
+	case 2:
340
+		if parts[1] == "max" {
341
+			value = math.MaxUint32
342
+		} else {
343
+			value, err = parseUint(parts[1], 10, 32)
344
+			if err != nil {
345
+				return
346
+			}
347
+		}
348
+		if parts[0] == "hca_handle" {
349
+			entry.HcaHandles = uint32(value)
350
+		} else if parts[0] == "hca_object" {
351
+			entry.HcaObjects = uint32(value)
352
+		}
353
+	}
354
+}
355
+
356
+func toRdmaEntry(strEntries []string) []*stats.RdmaEntry {
357
+	var rdmaEntries []*stats.RdmaEntry
358
+	for i := range strEntries {
359
+		parts := strings.Fields(strEntries[i])
360
+		switch len(parts) {
361
+		case 3:
362
+			entry := new(stats.RdmaEntry)
363
+			entry.Device = parts[0]
364
+			parseRdmaKV(parts[1], entry)
365
+			parseRdmaKV(parts[2], entry)
366
+
367
+			rdmaEntries = append(rdmaEntries, entry)
368
+		default:
369
+			continue
370
+		}
371
+	}
372
+	return rdmaEntries
373
+}
374
+
375
+// isUnitExists returns true if the error is that a systemd unit already exists.
376
+func isUnitExists(err error) bool {
377
+	if err != nil {
378
+		if dbusError, ok := err.(dbus.Error); ok {
379
+			return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
380
+		}
381
+	}
382
+	return false
383
+}
384
+
385
+func systemdUnitFromPath(path string) string {
386
+	_, unit := filepath.Split(path)
387
+	return unit
388
+}
389
+
390
+func readHugeTlbStats(path string) []*stats.HugeTlbStat {
391
+	var usage = []*stats.HugeTlbStat{}
392
+	var keyUsage = make(map[string]*stats.HugeTlbStat)
393
+	f, err := os.Open(path)
394
+	if err != nil {
395
+		return usage
396
+	}
397
+	files, err := f.Readdir(-1)
398
+	f.Close()
399
+	if err != nil {
400
+		return usage
401
+	}
402
+
403
+	for _, file := range files {
404
+		if strings.Contains(file.Name(), "hugetlb") &&
405
+			(strings.HasSuffix(file.Name(), "max") || strings.HasSuffix(file.Name(), "current")) {
406
+			var hugeTlb *stats.HugeTlbStat
407
+			var ok bool
408
+			fileName := strings.Split(file.Name(), ".")
409
+			pageSize := fileName[1]
410
+			if hugeTlb, ok = keyUsage[pageSize]; !ok {
411
+				hugeTlb = &stats.HugeTlbStat{}
412
+			}
413
+			hugeTlb.Pagesize = pageSize
414
+			out, err := ioutil.ReadFile(filepath.Join(path, file.Name()))
415
+			if err != nil {
416
+				continue
417
+			}
418
+			var value uint64
419
+			stringVal := strings.TrimSpace(string(out))
420
+			if stringVal == "max" {
421
+				value = math.MaxUint64
422
+			} else {
423
+				value, err = strconv.ParseUint(stringVal, 10, 64)
424
+			}
425
+			if err != nil {
426
+				continue
427
+			}
428
+			switch fileName[2] {
429
+			case "max":
430
+				hugeTlb.Max = value
431
+			case "current":
432
+				hugeTlb.Current = value
433
+			}
434
+			keyUsage[pageSize] = hugeTlb
435
+		}
436
+	}
437
+	for _, entry := range keyUsage {
438
+		usage = append(usage, entry)
439
+	}
440
+	return usage
441
+}