Browse code

switch to github.com/opencontainers/cgroups

The runc libcontainer/cgroups package was moved to a separate
module; switch our use of the runc module to use the new
location.

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>

Sebastiaan van Stijn authored on 2025/02/28 23:29:51
Showing 58 changed files
... ...
@@ -27,7 +27,7 @@ import (
27 27
 	"github.com/moby/sys/mountinfo"
28 28
 	"github.com/moby/sys/user"
29 29
 	"github.com/moby/sys/userns"
30
-	"github.com/opencontainers/runc/libcontainer/cgroups"
30
+	"github.com/opencontainers/cgroups"
31 31
 	"github.com/opencontainers/runtime-spec/specs-go"
32 32
 	"github.com/pkg/errors"
33 33
 )
... ...
@@ -80,9 +80,9 @@ require (
80 80
 	github.com/moby/sys/userns v0.1.0
81 81
 	github.com/moby/term v0.5.2
82 82
 	github.com/morikuni/aec v1.0.0
83
+	github.com/opencontainers/cgroups v0.0.1
83 84
 	github.com/opencontainers/go-digest v1.0.0
84 85
 	github.com/opencontainers/image-spec v1.1.0
85
-	github.com/opencontainers/runc v1.2.6
86 86
 	github.com/opencontainers/runtime-spec v1.2.0
87 87
 	github.com/opencontainers/selinux v1.11.1
88 88
 	github.com/pelletier/go-toml v1.9.5
... ...
@@ -435,12 +435,12 @@ github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C
435 435
 github.com/onsi/gomega v1.36.0 h1:Pb12RlruUtj4XUuPUqeEWc6j5DkVVVA49Uf6YLfC95Y=
436 436
 github.com/onsi/gomega v1.36.0/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
437 437
 github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
438
+github.com/opencontainers/cgroups v0.0.1 h1:MXjMkkFpKv6kpuirUa4USFBas573sSAY082B4CiHEVA=
439
+github.com/opencontainers/cgroups v0.0.1/go.mod h1:s8lktyhlGUqM7OSRL5P7eAW6Wb+kWPNvt4qvVfzA5vs=
438 440
 github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
439 441
 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
440 442
 github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
441 443
 github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
442
-github.com/opencontainers/runc v1.2.6 h1:P7Hqg40bsMvQGCS4S7DJYhUZOISMLJOB2iGX5COWiPk=
443
-github.com/opencontainers/runc v1.2.6/go.mod h1:dOQeFo29xZKBNeRBI0B19mJtfHv68YgCTh1X+YphA+4=
444 444
 github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
445 445
 github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
446 446
 github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
447 447
new file mode 100644
... ...
@@ -0,0 +1 @@
0
+* @maintainer1 @maintainer2 @maintainer3
0 1
new file mode 100644
... ...
@@ -0,0 +1,150 @@
0
+# Contribution Guidelines
1
+
2
+Development happens on GitHub.
3
+Issues are used for bugs and actionable items and longer discussions can happen on the [mailing list](#mailing-list).
4
+
5
+The content of this repository is licensed under the [Apache License, Version 2.0](LICENSE).
6
+
7
+## Code of Conduct
8
+
9
+Participation in the Open Container community is governed by [Open Container Code of Conduct][code-of-conduct].
10
+
11
+## Meetings
12
+
13
+The contributors and maintainers of all OCI projects have monthly meetings at 2:00 PM (USA Pacific) on the first Wednesday of every month.
14
+There is an [iCalendar][rfc5545] format for the meetings [here][meeting.ics].
15
+Everyone is welcome to participate via [UberConference web][UberConference] or audio-only: +1 415 968 0849 (no PIN needed).
16
+An initial agenda will be posted to the [mailing list](#mailing-list) in the week before each meeting, and everyone is welcome to propose additional topics or suggest other agenda alterations there.
17
+Minutes from past meetings are archived [here][minutes].
18
+
19
+## Mailing list
20
+
21
+You can subscribe and browse the mailing list on [Google Groups][mailing-list].
22
+
23
+## IRC
24
+
25
+OCI discussion happens on #opencontainers on [Freenode][] ([logs][irc-logs]).
26
+
27
+## Git
28
+
29
+### Security issues
30
+
31
+If you are reporting a security issue, do not create an issue or file a pull
32
+request on GitHub. Instead, disclose the issue responsibly by sending an email
33
+to security@opencontainers.org (which is inhabited only by the maintainers of
34
+the various OCI projects).
35
+
36
+### Pull requests are always welcome
37
+
38
+We are always thrilled to receive pull requests, and do our best to
39
+process them as fast as possible. Not sure if that typo is worth a pull
40
+request? Do it! We will appreciate it.
41
+
42
+If your pull request is not accepted on the first try, don't be
43
+discouraged! If there's a problem with the implementation, hopefully you
44
+received feedback on what to improve.
45
+
46
+We're trying very hard to keep the project lean and focused. We don't want it
47
+to do everything for everybody. This means that we might decide against
48
+incorporating a new feature.
49
+
50
+### Conventions
51
+
52
+Fork the repo and make changes on your fork in a feature branch.
53
+For larger bugs and enhancements, consider filing a leader issue or mailing-list thread for discussion that is independent of the implementation.
54
+Small changes or changes that have been discussed on the [project mailing list](#mailing-list) may be submitted without a leader issue.
55
+
56
+If the project has a test suite, submit unit tests for your changes. Take a
57
+look at existing tests for inspiration. Run the full test suite on your branch
58
+before submitting a pull request.
59
+
60
+Update the documentation when creating or modifying features. Test
61
+your documentation changes for clarity, concision, and correctness, as
62
+well as a clean documentation build.
63
+
64
+Pull requests descriptions should be as clear as possible and include a
65
+reference to all the issues that they address.
66
+
67
+Commit messages must start with a capitalized and short summary
68
+written in the imperative, followed by an optional, more detailed
69
+explanatory text which is separated from the summary by an empty line.
70
+
71
+Code review comments may be added to your pull request. Discuss, then make the
72
+suggested modifications and push additional commits to your feature branch. Be
73
+sure to post a comment after pushing. The new commits will show up in the pull
74
+request automatically, but the reviewers will not be notified unless you
75
+comment.
76
+
77
+Before the pull request is merged, make sure that you squash your commits into
78
+logical units of work using `git rebase -i` and `git push -f`. After every
79
+commit the test suite (if any) should be passing. Include documentation changes
80
+in the same commit so that a revert would remove all traces of the feature or
81
+fix.
82
+
83
+Commits that fix or close an issue should include a reference like `Closes #XXX`
84
+or `Fixes #XXX`, which will automatically close the issue when merged.
85
+
86
+### Sign your work
87
+
88
+The sign-off is a simple line at the end of the explanation for the
89
+patch, which certifies that you wrote it or otherwise have the right to
90
+pass it on as an open-source patch.  The rules are pretty simple: if you
91
+can certify the below (from [developercertificate.org][]):
92
+
93
+```
94
+Developer Certificate of Origin
95
+Version 1.1
96
+
97
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
98
+1 Letterman Drive
99
+Suite D4700
100
+San Francisco, CA, 94129
101
+
102
+Everyone is permitted to copy and distribute verbatim copies of this
103
+license document, but changing it is not allowed.
104
+
105
+
106
+Developer's Certificate of Origin 1.1
107
+
108
+By making a contribution to this project, I certify that:
109
+
110
+(a) The contribution was created in whole or in part by me and I
111
+    have the right to submit it under the open source license
112
+    indicated in the file; or
113
+
114
+(b) The contribution is based upon previous work that, to the best
115
+    of my knowledge, is covered under an appropriate open source
116
+    license and I have the right under that license to submit that
117
+    work with modifications, whether created in whole or in part
118
+    by me, under the same open source license (unless I am
119
+    permitted to submit under a different license), as indicated
120
+    in the file; or
121
+
122
+(c) The contribution was provided directly to me by some other
123
+    person who certified (a), (b) or (c) and I have not modified
124
+    it.
125
+
126
+(d) I understand and agree that this project and the contribution
127
+    are public and that a record of the contribution (including all
128
+    personal information I submit with it, including my sign-off) is
129
+    maintained indefinitely and may be redistributed consistent with
130
+    this project or the open source license(s) involved.
131
+```
132
+
133
+then you just add a line to every git commit message:
134
+
135
+    Signed-off-by: Joe Smith <joe@gmail.com>
136
+
137
+using your real name (sorry, no pseudonyms or anonymous contributions.)
138
+
139
+You can add the sign off when creating the git commit via `git commit -s`.
140
+
141
+[code-of-conduct]: https://github.com/opencontainers/tob/blob/d2f9d68c1332870e40693fe077d311e0742bc73d/code-of-conduct.md
142
+[developercertificate.org]: http://developercertificate.org/
143
+[Freenode]: https://freenode.net/
144
+[irc-logs]: http://ircbot.wl.linuxfoundation.org/eavesdrop/%23opencontainers/
145
+[mailing-list]: https://groups.google.com/a/opencontainers.org/forum/#!forum/dev
146
+[meeting.ics]: https://github.com/opencontainers/runtime-spec/blob/master/meeting.ics
147
+[minutes]: http://ircbot.wl.linuxfoundation.org/meetings/opencontainers/
148
+[rfc5545]: https://tools.ietf.org/html/rfc5545
149
+[UberConference]: https://www.uberconference.com/opencontainers
0 150
new file mode 100644
... ...
@@ -0,0 +1,63 @@
0
+# Project governance
1
+
2
+The [OCI charter][charter] §5.b.viii tasks an OCI Project's maintainers (listed in the repository's MAINTAINERS file and sometimes referred to as "the TDC", [§5.e][charter]) with:
3
+
4
+> Creating, maintaining and enforcing governance guidelines for the TDC, approved by the maintainers, and which shall be posted visibly for the TDC.
5
+
6
+This section describes generic rules and procedures for fulfilling that mandate.
7
+
8
+## Proposing a motion
9
+
10
+A maintainer SHOULD propose a motion on the dev@opencontainers.org mailing list (except [security issues](#security-issues)) with another maintainer as a co-sponsor.
11
+
12
+## Voting
13
+
14
+Voting on a proposed motion SHOULD happen on the dev@opencontainers.org mailing list (except [security issues](#security-issues)) with maintainers posting LGTM or REJECT.
15
+Maintainers MAY also explicitly not vote by posting ABSTAIN (which is useful to revert a previous vote).
16
+Maintainers MAY post multiple times (e.g. as they revise their position based on feedback), but only their final post counts in the tally.
17
+A proposed motion is adopted if two-thirds of votes cast, a quorum having voted, are in favor of the release.
18
+
19
+Voting SHOULD remain open for a week to collect feedback from the wider community and allow the maintainers to digest the proposed motion.
20
+Under exceptional conditions (e.g. non-major security fix releases) proposals which reach quorum with unanimous support MAY be adopted earlier.
21
+
22
+A maintainer MAY choose to reply with REJECT.
23
+A maintainer posting a REJECT MUST include a list of concerns or links to written documentation for those concerns (e.g. GitHub issues or mailing-list threads).
24
+The maintainers SHOULD try to resolve the concerns and wait for the rejecting maintainer to change their opinion to LGTM.
25
+However, a motion MAY be adopted with REJECTs, as outlined in the previous paragraphs.
26
+
27
+## Quorum
28
+
29
+A quorum is established when at least two-thirds of maintainers have voted.
30
+
31
+For projects that are not specifications, a [motion to release](#release-approval) MAY be adopted if the tally is at least three LGTMs and no REJECTs, even if three votes does not meet the usual two-thirds quorum.
32
+
33
+## Amendments
34
+
35
+The [project governance](#project-governance) rules and procedures MAY be amended or replaced using the procedures themselves.
36
+The MAINTAINERS of this project governance document is the total set of MAINTAINERS from all Open Containers projects (go-digest, image-spec, image-tools, runC, runtime-spec, runtime-tools, and selinux).
37
+
38
+## Subject templates
39
+
40
+Maintainers are busy and get lots of email.
41
+To make project proposals recognizable, proposed motions SHOULD use the following subject templates.
42
+
43
+### Proposing a motion
44
+
45
+> [{project} VOTE]: {motion description} (closes {end of voting window})
46
+
47
+For example:
48
+
49
+> [runtime-spec VOTE]: Tag 0647920 as 1.0.0-rc (closes 2016-06-03 20:00 UTC)
50
+
51
+### Tallying results
52
+
53
+After voting closes, a maintainer SHOULD post a tally to the motion thread with a subject template like:
54
+
55
+> [{project} {status}]: {motion description} (+{LGTMs} -{REJECTs} #{ABSTAINs})
56
+
57
+Where `{status}` is either `adopted` or `rejected`.
58
+For example:
59
+
60
+> [runtime-spec adopted]: Tag 0647920 as 1.0.0-rc (+6 -0 #3)
61
+
62
+[charter]: https://www.opencontainers.org/about/governance
0 63
new file mode 100644
... ...
@@ -0,0 +1,201 @@
0
+                                 Apache License
1
+                           Version 2.0, January 2004
2
+                        http://www.apache.org/licenses/
3
+
4
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
5
+
6
+   1. Definitions.
7
+
8
+      "License" shall mean the terms and conditions for use, reproduction,
9
+      and distribution as defined by Sections 1 through 9 of this document.
10
+
11
+      "Licensor" shall mean the copyright owner or entity authorized by
12
+      the copyright owner that is granting the License.
13
+
14
+      "Legal Entity" shall mean the union of the acting entity and all
15
+      other entities that control, are controlled by, or are under common
16
+      control with that entity. For the purposes of this definition,
17
+      "control" means (i) the power, direct or indirect, to cause the
18
+      direction or management of such entity, whether by contract or
19
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
20
+      outstanding shares, or (iii) beneficial ownership of such entity.
21
+
22
+      "You" (or "Your") shall mean an individual or Legal Entity
23
+      exercising permissions granted by this License.
24
+
25
+      "Source" form shall mean the preferred form for making modifications,
26
+      including but not limited to software source code, documentation
27
+      source, and configuration files.
28
+
29
+      "Object" form shall mean any form resulting from mechanical
30
+      transformation or translation of a Source form, including but
31
+      not limited to compiled object code, generated documentation,
32
+      and conversions to other media types.
33
+
34
+      "Work" shall mean the work of authorship, whether in Source or
35
+      Object form, made available under the License, as indicated by a
36
+      copyright notice that is included in or attached to the work
37
+      (an example is provided in the Appendix below).
38
+
39
+      "Derivative Works" shall mean any work, whether in Source or Object
40
+      form, that is based on (or derived from) the Work and for which the
41
+      editorial revisions, annotations, elaborations, or other modifications
42
+      represent, as a whole, an original work of authorship. For the purposes
43
+      of this License, Derivative Works shall not include works that remain
44
+      separable from, or merely link (or bind by name) to the interfaces of,
45
+      the Work and Derivative Works thereof.
46
+
47
+      "Contribution" shall mean any work of authorship, including
48
+      the original version of the Work and any modifications or additions
49
+      to that Work or Derivative Works thereof, that is intentionally
50
+      submitted to Licensor for inclusion in the Work by the copyright owner
51
+      or by an individual or Legal Entity authorized to submit on behalf of
52
+      the copyright owner. For the purposes of this definition, "submitted"
53
+      means any form of electronic, verbal, or written communication sent
54
+      to the Licensor or its representatives, including but not limited to
55
+      communication on electronic mailing lists, source code control systems,
56
+      and issue tracking systems that are managed by, or on behalf of, the
57
+      Licensor for the purpose of discussing and improving the Work, but
58
+      excluding communication that is conspicuously marked or otherwise
59
+      designated in writing by the copyright owner as "Not a Contribution."
60
+
61
+      "Contributor" shall mean Licensor and any individual or Legal Entity
62
+      on behalf of whom a Contribution has been received by Licensor and
63
+      subsequently incorporated within the Work.
64
+
65
+   2. Grant of Copyright License. Subject to the terms and conditions of
66
+      this License, each Contributor hereby grants to You a perpetual,
67
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
68
+      copyright license to reproduce, prepare Derivative Works of,
69
+      publicly display, publicly perform, sublicense, and distribute the
70
+      Work and such Derivative Works in Source or Object form.
71
+
72
+   3. Grant of Patent License. Subject to the terms and conditions of
73
+      this License, each Contributor hereby grants to You a perpetual,
74
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
75
+      (except as stated in this section) patent license to make, have made,
76
+      use, offer to sell, sell, import, and otherwise transfer the Work,
77
+      where such license applies only to those patent claims licensable
78
+      by such Contributor that are necessarily infringed by their
79
+      Contribution(s) alone or by combination of their Contribution(s)
80
+      with the Work to which such Contribution(s) was submitted. If You
81
+      institute patent litigation against any entity (including a
82
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
83
+      or a Contribution incorporated within the Work constitutes direct
84
+      or contributory patent infringement, then any patent licenses
85
+      granted to You under this License for that Work shall terminate
86
+      as of the date such litigation is filed.
87
+
88
+   4. Redistribution. You may reproduce and distribute copies of the
89
+      Work or Derivative Works thereof in any medium, with or without
90
+      modifications, and in Source or Object form, provided that You
91
+      meet the following conditions:
92
+
93
+      (a) You must give any other recipients of the Work or
94
+          Derivative Works a copy of this License; and
95
+
96
+      (b) You must cause any modified files to carry prominent notices
97
+          stating that You changed the files; and
98
+
99
+      (c) You must retain, in the Source form of any Derivative Works
100
+          that You distribute, all copyright, patent, trademark, and
101
+          attribution notices from the Source form of the Work,
102
+          excluding those notices that do not pertain to any part of
103
+          the Derivative Works; and
104
+
105
+      (d) If the Work includes a "NOTICE" text file as part of its
106
+          distribution, then any Derivative Works that You distribute must
107
+          include a readable copy of the attribution notices contained
108
+          within such NOTICE file, excluding those notices that do not
109
+          pertain to any part of the Derivative Works, in at least one
110
+          of the following places: within a NOTICE text file distributed
111
+          as part of the Derivative Works; within the Source form or
112
+          documentation, if provided along with the Derivative Works; or,
113
+          within a display generated by the Derivative Works, if and
114
+          wherever such third-party notices normally appear. The contents
115
+          of the NOTICE file are for informational purposes only and
116
+          do not modify the License. You may add Your own attribution
117
+          notices within Derivative Works that You distribute, alongside
118
+          or as an addendum to the NOTICE text from the Work, provided
119
+          that such additional attribution notices cannot be construed
120
+          as modifying the License.
121
+
122
+      You may add Your own copyright statement to Your modifications and
123
+      may provide additional or different license terms and conditions
124
+      for use, reproduction, or distribution of Your modifications, or
125
+      for any such Derivative Works as a whole, provided Your use,
126
+      reproduction, and distribution of the Work otherwise complies with
127
+      the conditions stated in this License.
128
+
129
+   5. Submission of Contributions. Unless You explicitly state otherwise,
130
+      any Contribution intentionally submitted for inclusion in the Work
131
+      by You to the Licensor shall be under the terms and conditions of
132
+      this License, without any additional terms or conditions.
133
+      Notwithstanding the above, nothing herein shall supersede or modify
134
+      the terms of any separate license agreement you may have executed
135
+      with Licensor regarding such Contributions.
136
+
137
+   6. Trademarks. This License does not grant permission to use the trade
138
+      names, trademarks, service marks, or product names of the Licensor,
139
+      except as required for reasonable and customary use in describing the
140
+      origin of the Work and reproducing the content of the NOTICE file.
141
+
142
+   7. Disclaimer of Warranty. Unless required by applicable law or
143
+      agreed to in writing, Licensor provides the Work (and each
144
+      Contributor provides its Contributions) on an "AS IS" BASIS,
145
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
146
+      implied, including, without limitation, any warranties or conditions
147
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
148
+      PARTICULAR PURPOSE. You are solely responsible for determining the
149
+      appropriateness of using or redistributing the Work and assume any
150
+      risks associated with Your exercise of permissions under this License.
151
+
152
+   8. Limitation of Liability. In no event and under no legal theory,
153
+      whether in tort (including negligence), contract, or otherwise,
154
+      unless required by applicable law (such as deliberate and grossly
155
+      negligent acts) or agreed to in writing, shall any Contributor be
156
+      liable to You for damages, including any direct, indirect, special,
157
+      incidental, or consequential damages of any character arising as a
158
+      result of this License or out of the use or inability to use the
159
+      Work (including but not limited to damages for loss of goodwill,
160
+      work stoppage, computer failure or malfunction, or any and all
161
+      other commercial damages or losses), even if such Contributor
162
+      has been advised of the possibility of such damages.
163
+
164
+   9. Accepting Warranty or Additional Liability. While redistributing
165
+      the Work or Derivative Works thereof, You may choose to offer,
166
+      and charge a fee for, acceptance of support, warranty, indemnity,
167
+      or other liability obligations and/or rights consistent with this
168
+      License. However, in accepting such obligations, You may act only
169
+      on Your own behalf and on Your sole responsibility, not on behalf
170
+      of any other Contributor, and only if You agree to indemnify,
171
+      defend, and hold each Contributor harmless for any liability
172
+      incurred by, or claims asserted against, such Contributor by reason
173
+      of your accepting any such warranty or additional liability.
174
+
175
+   END OF TERMS AND CONDITIONS
176
+
177
+   APPENDIX: How to apply the Apache License to your work.
178
+
179
+      To apply the Apache License to your work, attach the following
180
+      boilerplate notice, with the fields enclosed by brackets "{}"
181
+      replaced with your own identifying information. (Don't include
182
+      the brackets!)  The text should be enclosed in the appropriate
183
+      comment syntax for the file format. We also recommend that a
184
+      file or class name and description of purpose be included on the
185
+      same "printed page" as the copyright notice for easier
186
+      identification within third-party archives.
187
+
188
+   Copyright {yyyy} {name of copyright owner}
189
+
190
+   Licensed under the Apache License, Version 2.0 (the "License");
191
+   you may not use this file except in compliance with the License.
192
+   You may obtain a copy of the License at
193
+
194
+       http://www.apache.org/licenses/LICENSE-2.0
195
+
196
+   Unless required by applicable law or agreed to in writing, software
197
+   distributed under the License is distributed on an "AS IS" BASIS,
198
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
199
+   See the License for the specific language governing permissions and
200
+   limitations under the License.
0 201
new file mode 100644
... ...
@@ -0,0 +1,8 @@
0
+Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> (@AkihiroSuda)
1
+Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
2
+Kir Kolyshkin <kolyshkin@gmail.com> (@kolyshkin)
3
+Mrunal Patel <mpatel@redhat.com> (@mrunalp)
4
+Sebastiaan van Stijn <github@gone.nl> (@thaJeztah)
5
+Odin Ugedal <odin@uged.al> (@odinuge)
6
+Peter Hunt <pehunt@redhat.com> (@haircommander)
7
+Davanum Srinivas <davanum@gmail.com> (@dims)
0 8
new file mode 100644
... ...
@@ -0,0 +1,92 @@
0
+## Introduction
1
+
2
+Dear maintainer. Thank you for investing the time and energy to help
3
+make this project as useful as possible. Maintaining a project is difficult,
4
+sometimes unrewarding work.  Sure, you will get to contribute cool
5
+features to the project. But most of your time will be spent reviewing,
6
+cleaning up, documenting, answering questions, justifying design
7
+decisions - while everyone has all the fun! But remember - the quality
8
+of the maintainers work is what distinguishes the good projects from the
9
+great.  So please be proud of your work, even the unglamourous parts,
10
+and encourage a culture of appreciation and respect for *every* aspect
11
+of improving the project - not just the hot new features.
12
+
13
+This document is a manual for maintainers old and new. It explains what
14
+is expected of maintainers, how they should work, and what tools are
15
+available to them.
16
+
17
+This is a living document - if you see something out of date or missing,
18
+speak up!
19
+
20
+## What are a maintainer's responsibilities?
21
+
22
+It is every maintainer's responsibility to:
23
+
24
+* Expose a clear roadmap for improving their component.
25
+* Deliver prompt feedback and decisions on pull requests.
26
+* Be available to anyone with questions, bug reports, criticism etc. on their component.
27
+  This includes IRC and GitHub issues and pull requests.
28
+* Make sure their component respects the philosophy, design and roadmap of the project.
29
+
30
+## How are decisions made?
31
+
32
+This project is an open-source project with an open design philosophy. This
33
+means that the repository is the source of truth for EVERY aspect of the
34
+project, including its philosophy, design, roadmap and APIs. *If it's
35
+part of the project, it's in the repo. It's in the repo, it's part of
36
+the project.*
37
+
38
+As a result, all decisions can be expressed as changes to the
39
+repository. An implementation change is a change to the source code. An
40
+API change is a change to the API specification. A philosophy change is
41
+a change to the philosophy manifesto. And so on.
42
+
43
+All decisions affecting this project, big and small, follow the same procedure:
44
+
45
+1. Discuss a proposal on the [mailing list](CONTRIBUTING.md#mailing-list).
46
+   Anyone can do this.
47
+2. Open a pull request.
48
+   Anyone can do this.
49
+3. Discuss the pull request.
50
+   Anyone can do this.
51
+4. Endorse (`LGTM`) or oppose (`Rejected`) the pull request.
52
+   The relevant maintainers do this (see below [Who decides what?](#who-decides-what)).
53
+   Changes that affect project management (changing policy, cutting releases, etc.) are [proposed and voted on the mailing list](GOVERNANCE.md).
54
+5. Merge or close the pull request.
55
+   The relevant maintainers do this.
56
+
57
+### I'm a maintainer, should I make pull requests too?
58
+
59
+Yes. Nobody should ever push to master directly. All changes should be
60
+made through a pull request.
61
+
62
+## Who decides what?
63
+
64
+All decisions are pull requests, and the relevant maintainers make
65
+decisions by accepting or refusing the pull request. Review and acceptance
66
+by anyone is denoted by adding a comment in the pull request: `LGTM`.
67
+However, only currently listed `MAINTAINERS` are counted towards the required
68
+two LGTMs. In addition, if a maintainer has created a pull request, they cannot
69
+count toward the two LGTM rule (to ensure equal amounts of review for every pull
70
+request, no matter who wrote it).
71
+
72
+Overall the maintainer system works because of mutual respect.
73
+The maintainers trust one another to act in the best interests of the project.
74
+Sometimes maintainers can disagree and this is part of a healthy project to represent the points of view of various people.
75
+In the case where maintainers cannot find agreement on a specific change, maintainers should use the [governance procedure](GOVERNANCE.md) to attempt to reach a consensus.
76
+
77
+### How are maintainers added?
78
+
79
+The best maintainers have a vested interest in the project.  Maintainers
80
+are first and foremost contributors that have shown they are committed to
81
+the long term success of the project.  Contributors wanting to become
82
+maintainers are expected to be deeply involved in contributing code,
83
+pull request review, and triage of issues in the project for more than two months.
84
+
85
+Just contributing does not make you a maintainer, it is about building trust with the current maintainers of the project and being a person that they can depend on to act in the best interest of the project.
86
+The final vote to add a new maintainer should be approved by the [governance procedure](GOVERNANCE.md).
87
+
88
+### How are maintainers removed?
89
+
90
+When a maintainer is unable to perform the [required duties](#what-are-a-maintainers-responsibilities) they can be removed by the [governance procedure](GOVERNANCE.md).
91
+Issues related to a maintainer's performance should be discussed with them among the other maintainers so that they are not surprised by a pull request removing them.
0 92
new file mode 100644
... ...
@@ -0,0 +1,11 @@
0
+# OCI Project Template
1
+
2
+Useful boilerplate and organizational information for all OCI projects.
3
+
4
+* README (this file)
5
+* [The Apache License, Version 2.0](LICENSE)
6
+* [A list of maintainers](MAINTAINERS)
7
+* [Maintainer guidelines](MAINTAINERS_GUIDE.md)
8
+* [Contributor guidelines](CONTRIBUTING.md)
9
+* [Project governance](GOVERNANCE.md)
10
+* [Release procedures](RELEASES.md)
0 11
new file mode 100644
... ...
@@ -0,0 +1,51 @@
0
+# Releases
1
+
2
+The release process hopes to encourage early, consistent consensus-building during project development.
3
+The mechanisms used are regular community communication on the mailing list about progress, scheduled meetings for issue resolution and release triage, and regularly paced and communicated releases.
4
+Releases are proposed and adopted or rejected using the usual [project governance](GOVERNANCE.md) rules and procedures.
5
+
6
+An anti-pattern that we want to avoid is heavy development or discussions "late cycle" around major releases.
7
+We want to build a community that is involved and communicates consistently through all releases instead of relying on "silent periods" as a judge of stability.
8
+
9
+## Parallel releases
10
+
11
+A single project MAY consider several motions to release in parallel.
12
+However each motion to release after the initial 0.1.0 MUST be based on a previous release that has already landed.
13
+
14
+For example, runtime-spec maintainers may propose a v1.0.0-rc2 on the 1st of the month and a v0.9.1 bugfix on the 2nd of the month.
15
+They may not propose a v1.0.0-rc3 until the v1.0.0-rc2 is accepted (on the 7th if the vote initiated on the 1st passes).
16
+
17
+## Specifications
18
+
19
+The OCI maintains three categories of projects: specifications, applications, and conformance-testing tools.
20
+However, specification releases have special restrictions in the [OCI charter][charter]:
21
+
22
+* They are the target of backwards compatibility (§7.g), and
23
+* They are subject to the OFWa patent grant (§8.d and e).
24
+
25
+To avoid unfortunate side effects (onerous backwards compatibity requirements or Member resignations), the following additional procedures apply to specification releases:
26
+
27
+### Planning a release
28
+
29
+Every OCI specification project SHOULD hold meetings that involve maintainers reviewing pull requests, debating outstanding issues, and planning releases.
30
+This meeting MUST be advertised on the project README and MAY happen on a phone call, video conference, or on IRC.
31
+Maintainers MUST send updates to the dev@opencontainers.org with results of these meetings.
32
+
33
+Before the specification reaches v1.0.0, the meetings SHOULD be weekly.
34
+Once a specification has reached v1.0.0, the maintainers may alter the cadence, but a meeting MUST be held within four weeks of the previous meeting.
35
+
36
+The release plans, corresponding milestones and estimated due dates MUST be published on GitHub (e.g. https://github.com/opencontainers/runtime-spec/milestones).
37
+GitHub milestones and issues are only used for community organization and all releases MUST follow the [project governance](GOVERNANCE.md) rules and procedures.
38
+
39
+### Timelines
40
+
41
+Specifications have a variety of different timelines in their lifecycle.
42
+
43
+* Pre-v1.0.0 specifications SHOULD release on a monthly cadence to garner feedback.
44
+* Major specification releases MUST release at least three release candidates spaced a minimum of one week apart.
45
+  This means a major release like a v1.0.0 or v2.0.0 release will take 1 month at minimum: one week for rc1, one week for rc2, one week for rc3, and one week for the major release itself.
46
+  Maintainers SHOULD strive to make zero breaking changes during this cycle of release candidates and SHOULD restart the three-candidate count when a breaking change is introduced.
47
+  For example if a breaking change is introduced in v1.0.0-rc2 then the series would end with v1.0.0-rc4 and v1.0.0.
48
+* Minor and patch releases SHOULD be made on an as-needed basis.
49
+
50
+[charter]: https://www.opencontainers.org/about/governance
0 51
new file mode 100644
... ...
@@ -0,0 +1,78 @@
0
+package cgroups
1
+
2
+import (
3
+	"errors"
4
+)
5
+
6
+var (
7
+	// ErrDevicesUnsupported is an error returned when a cgroup manager
8
+	// is not configured to set device rules.
9
+	ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
10
+
11
+	// ErrRootless is returned by [Manager.Apply] when there is an error
12
+	// creating cgroup directory, and cgroup.Rootless is set. In general,
13
+	// this error is to be ignored.
14
+	ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)")
15
+
16
+	// DevicesSetV1 and DevicesSetV2 are functions to set devices for
17
+	// cgroup v1 and v2, respectively. Unless
18
+	// [github.com/opencontainers/cgroups/devices]
19
+	// package is imported, it is set to nil, so cgroup managers can't
20
+	// manage devices.
21
+	DevicesSetV1 func(path string, r *Resources) error
22
+	DevicesSetV2 func(path string, r *Resources) error
23
+)
24
+
25
+type Manager interface {
26
+	// Apply creates a cgroup, if not yet created, and adds a process
27
+	// with the specified pid into that cgroup.  A special value of -1
28
+	// can be used to merely create a cgroup.
29
+	Apply(pid int) error
30
+
31
+	// GetPids returns the PIDs of all processes inside the cgroup.
32
+	GetPids() ([]int, error)
33
+
34
+	// GetAllPids returns the PIDs of all processes inside the cgroup
35
+	// any all its sub-cgroups.
36
+	GetAllPids() ([]int, error)
37
+
38
+	// GetStats returns cgroups statistics.
39
+	GetStats() (*Stats, error)
40
+
41
+	// Freeze sets the freezer cgroup to the specified state.
42
+	Freeze(state FreezerState) error
43
+
44
+	// Destroy removes cgroup.
45
+	Destroy() error
46
+
47
+	// Path returns a cgroup path to the specified controller/subsystem.
48
+	// For cgroupv2, the argument is unused and can be empty.
49
+	Path(string) string
50
+
51
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
52
+	// the resources specified during Manager creation (or the previous call
53
+	// to Set) are used.
54
+	Set(r *Resources) error
55
+
56
+	// GetPaths returns cgroup path(s) to save in a state file in order to
57
+	// restore later.
58
+	//
59
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
60
+	// path to the cgroup for this subsystem.
61
+	//
62
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
63
+	// unified path.
64
+	GetPaths() map[string]string
65
+
66
+	// GetCgroups returns the cgroup data as configured.
67
+	GetCgroups() (*Cgroup, error)
68
+
69
+	// GetFreezerState retrieves the current FreezerState of the cgroup.
70
+	GetFreezerState() (FreezerState, error)
71
+
72
+	// Exists returns whether the cgroup path exists or not.
73
+	Exists() bool
74
+
75
+	// OOMKillCount reports OOM kill count for the cgroup.
76
+	OOMKillCount() (uint64, error)
77
+}
0 78
new file mode 100644
... ...
@@ -0,0 +1,66 @@
0
+package cgroups
1
+
2
+import "fmt"
3
+
4
+// BlockIODevice holds major:minor format supported in blkio cgroup.
5
+type BlockIODevice struct {
6
+	// Major is the device's major number
7
+	Major int64 `json:"major"`
8
+	// Minor is the device's minor number
9
+	Minor int64 `json:"minor"`
10
+}
11
+
12
+// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
13
+type WeightDevice struct {
14
+	BlockIODevice
15
+	// Weight is the bandwidth rate for the device, range is from 10 to 1000
16
+	Weight uint16 `json:"weight"`
17
+	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
18
+	LeafWeight uint16 `json:"leafWeight"`
19
+}
20
+
21
+// NewWeightDevice returns a configured WeightDevice pointer
22
+func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
23
+	wd := &WeightDevice{}
24
+	wd.Major = major
25
+	wd.Minor = minor
26
+	wd.Weight = weight
27
+	wd.LeafWeight = leafWeight
28
+	return wd
29
+}
30
+
31
+// WeightString formats the struct to be writable to the cgroup specific file
32
+func (wd *WeightDevice) WeightString() string {
33
+	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
34
+}
35
+
36
+// LeafWeightString formats the struct to be writable to the cgroup specific file
37
+func (wd *WeightDevice) LeafWeightString() string {
38
+	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
39
+}
40
+
41
+// ThrottleDevice struct holds a `major:minor rate_per_second` pair
42
+type ThrottleDevice struct {
43
+	BlockIODevice
44
+	// Rate is the IO rate limit per cgroup per device
45
+	Rate uint64 `json:"rate"`
46
+}
47
+
48
+// NewThrottleDevice returns a configured ThrottleDevice pointer
49
+func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
50
+	td := &ThrottleDevice{}
51
+	td.Major = major
52
+	td.Minor = minor
53
+	td.Rate = rate
54
+	return td
55
+}
56
+
57
+// String formats the struct to be writable to the cgroup specific file
58
+func (td *ThrottleDevice) String() string {
59
+	return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
60
+}
61
+
62
+// StringName formats the struct to be writable to the cgroup specific file
63
+func (td *ThrottleDevice) StringName(name string) string {
64
+	return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
65
+}
0 66
new file mode 100644
... ...
@@ -0,0 +1,9 @@
0
+package cgroups
1
+
2
+type HugepageLimit struct {
3
+	// which type of hugepage to limit.
4
+	Pagesize string `json:"page_size"`
5
+
6
+	// usage limit for hugepage.
7
+	Limit uint64 `json:"limit"`
8
+}
0 9
new file mode 100644
... ...
@@ -0,0 +1,14 @@
0
+package cgroups
1
+
2
+import (
3
+	"fmt"
4
+)
5
+
6
+type IfPrioMap struct {
7
+	Interface string `json:"interface"`
8
+	Priority  int64  `json:"priority"`
9
+}
10
+
11
+func (i *IfPrioMap) CgroupString() string {
12
+	return fmt.Sprintf("%s %d", i.Interface, i.Priority)
13
+}
0 14
new file mode 100644
... ...
@@ -0,0 +1,169 @@
0
+package cgroups
1
+
2
+import (
3
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
4
+	devices "github.com/opencontainers/cgroups/devices/config"
5
+)
6
+
7
+type FreezerState string
8
+
9
+const (
10
+	Undefined FreezerState = ""
11
+	Frozen    FreezerState = "FROZEN"
12
+	Thawed    FreezerState = "THAWED"
13
+)
14
+
15
+// Cgroup holds properties of a cgroup on Linux.
16
+type Cgroup struct {
17
+	// Name specifies the name of the cgroup
18
+	Name string `json:"name,omitempty"`
19
+
20
+	// Parent specifies the name of parent of cgroup or slice
21
+	Parent string `json:"parent,omitempty"`
22
+
23
+	// Path specifies the path to cgroups that are created and/or joined by the container.
24
+	// The path is assumed to be relative to the host system cgroup mountpoint.
25
+	Path string `json:"path"`
26
+
27
+	// ScopePrefix describes prefix for the scope name
28
+	ScopePrefix string `json:"scope_prefix"`
29
+
30
+	// Resources contains various cgroups settings to apply
31
+	*Resources
32
+
33
+	// Systemd tells if systemd should be used to manage cgroups.
34
+	Systemd bool
35
+
36
+	// SystemdProps are any additional properties for systemd,
37
+	// derived from org.systemd.property.xxx annotations.
38
+	// Ignored unless systemd is used for managing cgroups.
39
+	SystemdProps []systemdDbus.Property `json:"-"`
40
+
41
+	// Rootless tells if rootless cgroups should be used.
42
+	Rootless bool
43
+
44
+	// The host UID that should own the cgroup, or nil to accept
45
+	// the default ownership.  This should only be set when the
46
+	// cgroupfs is to be mounted read/write.
47
+	// Not all cgroup manager implementations support changing
48
+	// the ownership.
49
+	OwnerUID *int `json:"owner_uid,omitempty"`
50
+}
51
+
52
+type Resources struct {
53
+	// Devices is the set of access rules for devices in the container.
54
+	Devices []*devices.Rule `json:"devices"`
55
+
56
+	// Memory limit (in bytes)
57
+	Memory int64 `json:"memory"`
58
+
59
+	// Memory reservation or soft_limit (in bytes)
60
+	MemoryReservation int64 `json:"memory_reservation"`
61
+
62
+	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
63
+	MemorySwap int64 `json:"memory_swap"`
64
+
65
+	// CPU shares (relative weight vs. other containers)
66
+	CpuShares uint64 `json:"cpu_shares"`
67
+
68
+	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
69
+	CpuQuota int64 `json:"cpu_quota"`
70
+
71
+	// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
72
+	CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
73
+
74
+	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
75
+	CpuPeriod uint64 `json:"cpu_period"`
76
+
77
+	// How many time CPU will use in realtime scheduling (in usecs).
78
+	CpuRtRuntime int64 `json:"cpu_rt_quota"`
79
+
80
+	// CPU period to be used for realtime scheduling (in usecs).
81
+	CpuRtPeriod uint64 `json:"cpu_rt_period"`
82
+
83
+	// CPU to use
84
+	CpusetCpus string `json:"cpuset_cpus"`
85
+
86
+	// MEM to use
87
+	CpusetMems string `json:"cpuset_mems"`
88
+
89
+	// cgroup SCHED_IDLE
90
+	CPUIdle *int64 `json:"cpu_idle,omitempty"`
91
+
92
+	// Process limit; set <= `0' to disable limit.
93
+	PidsLimit int64 `json:"pids_limit"`
94
+
95
+	// Specifies per cgroup weight, range is from 10 to 1000.
96
+	BlkioWeight uint16 `json:"blkio_weight"`
97
+
98
+	// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
99
+	BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
100
+
101
+	// Weight per cgroup per device, can override BlkioWeight.
102
+	BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
103
+
104
+	// IO read rate limit per cgroup per device, bytes per second.
105
+	BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
106
+
107
+	// IO write rate limit per cgroup per device, bytes per second.
108
+	BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
109
+
110
+	// IO read rate limit per cgroup per device, IO per second.
111
+	BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
112
+
113
+	// IO write rate limit per cgroup per device, IO per second.
114
+	BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
115
+
116
+	// set the freeze value for the process
117
+	Freezer FreezerState `json:"freezer"`
118
+
119
+	// Hugetlb limit (in bytes)
120
+	HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
121
+
122
+	// Whether to disable OOM Killer
123
+	OomKillDisable bool `json:"oom_kill_disable"`
124
+
125
+	// Tuning swappiness behaviour per cgroup
126
+	MemorySwappiness *uint64 `json:"memory_swappiness"`
127
+
128
+	// Set priority of network traffic for container
129
+	NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
130
+
131
+	// Set class identifier for container's network packets
132
+	NetClsClassid uint32 `json:"net_cls_classid_u"`
133
+
134
+	// Rdma resource restriction configuration
135
+	Rdma map[string]LinuxRdma `json:"rdma"`
136
+
137
+	// Used on cgroups v2:
138
+
139
+	// CpuWeight sets a proportional bandwidth limit.
140
+	CpuWeight uint64 `json:"cpu_weight"`
141
+
142
+	// Unified is cgroupv2-only key-value map.
143
+	Unified map[string]string `json:"unified"`
144
+
145
+	// SkipDevices allows to skip configuring device permissions.
146
+	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
147
+	// common for many containers, and by runc update.
148
+	//
149
+	// NOTE it is impossible to start a container which has this flag set.
150
+	SkipDevices bool `json:"-"`
151
+
152
+	// SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup
153
+	// freeze when setting resources. Only applicable to systemd legacy
154
+	// (i.e. cgroup v1) manager (which uses freeze by default to avoid
155
+	// spurious permission errors caused by systemd inability to update
156
+	// device rules in a non-disruptive manner).
157
+	//
158
+	// If not set, a few methods (such as looking into cgroup's
159
+	// devices.list and querying the systemd unit properties) are used
160
+	// during Set() to figure out whether the freeze is required. Those
161
+	// methods may be relatively slow, thus this flag.
162
+	SkipFreezeOnSet bool `json:"-"`
163
+
164
+	// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
165
+	// if the new memory limits (Memory and MemorySwap) being set are lower
166
+	// than the current memory usage, and reject if so.
167
+	MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
168
+}
0 169
new file mode 100644
... ...
@@ -0,0 +1,9 @@
0
+package cgroups
1
+
2
+// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11)
3
+type LinuxRdma struct {
4
+	// Maximum number of HCA handles that can be opened. Default is "no limit".
5
+	HcaHandles *uint32 `json:"hca_handles,omitempty"`
6
+	// Maximum number of HCA objects that can be created. Default is "no limit".
7
+	HcaObjects *uint32 `json:"hca_objects,omitempty"`
8
+}
0 9
new file mode 100644
... ...
@@ -0,0 +1,8 @@
0
+//go:build !linux
1
+
2
+package cgroups
3
+
4
+// Cgroup holds properties of a cgroup on Linux
5
+// TODO Windows: This can ultimately be entirely factored out on Windows as
6
+// cgroups are a Unix-specific construct.
7
+type Cgroup struct{}
0 8
new file mode 100644
... ...
@@ -0,0 +1,174 @@
0
+package config
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"strconv"
6
+)
7
+
8
+const (
9
+	Wildcard = -1
10
+)
11
+
12
+type Device struct {
13
+	Rule
14
+
15
+	// Path to the device.
16
+	Path string `json:"path"`
17
+
18
+	// FileMode permission bits for the device.
19
+	FileMode os.FileMode `json:"file_mode"`
20
+
21
+	// Uid of the device.
22
+	Uid uint32 `json:"uid"`
23
+
24
+	// Gid of the device.
25
+	Gid uint32 `json:"gid"`
26
+}
27
+
28
+// Permissions is a cgroupv1-style string to represent device access. It
29
+// has to be a string for backward compatibility reasons, hence why it has
30
+// methods to do set operations.
31
+type Permissions string
32
+
33
+const (
34
+	deviceRead uint = (1 << iota)
35
+	deviceWrite
36
+	deviceMknod
37
+)
38
+
39
+func (p Permissions) toSet() uint {
40
+	var set uint
41
+	for _, perm := range p {
42
+		switch perm {
43
+		case 'r':
44
+			set |= deviceRead
45
+		case 'w':
46
+			set |= deviceWrite
47
+		case 'm':
48
+			set |= deviceMknod
49
+		}
50
+	}
51
+	return set
52
+}
53
+
54
+func fromSet(set uint) Permissions {
55
+	var perm string
56
+	if set&deviceRead == deviceRead {
57
+		perm += "r"
58
+	}
59
+	if set&deviceWrite == deviceWrite {
60
+		perm += "w"
61
+	}
62
+	if set&deviceMknod == deviceMknod {
63
+		perm += "m"
64
+	}
65
+	return Permissions(perm)
66
+}
67
+
68
+// Union returns the union of the two sets of Permissions.
69
+func (p Permissions) Union(o Permissions) Permissions {
70
+	lhs := p.toSet()
71
+	rhs := o.toSet()
72
+	return fromSet(lhs | rhs)
73
+}
74
+
75
+// Difference returns the set difference of the two sets of Permissions.
76
+// In set notation, A.Difference(B) gives you A\B.
77
+func (p Permissions) Difference(o Permissions) Permissions {
78
+	lhs := p.toSet()
79
+	rhs := o.toSet()
80
+	return fromSet(lhs &^ rhs)
81
+}
82
+
83
+// Intersection computes the intersection of the two sets of Permissions.
84
+func (p Permissions) Intersection(o Permissions) Permissions {
85
+	lhs := p.toSet()
86
+	rhs := o.toSet()
87
+	return fromSet(lhs & rhs)
88
+}
89
+
90
+// IsEmpty returns whether the set of permissions in a Permissions is
91
+// empty.
92
+func (p Permissions) IsEmpty() bool {
93
+	return p == Permissions("")
94
+}
95
+
96
+// IsValid returns whether the set of permissions is a subset of valid
97
+// permissions (namely, {r,w,m}).
98
+func (p Permissions) IsValid() bool {
99
+	return p == fromSet(p.toSet())
100
+}
101
+
102
+type Type rune
103
+
104
+const (
105
+	WildcardDevice Type = 'a'
106
+	BlockDevice    Type = 'b'
107
+	CharDevice     Type = 'c' // or 'u'
108
+	FifoDevice     Type = 'p'
109
+)
110
+
111
+func (t Type) IsValid() bool {
112
+	switch t {
113
+	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
114
+		return true
115
+	default:
116
+		return false
117
+	}
118
+}
119
+
120
+func (t Type) CanMknod() bool {
121
+	switch t {
122
+	case BlockDevice, CharDevice, FifoDevice:
123
+		return true
124
+	default:
125
+		return false
126
+	}
127
+}
128
+
129
+func (t Type) CanCgroup() bool {
130
+	switch t {
131
+	case WildcardDevice, BlockDevice, CharDevice:
132
+		return true
133
+	default:
134
+		return false
135
+	}
136
+}
137
+
138
+type Rule struct {
139
+	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
140
+	// acts as a wildcard and all fields other than Allow are ignored.
141
+	Type Type `json:"type"`
142
+
143
+	// Major is the device's major number.
144
+	Major int64 `json:"major"`
145
+
146
+	// Minor is the device's minor number.
147
+	Minor int64 `json:"minor"`
148
+
149
+	// Permissions is the set of permissions that this rule applies to (in the
150
+	// cgroupv1 format -- any combination of "rwm").
151
+	Permissions Permissions `json:"permissions"`
152
+
153
+	// Allow specifies whether this rule is allowed.
154
+	Allow bool `json:"allow"`
155
+}
156
+
157
+func (d *Rule) CgroupString() string {
158
+	var (
159
+		major = strconv.FormatInt(d.Major, 10)
160
+		minor = strconv.FormatInt(d.Minor, 10)
161
+	)
162
+	if d.Major == Wildcard {
163
+		major = "*"
164
+	}
165
+	if d.Minor == Wildcard {
166
+		minor = "*"
167
+	}
168
+	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
169
+}
170
+
171
+func (d *Rule) Mkdev() (uint64, error) {
172
+	return mkDev(d)
173
+}
0 174
new file mode 100644
... ...
@@ -0,0 +1,14 @@
0
+package config
1
+
2
+import (
3
+	"errors"
4
+
5
+	"golang.org/x/sys/unix"
6
+)
7
+
8
+func mkDev(d *Rule) (uint64, error) {
9
+	if d.Major == Wildcard || d.Minor == Wildcard {
10
+		return 0, errors.New("cannot mkdev() device with wildcards")
11
+	}
12
+	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
13
+}
0 14
new file mode 100644
... ...
@@ -0,0 +1,216 @@
0
+package cgroups
1
+
2
+import (
3
+	"bytes"
4
+	"errors"
5
+	"fmt"
6
+	"os"
7
+	"path/filepath"
8
+	"strconv"
9
+	"strings"
10
+	"sync"
11
+
12
+	"github.com/sirupsen/logrus"
13
+	"golang.org/x/sys/unix"
14
+)
15
+
16
+// OpenFile opens a cgroup file in a given dir with given flags.
17
+// It is supposed to be used for cgroup files only, and returns
18
+// an error if the file is not a cgroup file.
19
+//
20
+// Arguments dir and file are joined together to form an absolute path
21
+// to a file being opened.
22
+func OpenFile(dir, file string, flags int) (*os.File, error) {
23
+	if dir == "" {
24
+		return nil, fmt.Errorf("no directory specified for %s", file)
25
+	}
26
+	return openFile(dir, file, flags)
27
+}
28
+
29
+// ReadFile reads data from a cgroup file in dir.
30
+// It is supposed to be used for cgroup files only.
31
+func ReadFile(dir, file string) (string, error) {
32
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
33
+	if err != nil {
34
+		return "", err
35
+	}
36
+	defer fd.Close()
37
+	var buf bytes.Buffer
38
+
39
+	_, err = buf.ReadFrom(fd)
40
+	return buf.String(), err
41
+}
42
+
43
+// WriteFile writes data to a cgroup file in dir.
44
+// It is supposed to be used for cgroup files only.
45
+func WriteFile(dir, file, data string) error {
46
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
47
+	if err != nil {
48
+		return err
49
+	}
50
+	defer fd.Close()
51
+	if _, err := fd.WriteString(data); err != nil {
52
+		// Having data in the error message helps in debugging.
53
+		return fmt.Errorf("failed to write %q: %w", data, err)
54
+	}
55
+	return nil
56
+}
57
+
58
+// WriteFileByLine is the same as WriteFile, except if data contains newlines,
59
+// it is written line by line.
60
+func WriteFileByLine(dir, file, data string) error {
61
+	i := strings.Index(data, "\n")
62
+	if i == -1 {
63
+		return WriteFile(dir, file, data)
64
+	}
65
+
66
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
67
+	if err != nil {
68
+		return err
69
+	}
70
+	defer fd.Close()
71
+	start := 0
72
+	for {
73
+		var line string
74
+		if i == -1 {
75
+			line = data[start:]
76
+		} else {
77
+			line = data[start : start+i+1]
78
+		}
79
+		_, err := fd.WriteString(line)
80
+		if err != nil {
81
+			return fmt.Errorf("failed to write %q: %w", line, err)
82
+		}
83
+		if i == -1 {
84
+			break
85
+		}
86
+		start += i + 1
87
+		i = strings.Index(data[start:], "\n")
88
+	}
89
+	return nil
90
+}
91
+
92
+const (
93
+	cgroupfsDir    = "/sys/fs/cgroup"
94
+	cgroupfsPrefix = cgroupfsDir + "/"
95
+)
96
+
97
+var (
98
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
99
+	TestMode bool
100
+
101
+	cgroupRootHandle *os.File
102
+	prepOnce         sync.Once
103
+	prepErr          error
104
+	resolveFlags     uint64
105
+)
106
+
107
+func prepareOpenat2() error {
108
+	prepOnce.Do(func() {
109
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
110
+			Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
111
+		})
112
+		if err != nil {
113
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
114
+			if err != unix.ENOSYS {
115
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
116
+			} else {
117
+				logrus.Debug("openat2 not available, falling back to securejoin")
118
+			}
119
+			return
120
+		}
121
+		file := os.NewFile(uintptr(fd), cgroupfsDir)
122
+
123
+		var st unix.Statfs_t
124
+		if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
125
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
126
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
127
+			return
128
+		}
129
+
130
+		cgroupRootHandle = file
131
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
132
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
133
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
134
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
135
+		}
136
+	})
137
+
138
+	return prepErr
139
+}
140
+
141
+func openFile(dir, file string, flags int) (*os.File, error) {
142
+	mode := os.FileMode(0)
143
+	if TestMode && flags&os.O_WRONLY != 0 {
144
+		// "emulate" cgroup fs for unit tests
145
+		flags |= os.O_TRUNC | os.O_CREATE
146
+		mode = 0o600
147
+	}
148
+	// NOTE it is important to use filepath.Clean("/"+file) here
149
+	// (see https://github.com/opencontainers/runc/issues/4103)!
150
+	path := filepath.Join(dir, filepath.Clean("/"+file))
151
+
152
+	if prepareOpenat2() != nil {
153
+		return openFallback(path, flags, mode)
154
+	}
155
+	relPath, ok := strings.CutPrefix(path, cgroupfsPrefix)
156
+	if !ok { // Non-standard path, old system?
157
+		return openFallback(path, flags, mode)
158
+	}
159
+
160
+	fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
161
+		&unix.OpenHow{
162
+			Resolve: resolveFlags,
163
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
164
+			Mode:    uint64(mode),
165
+		})
166
+	if err != nil {
167
+		err = &os.PathError{Op: "openat2", Path: path, Err: err}
168
+		// Check if cgroupRootHandle is still opened to cgroupfsDir
169
+		// (happens when this package is incorrectly used
170
+		// across the chroot/pivot_root/mntns boundary, or
171
+		// when /sys/fs/cgroup is remounted).
172
+		//
173
+		// TODO: if such usage will ever be common, amend this
174
+		// to reopen cgroupRootHandle and retry openat2.
175
+		fdDest, fdErr := os.Readlink("/proc/thread-self/fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
176
+		if fdErr == nil && fdDest != cgroupfsDir {
177
+			// Wrap the error so it is clear that cgroupRootHandle
178
+			// is opened to an unexpected/wrong directory.
179
+			err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
180
+				cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
181
+		}
182
+		return nil, err
183
+	}
184
+
185
+	return os.NewFile(uintptr(fd), path), nil
186
+}
187
+
188
+var errNotCgroupfs = errors.New("not a cgroup file")
189
+
190
+// Can be changed by unit tests.
191
+var openFallback = openAndCheck
192
+
193
+// openAndCheck is used when openat2(2) is not available. It checks the opened
194
+// file is on cgroupfs, returning an error otherwise.
195
+func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
196
+	fd, err := os.OpenFile(path, flags, mode)
197
+	if err != nil {
198
+		return nil, err
199
+	}
200
+	if TestMode {
201
+		return fd, nil
202
+	}
203
+	// Check this is a cgroupfs file.
204
+	var st unix.Statfs_t
205
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
206
+		_ = fd.Close()
207
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
208
+	}
209
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
210
+		_ = fd.Close()
211
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
212
+	}
213
+
214
+	return fd, nil
215
+}
0 216
new file mode 100644
... ...
@@ -0,0 +1,27 @@
0
+package cgroups
1
+
2
+import (
3
+	"io/fs"
4
+	"path/filepath"
5
+)
6
+
7
+// GetAllPids returns all pids from the cgroup identified by path, and all its
8
+// sub-cgroups.
9
+func GetAllPids(path string) ([]int, error) {
10
+	var pids []int
11
+	err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error {
12
+		if iErr != nil {
13
+			return iErr
14
+		}
15
+		if !d.IsDir() {
16
+			return nil
17
+		}
18
+		cPids, err := readProcsFile(p)
19
+		if err != nil {
20
+			return err
21
+		}
22
+		pids = append(pids, cPids...)
23
+		return nil
24
+	})
25
+	return pids, err
26
+}
0 27
new file mode 100644
... ...
@@ -0,0 +1,200 @@
0
+package cgroups
1
+
2
+type ThrottlingData struct {
3
+	// Number of periods with throttling active
4
+	Periods uint64 `json:"periods,omitempty"`
5
+	// Number of periods when the container hit its throttling limit.
6
+	ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
7
+	// Aggregate time the container was throttled for in nanoseconds.
8
+	ThrottledTime uint64 `json:"throttled_time,omitempty"`
9
+}
10
+
11
+// CpuUsage denotes the usage of a CPU.
12
+// All CPU stats are aggregate since container inception.
13
+type CpuUsage struct {
14
+	// Total CPU time consumed.
15
+	// Units: nanoseconds.
16
+	TotalUsage uint64 `json:"total_usage,omitempty"`
17
+	// Total CPU time consumed per core.
18
+	// Units: nanoseconds.
19
+	PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
20
+	// CPU time consumed per core in kernel mode
21
+	// Units: nanoseconds.
22
+	PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"`
23
+	// CPU time consumed per core in user mode
24
+	// Units: nanoseconds.
25
+	PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"`
26
+	// Time spent by tasks of the cgroup in kernel mode.
27
+	// Units: nanoseconds.
28
+	UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
29
+	// Time spent by tasks of the cgroup in user mode.
30
+	// Units: nanoseconds.
31
+	UsageInUsermode uint64 `json:"usage_in_usermode"`
32
+}
33
+
34
+type PSIData struct {
35
+	Avg10  float64 `json:"avg10"`
36
+	Avg60  float64 `json:"avg60"`
37
+	Avg300 float64 `json:"avg300"`
38
+	Total  uint64  `json:"total"`
39
+}
40
+
41
+type PSIStats struct {
42
+	Some PSIData `json:"some,omitempty"`
43
+	Full PSIData `json:"full,omitempty"`
44
+}
45
+
46
+type CpuStats struct {
47
+	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
48
+	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
49
+	PSI            *PSIStats      `json:"psi,omitempty"`
50
+}
51
+
52
+type CPUSetStats struct {
53
+	// List of the physical numbers of the CPUs on which processes
54
+	// in that cpuset are allowed to execute
55
+	CPUs []uint16 `json:"cpus,omitempty"`
56
+	// cpu_exclusive flag
57
+	CPUExclusive uint64 `json:"cpu_exclusive"`
58
+	// List of memory nodes on which processes in that cpuset
59
+	// are allowed to allocate memory
60
+	Mems []uint16 `json:"mems,omitempty"`
61
+	// mem_hardwall flag
62
+	MemHardwall uint64 `json:"mem_hardwall"`
63
+	// mem_exclusive flag
64
+	MemExclusive uint64 `json:"mem_exclusive"`
65
+	// memory_migrate flag
66
+	MemoryMigrate uint64 `json:"memory_migrate"`
67
+	// memory_spread page flag
68
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
69
+	// memory_spread slab flag
70
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
71
+	// memory_pressure
72
+	MemoryPressure uint64 `json:"memory_pressure"`
73
+	// sched_load balance flag
74
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
75
+	// sched_relax_domain_level
76
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
77
+}
78
+
79
+type MemoryData struct {
80
+	Usage    uint64 `json:"usage,omitempty"`
81
+	MaxUsage uint64 `json:"max_usage,omitempty"`
82
+	Failcnt  uint64 `json:"failcnt"`
83
+	Limit    uint64 `json:"limit"`
84
+}
85
+
86
+type MemoryStats struct {
87
+	// memory used for cache
88
+	Cache uint64 `json:"cache,omitempty"`
89
+	// usage of memory
90
+	Usage MemoryData `json:"usage,omitempty"`
91
+	// usage of memory + swap
92
+	SwapUsage MemoryData `json:"swap_usage,omitempty"`
93
+	// usage of swap only
94
+	SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
95
+	// usage of kernel memory
96
+	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
97
+	// usage of kernel TCP memory
98
+	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
99
+	// usage of memory pages by NUMA node
100
+	// see chapter 5.6 of memory controller documentation
101
+	PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"`
102
+	// if true, memory usage is accounted for throughout a hierarchy of cgroups.
103
+	UseHierarchy bool `json:"use_hierarchy"`
104
+
105
+	Stats map[string]uint64 `json:"stats,omitempty"`
106
+	PSI   *PSIStats         `json:"psi,omitempty"`
107
+}
108
+
109
+type PageUsageByNUMA struct {
110
+	// Embedding is used as types can't be recursive.
111
+	PageUsageByNUMAInner
112
+	Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"`
113
+}
114
+
115
+type PageUsageByNUMAInner struct {
116
+	Total       PageStats `json:"total,omitempty"`
117
+	File        PageStats `json:"file,omitempty"`
118
+	Anon        PageStats `json:"anon,omitempty"`
119
+	Unevictable PageStats `json:"unevictable,omitempty"`
120
+}
121
+
122
+type PageStats struct {
123
+	Total uint64           `json:"total,omitempty"`
124
+	Nodes map[uint8]uint64 `json:"nodes,omitempty"`
125
+}
126
+
127
+type PidsStats struct {
128
+	// number of pids in the cgroup
129
+	Current uint64 `json:"current,omitempty"`
130
+	// active pids hard limit
131
+	Limit uint64 `json:"limit,omitempty"`
132
+}
133
+
134
+type BlkioStatEntry struct {
135
+	Major uint64 `json:"major,omitempty"`
136
+	Minor uint64 `json:"minor,omitempty"`
137
+	Op    string `json:"op,omitempty"`
138
+	Value uint64 `json:"value,omitempty"`
139
+}
140
+
141
+type BlkioStats struct {
142
+	// number of bytes transferred to and from the block device
143
+	IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
144
+	IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
145
+	IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
146
+	IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
147
+	IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
148
+	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
149
+	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
150
+	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
151
+	PSI                     *PSIStats        `json:"psi,omitempty"`
152
+}
153
+
154
+type HugetlbStats struct {
155
+	// current res_counter usage for hugetlb
156
+	Usage uint64 `json:"usage,omitempty"`
157
+	// maximum usage ever recorded.
158
+	MaxUsage uint64 `json:"max_usage,omitempty"`
159
+	// number of times hugetlb usage allocation failure.
160
+	Failcnt uint64 `json:"failcnt"`
161
+}
162
+
163
+type RdmaEntry struct {
164
+	Device     string `json:"device,omitempty"`
165
+	HcaHandles uint32 `json:"hca_handles,omitempty"`
166
+	HcaObjects uint32 `json:"hca_objects,omitempty"`
167
+}
168
+
169
+type RdmaStats struct {
170
+	RdmaLimit   []RdmaEntry `json:"rdma_limit,omitempty"`
171
+	RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
172
+}
173
+
174
+type MiscStats struct {
175
+	// current resource usage for a key in misc
176
+	Usage uint64 `json:"usage,omitempty"`
177
+	// number of times the resource usage was about to go over the max boundary
178
+	Events uint64 `json:"events,omitempty"`
179
+}
180
+
181
+type Stats struct {
182
+	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
183
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
184
+	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
185
+	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
186
+	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
187
+	// the map is in the format "size of hugepage: stats of the hugepage"
188
+	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
189
+	RdmaStats    RdmaStats               `json:"rdma_stats,omitempty"`
190
+	// the map is in the format "misc resource name: stats of the key"
191
+	MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
192
+}
193
+
194
+func NewStats() *Stats {
195
+	memoryStats := MemoryStats{Stats: make(map[string]uint64)}
196
+	hugetlbStats := make(map[string]HugetlbStats)
197
+	miscStats := make(map[string]MiscStats)
198
+	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
199
+}
0 200
new file mode 100644
... ...
@@ -0,0 +1,468 @@
0
+package cgroups
1
+
2
+import (
3
+	"bufio"
4
+	"errors"
5
+	"fmt"
6
+	"io"
7
+	"os"
8
+	"path/filepath"
9
+	"strconv"
10
+	"strings"
11
+	"sync"
12
+	"time"
13
+
14
+	"github.com/moby/sys/userns"
15
+	"github.com/sirupsen/logrus"
16
+	"golang.org/x/sys/unix"
17
+)
18
+
19
+const (
20
+	CgroupProcesses   = "cgroup.procs"
21
+	unifiedMountpoint = "/sys/fs/cgroup"
22
+	hybridMountpoint  = "/sys/fs/cgroup/unified"
23
+)
24
+
25
+var (
26
+	isUnifiedOnce sync.Once
27
+	isUnified     bool
28
+	isHybridOnce  sync.Once
29
+	isHybrid      bool
30
+)
31
+
32
+// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
33
+func IsCgroup2UnifiedMode() bool {
34
+	isUnifiedOnce.Do(func() {
35
+		var st unix.Statfs_t
36
+		err := unix.Statfs(unifiedMountpoint, &st)
37
+		if err != nil {
38
+			level := logrus.WarnLevel
39
+			if os.IsNotExist(err) && userns.RunningInUserNS() {
40
+				// For rootless containers, sweep it under the rug.
41
+				level = logrus.DebugLevel
42
+			}
43
+			logrus.StandardLogger().Logf(level,
44
+				"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
45
+		}
46
+		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
47
+	})
48
+	return isUnified
49
+}
50
+
51
+// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
52
+func IsCgroup2HybridMode() bool {
53
+	isHybridOnce.Do(func() {
54
+		var st unix.Statfs_t
55
+		err := unix.Statfs(hybridMountpoint, &st)
56
+		if err != nil {
57
+			isHybrid = false
58
+			if !os.IsNotExist(err) {
59
+				// Report unexpected errors.
60
+				logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
61
+			}
62
+			return
63
+		}
64
+		isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
65
+	})
66
+	return isHybrid
67
+}
68
+
69
+type Mount struct {
70
+	Mountpoint string
71
+	Root       string
72
+	Subsystems []string
73
+}
74
+
75
+// GetCgroupMounts returns the mounts for the cgroup subsystems.
76
+// all indicates whether to return just the first instance or all the mounts.
77
+// This function should not be used from cgroupv2 code, as in this case
78
+// all the controllers are available under the constant unifiedMountpoint.
79
+func GetCgroupMounts(all bool) ([]Mount, error) {
80
+	if IsCgroup2UnifiedMode() {
81
+		// TODO: remove cgroupv2 case once all external users are converted
82
+		availableControllers, err := GetAllSubsystems()
83
+		if err != nil {
84
+			return nil, err
85
+		}
86
+		m := Mount{
87
+			Mountpoint: unifiedMountpoint,
88
+			Root:       unifiedMountpoint,
89
+			Subsystems: availableControllers,
90
+		}
91
+		return []Mount{m}, nil
92
+	}
93
+
94
+	return getCgroupMountsV1(all)
95
+}
96
+
97
+// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
98
+func GetAllSubsystems() ([]string, error) {
99
+	// /proc/cgroups is meaningless for v2
100
+	// https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
101
+	if IsCgroup2UnifiedMode() {
102
+		// "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
103
+		// - devices: implemented in kernel 4.15
104
+		// - freezer: implemented in kernel 5.2
105
+		// We assume these are always available, as it is hard to detect availability.
106
+		pseudo := []string{"devices", "freezer"}
107
+		data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
108
+		if err != nil {
109
+			return nil, err
110
+		}
111
+		subsystems := append(pseudo, strings.Fields(data)...)
112
+		return subsystems, nil
113
+	}
114
+	f, err := os.Open("/proc/cgroups")
115
+	if err != nil {
116
+		return nil, err
117
+	}
118
+	defer f.Close()
119
+
120
+	subsystems := []string{}
121
+
122
+	s := bufio.NewScanner(f)
123
+	for s.Scan() {
124
+		text := s.Text()
125
+		if text[0] != '#' {
126
+			parts := strings.Fields(text)
127
+			if len(parts) >= 4 && parts[3] != "0" {
128
+				subsystems = append(subsystems, parts[0])
129
+			}
130
+		}
131
+	}
132
+	if err := s.Err(); err != nil {
133
+		return nil, err
134
+	}
135
+	return subsystems, nil
136
+}
137
+
138
+func readProcsFile(dir string) (out []int, _ error) {
139
+	file := CgroupProcesses
140
+	retry := true
141
+
142
+again:
143
+	f, err := OpenFile(dir, file, os.O_RDONLY)
144
+	if err != nil {
145
+		return nil, err
146
+	}
147
+	defer f.Close()
148
+
149
+	s := bufio.NewScanner(f)
150
+	for s.Scan() {
151
+		if t := s.Text(); t != "" {
152
+			pid, err := strconv.Atoi(t)
153
+			if err != nil {
154
+				return nil, err
155
+			}
156
+			out = append(out, pid)
157
+		}
158
+	}
159
+	if errors.Is(s.Err(), unix.ENOTSUP) && retry {
160
+		// For a threaded cgroup, read returns ENOTSUP, and we should
161
+		// read from cgroup.threads instead.
162
+		file = "cgroup.threads"
163
+		retry = false
164
+		goto again
165
+	}
166
+	return out, s.Err()
167
+}
168
+
169
+// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
170
+// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
171
+//
172
+//	"cpu": "/user.slice/user-1000.slice"
173
+//	"pids": "/user.slice/user-1000.slice"
174
+//
175
+// etc.
176
+//
177
+// Note that for cgroup v2 unified hierarchy, there are no per-controller
178
+// cgroup paths, so the resulting map will have a single element where the key
179
+// is empty string ("") and the value is the cgroup path the <pid> is in.
180
+func ParseCgroupFile(path string) (map[string]string, error) {
181
+	f, err := os.Open(path)
182
+	if err != nil {
183
+		return nil, err
184
+	}
185
+	defer f.Close()
186
+
187
+	return parseCgroupFromReader(f)
188
+}
189
+
190
+// helper function for ParseCgroupFile to make testing easier
191
+func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
192
+	s := bufio.NewScanner(r)
193
+	cgroups := make(map[string]string)
194
+
195
+	for s.Scan() {
196
+		text := s.Text()
197
+		// from cgroups(7):
198
+		// /proc/[pid]/cgroup
199
+		// ...
200
+		// For each cgroup hierarchy ... there is one entry
201
+		// containing three colon-separated fields of the form:
202
+		//     hierarchy-ID:subsystem-list:cgroup-path
203
+		parts := strings.SplitN(text, ":", 3)
204
+		if len(parts) < 3 {
205
+			return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
206
+		}
207
+
208
+		for _, subs := range strings.Split(parts[1], ",") {
209
+			cgroups[subs] = parts[2]
210
+		}
211
+	}
212
+	if err := s.Err(); err != nil {
213
+		return nil, err
214
+	}
215
+
216
+	return cgroups, nil
217
+}
218
+
219
+func PathExists(path string) bool {
220
+	if _, err := os.Stat(path); err != nil {
221
+		return false
222
+	}
223
+	return true
224
+}
225
+
226
+// rmdir tries to remove a directory, optionally retrying on EBUSY.
227
+func rmdir(path string, retry bool) error {
228
+	delay := time.Millisecond
229
+	tries := 10
230
+
231
+again:
232
+	err := unix.Rmdir(path)
233
+	switch err { // nolint:errorlint // unix errors are bare
234
+	case nil, unix.ENOENT:
235
+		return nil
236
+	case unix.EINTR:
237
+		goto again
238
+	case unix.EBUSY:
239
+		if retry && tries > 0 {
240
+			time.Sleep(delay)
241
+			delay *= 2
242
+			tries--
243
+			goto again
244
+
245
+		}
246
+	}
247
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
248
+}
249
+
250
+// RemovePath aims to remove cgroup path. It does so recursively,
251
+// by removing any subdirectories (sub-cgroups) first.
252
+func RemovePath(path string) error {
253
+	// Try the fast path first; don't retry on EBUSY yet.
254
+	if err := rmdir(path, false); err == nil {
255
+		return nil
256
+	}
257
+
258
+	// There are many reasons why rmdir can fail, including:
259
+	// 1. cgroup have existing sub-cgroups;
260
+	// 2. cgroup (still) have some processes (that are about to vanish);
261
+	// 3. lack of permission (one example is read-only /sys/fs/cgroup mount,
262
+	//    in which case rmdir returns EROFS even for for a non-existent path,
263
+	//    see issue 4518).
264
+	//
265
+	// Using os.ReadDir here kills two birds with one stone: check if
266
+	// the directory exists (handling scenario 3 above), and use
267
+	// directory contents to remove sub-cgroups (handling scenario 1).
268
+	infos, err := os.ReadDir(path)
269
+	if err != nil {
270
+		if os.IsNotExist(err) {
271
+			return nil
272
+		}
273
+		return err
274
+	}
275
+	// Let's remove sub-cgroups, if any.
276
+	for _, info := range infos {
277
+		if info.IsDir() {
278
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
279
+				return err
280
+			}
281
+		}
282
+	}
283
+	// Finally, try rmdir again, this time with retries on EBUSY,
284
+	// which may help with scenario 2 above.
285
+	return rmdir(path, true)
286
+}
287
+
288
+// RemovePaths iterates over the provided paths removing them.
289
+func RemovePaths(paths map[string]string) (err error) {
290
+	for s, p := range paths {
291
+		if err := RemovePath(p); err == nil {
292
+			delete(paths, s)
293
+		}
294
+	}
295
+	if len(paths) == 0 {
296
+		clear(paths)
297
+		return nil
298
+	}
299
+	return fmt.Errorf("Failed to remove paths: %v", paths)
300
+}
301
+
302
+var (
303
+	hugePageSizes []string
304
+	initHPSOnce   sync.Once
305
+)
306
+
307
+func HugePageSizes() []string {
308
+	initHPSOnce.Do(func() {
309
+		dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
310
+		if err != nil {
311
+			return
312
+		}
313
+		files, err := dir.Readdirnames(0)
314
+		dir.Close()
315
+		if err != nil {
316
+			return
317
+		}
318
+
319
+		hugePageSizes, err = getHugePageSizeFromFilenames(files)
320
+		if err != nil {
321
+			logrus.Warn("HugePageSizes: ", err)
322
+		}
323
+	})
324
+
325
+	return hugePageSizes
326
+}
327
+
328
+func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
329
+	pageSizes := make([]string, 0, len(fileNames))
330
+	var warn error
331
+
332
+	for _, file := range fileNames {
333
+		// example: hugepages-1048576kB
334
+		val, ok := strings.CutPrefix(file, "hugepages-")
335
+		if !ok {
336
+			// Unexpected file name: no prefix found, ignore it.
337
+			continue
338
+		}
339
+		// The suffix is always "kB" (as of Linux 5.13). If we find
340
+		// something else, produce an error but keep going.
341
+		eLen := len(val) - 2
342
+		val = strings.TrimSuffix(val, "kB")
343
+		if len(val) != eLen {
344
+			// Highly unlikely.
345
+			if warn == nil {
346
+				warn = errors.New(file + `: invalid suffix (expected "kB")`)
347
+			}
348
+			continue
349
+		}
350
+		size, err := strconv.Atoi(val)
351
+		if err != nil {
352
+			// Highly unlikely.
353
+			if warn == nil {
354
+				warn = fmt.Errorf("%s: %w", file, err)
355
+			}
356
+			continue
357
+		}
358
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
359
+		// but in our case the size is in KB already.
360
+		if size >= (1 << 20) {
361
+			val = strconv.Itoa(size>>20) + "GB"
362
+		} else if size >= (1 << 10) {
363
+			val = strconv.Itoa(size>>10) + "MB"
364
+		} else {
365
+			val += "KB"
366
+		}
367
+		pageSizes = append(pageSizes, val)
368
+	}
369
+
370
+	return pageSizes, warn
371
+}
372
+
373
+// GetPids returns all pids, that were added to cgroup at path.
374
+func GetPids(dir string) ([]int, error) {
375
+	return readProcsFile(dir)
376
+}
377
+
378
+// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
379
+func WriteCgroupProc(dir string, pid int) error {
380
+	// Normally dir should not be empty, one case is that cgroup subsystem
381
+	// is not mounted, we will get empty dir, and we want it fail here.
382
+	if dir == "" {
383
+		return fmt.Errorf("no such directory for %s", CgroupProcesses)
384
+	}
385
+
386
+	// Dont attach any pid to the cgroup if -1 is specified as a pid
387
+	if pid == -1 {
388
+		return nil
389
+	}
390
+
391
+	file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
392
+	if err != nil {
393
+		return fmt.Errorf("failed to write %v: %w", pid, err)
394
+	}
395
+	defer file.Close()
396
+
397
+	for i := 0; i < 5; i++ {
398
+		_, err = file.WriteString(strconv.Itoa(pid))
399
+		if err == nil {
400
+			return nil
401
+		}
402
+
403
+		// EINVAL might mean that the task being added to cgroup.procs is in state
404
+		// TASK_NEW. We should attempt to do so again.
405
+		if errors.Is(err, unix.EINVAL) {
406
+			time.Sleep(30 * time.Millisecond)
407
+			continue
408
+		}
409
+
410
+		return fmt.Errorf("failed to write %v: %w", pid, err)
411
+	}
412
+	return err
413
+}
414
+
415
+// Since the OCI spec is designed for cgroup v1, in some cases
416
+// there is need to convert from the cgroup v1 configuration to cgroup v2
417
+// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
418
+// convert from [2-262144] to [1-10000]
419
+// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
420
+func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
421
+	if cpuShares == 0 {
422
+		return 0
423
+	}
424
+	return (1 + ((cpuShares-2)*9999)/262142)
425
+}
426
+
427
+// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
428
+// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
429
+// is defined as memory+swap combined, while in cgroup v2 swap is a separate value,
430
+// so we need to subtract memory from it where it makes sense.
431
+func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
432
+	switch {
433
+	case memory == -1 && memorySwap == 0:
434
+		// For compatibility with cgroup1 controller, set swap to unlimited in
435
+		// case the memory is set to unlimited and the swap is not explicitly set,
436
+		// treating the request as "set both memory and swap to unlimited".
437
+		return -1, nil
438
+	case memorySwap == -1, memorySwap == 0:
439
+		// Treat -1 ("max") and 0 ("unset") swap as is.
440
+		return memorySwap, nil
441
+	case memory == -1:
442
+		// Unlimited memory, so treat swap as is.
443
+		return memorySwap, nil
444
+	case memory == 0:
445
+		// Unset or unknown memory, can't calculate swap.
446
+		return 0, errors.New("unable to set swap limit without memory limit")
447
+	case memory < 0:
448
+		// Does not make sense to subtract a negative value.
449
+		return 0, fmt.Errorf("invalid memory value: %d", memory)
450
+	case memorySwap < memory:
451
+		// Sanity check.
452
+		return 0, errors.New("memory+swap limit should be >= memory limit")
453
+	}
454
+
455
+	return memorySwap - memory, nil
456
+}
457
+
458
+// Since the OCI spec is designed for cgroup v1, in some cases
459
+// there is need to convert from the cgroup v1 configuration to cgroup v2
460
+// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
461
+// convert linearly from [10-1000] to [1-10000]
462
+func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
463
+	if blkIoWeight == 0 {
464
+		return 0
465
+	}
466
+	return 1 + (uint64(blkIoWeight)-10)*9999/990
467
+}
0 468
new file mode 100644
... ...
@@ -0,0 +1,277 @@
0
+package cgroups
1
+
2
+import (
3
+	"errors"
4
+	"fmt"
5
+	"os"
6
+	"path/filepath"
7
+	"strings"
8
+	"sync"
9
+	"syscall"
10
+
11
+	securejoin "github.com/cyphar/filepath-securejoin"
12
+	"github.com/moby/sys/mountinfo"
13
+	"golang.org/x/sys/unix"
14
+)
15
+
16
+// Code in this source file are specific to cgroup v1,
17
+// and must not be used from any cgroup v2 code.
18
+
19
+const (
20
+	CgroupNamePrefix = "name="
21
+	defaultPrefix    = "/sys/fs/cgroup"
22
+)
23
+
24
+var (
25
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
26
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
27
+
28
+	readMountinfoOnce sync.Once
29
+	readMountinfoErr  error
30
+	cgroupMountinfo   []*mountinfo.Info
31
+)
32
+
33
+type NotFoundError struct {
34
+	Subsystem string
35
+}
36
+
37
+func (e *NotFoundError) Error() string {
38
+	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
39
+}
40
+
41
+func NewNotFoundError(sub string) error {
42
+	return &NotFoundError{
43
+		Subsystem: sub,
44
+	}
45
+}
46
+
47
+func IsNotFound(err error) bool {
48
+	var nfErr *NotFoundError
49
+	return errors.As(err, &nfErr)
50
+}
51
+
52
+func tryDefaultPath(cgroupPath, subsystem string) string {
53
+	if !strings.HasPrefix(defaultPrefix, cgroupPath) {
54
+		return ""
55
+	}
56
+
57
+	// remove possible prefix
58
+	subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix)
59
+
60
+	// Make sure we're still under defaultPrefix, and resolve
61
+	// a possible symlink (like cpu -> cpu,cpuacct).
62
+	path, err := securejoin.SecureJoin(defaultPrefix, subsystem)
63
+	if err != nil {
64
+		return ""
65
+	}
66
+
67
+	// (1) path should be a directory.
68
+	st, err := os.Lstat(path)
69
+	if err != nil || !st.IsDir() {
70
+		return ""
71
+	}
72
+
73
+	// (2) path should be a mount point.
74
+	pst, err := os.Lstat(filepath.Dir(path))
75
+	if err != nil {
76
+		return ""
77
+	}
78
+
79
+	if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev {
80
+		// parent dir has the same dev -- path is not a mount point
81
+		return ""
82
+	}
83
+
84
+	// (3) path should have 'cgroup' fs type.
85
+	fst := unix.Statfs_t{}
86
+	err = unix.Statfs(path, &fst)
87
+	if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
88
+		return ""
89
+	}
90
+
91
+	return path
92
+}
93
+
94
+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
95
+// with fstype of "cgroup") for the current running process.
96
+//
97
+// The results are cached (to avoid re-reading mountinfo which is relatively
98
+// expensive), so it is assumed that cgroup mounts are not being changed.
99
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
100
+	readMountinfoOnce.Do(func() {
101
+		// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
102
+		// issues.
103
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
104
+			mountinfo.FSTypeFilter("cgroup"),
105
+		)
106
+	})
107
+	return cgroupMountinfo, readMountinfoErr
108
+}
109
+
110
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
111
+func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
112
+	if IsCgroup2UnifiedMode() {
113
+		return "", errUnified
114
+	}
115
+
116
+	// If subsystem is empty, we look for the cgroupv2 hybrid path.
117
+	if len(subsystem) == 0 {
118
+		return hybridMountpoint, nil
119
+	}
120
+
121
+	// Avoid parsing mountinfo by trying the default path first, if possible.
122
+	if path := tryDefaultPath(cgroupPath, subsystem); path != "" {
123
+		return path, nil
124
+	}
125
+
126
+	mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
127
+	return mnt, err
128
+}
129
+
130
+func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
131
+	if IsCgroup2UnifiedMode() {
132
+		return "", "", errUnified
133
+	}
134
+
135
+	mi, err := readCgroupMountinfo()
136
+	if err != nil {
137
+		return "", "", err
138
+	}
139
+
140
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
141
+}
142
+
143
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
144
+	for _, mi := range mounts {
145
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
146
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
147
+				if opt == subsystem {
148
+					return mi.Mountpoint, mi.Root, nil
149
+				}
150
+			}
151
+		}
152
+	}
153
+
154
+	return "", "", NewNotFoundError(subsystem)
155
+}
156
+
157
+func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
158
+	if len(m.Subsystems) == 0 {
159
+		return "", errors.New("no subsystem for mount")
160
+	}
161
+
162
+	return getControllerPath(m.Subsystems[0], cgroups)
163
+}
164
+
165
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
166
+	res := make([]Mount, 0, len(ss))
167
+	numFound := 0
168
+	for _, mi := range mounts {
169
+		m := Mount{
170
+			Mountpoint: mi.Mountpoint,
171
+			Root:       mi.Root,
172
+		}
173
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
174
+			seen, known := ss[opt]
175
+			if !known || (!all && seen) {
176
+				continue
177
+			}
178
+			ss[opt] = true
179
+			opt = strings.TrimPrefix(opt, CgroupNamePrefix)
180
+			m.Subsystems = append(m.Subsystems, opt)
181
+			numFound++
182
+		}
183
+		if len(m.Subsystems) > 0 || all {
184
+			res = append(res, m)
185
+		}
186
+		if !all && numFound >= len(ss) {
187
+			break
188
+		}
189
+	}
190
+	return res, nil
191
+}
192
+
193
+func getCgroupMountsV1(all bool) ([]Mount, error) {
194
+	mi, err := readCgroupMountinfo()
195
+	if err != nil {
196
+		return nil, err
197
+	}
198
+
199
+	// We don't need to use /proc/thread-self here because runc always runs
200
+	// with every thread in the same cgroup. This lets us avoid having to do
201
+	// runtime.LockOSThread.
202
+	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
203
+	if err != nil {
204
+		return nil, err
205
+	}
206
+
207
+	allMap := make(map[string]bool)
208
+	for s := range allSubsystems {
209
+		allMap[s] = false
210
+	}
211
+
212
+	return getCgroupMountsHelper(allMap, mi, all)
213
+}
214
+
215
+// GetOwnCgroup returns the relative path to the cgroup docker is running in.
216
+func GetOwnCgroup(subsystem string) (string, error) {
217
+	if IsCgroup2UnifiedMode() {
218
+		return "", errUnified
219
+	}
220
+
221
+	// We don't need to use /proc/thread-self here because runc always runs
222
+	// with every thread in the same cgroup. This lets us avoid having to do
223
+	// runtime.LockOSThread.
224
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
225
+	if err != nil {
226
+		return "", err
227
+	}
228
+
229
+	return getControllerPath(subsystem, cgroups)
230
+}
231
+
232
+func GetOwnCgroupPath(subsystem string) (string, error) {
233
+	cgroup, err := GetOwnCgroup(subsystem)
234
+	if err != nil {
235
+		return "", err
236
+	}
237
+
238
+	// If subsystem is empty, we look for the cgroupv2 hybrid path.
239
+	if len(subsystem) == 0 {
240
+		return hybridMountpoint, nil
241
+	}
242
+
243
+	return getCgroupPathHelper(subsystem, cgroup)
244
+}
245
+
246
+func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
247
+	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
248
+	if err != nil {
249
+		return "", err
250
+	}
251
+
252
+	// This is needed for nested containers, because in /proc/self/cgroup we
253
+	// see paths from host, which don't exist in container.
254
+	relCgroup, err := filepath.Rel(root, cgroup)
255
+	if err != nil {
256
+		return "", err
257
+	}
258
+
259
+	return filepath.Join(mnt, relCgroup), nil
260
+}
261
+
262
+func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
263
+	if IsCgroup2UnifiedMode() {
264
+		return "", errUnified
265
+	}
266
+
267
+	if p, ok := cgroups[subsystem]; ok {
268
+		return p, nil
269
+	}
270
+
271
+	if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
272
+		return p, nil
273
+	}
274
+
275
+	return "", NewNotFoundError(subsystem)
276
+}
0 277
deleted file mode 100644
... ...
@@ -1,191 +0,0 @@
1
-
2
-                                 Apache License
3
-                           Version 2.0, January 2004
4
-                        http://www.apache.org/licenses/
5
-
6
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
-
8
-   1. Definitions.
9
-
10
-      "License" shall mean the terms and conditions for use, reproduction,
11
-      and distribution as defined by Sections 1 through 9 of this document.
12
-
13
-      "Licensor" shall mean the copyright owner or entity authorized by
14
-      the copyright owner that is granting the License.
15
-
16
-      "Legal Entity" shall mean the union of the acting entity and all
17
-      other entities that control, are controlled by, or are under common
18
-      control with that entity. For the purposes of this definition,
19
-      "control" means (i) the power, direct or indirect, to cause the
20
-      direction or management of such entity, whether by contract or
21
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
-      outstanding shares, or (iii) beneficial ownership of such entity.
23
-
24
-      "You" (or "Your") shall mean an individual or Legal Entity
25
-      exercising permissions granted by this License.
26
-
27
-      "Source" form shall mean the preferred form for making modifications,
28
-      including but not limited to software source code, documentation
29
-      source, and configuration files.
30
-
31
-      "Object" form shall mean any form resulting from mechanical
32
-      transformation or translation of a Source form, including but
33
-      not limited to compiled object code, generated documentation,
34
-      and conversions to other media types.
35
-
36
-      "Work" shall mean the work of authorship, whether in Source or
37
-      Object form, made available under the License, as indicated by a
38
-      copyright notice that is included in or attached to the work
39
-      (an example is provided in the Appendix below).
40
-
41
-      "Derivative Works" shall mean any work, whether in Source or Object
42
-      form, that is based on (or derived from) the Work and for which the
43
-      editorial revisions, annotations, elaborations, or other modifications
44
-      represent, as a whole, an original work of authorship. For the purposes
45
-      of this License, Derivative Works shall not include works that remain
46
-      separable from, or merely link (or bind by name) to the interfaces of,
47
-      the Work and Derivative Works thereof.
48
-
49
-      "Contribution" shall mean any work of authorship, including
50
-      the original version of the Work and any modifications or additions
51
-      to that Work or Derivative Works thereof, that is intentionally
52
-      submitted to Licensor for inclusion in the Work by the copyright owner
53
-      or by an individual or Legal Entity authorized to submit on behalf of
54
-      the copyright owner. For the purposes of this definition, "submitted"
55
-      means any form of electronic, verbal, or written communication sent
56
-      to the Licensor or its representatives, including but not limited to
57
-      communication on electronic mailing lists, source code control systems,
58
-      and issue tracking systems that are managed by, or on behalf of, the
59
-      Licensor for the purpose of discussing and improving the Work, but
60
-      excluding communication that is conspicuously marked or otherwise
61
-      designated in writing by the copyright owner as "Not a Contribution."
62
-
63
-      "Contributor" shall mean Licensor and any individual or Legal Entity
64
-      on behalf of whom a Contribution has been received by Licensor and
65
-      subsequently incorporated within the Work.
66
-
67
-   2. Grant of Copyright License. Subject to the terms and conditions of
68
-      this License, each Contributor hereby grants to You a perpetual,
69
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
-      copyright license to reproduce, prepare Derivative Works of,
71
-      publicly display, publicly perform, sublicense, and distribute the
72
-      Work and such Derivative Works in Source or Object form.
73
-
74
-   3. Grant of Patent License. Subject to the terms and conditions of
75
-      this License, each Contributor hereby grants to You a perpetual,
76
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
-      (except as stated in this section) patent license to make, have made,
78
-      use, offer to sell, sell, import, and otherwise transfer the Work,
79
-      where such license applies only to those patent claims licensable
80
-      by such Contributor that are necessarily infringed by their
81
-      Contribution(s) alone or by combination of their Contribution(s)
82
-      with the Work to which such Contribution(s) was submitted. If You
83
-      institute patent litigation against any entity (including a
84
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
85
-      or a Contribution incorporated within the Work constitutes direct
86
-      or contributory patent infringement, then any patent licenses
87
-      granted to You under this License for that Work shall terminate
88
-      as of the date such litigation is filed.
89
-
90
-   4. Redistribution. You may reproduce and distribute copies of the
91
-      Work or Derivative Works thereof in any medium, with or without
92
-      modifications, and in Source or Object form, provided that You
93
-      meet the following conditions:
94
-
95
-      (a) You must give any other recipients of the Work or
96
-          Derivative Works a copy of this License; and
97
-
98
-      (b) You must cause any modified files to carry prominent notices
99
-          stating that You changed the files; and
100
-
101
-      (c) You must retain, in the Source form of any Derivative Works
102
-          that You distribute, all copyright, patent, trademark, and
103
-          attribution notices from the Source form of the Work,
104
-          excluding those notices that do not pertain to any part of
105
-          the Derivative Works; and
106
-
107
-      (d) If the Work includes a "NOTICE" text file as part of its
108
-          distribution, then any Derivative Works that You distribute must
109
-          include a readable copy of the attribution notices contained
110
-          within such NOTICE file, excluding those notices that do not
111
-          pertain to any part of the Derivative Works, in at least one
112
-          of the following places: within a NOTICE text file distributed
113
-          as part of the Derivative Works; within the Source form or
114
-          documentation, if provided along with the Derivative Works; or,
115
-          within a display generated by the Derivative Works, if and
116
-          wherever such third-party notices normally appear. The contents
117
-          of the NOTICE file are for informational purposes only and
118
-          do not modify the License. You may add Your own attribution
119
-          notices within Derivative Works that You distribute, alongside
120
-          or as an addendum to the NOTICE text from the Work, provided
121
-          that such additional attribution notices cannot be construed
122
-          as modifying the License.
123
-
124
-      You may add Your own copyright statement to Your modifications and
125
-      may provide additional or different license terms and conditions
126
-      for use, reproduction, or distribution of Your modifications, or
127
-      for any such Derivative Works as a whole, provided Your use,
128
-      reproduction, and distribution of the Work otherwise complies with
129
-      the conditions stated in this License.
130
-
131
-   5. Submission of Contributions. Unless You explicitly state otherwise,
132
-      any Contribution intentionally submitted for inclusion in the Work
133
-      by You to the Licensor shall be under the terms and conditions of
134
-      this License, without any additional terms or conditions.
135
-      Notwithstanding the above, nothing herein shall supersede or modify
136
-      the terms of any separate license agreement you may have executed
137
-      with Licensor regarding such Contributions.
138
-
139
-   6. Trademarks. This License does not grant permission to use the trade
140
-      names, trademarks, service marks, or product names of the Licensor,
141
-      except as required for reasonable and customary use in describing the
142
-      origin of the Work and reproducing the content of the NOTICE file.
143
-
144
-   7. Disclaimer of Warranty. Unless required by applicable law or
145
-      agreed to in writing, Licensor provides the Work (and each
146
-      Contributor provides its Contributions) on an "AS IS" BASIS,
147
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
-      implied, including, without limitation, any warranties or conditions
149
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
-      PARTICULAR PURPOSE. You are solely responsible for determining the
151
-      appropriateness of using or redistributing the Work and assume any
152
-      risks associated with Your exercise of permissions under this License.
153
-
154
-   8. Limitation of Liability. In no event and under no legal theory,
155
-      whether in tort (including negligence), contract, or otherwise,
156
-      unless required by applicable law (such as deliberate and grossly
157
-      negligent acts) or agreed to in writing, shall any Contributor be
158
-      liable to You for damages, including any direct, indirect, special,
159
-      incidental, or consequential damages of any character arising as a
160
-      result of this License or out of the use or inability to use the
161
-      Work (including but not limited to damages for loss of goodwill,
162
-      work stoppage, computer failure or malfunction, or any and all
163
-      other commercial damages or losses), even if such Contributor
164
-      has been advised of the possibility of such damages.
165
-
166
-   9. Accepting Warranty or Additional Liability. While redistributing
167
-      the Work or Derivative Works thereof, You may choose to offer,
168
-      and charge a fee for, acceptance of support, warranty, indemnity,
169
-      or other liability obligations and/or rights consistent with this
170
-      License. However, in accepting such obligations, You may act only
171
-      on Your own behalf and on Your sole responsibility, not on behalf
172
-      of any other Contributor, and only if You agree to indemnify,
173
-      defend, and hold each Contributor harmless for any liability
174
-      incurred by, or claims asserted against, such Contributor by reason
175
-      of your accepting any such warranty or additional liability.
176
-
177
-   END OF TERMS AND CONDITIONS
178
-
179
-   Copyright 2014 Docker, Inc.
180
-
181
-   Licensed under the Apache License, Version 2.0 (the "License");
182
-   you may not use this file except in compliance with the License.
183
-   You may obtain a copy of the License at
184
-
185
-       http://www.apache.org/licenses/LICENSE-2.0
186
-
187
-   Unless required by applicable law or agreed to in writing, software
188
-   distributed under the License is distributed on an "AS IS" BASIS,
189
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190
-   See the License for the specific language governing permissions and
191
-   limitations under the License.
192 1
deleted file mode 100644
... ...
@@ -1,17 +0,0 @@
1
-runc
2
-
3
-Copyright 2012-2015 Docker, Inc.
4
-
5
-This product includes software developed at Docker, Inc. (http://www.docker.com).
6
-
7
-The following is courtesy of our legal counsel:
8
-
9
-
10
-Use and transfer of Docker may be subject to certain restrictions by the
11
-United States and other governments.
12
-It is your responsibility to ensure that your use and/or transfer does not
13
-violate applicable laws.
14
-
15
-For more information, please see http://www.bis.doc.gov
16
-
17
-See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
18 1
deleted file mode 100644
... ...
@@ -1,80 +0,0 @@
1
-package cgroups
2
-
3
-import (
4
-	"errors"
5
-
6
-	"github.com/opencontainers/runc/libcontainer/configs"
7
-)
8
-
9
-var (
10
-	// ErrDevicesUnsupported is an error returned when a cgroup manager
11
-	// is not configured to set device rules.
12
-	ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
13
-
14
-	// ErrRootless is returned by [Manager.Apply] when there is an error
15
-	// creating cgroup directory, and cgroup.Rootless is set. In general,
16
-	// this error is to be ignored.
17
-	ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)")
18
-
19
-	// DevicesSetV1 and DevicesSetV2 are functions to set devices for
20
-	// cgroup v1 and v2, respectively. Unless
21
-	// [github.com/opencontainers/runc/libcontainer/cgroups/devices]
22
-	// package is imported, it is set to nil, so cgroup managers can't
23
-	// manage devices.
24
-	DevicesSetV1 func(path string, r *configs.Resources) error
25
-	DevicesSetV2 func(path string, r *configs.Resources) error
26
-)
27
-
28
-type Manager interface {
29
-	// Apply creates a cgroup, if not yet created, and adds a process
30
-	// with the specified pid into that cgroup.  A special value of -1
31
-	// can be used to merely create a cgroup.
32
-	Apply(pid int) error
33
-
34
-	// GetPids returns the PIDs of all processes inside the cgroup.
35
-	GetPids() ([]int, error)
36
-
37
-	// GetAllPids returns the PIDs of all processes inside the cgroup
38
-	// any all its sub-cgroups.
39
-	GetAllPids() ([]int, error)
40
-
41
-	// GetStats returns cgroups statistics.
42
-	GetStats() (*Stats, error)
43
-
44
-	// Freeze sets the freezer cgroup to the specified state.
45
-	Freeze(state configs.FreezerState) error
46
-
47
-	// Destroy removes cgroup.
48
-	Destroy() error
49
-
50
-	// Path returns a cgroup path to the specified controller/subsystem.
51
-	// For cgroupv2, the argument is unused and can be empty.
52
-	Path(string) string
53
-
54
-	// Set sets cgroup resources parameters/limits. If the argument is nil,
55
-	// the resources specified during Manager creation (or the previous call
56
-	// to Set) are used.
57
-	Set(r *configs.Resources) error
58
-
59
-	// GetPaths returns cgroup path(s) to save in a state file in order to
60
-	// restore later.
61
-	//
62
-	// For cgroup v1, a key is cgroup subsystem name, and the value is the
63
-	// path to the cgroup for this subsystem.
64
-	//
65
-	// For cgroup v2 unified hierarchy, a key is "", and the value is the
66
-	// unified path.
67
-	GetPaths() map[string]string
68
-
69
-	// GetCgroups returns the cgroup data as configured.
70
-	GetCgroups() (*configs.Cgroup, error)
71
-
72
-	// GetFreezerState retrieves the current FreezerState of the cgroup.
73
-	GetFreezerState() (configs.FreezerState, error)
74
-
75
-	// Exists returns whether the cgroup path exists or not.
76
-	Exists() bool
77
-
78
-	// OOMKillCount reports OOM kill count for the cgroup.
79
-	OOMKillCount() (uint64, error)
80
-}
81 1
deleted file mode 100644
... ...
@@ -1,216 +0,0 @@
1
-package cgroups
2
-
3
-import (
4
-	"bytes"
5
-	"errors"
6
-	"fmt"
7
-	"os"
8
-	"path"
9
-	"strconv"
10
-	"strings"
11
-	"sync"
12
-
13
-	"github.com/opencontainers/runc/libcontainer/utils"
14
-	"github.com/sirupsen/logrus"
15
-	"golang.org/x/sys/unix"
16
-)
17
-
18
-// OpenFile opens a cgroup file in a given dir with given flags.
19
-// It is supposed to be used for cgroup files only, and returns
20
-// an error if the file is not a cgroup file.
21
-//
22
-// Arguments dir and file are joined together to form an absolute path
23
-// to a file being opened.
24
-func OpenFile(dir, file string, flags int) (*os.File, error) {
25
-	if dir == "" {
26
-		return nil, fmt.Errorf("no directory specified for %s", file)
27
-	}
28
-	return openFile(dir, file, flags)
29
-}
30
-
31
-// ReadFile reads data from a cgroup file in dir.
32
-// It is supposed to be used for cgroup files only.
33
-func ReadFile(dir, file string) (string, error) {
34
-	fd, err := OpenFile(dir, file, unix.O_RDONLY)
35
-	if err != nil {
36
-		return "", err
37
-	}
38
-	defer fd.Close()
39
-	var buf bytes.Buffer
40
-
41
-	_, err = buf.ReadFrom(fd)
42
-	return buf.String(), err
43
-}
44
-
45
-// WriteFile writes data to a cgroup file in dir.
46
-// It is supposed to be used for cgroup files only.
47
-func WriteFile(dir, file, data string) error {
48
-	fd, err := OpenFile(dir, file, unix.O_WRONLY)
49
-	if err != nil {
50
-		return err
51
-	}
52
-	defer fd.Close()
53
-	if _, err := fd.WriteString(data); err != nil {
54
-		// Having data in the error message helps in debugging.
55
-		return fmt.Errorf("failed to write %q: %w", data, err)
56
-	}
57
-	return nil
58
-}
59
-
60
-// WriteFileByLine is the same as WriteFile, except if data contains newlines,
61
-// it is written line by line.
62
-func WriteFileByLine(dir, file, data string) error {
63
-	i := strings.Index(data, "\n")
64
-	if i == -1 {
65
-		return WriteFile(dir, file, data)
66
-	}
67
-
68
-	fd, err := OpenFile(dir, file, unix.O_WRONLY)
69
-	if err != nil {
70
-		return err
71
-	}
72
-	defer fd.Close()
73
-	start := 0
74
-	for {
75
-		var line string
76
-		if i == -1 {
77
-			line = data[start:]
78
-		} else {
79
-			line = data[start : start+i+1]
80
-		}
81
-		_, err := fd.WriteString(line)
82
-		if err != nil {
83
-			return fmt.Errorf("failed to write %q: %w", line, err)
84
-		}
85
-		if i == -1 {
86
-			break
87
-		}
88
-		start += i + 1
89
-		i = strings.Index(data[start:], "\n")
90
-	}
91
-	return nil
92
-}
93
-
94
-const (
95
-	cgroupfsDir    = "/sys/fs/cgroup"
96
-	cgroupfsPrefix = cgroupfsDir + "/"
97
-)
98
-
99
-var (
100
-	// TestMode is set to true by unit tests that need "fake" cgroupfs.
101
-	TestMode bool
102
-
103
-	cgroupRootHandle *os.File
104
-	prepOnce         sync.Once
105
-	prepErr          error
106
-	resolveFlags     uint64
107
-)
108
-
109
-func prepareOpenat2() error {
110
-	prepOnce.Do(func() {
111
-		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
112
-			Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
113
-		})
114
-		if err != nil {
115
-			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
116
-			if err != unix.ENOSYS {
117
-				logrus.Warnf("falling back to securejoin: %s", prepErr)
118
-			} else {
119
-				logrus.Debug("openat2 not available, falling back to securejoin")
120
-			}
121
-			return
122
-		}
123
-		file := os.NewFile(uintptr(fd), cgroupfsDir)
124
-
125
-		var st unix.Statfs_t
126
-		if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
127
-			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
128
-			logrus.Warnf("falling back to securejoin: %s", prepErr)
129
-			return
130
-		}
131
-
132
-		cgroupRootHandle = file
133
-		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
134
-		if st.Type == unix.CGROUP2_SUPER_MAGIC {
135
-			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
136
-			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
137
-		}
138
-	})
139
-
140
-	return prepErr
141
-}
142
-
143
-func openFile(dir, file string, flags int) (*os.File, error) {
144
-	mode := os.FileMode(0)
145
-	if TestMode && flags&os.O_WRONLY != 0 {
146
-		// "emulate" cgroup fs for unit tests
147
-		flags |= os.O_TRUNC | os.O_CREATE
148
-		mode = 0o600
149
-	}
150
-	path := path.Join(dir, utils.CleanPath(file))
151
-	if prepareOpenat2() != nil {
152
-		return openFallback(path, flags, mode)
153
-	}
154
-	relPath := strings.TrimPrefix(path, cgroupfsPrefix)
155
-	if len(relPath) == len(path) { // non-standard path, old system?
156
-		return openFallback(path, flags, mode)
157
-	}
158
-
159
-	fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
160
-		&unix.OpenHow{
161
-			Resolve: resolveFlags,
162
-			Flags:   uint64(flags) | unix.O_CLOEXEC,
163
-			Mode:    uint64(mode),
164
-		})
165
-	if err != nil {
166
-		err = &os.PathError{Op: "openat2", Path: path, Err: err}
167
-		// Check if cgroupRootHandle is still opened to cgroupfsDir
168
-		// (happens when this package is incorrectly used
169
-		// across the chroot/pivot_root/mntns boundary, or
170
-		// when /sys/fs/cgroup is remounted).
171
-		//
172
-		// TODO: if such usage will ever be common, amend this
173
-		// to reopen cgroupRootHandle and retry openat2.
174
-		fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
175
-		defer closer()
176
-		fdDest, _ := os.Readlink(fdPath)
177
-		if fdDest != cgroupfsDir {
178
-			// Wrap the error so it is clear that cgroupRootHandle
179
-			// is opened to an unexpected/wrong directory.
180
-			err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
181
-				cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
182
-		}
183
-		return nil, err
184
-	}
185
-
186
-	return os.NewFile(uintptr(fd), path), nil
187
-}
188
-
189
-var errNotCgroupfs = errors.New("not a cgroup file")
190
-
191
-// Can be changed by unit tests.
192
-var openFallback = openAndCheck
193
-
194
-// openAndCheck is used when openat2(2) is not available. It checks the opened
195
-// file is on cgroupfs, returning an error otherwise.
196
-func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
197
-	fd, err := os.OpenFile(path, flags, mode)
198
-	if err != nil {
199
-		return nil, err
200
-	}
201
-	if TestMode {
202
-		return fd, nil
203
-	}
204
-	// Check this is a cgroupfs file.
205
-	var st unix.Statfs_t
206
-	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
207
-		_ = fd.Close()
208
-		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
209
-	}
210
-	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
211
-		_ = fd.Close()
212
-		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
213
-	}
214
-
215
-	return fd, nil
216
-}
217 1
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-package cgroups
2
-
3
-import (
4
-	"io/fs"
5
-	"path/filepath"
6
-)
7
-
8
-// GetAllPids returns all pids from the cgroup identified by path, and all its
9
-// sub-cgroups.
10
-func GetAllPids(path string) ([]int, error) {
11
-	var pids []int
12
-	err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error {
13
-		if iErr != nil {
14
-			return iErr
15
-		}
16
-		if !d.IsDir() {
17
-			return nil
18
-		}
19
-		cPids, err := readProcsFile(p)
20
-		if err != nil {
21
-			return err
22
-		}
23
-		pids = append(pids, cPids...)
24
-		return nil
25
-	})
26
-	return pids, err
27
-}
28 1
deleted file mode 100644
... ...
@@ -1,200 +0,0 @@
1
-package cgroups
2
-
3
-type ThrottlingData struct {
4
-	// Number of periods with throttling active
5
-	Periods uint64 `json:"periods,omitempty"`
6
-	// Number of periods when the container hit its throttling limit.
7
-	ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
8
-	// Aggregate time the container was throttled for in nanoseconds.
9
-	ThrottledTime uint64 `json:"throttled_time,omitempty"`
10
-}
11
-
12
-// CpuUsage denotes the usage of a CPU.
13
-// All CPU stats are aggregate since container inception.
14
-type CpuUsage struct {
15
-	// Total CPU time consumed.
16
-	// Units: nanoseconds.
17
-	TotalUsage uint64 `json:"total_usage,omitempty"`
18
-	// Total CPU time consumed per core.
19
-	// Units: nanoseconds.
20
-	PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
21
-	// CPU time consumed per core in kernel mode
22
-	// Units: nanoseconds.
23
-	PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"`
24
-	// CPU time consumed per core in user mode
25
-	// Units: nanoseconds.
26
-	PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"`
27
-	// Time spent by tasks of the cgroup in kernel mode.
28
-	// Units: nanoseconds.
29
-	UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
30
-	// Time spent by tasks of the cgroup in user mode.
31
-	// Units: nanoseconds.
32
-	UsageInUsermode uint64 `json:"usage_in_usermode"`
33
-}
34
-
35
-type PSIData struct {
36
-	Avg10  float64 `json:"avg10"`
37
-	Avg60  float64 `json:"avg60"`
38
-	Avg300 float64 `json:"avg300"`
39
-	Total  uint64  `json:"total"`
40
-}
41
-
42
-type PSIStats struct {
43
-	Some PSIData `json:"some,omitempty"`
44
-	Full PSIData `json:"full,omitempty"`
45
-}
46
-
47
-type CpuStats struct {
48
-	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
49
-	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
50
-	PSI            *PSIStats      `json:"psi,omitempty"`
51
-}
52
-
53
-type CPUSetStats struct {
54
-	// List of the physical numbers of the CPUs on which processes
55
-	// in that cpuset are allowed to execute
56
-	CPUs []uint16 `json:"cpus,omitempty"`
57
-	// cpu_exclusive flag
58
-	CPUExclusive uint64 `json:"cpu_exclusive"`
59
-	// List of memory nodes on which processes in that cpuset
60
-	// are allowed to allocate memory
61
-	Mems []uint16 `json:"mems,omitempty"`
62
-	// mem_hardwall flag
63
-	MemHardwall uint64 `json:"mem_hardwall"`
64
-	// mem_exclusive flag
65
-	MemExclusive uint64 `json:"mem_exclusive"`
66
-	// memory_migrate flag
67
-	MemoryMigrate uint64 `json:"memory_migrate"`
68
-	// memory_spread page flag
69
-	MemorySpreadPage uint64 `json:"memory_spread_page"`
70
-	// memory_spread slab flag
71
-	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
72
-	// memory_pressure
73
-	MemoryPressure uint64 `json:"memory_pressure"`
74
-	// sched_load balance flag
75
-	SchedLoadBalance uint64 `json:"sched_load_balance"`
76
-	// sched_relax_domain_level
77
-	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
78
-}
79
-
80
-type MemoryData struct {
81
-	Usage    uint64 `json:"usage,omitempty"`
82
-	MaxUsage uint64 `json:"max_usage,omitempty"`
83
-	Failcnt  uint64 `json:"failcnt"`
84
-	Limit    uint64 `json:"limit"`
85
-}
86
-
87
-type MemoryStats struct {
88
-	// memory used for cache
89
-	Cache uint64 `json:"cache,omitempty"`
90
-	// usage of memory
91
-	Usage MemoryData `json:"usage,omitempty"`
92
-	// usage of memory + swap
93
-	SwapUsage MemoryData `json:"swap_usage,omitempty"`
94
-	// usage of swap only
95
-	SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
96
-	// usage of kernel memory
97
-	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
98
-	// usage of kernel TCP memory
99
-	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
100
-	// usage of memory pages by NUMA node
101
-	// see chapter 5.6 of memory controller documentation
102
-	PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"`
103
-	// if true, memory usage is accounted for throughout a hierarchy of cgroups.
104
-	UseHierarchy bool `json:"use_hierarchy"`
105
-
106
-	Stats map[string]uint64 `json:"stats,omitempty"`
107
-	PSI   *PSIStats         `json:"psi,omitempty"`
108
-}
109
-
110
-type PageUsageByNUMA struct {
111
-	// Embedding is used as types can't be recursive.
112
-	PageUsageByNUMAInner
113
-	Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"`
114
-}
115
-
116
-type PageUsageByNUMAInner struct {
117
-	Total       PageStats `json:"total,omitempty"`
118
-	File        PageStats `json:"file,omitempty"`
119
-	Anon        PageStats `json:"anon,omitempty"`
120
-	Unevictable PageStats `json:"unevictable,omitempty"`
121
-}
122
-
123
-type PageStats struct {
124
-	Total uint64           `json:"total,omitempty"`
125
-	Nodes map[uint8]uint64 `json:"nodes,omitempty"`
126
-}
127
-
128
-type PidsStats struct {
129
-	// number of pids in the cgroup
130
-	Current uint64 `json:"current,omitempty"`
131
-	// active pids hard limit
132
-	Limit uint64 `json:"limit,omitempty"`
133
-}
134
-
135
-type BlkioStatEntry struct {
136
-	Major uint64 `json:"major,omitempty"`
137
-	Minor uint64 `json:"minor,omitempty"`
138
-	Op    string `json:"op,omitempty"`
139
-	Value uint64 `json:"value,omitempty"`
140
-}
141
-
142
-type BlkioStats struct {
143
-	// number of bytes transferred to and from the block device
144
-	IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
145
-	IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
146
-	IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
147
-	IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
148
-	IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
149
-	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
150
-	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
151
-	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
152
-	PSI                     *PSIStats        `json:"psi,omitempty"`
153
-}
154
-
155
-type HugetlbStats struct {
156
-	// current res_counter usage for hugetlb
157
-	Usage uint64 `json:"usage,omitempty"`
158
-	// maximum usage ever recorded.
159
-	MaxUsage uint64 `json:"max_usage,omitempty"`
160
-	// number of times hugetlb usage allocation failure.
161
-	Failcnt uint64 `json:"failcnt"`
162
-}
163
-
164
-type RdmaEntry struct {
165
-	Device     string `json:"device,omitempty"`
166
-	HcaHandles uint32 `json:"hca_handles,omitempty"`
167
-	HcaObjects uint32 `json:"hca_objects,omitempty"`
168
-}
169
-
170
-type RdmaStats struct {
171
-	RdmaLimit   []RdmaEntry `json:"rdma_limit,omitempty"`
172
-	RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
173
-}
174
-
175
-type MiscStats struct {
176
-	// current resource usage for a key in misc
177
-	Usage uint64 `json:"usage,omitempty"`
178
-	// number of times the resource usage was about to go over the max boundary
179
-	Events uint64 `json:"events,omitempty"`
180
-}
181
-
182
-type Stats struct {
183
-	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
184
-	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
185
-	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
186
-	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
187
-	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
188
-	// the map is in the format "size of hugepage: stats of the hugepage"
189
-	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
190
-	RdmaStats    RdmaStats               `json:"rdma_stats,omitempty"`
191
-	// the map is in the format "misc resource name: stats of the key"
192
-	MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
193
-}
194
-
195
-func NewStats() *Stats {
196
-	memoryStats := MemoryStats{Stats: make(map[string]uint64)}
197
-	hugetlbStats := make(map[string]HugetlbStats)
198
-	miscStats := make(map[string]MiscStats)
199
-	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
200
-}
201 1
deleted file mode 100644
... ...
@@ -1,468 +0,0 @@
1
-package cgroups
2
-
3
-import (
4
-	"bufio"
5
-	"errors"
6
-	"fmt"
7
-	"io"
8
-	"os"
9
-	"path/filepath"
10
-	"strconv"
11
-	"strings"
12
-	"sync"
13
-	"time"
14
-
15
-	"github.com/moby/sys/userns"
16
-	"github.com/sirupsen/logrus"
17
-	"golang.org/x/sys/unix"
18
-)
19
-
20
-const (
21
-	CgroupProcesses   = "cgroup.procs"
22
-	unifiedMountpoint = "/sys/fs/cgroup"
23
-	hybridMountpoint  = "/sys/fs/cgroup/unified"
24
-)
25
-
26
-var (
27
-	isUnifiedOnce sync.Once
28
-	isUnified     bool
29
-	isHybridOnce  sync.Once
30
-	isHybrid      bool
31
-)
32
-
33
-// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
34
-func IsCgroup2UnifiedMode() bool {
35
-	isUnifiedOnce.Do(func() {
36
-		var st unix.Statfs_t
37
-		err := unix.Statfs(unifiedMountpoint, &st)
38
-		if err != nil {
39
-			level := logrus.WarnLevel
40
-			if os.IsNotExist(err) && userns.RunningInUserNS() {
41
-				// For rootless containers, sweep it under the rug.
42
-				level = logrus.DebugLevel
43
-			}
44
-			logrus.StandardLogger().Logf(level,
45
-				"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
46
-		}
47
-		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
48
-	})
49
-	return isUnified
50
-}
51
-
52
-// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
53
-func IsCgroup2HybridMode() bool {
54
-	isHybridOnce.Do(func() {
55
-		var st unix.Statfs_t
56
-		err := unix.Statfs(hybridMountpoint, &st)
57
-		if err != nil {
58
-			isHybrid = false
59
-			if !os.IsNotExist(err) {
60
-				// Report unexpected errors.
61
-				logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
62
-			}
63
-			return
64
-		}
65
-		isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
66
-	})
67
-	return isHybrid
68
-}
69
-
70
-type Mount struct {
71
-	Mountpoint string
72
-	Root       string
73
-	Subsystems []string
74
-}
75
-
76
-// GetCgroupMounts returns the mounts for the cgroup subsystems.
77
-// all indicates whether to return just the first instance or all the mounts.
78
-// This function should not be used from cgroupv2 code, as in this case
79
-// all the controllers are available under the constant unifiedMountpoint.
80
-func GetCgroupMounts(all bool) ([]Mount, error) {
81
-	if IsCgroup2UnifiedMode() {
82
-		// TODO: remove cgroupv2 case once all external users are converted
83
-		availableControllers, err := GetAllSubsystems()
84
-		if err != nil {
85
-			return nil, err
86
-		}
87
-		m := Mount{
88
-			Mountpoint: unifiedMountpoint,
89
-			Root:       unifiedMountpoint,
90
-			Subsystems: availableControllers,
91
-		}
92
-		return []Mount{m}, nil
93
-	}
94
-
95
-	return getCgroupMountsV1(all)
96
-}
97
-
98
-// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
99
-func GetAllSubsystems() ([]string, error) {
100
-	// /proc/cgroups is meaningless for v2
101
-	// https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
102
-	if IsCgroup2UnifiedMode() {
103
-		// "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
104
-		// - devices: implemented in kernel 4.15
105
-		// - freezer: implemented in kernel 5.2
106
-		// We assume these are always available, as it is hard to detect availability.
107
-		pseudo := []string{"devices", "freezer"}
108
-		data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
109
-		if err != nil {
110
-			return nil, err
111
-		}
112
-		subsystems := append(pseudo, strings.Fields(data)...)
113
-		return subsystems, nil
114
-	}
115
-	f, err := os.Open("/proc/cgroups")
116
-	if err != nil {
117
-		return nil, err
118
-	}
119
-	defer f.Close()
120
-
121
-	subsystems := []string{}
122
-
123
-	s := bufio.NewScanner(f)
124
-	for s.Scan() {
125
-		text := s.Text()
126
-		if text[0] != '#' {
127
-			parts := strings.Fields(text)
128
-			if len(parts) >= 4 && parts[3] != "0" {
129
-				subsystems = append(subsystems, parts[0])
130
-			}
131
-		}
132
-	}
133
-	if err := s.Err(); err != nil {
134
-		return nil, err
135
-	}
136
-	return subsystems, nil
137
-}
138
-
139
-func readProcsFile(dir string) (out []int, _ error) {
140
-	file := CgroupProcesses
141
-	retry := true
142
-
143
-again:
144
-	f, err := OpenFile(dir, file, os.O_RDONLY)
145
-	if err != nil {
146
-		return nil, err
147
-	}
148
-	defer f.Close()
149
-
150
-	s := bufio.NewScanner(f)
151
-	for s.Scan() {
152
-		if t := s.Text(); t != "" {
153
-			pid, err := strconv.Atoi(t)
154
-			if err != nil {
155
-				return nil, err
156
-			}
157
-			out = append(out, pid)
158
-		}
159
-	}
160
-	if errors.Is(s.Err(), unix.ENOTSUP) && retry {
161
-		// For a threaded cgroup, read returns ENOTSUP, and we should
162
-		// read from cgroup.threads instead.
163
-		file = "cgroup.threads"
164
-		retry = false
165
-		goto again
166
-	}
167
-	return out, s.Err()
168
-}
169
-
170
-// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
171
-// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
172
-//
173
-//	"cpu": "/user.slice/user-1000.slice"
174
-//	"pids": "/user.slice/user-1000.slice"
175
-//
176
-// etc.
177
-//
178
-// Note that for cgroup v2 unified hierarchy, there are no per-controller
179
-// cgroup paths, so the resulting map will have a single element where the key
180
-// is empty string ("") and the value is the cgroup path the <pid> is in.
181
-func ParseCgroupFile(path string) (map[string]string, error) {
182
-	f, err := os.Open(path)
183
-	if err != nil {
184
-		return nil, err
185
-	}
186
-	defer f.Close()
187
-
188
-	return parseCgroupFromReader(f)
189
-}
190
-
191
-// helper function for ParseCgroupFile to make testing easier
192
-func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
193
-	s := bufio.NewScanner(r)
194
-	cgroups := make(map[string]string)
195
-
196
-	for s.Scan() {
197
-		text := s.Text()
198
-		// from cgroups(7):
199
-		// /proc/[pid]/cgroup
200
-		// ...
201
-		// For each cgroup hierarchy ... there is one entry
202
-		// containing three colon-separated fields of the form:
203
-		//     hierarchy-ID:subsystem-list:cgroup-path
204
-		parts := strings.SplitN(text, ":", 3)
205
-		if len(parts) < 3 {
206
-			return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
207
-		}
208
-
209
-		for _, subs := range strings.Split(parts[1], ",") {
210
-			cgroups[subs] = parts[2]
211
-		}
212
-	}
213
-	if err := s.Err(); err != nil {
214
-		return nil, err
215
-	}
216
-
217
-	return cgroups, nil
218
-}
219
-
220
-func PathExists(path string) bool {
221
-	if _, err := os.Stat(path); err != nil {
222
-		return false
223
-	}
224
-	return true
225
-}
226
-
227
-// rmdir tries to remove a directory, optionally retrying on EBUSY.
228
-func rmdir(path string, retry bool) error {
229
-	delay := time.Millisecond
230
-	tries := 10
231
-
232
-again:
233
-	err := unix.Rmdir(path)
234
-	switch err { // nolint:errorlint // unix errors are bare
235
-	case nil, unix.ENOENT:
236
-		return nil
237
-	case unix.EINTR:
238
-		goto again
239
-	case unix.EBUSY:
240
-		if retry && tries > 0 {
241
-			time.Sleep(delay)
242
-			delay *= 2
243
-			tries--
244
-			goto again
245
-
246
-		}
247
-	}
248
-	return &os.PathError{Op: "rmdir", Path: path, Err: err}
249
-}
250
-
251
-// RemovePath aims to remove cgroup path. It does so recursively,
252
-// by removing any subdirectories (sub-cgroups) first.
253
-func RemovePath(path string) error {
254
-	// Try the fast path first; don't retry on EBUSY yet.
255
-	if err := rmdir(path, false); err == nil {
256
-		return nil
257
-	}
258
-
259
-	// There are many reasons why rmdir can fail, including:
260
-	// 1. cgroup have existing sub-cgroups;
261
-	// 2. cgroup (still) have some processes (that are about to vanish);
262
-	// 3. lack of permission (one example is read-only /sys/fs/cgroup mount,
263
-	//    in which case rmdir returns EROFS even for for a non-existent path,
264
-	//    see issue 4518).
265
-	//
266
-	// Using os.ReadDir here kills two birds with one stone: check if
267
-	// the directory exists (handling scenario 3 above), and use
268
-	// directory contents to remove sub-cgroups (handling scenario 1).
269
-	infos, err := os.ReadDir(path)
270
-	if err != nil {
271
-		if os.IsNotExist(err) {
272
-			return nil
273
-		}
274
-		return err
275
-	}
276
-	// Let's remove sub-cgroups, if any.
277
-	for _, info := range infos {
278
-		if info.IsDir() {
279
-			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
280
-				return err
281
-			}
282
-		}
283
-	}
284
-	// Finally, try rmdir again, this time with retries on EBUSY,
285
-	// which may help with scenario 2 above.
286
-	return rmdir(path, true)
287
-}
288
-
289
-// RemovePaths iterates over the provided paths removing them.
290
-func RemovePaths(paths map[string]string) (err error) {
291
-	for s, p := range paths {
292
-		if err := RemovePath(p); err == nil {
293
-			delete(paths, s)
294
-		}
295
-	}
296
-	if len(paths) == 0 {
297
-		clear(paths)
298
-		return nil
299
-	}
300
-	return fmt.Errorf("Failed to remove paths: %v", paths)
301
-}
302
-
303
-var (
304
-	hugePageSizes []string
305
-	initHPSOnce   sync.Once
306
-)
307
-
308
-func HugePageSizes() []string {
309
-	initHPSOnce.Do(func() {
310
-		dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
311
-		if err != nil {
312
-			return
313
-		}
314
-		files, err := dir.Readdirnames(0)
315
-		dir.Close()
316
-		if err != nil {
317
-			return
318
-		}
319
-
320
-		hugePageSizes, err = getHugePageSizeFromFilenames(files)
321
-		if err != nil {
322
-			logrus.Warn("HugePageSizes: ", err)
323
-		}
324
-	})
325
-
326
-	return hugePageSizes
327
-}
328
-
329
-func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
330
-	pageSizes := make([]string, 0, len(fileNames))
331
-	var warn error
332
-
333
-	for _, file := range fileNames {
334
-		// example: hugepages-1048576kB
335
-		val := strings.TrimPrefix(file, "hugepages-")
336
-		if len(val) == len(file) {
337
-			// Unexpected file name: no prefix found, ignore it.
338
-			continue
339
-		}
340
-		// The suffix is always "kB" (as of Linux 5.13). If we find
341
-		// something else, produce an error but keep going.
342
-		eLen := len(val) - 2
343
-		val = strings.TrimSuffix(val, "kB")
344
-		if len(val) != eLen {
345
-			// Highly unlikely.
346
-			if warn == nil {
347
-				warn = errors.New(file + `: invalid suffix (expected "kB")`)
348
-			}
349
-			continue
350
-		}
351
-		size, err := strconv.Atoi(val)
352
-		if err != nil {
353
-			// Highly unlikely.
354
-			if warn == nil {
355
-				warn = fmt.Errorf("%s: %w", file, err)
356
-			}
357
-			continue
358
-		}
359
-		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
360
-		// but in our case the size is in KB already.
361
-		if size >= (1 << 20) {
362
-			val = strconv.Itoa(size>>20) + "GB"
363
-		} else if size >= (1 << 10) {
364
-			val = strconv.Itoa(size>>10) + "MB"
365
-		} else {
366
-			val += "KB"
367
-		}
368
-		pageSizes = append(pageSizes, val)
369
-	}
370
-
371
-	return pageSizes, warn
372
-}
373
-
374
-// GetPids returns all pids, that were added to cgroup at path.
375
-func GetPids(dir string) ([]int, error) {
376
-	return readProcsFile(dir)
377
-}
378
-
379
-// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
380
-func WriteCgroupProc(dir string, pid int) error {
381
-	// Normally dir should not be empty, one case is that cgroup subsystem
382
-	// is not mounted, we will get empty dir, and we want it fail here.
383
-	if dir == "" {
384
-		return fmt.Errorf("no such directory for %s", CgroupProcesses)
385
-	}
386
-
387
-	// Dont attach any pid to the cgroup if -1 is specified as a pid
388
-	if pid == -1 {
389
-		return nil
390
-	}
391
-
392
-	file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
393
-	if err != nil {
394
-		return fmt.Errorf("failed to write %v: %w", pid, err)
395
-	}
396
-	defer file.Close()
397
-
398
-	for i := 0; i < 5; i++ {
399
-		_, err = file.WriteString(strconv.Itoa(pid))
400
-		if err == nil {
401
-			return nil
402
-		}
403
-
404
-		// EINVAL might mean that the task being added to cgroup.procs is in state
405
-		// TASK_NEW. We should attempt to do so again.
406
-		if errors.Is(err, unix.EINVAL) {
407
-			time.Sleep(30 * time.Millisecond)
408
-			continue
409
-		}
410
-
411
-		return fmt.Errorf("failed to write %v: %w", pid, err)
412
-	}
413
-	return err
414
-}
415
-
416
-// Since the OCI spec is designed for cgroup v1, in some cases
417
-// there is need to convert from the cgroup v1 configuration to cgroup v2
418
-// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
419
-// convert from [2-262144] to [1-10000]
420
-// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
421
-func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
422
-	if cpuShares == 0 {
423
-		return 0
424
-	}
425
-	return (1 + ((cpuShares-2)*9999)/262142)
426
-}
427
-
428
-// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
429
-// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
430
-// is defined as memory+swap combined, while in cgroup v2 swap is a separate value,
431
-// so we need to subtract memory from it where it makes sense.
432
-func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
433
-	switch {
434
-	case memory == -1 && memorySwap == 0:
435
-		// For compatibility with cgroup1 controller, set swap to unlimited in
436
-		// case the memory is set to unlimited and the swap is not explicitly set,
437
-		// treating the request as "set both memory and swap to unlimited".
438
-		return -1, nil
439
-	case memorySwap == -1, memorySwap == 0:
440
-		// Treat -1 ("max") and 0 ("unset") swap as is.
441
-		return memorySwap, nil
442
-	case memory == -1:
443
-		// Unlimited memory, so treat swap as is.
444
-		return memorySwap, nil
445
-	case memory == 0:
446
-		// Unset or unknown memory, can't calculate swap.
447
-		return 0, errors.New("unable to set swap limit without memory limit")
448
-	case memory < 0:
449
-		// Does not make sense to subtract a negative value.
450
-		return 0, fmt.Errorf("invalid memory value: %d", memory)
451
-	case memorySwap < memory:
452
-		// Sanity check.
453
-		return 0, errors.New("memory+swap limit should be >= memory limit")
454
-	}
455
-
456
-	return memorySwap - memory, nil
457
-}
458
-
459
-// Since the OCI spec is designed for cgroup v1, in some cases
460
-// there is need to convert from the cgroup v1 configuration to cgroup v2
461
-// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
462
-// convert linearly from [10-1000] to [1-10000]
463
-func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
464
-	if blkIoWeight == 0 {
465
-		return 0
466
-	}
467
-	return 1 + (uint64(blkIoWeight)-10)*9999/990
468
-}
469 1
deleted file mode 100644
... ...
@@ -1,277 +0,0 @@
1
-package cgroups
2
-
3
-import (
4
-	"errors"
5
-	"fmt"
6
-	"os"
7
-	"path/filepath"
8
-	"strings"
9
-	"sync"
10
-	"syscall"
11
-
12
-	securejoin "github.com/cyphar/filepath-securejoin"
13
-	"github.com/moby/sys/mountinfo"
14
-	"golang.org/x/sys/unix"
15
-)
16
-
17
-// Code in this source file are specific to cgroup v1,
18
-// and must not be used from any cgroup v2 code.
19
-
20
-const (
21
-	CgroupNamePrefix = "name="
22
-	defaultPrefix    = "/sys/fs/cgroup"
23
-)
24
-
25
-var (
26
-	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
27
-	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
28
-
29
-	readMountinfoOnce sync.Once
30
-	readMountinfoErr  error
31
-	cgroupMountinfo   []*mountinfo.Info
32
-)
33
-
34
-type NotFoundError struct {
35
-	Subsystem string
36
-}
37
-
38
-func (e *NotFoundError) Error() string {
39
-	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
40
-}
41
-
42
-func NewNotFoundError(sub string) error {
43
-	return &NotFoundError{
44
-		Subsystem: sub,
45
-	}
46
-}
47
-
48
-func IsNotFound(err error) bool {
49
-	var nfErr *NotFoundError
50
-	return errors.As(err, &nfErr)
51
-}
52
-
53
-func tryDefaultPath(cgroupPath, subsystem string) string {
54
-	if !strings.HasPrefix(defaultPrefix, cgroupPath) {
55
-		return ""
56
-	}
57
-
58
-	// remove possible prefix
59
-	subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix)
60
-
61
-	// Make sure we're still under defaultPrefix, and resolve
62
-	// a possible symlink (like cpu -> cpu,cpuacct).
63
-	path, err := securejoin.SecureJoin(defaultPrefix, subsystem)
64
-	if err != nil {
65
-		return ""
66
-	}
67
-
68
-	// (1) path should be a directory.
69
-	st, err := os.Lstat(path)
70
-	if err != nil || !st.IsDir() {
71
-		return ""
72
-	}
73
-
74
-	// (2) path should be a mount point.
75
-	pst, err := os.Lstat(filepath.Dir(path))
76
-	if err != nil {
77
-		return ""
78
-	}
79
-
80
-	if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev {
81
-		// parent dir has the same dev -- path is not a mount point
82
-		return ""
83
-	}
84
-
85
-	// (3) path should have 'cgroup' fs type.
86
-	fst := unix.Statfs_t{}
87
-	err = unix.Statfs(path, &fst)
88
-	if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
89
-		return ""
90
-	}
91
-
92
-	return path
93
-}
94
-
95
-// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
96
-// with fstype of "cgroup") for the current running process.
97
-//
98
-// The results are cached (to avoid re-reading mountinfo which is relatively
99
-// expensive), so it is assumed that cgroup mounts are not being changed.
100
-func readCgroupMountinfo() ([]*mountinfo.Info, error) {
101
-	readMountinfoOnce.Do(func() {
102
-		// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
103
-		// issues.
104
-		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
105
-			mountinfo.FSTypeFilter("cgroup"),
106
-		)
107
-	})
108
-	return cgroupMountinfo, readMountinfoErr
109
-}
110
-
111
-// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
112
-func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
113
-	if IsCgroup2UnifiedMode() {
114
-		return "", errUnified
115
-	}
116
-
117
-	// If subsystem is empty, we look for the cgroupv2 hybrid path.
118
-	if len(subsystem) == 0 {
119
-		return hybridMountpoint, nil
120
-	}
121
-
122
-	// Avoid parsing mountinfo by trying the default path first, if possible.
123
-	if path := tryDefaultPath(cgroupPath, subsystem); path != "" {
124
-		return path, nil
125
-	}
126
-
127
-	mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
128
-	return mnt, err
129
-}
130
-
131
-func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
132
-	if IsCgroup2UnifiedMode() {
133
-		return "", "", errUnified
134
-	}
135
-
136
-	mi, err := readCgroupMountinfo()
137
-	if err != nil {
138
-		return "", "", err
139
-	}
140
-
141
-	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
142
-}
143
-
144
-func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
145
-	for _, mi := range mounts {
146
-		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
147
-			for _, opt := range strings.Split(mi.VFSOptions, ",") {
148
-				if opt == subsystem {
149
-					return mi.Mountpoint, mi.Root, nil
150
-				}
151
-			}
152
-		}
153
-	}
154
-
155
-	return "", "", NewNotFoundError(subsystem)
156
-}
157
-
158
-func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
159
-	if len(m.Subsystems) == 0 {
160
-		return "", errors.New("no subsystem for mount")
161
-	}
162
-
163
-	return getControllerPath(m.Subsystems[0], cgroups)
164
-}
165
-
166
-func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
167
-	res := make([]Mount, 0, len(ss))
168
-	numFound := 0
169
-	for _, mi := range mounts {
170
-		m := Mount{
171
-			Mountpoint: mi.Mountpoint,
172
-			Root:       mi.Root,
173
-		}
174
-		for _, opt := range strings.Split(mi.VFSOptions, ",") {
175
-			seen, known := ss[opt]
176
-			if !known || (!all && seen) {
177
-				continue
178
-			}
179
-			ss[opt] = true
180
-			opt = strings.TrimPrefix(opt, CgroupNamePrefix)
181
-			m.Subsystems = append(m.Subsystems, opt)
182
-			numFound++
183
-		}
184
-		if len(m.Subsystems) > 0 || all {
185
-			res = append(res, m)
186
-		}
187
-		if !all && numFound >= len(ss) {
188
-			break
189
-		}
190
-	}
191
-	return res, nil
192
-}
193
-
194
-func getCgroupMountsV1(all bool) ([]Mount, error) {
195
-	mi, err := readCgroupMountinfo()
196
-	if err != nil {
197
-		return nil, err
198
-	}
199
-
200
-	// We don't need to use /proc/thread-self here because runc always runs
201
-	// with every thread in the same cgroup. This lets us avoid having to do
202
-	// runtime.LockOSThread.
203
-	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
204
-	if err != nil {
205
-		return nil, err
206
-	}
207
-
208
-	allMap := make(map[string]bool)
209
-	for s := range allSubsystems {
210
-		allMap[s] = false
211
-	}
212
-
213
-	return getCgroupMountsHelper(allMap, mi, all)
214
-}
215
-
216
-// GetOwnCgroup returns the relative path to the cgroup docker is running in.
217
-func GetOwnCgroup(subsystem string) (string, error) {
218
-	if IsCgroup2UnifiedMode() {
219
-		return "", errUnified
220
-	}
221
-
222
-	// We don't need to use /proc/thread-self here because runc always runs
223
-	// with every thread in the same cgroup. This lets us avoid having to do
224
-	// runtime.LockOSThread.
225
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
226
-	if err != nil {
227
-		return "", err
228
-	}
229
-
230
-	return getControllerPath(subsystem, cgroups)
231
-}
232
-
233
-func GetOwnCgroupPath(subsystem string) (string, error) {
234
-	cgroup, err := GetOwnCgroup(subsystem)
235
-	if err != nil {
236
-		return "", err
237
-	}
238
-
239
-	// If subsystem is empty, we look for the cgroupv2 hybrid path.
240
-	if len(subsystem) == 0 {
241
-		return hybridMountpoint, nil
242
-	}
243
-
244
-	return getCgroupPathHelper(subsystem, cgroup)
245
-}
246
-
247
-func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
248
-	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
249
-	if err != nil {
250
-		return "", err
251
-	}
252
-
253
-	// This is needed for nested containers, because in /proc/self/cgroup we
254
-	// see paths from host, which don't exist in container.
255
-	relCgroup, err := filepath.Rel(root, cgroup)
256
-	if err != nil {
257
-		return "", err
258
-	}
259
-
260
-	return filepath.Join(mnt, relCgroup), nil
261
-}
262
-
263
-func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
264
-	if IsCgroup2UnifiedMode() {
265
-		return "", errUnified
266
-	}
267
-
268
-	if p, ok := cgroups[subsystem]; ok {
269
-		return p, nil
270
-	}
271
-
272
-	if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
273
-		return p, nil
274
-	}
275
-
276
-	return "", NewNotFoundError(subsystem)
277
-}
278 1
deleted file mode 100644
... ...
@@ -1,66 +0,0 @@
1
-package configs
2
-
3
-import "fmt"
4
-
5
-// BlockIODevice holds major:minor format supported in blkio cgroup.
6
-type BlockIODevice struct {
7
-	// Major is the device's major number
8
-	Major int64 `json:"major"`
9
-	// Minor is the device's minor number
10
-	Minor int64 `json:"minor"`
11
-}
12
-
13
-// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
14
-type WeightDevice struct {
15
-	BlockIODevice
16
-	// Weight is the bandwidth rate for the device, range is from 10 to 1000
17
-	Weight uint16 `json:"weight"`
18
-	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
19
-	LeafWeight uint16 `json:"leafWeight"`
20
-}
21
-
22
-// NewWeightDevice returns a configured WeightDevice pointer
23
-func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
24
-	wd := &WeightDevice{}
25
-	wd.Major = major
26
-	wd.Minor = minor
27
-	wd.Weight = weight
28
-	wd.LeafWeight = leafWeight
29
-	return wd
30
-}
31
-
32
-// WeightString formats the struct to be writable to the cgroup specific file
33
-func (wd *WeightDevice) WeightString() string {
34
-	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
35
-}
36
-
37
-// LeafWeightString formats the struct to be writable to the cgroup specific file
38
-func (wd *WeightDevice) LeafWeightString() string {
39
-	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
40
-}
41
-
42
-// ThrottleDevice struct holds a `major:minor rate_per_second` pair
43
-type ThrottleDevice struct {
44
-	BlockIODevice
45
-	// Rate is the IO rate limit per cgroup per device
46
-	Rate uint64 `json:"rate"`
47
-}
48
-
49
-// NewThrottleDevice returns a configured ThrottleDevice pointer
50
-func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
51
-	td := &ThrottleDevice{}
52
-	td.Major = major
53
-	td.Minor = minor
54
-	td.Rate = rate
55
-	return td
56
-}
57
-
58
-// String formats the struct to be writable to the cgroup specific file
59
-func (td *ThrottleDevice) String() string {
60
-	return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
61
-}
62
-
63
-// StringName formats the struct to be writable to the cgroup specific file
64
-func (td *ThrottleDevice) StringName(name string) string {
65
-	return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
66
-}
67 1
deleted file mode 100644
... ...
@@ -1,169 +0,0 @@
1
-package configs
2
-
3
-import (
4
-	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
5
-	"github.com/opencontainers/runc/libcontainer/devices"
6
-)
7
-
8
-type FreezerState string
9
-
10
-const (
11
-	Undefined FreezerState = ""
12
-	Frozen    FreezerState = "FROZEN"
13
-	Thawed    FreezerState = "THAWED"
14
-)
15
-
16
-// Cgroup holds properties of a cgroup on Linux.
17
-type Cgroup struct {
18
-	// Name specifies the name of the cgroup
19
-	Name string `json:"name,omitempty"`
20
-
21
-	// Parent specifies the name of parent of cgroup or slice
22
-	Parent string `json:"parent,omitempty"`
23
-
24
-	// Path specifies the path to cgroups that are created and/or joined by the container.
25
-	// The path is assumed to be relative to the host system cgroup mountpoint.
26
-	Path string `json:"path"`
27
-
28
-	// ScopePrefix describes prefix for the scope name
29
-	ScopePrefix string `json:"scope_prefix"`
30
-
31
-	// Resources contains various cgroups settings to apply
32
-	*Resources
33
-
34
-	// Systemd tells if systemd should be used to manage cgroups.
35
-	Systemd bool
36
-
37
-	// SystemdProps are any additional properties for systemd,
38
-	// derived from org.systemd.property.xxx annotations.
39
-	// Ignored unless systemd is used for managing cgroups.
40
-	SystemdProps []systemdDbus.Property `json:"-"`
41
-
42
-	// Rootless tells if rootless cgroups should be used.
43
-	Rootless bool
44
-
45
-	// The host UID that should own the cgroup, or nil to accept
46
-	// the default ownership.  This should only be set when the
47
-	// cgroupfs is to be mounted read/write.
48
-	// Not all cgroup manager implementations support changing
49
-	// the ownership.
50
-	OwnerUID *int `json:"owner_uid,omitempty"`
51
-}
52
-
53
-type Resources struct {
54
-	// Devices is the set of access rules for devices in the container.
55
-	Devices []*devices.Rule `json:"devices"`
56
-
57
-	// Memory limit (in bytes)
58
-	Memory int64 `json:"memory"`
59
-
60
-	// Memory reservation or soft_limit (in bytes)
61
-	MemoryReservation int64 `json:"memory_reservation"`
62
-
63
-	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
64
-	MemorySwap int64 `json:"memory_swap"`
65
-
66
-	// CPU shares (relative weight vs. other containers)
67
-	CpuShares uint64 `json:"cpu_shares"`
68
-
69
-	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
70
-	CpuQuota int64 `json:"cpu_quota"`
71
-
72
-	// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
73
-	CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
74
-
75
-	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
76
-	CpuPeriod uint64 `json:"cpu_period"`
77
-
78
-	// How many time CPU will use in realtime scheduling (in usecs).
79
-	CpuRtRuntime int64 `json:"cpu_rt_quota"`
80
-
81
-	// CPU period to be used for realtime scheduling (in usecs).
82
-	CpuRtPeriod uint64 `json:"cpu_rt_period"`
83
-
84
-	// CPU to use
85
-	CpusetCpus string `json:"cpuset_cpus"`
86
-
87
-	// MEM to use
88
-	CpusetMems string `json:"cpuset_mems"`
89
-
90
-	// cgroup SCHED_IDLE
91
-	CPUIdle *int64 `json:"cpu_idle,omitempty"`
92
-
93
-	// Process limit; set <= `0' to disable limit.
94
-	PidsLimit int64 `json:"pids_limit"`
95
-
96
-	// Specifies per cgroup weight, range is from 10 to 1000.
97
-	BlkioWeight uint16 `json:"blkio_weight"`
98
-
99
-	// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
100
-	BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
101
-
102
-	// Weight per cgroup per device, can override BlkioWeight.
103
-	BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
104
-
105
-	// IO read rate limit per cgroup per device, bytes per second.
106
-	BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
107
-
108
-	// IO write rate limit per cgroup per device, bytes per second.
109
-	BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
110
-
111
-	// IO read rate limit per cgroup per device, IO per second.
112
-	BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
113
-
114
-	// IO write rate limit per cgroup per device, IO per second.
115
-	BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
116
-
117
-	// set the freeze value for the process
118
-	Freezer FreezerState `json:"freezer"`
119
-
120
-	// Hugetlb limit (in bytes)
121
-	HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
122
-
123
-	// Whether to disable OOM Killer
124
-	OomKillDisable bool `json:"oom_kill_disable"`
125
-
126
-	// Tuning swappiness behaviour per cgroup
127
-	MemorySwappiness *uint64 `json:"memory_swappiness"`
128
-
129
-	// Set priority of network traffic for container
130
-	NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
131
-
132
-	// Set class identifier for container's network packets
133
-	NetClsClassid uint32 `json:"net_cls_classid_u"`
134
-
135
-	// Rdma resource restriction configuration
136
-	Rdma map[string]LinuxRdma `json:"rdma"`
137
-
138
-	// Used on cgroups v2:
139
-
140
-	// CpuWeight sets a proportional bandwidth limit.
141
-	CpuWeight uint64 `json:"cpu_weight"`
142
-
143
-	// Unified is cgroupv2-only key-value map.
144
-	Unified map[string]string `json:"unified"`
145
-
146
-	// SkipDevices allows to skip configuring device permissions.
147
-	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
148
-	// common for many containers, and by runc update.
149
-	//
150
-	// NOTE it is impossible to start a container which has this flag set.
151
-	SkipDevices bool `json:"-"`
152
-
153
-	// SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup
154
-	// freeze when setting resources. Only applicable to systemd legacy
155
-	// (i.e. cgroup v1) manager (which uses freeze by default to avoid
156
-	// spurious permission errors caused by systemd inability to update
157
-	// device rules in a non-disruptive manner).
158
-	//
159
-	// If not set, a few methods (such as looking into cgroup's
160
-	// devices.list and querying the systemd unit properties) are used
161
-	// during Set() to figure out whether the freeze is required. Those
162
-	// methods may be relatively slow, thus this flag.
163
-	SkipFreezeOnSet bool `json:"-"`
164
-
165
-	// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
166
-	// if the new memory limits (Memory and MemorySwap) being set are lower
167
-	// than the current memory usage, and reject if so.
168
-	MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
169
-}
170 1
deleted file mode 100644
... ...
@@ -1,8 +0,0 @@
1
-//go:build !linux
2
-
3
-package configs
4
-
5
-// Cgroup holds properties of a cgroup on Linux
6
-// TODO Windows: This can ultimately be entirely factored out on Windows as
7
-// cgroups are a Unix-specific construct.
8
-type Cgroup struct{}
9 1
deleted file mode 100644
... ...
@@ -1,508 +0,0 @@
1
-package configs
2
-
3
-import (
4
-	"bytes"
5
-	"encoding/json"
6
-	"fmt"
7
-	"os/exec"
8
-	"time"
9
-
10
-	"github.com/sirupsen/logrus"
11
-	"golang.org/x/sys/unix"
12
-
13
-	"github.com/opencontainers/runc/libcontainer/devices"
14
-	"github.com/opencontainers/runtime-spec/specs-go"
15
-)
16
-
17
-type Rlimit struct {
18
-	Type int    `json:"type"`
19
-	Hard uint64 `json:"hard"`
20
-	Soft uint64 `json:"soft"`
21
-}
22
-
23
-// IDMap represents UID/GID Mappings for User Namespaces.
24
-type IDMap struct {
25
-	ContainerID int64 `json:"container_id"`
26
-	HostID      int64 `json:"host_id"`
27
-	Size        int64 `json:"size"`
28
-}
29
-
30
-// Seccomp represents syscall restrictions
31
-// By default, only the native architecture of the kernel is allowed to be used
32
-// for syscalls. Additional architectures can be added by specifying them in
33
-// Architectures.
34
-type Seccomp struct {
35
-	DefaultAction    Action                   `json:"default_action"`
36
-	Architectures    []string                 `json:"architectures"`
37
-	Flags            []specs.LinuxSeccompFlag `json:"flags"`
38
-	Syscalls         []*Syscall               `json:"syscalls"`
39
-	DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
40
-	ListenerPath     string                   `json:"listener_path,omitempty"`
41
-	ListenerMetadata string                   `json:"listener_metadata,omitempty"`
42
-}
43
-
44
-// Action is taken upon rule match in Seccomp
45
-type Action int
46
-
47
-const (
48
-	Kill Action = iota + 1
49
-	Errno
50
-	Trap
51
-	Allow
52
-	Trace
53
-	Log
54
-	Notify
55
-	KillThread
56
-	KillProcess
57
-)
58
-
59
-// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
60
-type Operator int
61
-
62
-const (
63
-	EqualTo Operator = iota + 1
64
-	NotEqualTo
65
-	GreaterThan
66
-	GreaterThanOrEqualTo
67
-	LessThan
68
-	LessThanOrEqualTo
69
-	MaskEqualTo
70
-)
71
-
72
-// Arg is a rule to match a specific syscall argument in Seccomp
73
-type Arg struct {
74
-	Index    uint     `json:"index"`
75
-	Value    uint64   `json:"value"`
76
-	ValueTwo uint64   `json:"value_two"`
77
-	Op       Operator `json:"op"`
78
-}
79
-
80
-// Syscall is a rule to match a syscall in Seccomp
81
-type Syscall struct {
82
-	Name     string `json:"name"`
83
-	Action   Action `json:"action"`
84
-	ErrnoRet *uint  `json:"errnoRet"`
85
-	Args     []*Arg `json:"args"`
86
-}
87
-
88
-// Config defines configuration options for executing a process inside a contained environment.
89
-type Config struct {
90
-	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
91
-	// This is a common option when the container is running in ramdisk
92
-	NoPivotRoot bool `json:"no_pivot_root"`
93
-
94
-	// ParentDeathSignal specifies the signal that is sent to the container's process in the case
95
-	// that the parent process dies.
96
-	ParentDeathSignal int `json:"parent_death_signal"`
97
-
98
-	// Path to a directory containing the container's root filesystem.
99
-	Rootfs string `json:"rootfs"`
100
-
101
-	// Umask is the umask to use inside of the container.
102
-	Umask *uint32 `json:"umask"`
103
-
104
-	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
105
-	// bind mounts are writtable.
106
-	Readonlyfs bool `json:"readonlyfs"`
107
-
108
-	// Specifies the mount propagation flags to be applied to /.
109
-	RootPropagation int `json:"rootPropagation"`
110
-
111
-	// Mounts specify additional source and destination paths that will be mounted inside the container's
112
-	// rootfs and mount namespace if specified
113
-	Mounts []*Mount `json:"mounts"`
114
-
115
-	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
116
-	Devices []*devices.Device `json:"devices"`
117
-
118
-	MountLabel string `json:"mount_label"`
119
-
120
-	// Hostname optionally sets the container's hostname if provided
121
-	Hostname string `json:"hostname"`
122
-
123
-	// Domainname optionally sets the container's domainname if provided
124
-	Domainname string `json:"domainname"`
125
-
126
-	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
127
-	// If a namespace is not provided that namespace is shared from the container's parent process
128
-	Namespaces Namespaces `json:"namespaces"`
129
-
130
-	// Capabilities specify the capabilities to keep when executing the process inside the container
131
-	// All capabilities not specified will be dropped from the processes capability mask
132
-	Capabilities *Capabilities `json:"capabilities"`
133
-
134
-	// Networks specifies the container's network setup to be created
135
-	Networks []*Network `json:"networks"`
136
-
137
-	// Routes can be specified to create entries in the route table as the container is started
138
-	Routes []*Route `json:"routes"`
139
-
140
-	// Cgroups specifies specific cgroup settings for the various subsystems that the container is
141
-	// placed into to limit the resources the container has available
142
-	Cgroups *Cgroup `json:"cgroups"`
143
-
144
-	// AppArmorProfile specifies the profile to apply to the process running in the container and is
145
-	// change at the time the process is execed
146
-	AppArmorProfile string `json:"apparmor_profile,omitempty"`
147
-
148
-	// ProcessLabel specifies the label to apply to the process running in the container.  It is
149
-	// commonly used by selinux
150
-	ProcessLabel string `json:"process_label,omitempty"`
151
-
152
-	// Rlimits specifies the resource limits, such as max open files, to set in the container
153
-	// If Rlimits are not set, the container will inherit rlimits from the parent process
154
-	Rlimits []Rlimit `json:"rlimits,omitempty"`
155
-
156
-	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
157
-	// for a process. Valid values are between the range [-1000, '1000'], where processes with
158
-	// higher scores are preferred for being killed. If it is unset then we don't touch the current
159
-	// value.
160
-	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
161
-	OomScoreAdj *int `json:"oom_score_adj,omitempty"`
162
-
163
-	// UIDMappings is an array of User ID mappings for User Namespaces
164
-	UIDMappings []IDMap `json:"uid_mappings"`
165
-
166
-	// GIDMappings is an array of Group ID mappings for User Namespaces
167
-	GIDMappings []IDMap `json:"gid_mappings"`
168
-
169
-	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
170
-	// mount pointing to /dev/null as to prevent reads of the file.
171
-	MaskPaths []string `json:"mask_paths"`
172
-
173
-	// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
174
-	// so that these files prevent any writes.
175
-	ReadonlyPaths []string `json:"readonly_paths"`
176
-
177
-	// Sysctl is a map of properties and their values. It is the equivalent of using
178
-	// sysctl -w my.property.name value in Linux.
179
-	Sysctl map[string]string `json:"sysctl"`
180
-
181
-	// Seccomp allows actions to be taken whenever a syscall is made within the container.
182
-	// A number of rules are given, each having an action to be taken if a syscall matches it.
183
-	// A default action to be taken if no rules match is also given.
184
-	Seccomp *Seccomp `json:"seccomp"`
185
-
186
-	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
187
-	NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
188
-
189
-	// Hooks are a collection of actions to perform at various container lifecycle events.
190
-	// CommandHooks are serialized to JSON, but other hooks are not.
191
-	Hooks Hooks
192
-
193
-	// Version is the version of opencontainer specification that is supported.
194
-	Version string `json:"version"`
195
-
196
-	// Labels are user defined metadata that is stored in the config and populated on the state
197
-	Labels []string `json:"labels"`
198
-
199
-	// NoNewKeyring will not allocated a new session keyring for the container.  It will use the
200
-	// callers keyring in this case.
201
-	NoNewKeyring bool `json:"no_new_keyring"`
202
-
203
-	// IntelRdt specifies settings for Intel RDT group that the container is placed into
204
-	// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
205
-	IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
206
-
207
-	// RootlessEUID is set when the runc was launched with non-zero EUID.
208
-	// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
209
-	// When RootlessEUID is set, runc creates a new userns for the container.
210
-	// (config.json needs to contain userns settings)
211
-	RootlessEUID bool `json:"rootless_euid,omitempty"`
212
-
213
-	// RootlessCgroups is set when unlikely to have the full access to cgroups.
214
-	// When RootlessCgroups is set, cgroups errors are ignored.
215
-	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
216
-
217
-	// TimeOffsets specifies the offset for supporting time namespaces.
218
-	TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
219
-
220
-	// Scheduler represents the scheduling attributes for a process.
221
-	Scheduler *Scheduler `json:"scheduler,omitempty"`
222
-
223
-	// Personality contains configuration for the Linux personality syscall.
224
-	Personality *LinuxPersonality `json:"personality,omitempty"`
225
-
226
-	// IOPriority is the container's I/O priority.
227
-	IOPriority *IOPriority `json:"io_priority,omitempty"`
228
-}
229
-
230
-// Scheduler is based on the Linux sched_setattr(2) syscall.
231
-type Scheduler = specs.Scheduler
232
-
233
-// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
234
-func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
235
-	var policy uint32
236
-	switch scheduler.Policy {
237
-	case specs.SchedOther:
238
-		policy = 0
239
-	case specs.SchedFIFO:
240
-		policy = 1
241
-	case specs.SchedRR:
242
-		policy = 2
243
-	case specs.SchedBatch:
244
-		policy = 3
245
-	case specs.SchedISO:
246
-		policy = 4
247
-	case specs.SchedIdle:
248
-		policy = 5
249
-	case specs.SchedDeadline:
250
-		policy = 6
251
-	default:
252
-		return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
253
-	}
254
-
255
-	var flags uint64
256
-	for _, flag := range scheduler.Flags {
257
-		switch flag {
258
-		case specs.SchedFlagResetOnFork:
259
-			flags |= 0x01
260
-		case specs.SchedFlagReclaim:
261
-			flags |= 0x02
262
-		case specs.SchedFlagDLOverrun:
263
-			flags |= 0x04
264
-		case specs.SchedFlagKeepPolicy:
265
-			flags |= 0x08
266
-		case specs.SchedFlagKeepParams:
267
-			flags |= 0x10
268
-		case specs.SchedFlagUtilClampMin:
269
-			flags |= 0x20
270
-		case specs.SchedFlagUtilClampMax:
271
-			flags |= 0x40
272
-		default:
273
-			return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
274
-		}
275
-	}
276
-
277
-	return &unix.SchedAttr{
278
-		Size:     unix.SizeofSchedAttr,
279
-		Policy:   policy,
280
-		Flags:    flags,
281
-		Nice:     scheduler.Nice,
282
-		Priority: uint32(scheduler.Priority),
283
-		Runtime:  scheduler.Runtime,
284
-		Deadline: scheduler.Deadline,
285
-		Period:   scheduler.Period,
286
-	}, nil
287
-}
288
-
289
-var IOPrioClassMapping = map[specs.IOPriorityClass]int{
290
-	specs.IOPRIO_CLASS_RT:   1,
291
-	specs.IOPRIO_CLASS_BE:   2,
292
-	specs.IOPRIO_CLASS_IDLE: 3,
293
-}
294
-
295
-type IOPriority = specs.LinuxIOPriority
296
-
297
-type (
298
-	HookName string
299
-	HookList []Hook
300
-	Hooks    map[HookName]HookList
301
-)
302
-
303
-const (
304
-	// Prestart commands are executed after the container namespaces are created,
305
-	// but before the user supplied command is executed from init.
306
-	// Note: This hook is now deprecated
307
-	// Prestart commands are called in the Runtime namespace.
308
-	Prestart HookName = "prestart"
309
-
310
-	// CreateRuntime commands MUST be called as part of the create operation after
311
-	// the runtime environment has been created but before the pivot_root has been executed.
312
-	// CreateRuntime is called immediately after the deprecated Prestart hook.
313
-	// CreateRuntime commands are called in the Runtime Namespace.
314
-	CreateRuntime HookName = "createRuntime"
315
-
316
-	// CreateContainer commands MUST be called as part of the create operation after
317
-	// the runtime environment has been created but before the pivot_root has been executed.
318
-	// CreateContainer commands are called in the Container namespace.
319
-	CreateContainer HookName = "createContainer"
320
-
321
-	// StartContainer commands MUST be called as part of the start operation and before
322
-	// the container process is started.
323
-	// StartContainer commands are called in the Container namespace.
324
-	StartContainer HookName = "startContainer"
325
-
326
-	// Poststart commands are executed after the container init process starts.
327
-	// Poststart commands are called in the Runtime Namespace.
328
-	Poststart HookName = "poststart"
329
-
330
-	// Poststop commands are executed after the container init process exits.
331
-	// Poststop commands are called in the Runtime Namespace.
332
-	Poststop HookName = "poststop"
333
-)
334
-
335
-// KnownHookNames returns the known hook names.
336
-// Used by `runc features`.
337
-func KnownHookNames() []string {
338
-	return []string{
339
-		string(Prestart), // deprecated
340
-		string(CreateRuntime),
341
-		string(CreateContainer),
342
-		string(StartContainer),
343
-		string(Poststart),
344
-		string(Poststop),
345
-	}
346
-}
347
-
348
-type Capabilities struct {
349
-	// Bounding is the set of capabilities checked by the kernel.
350
-	Bounding []string
351
-	// Effective is the set of capabilities checked by the kernel.
352
-	Effective []string
353
-	// Inheritable is the capabilities preserved across execve.
354
-	Inheritable []string
355
-	// Permitted is the limiting superset for effective capabilities.
356
-	Permitted []string
357
-	// Ambient is the ambient set of capabilities that are kept.
358
-	Ambient []string
359
-}
360
-
361
-// Deprecated: use (Hooks).Run instead.
362
-func (hooks HookList) RunHooks(state *specs.State) error {
363
-	for i, h := range hooks {
364
-		if err := h.Run(state); err != nil {
365
-			return fmt.Errorf("error running hook #%d: %w", i, err)
366
-		}
367
-	}
368
-
369
-	return nil
370
-}
371
-
372
-func (hooks *Hooks) UnmarshalJSON(b []byte) error {
373
-	var state map[HookName][]CommandHook
374
-
375
-	if err := json.Unmarshal(b, &state); err != nil {
376
-		return err
377
-	}
378
-
379
-	*hooks = Hooks{}
380
-	for n, commandHooks := range state {
381
-		if len(commandHooks) == 0 {
382
-			continue
383
-		}
384
-
385
-		(*hooks)[n] = HookList{}
386
-		for _, h := range commandHooks {
387
-			(*hooks)[n] = append((*hooks)[n], h)
388
-		}
389
-	}
390
-
391
-	return nil
392
-}
393
-
394
-func (hooks *Hooks) MarshalJSON() ([]byte, error) {
395
-	serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
396
-		for _, hook := range hooks {
397
-			switch chook := hook.(type) {
398
-			case CommandHook:
399
-				serializableHooks = append(serializableHooks, chook)
400
-			default:
401
-				logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
402
-			}
403
-		}
404
-
405
-		return serializableHooks
406
-	}
407
-
408
-	return json.Marshal(map[string]interface{}{
409
-		"prestart":        serialize((*hooks)[Prestart]),
410
-		"createRuntime":   serialize((*hooks)[CreateRuntime]),
411
-		"createContainer": serialize((*hooks)[CreateContainer]),
412
-		"startContainer":  serialize((*hooks)[StartContainer]),
413
-		"poststart":       serialize((*hooks)[Poststart]),
414
-		"poststop":        serialize((*hooks)[Poststop]),
415
-	})
416
-}
417
-
418
-// Run executes all hooks for the given hook name.
419
-func (hooks Hooks) Run(name HookName, state *specs.State) error {
420
-	list := hooks[name]
421
-	for i, h := range list {
422
-		if err := h.Run(state); err != nil {
423
-			return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
424
-		}
425
-	}
426
-
427
-	return nil
428
-}
429
-
430
-type Hook interface {
431
-	// Run executes the hook with the provided state.
432
-	Run(*specs.State) error
433
-}
434
-
435
-// NewFunctionHook will call the provided function when the hook is run.
436
-func NewFunctionHook(f func(*specs.State) error) FuncHook {
437
-	return FuncHook{
438
-		run: f,
439
-	}
440
-}
441
-
442
-type FuncHook struct {
443
-	run func(*specs.State) error
444
-}
445
-
446
-func (f FuncHook) Run(s *specs.State) error {
447
-	return f.run(s)
448
-}
449
-
450
-type Command struct {
451
-	Path    string         `json:"path"`
452
-	Args    []string       `json:"args"`
453
-	Env     []string       `json:"env"`
454
-	Dir     string         `json:"dir"`
455
-	Timeout *time.Duration `json:"timeout"`
456
-}
457
-
458
-// NewCommandHook will execute the provided command when the hook is run.
459
-func NewCommandHook(cmd Command) CommandHook {
460
-	return CommandHook{
461
-		Command: cmd,
462
-	}
463
-}
464
-
465
-type CommandHook struct {
466
-	Command
467
-}
468
-
469
-func (c Command) Run(s *specs.State) error {
470
-	b, err := json.Marshal(s)
471
-	if err != nil {
472
-		return err
473
-	}
474
-	var stdout, stderr bytes.Buffer
475
-	cmd := exec.Cmd{
476
-		Path:   c.Path,
477
-		Args:   c.Args,
478
-		Env:    c.Env,
479
-		Stdin:  bytes.NewReader(b),
480
-		Stdout: &stdout,
481
-		Stderr: &stderr,
482
-	}
483
-	if err := cmd.Start(); err != nil {
484
-		return err
485
-	}
486
-	errC := make(chan error, 1)
487
-	go func() {
488
-		err := cmd.Wait()
489
-		if err != nil {
490
-			err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
491
-		}
492
-		errC <- err
493
-	}()
494
-	var timerCh <-chan time.Time
495
-	if c.Timeout != nil {
496
-		timer := time.NewTimer(*c.Timeout)
497
-		defer timer.Stop()
498
-		timerCh = timer.C
499
-	}
500
-	select {
501
-	case err := <-errC:
502
-		return err
503
-	case <-timerCh:
504
-		_ = cmd.Process.Kill()
505
-		<-errC
506
-		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
507
-	}
508
-}
509 1
deleted file mode 100644
... ...
@@ -1,97 +0,0 @@
1
-package configs
2
-
3
-import (
4
-	"errors"
5
-	"fmt"
6
-	"math"
7
-)
8
-
9
-var (
10
-	errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found")
11
-	errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found")
12
-)
13
-
14
-// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details.
15
-// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h
16
-const (
17
-	PerLinux   = 0x0000
18
-	PerLinux32 = 0x0008
19
-)
20
-
21
-type LinuxPersonality struct {
22
-	// Domain for the personality
23
-	// can only contain values "LINUX" and "LINUX32"
24
-	Domain int `json:"domain"`
25
-}
26
-
27
-// HostUID gets the translated uid for the process on host which could be
28
-// different when user namespaces are enabled.
29
-func (c Config) HostUID(containerId int) (int, error) {
30
-	if c.Namespaces.Contains(NEWUSER) {
31
-		if len(c.UIDMappings) == 0 {
32
-			return -1, errNoUIDMap
33
-		}
34
-		id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings)
35
-		if !found {
36
-			return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId)
37
-		}
38
-		// If we are a 32-bit binary running on a 64-bit system, it's possible
39
-		// the mapped user is too large to store in an int, which means we
40
-		// cannot do the mapping. We can't just return an int64, because
41
-		// os.Setuid() takes an int.
42
-		if id > math.MaxInt {
43
-			return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
44
-		}
45
-		return int(id), nil
46
-	}
47
-	// Return unchanged id.
48
-	return containerId, nil
49
-}
50
-
51
-// HostRootUID gets the root uid for the process on host which could be non-zero
52
-// when user namespaces are enabled.
53
-func (c Config) HostRootUID() (int, error) {
54
-	return c.HostUID(0)
55
-}
56
-
57
-// HostGID gets the translated gid for the process on host which could be
58
-// different when user namespaces are enabled.
59
-func (c Config) HostGID(containerId int) (int, error) {
60
-	if c.Namespaces.Contains(NEWUSER) {
61
-		if len(c.GIDMappings) == 0 {
62
-			return -1, errNoGIDMap
63
-		}
64
-		id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings)
65
-		if !found {
66
-			return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId)
67
-		}
68
-		// If we are a 32-bit binary running on a 64-bit system, it's possible
69
-		// the mapped user is too large to store in an int, which means we
70
-		// cannot do the mapping. We can't just return an int64, because
71
-		// os.Setgid() takes an int.
72
-		if id > math.MaxInt {
73
-			return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
74
-		}
75
-		return int(id), nil
76
-	}
77
-	// Return unchanged id.
78
-	return containerId, nil
79
-}
80
-
81
-// HostRootGID gets the root gid for the process on host which could be non-zero
82
-// when user namespaces are enabled.
83
-func (c Config) HostRootGID() (int, error) {
84
-	return c.HostGID(0)
85
-}
86
-
87
-// Utility function that gets a host ID for a container ID from user namespace map
88
-// if that ID is present in the map.
89
-func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
90
-	for _, m := range uMap {
91
-		if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
92
-			hostID := m.HostID + (containerID - m.ContainerID)
93
-			return hostID, true
94
-		}
95
-	}
96
-	return -1, false
97
-}
98 1
deleted file mode 100644
... ...
@@ -1,9 +0,0 @@
1
-//go:build gofuzz
2
-
3
-package configs
4
-
5
-func FuzzUnmarshalJSON(data []byte) int {
6
-	hooks := Hooks{}
7
-	_ = hooks.UnmarshalJSON(data)
8
-	return 1
9
-}
10 1
deleted file mode 100644
... ...
@@ -1,9 +0,0 @@
1
-package configs
2
-
3
-type HugepageLimit struct {
4
-	// which type of hugepage to limit.
5
-	Pagesize string `json:"page_size"`
6
-
7
-	// usage limit for hugepage.
8
-	Limit uint64 `json:"limit"`
9
-}
10 1
deleted file mode 100644
... ...
@@ -1,16 +0,0 @@
1
-package configs
2
-
3
-type IntelRdt struct {
4
-	// The identity for RDT Class of Service
5
-	ClosID string `json:"closID,omitempty"`
6
-
7
-	// The schema for L3 cache id and capacity bitmask (CBM)
8
-	// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
9
-	L3CacheSchema string `json:"l3_cache_schema,omitempty"`
10
-
11
-	// The schema of memory bandwidth per L3 cache id
12
-	// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
13
-	// The unit of memory bandwidth is specified in "percentages" by
14
-	// default, and in "MBps" if MBA Software Controller is enabled.
15
-	MemBwSchema string `json:"memBwSchema,omitempty"`
16
-}
17 1
deleted file mode 100644
... ...
@@ -1,14 +0,0 @@
1
-package configs
2
-
3
-import (
4
-	"fmt"
5
-)
6
-
7
-type IfPrioMap struct {
8
-	Interface string `json:"interface"`
9
-	Priority  int64  `json:"priority"`
10
-}
11
-
12
-func (i *IfPrioMap) CgroupString() string {
13
-	return fmt.Sprintf("%s %d", i.Interface, i.Priority)
14
-}
15 1
deleted file mode 100644
... ...
@@ -1,7 +0,0 @@
1
-package configs
2
-
3
-const (
4
-	// EXT_COPYUP is a directive to copy up the contents of a directory when
5
-	// a tmpfs is mounted over it.
6
-	EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning
7
-)
8 1
deleted file mode 100644
... ...
@@ -1,66 +0,0 @@
1
-package configs
2
-
3
-import "golang.org/x/sys/unix"
4
-
5
-type MountIDMapping struct {
6
-	// Recursive indicates if the mapping needs to be recursive.
7
-	Recursive bool `json:"recursive"`
8
-
9
-	// UserNSPath is a path to a user namespace that indicates the necessary
10
-	// id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
11
-	// GIDMappings must be set to nil.
12
-	UserNSPath string `json:"userns_path,omitempty"`
13
-
14
-	// UIDMappings is the uid mapping set for this mount, to be used with
15
-	// MOUNT_ATTR_IDMAP.
16
-	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
17
-
18
-	// GIDMappings is the gid mapping set for this mount, to be used with
19
-	// MOUNT_ATTR_IDMAP.
20
-	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
21
-}
22
-
23
-type Mount struct {
24
-	// Source path for the mount.
25
-	Source string `json:"source"`
26
-
27
-	// Destination path for the mount inside the container.
28
-	Destination string `json:"destination"`
29
-
30
-	// Device the mount is for.
31
-	Device string `json:"device"`
32
-
33
-	// Mount flags.
34
-	Flags int `json:"flags"`
35
-
36
-	// Mount flags that were explicitly cleared in the configuration (meaning
37
-	// the user explicitly requested that these flags *not* be set).
38
-	ClearedFlags int `json:"cleared_flags"`
39
-
40
-	// Propagation Flags
41
-	PropagationFlags []int `json:"propagation_flags"`
42
-
43
-	// Mount data applied to the mount.
44
-	Data string `json:"data"`
45
-
46
-	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
47
-	Relabel string `json:"relabel"`
48
-
49
-	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
50
-	RecAttr *unix.MountAttr `json:"rec_attr"`
51
-
52
-	// Extensions are additional flags that are specific to runc.
53
-	Extensions int `json:"extensions"`
54
-
55
-	// Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
56
-	// the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
57
-	IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
58
-}
59
-
60
-func (m *Mount) IsBind() bool {
61
-	return m.Flags&unix.MS_BIND != 0
62
-}
63
-
64
-func (m *Mount) IsIDMapped() bool {
65
-	return m.IDMapping != nil
66
-}
67 1
deleted file mode 100644
... ...
@@ -1,9 +0,0 @@
1
-//go:build !linux
2
-
3
-package configs
4
-
5
-type Mount struct{}
6
-
7
-func (m *Mount) IsBind() bool {
8
-	return false
9
-}
10 1
deleted file mode 100644
... ...
@@ -1,5 +0,0 @@
1
-package configs
2
-
3
-type NamespaceType string
4
-
5
-type Namespaces []Namespace
6 1
deleted file mode 100644
... ...
@@ -1,133 +0,0 @@
1
-package configs
2
-
3
-import (
4
-	"fmt"
5
-	"os"
6
-	"sync"
7
-)
8
-
9
-const (
10
-	NEWNET    NamespaceType = "NEWNET"
11
-	NEWPID    NamespaceType = "NEWPID"
12
-	NEWNS     NamespaceType = "NEWNS"
13
-	NEWUTS    NamespaceType = "NEWUTS"
14
-	NEWIPC    NamespaceType = "NEWIPC"
15
-	NEWUSER   NamespaceType = "NEWUSER"
16
-	NEWCGROUP NamespaceType = "NEWCGROUP"
17
-	NEWTIME   NamespaceType = "NEWTIME"
18
-)
19
-
20
-var (
21
-	nsLock              sync.Mutex
22
-	supportedNamespaces = make(map[NamespaceType]bool)
23
-)
24
-
25
-// NsName converts the namespace type to its filename
26
-func NsName(ns NamespaceType) string {
27
-	switch ns {
28
-	case NEWNET:
29
-		return "net"
30
-	case NEWNS:
31
-		return "mnt"
32
-	case NEWPID:
33
-		return "pid"
34
-	case NEWIPC:
35
-		return "ipc"
36
-	case NEWUSER:
37
-		return "user"
38
-	case NEWUTS:
39
-		return "uts"
40
-	case NEWCGROUP:
41
-		return "cgroup"
42
-	case NEWTIME:
43
-		return "time"
44
-	}
45
-	return ""
46
-}
47
-
48
-// IsNamespaceSupported returns whether a namespace is available or
49
-// not
50
-func IsNamespaceSupported(ns NamespaceType) bool {
51
-	nsLock.Lock()
52
-	defer nsLock.Unlock()
53
-	supported, ok := supportedNamespaces[ns]
54
-	if ok {
55
-		return supported
56
-	}
57
-	nsFile := NsName(ns)
58
-	// if the namespace type is unknown, just return false
59
-	if nsFile == "" {
60
-		return false
61
-	}
62
-	// We don't need to use /proc/thread-self here because the list of
63
-	// namespace types is unrelated to the thread. This lets us avoid having to
64
-	// do runtime.LockOSThread.
65
-	_, err := os.Stat("/proc/self/ns/" + nsFile)
66
-	// a namespace is supported if it exists and we have permissions to read it
67
-	supported = err == nil
68
-	supportedNamespaces[ns] = supported
69
-	return supported
70
-}
71
-
72
-func NamespaceTypes() []NamespaceType {
73
-	return []NamespaceType{
74
-		NEWUSER, // Keep user NS always first, don't move it.
75
-		NEWIPC,
76
-		NEWUTS,
77
-		NEWNET,
78
-		NEWPID,
79
-		NEWNS,
80
-		NEWCGROUP,
81
-		NEWTIME,
82
-	}
83
-}
84
-
85
-// Namespace defines configuration for each namespace.  It specifies an
86
-// alternate path that is able to be joined via setns.
87
-type Namespace struct {
88
-	Type NamespaceType `json:"type"`
89
-	Path string        `json:"path"`
90
-}
91
-
92
-func (n *Namespace) GetPath(pid int) string {
93
-	return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
94
-}
95
-
96
-func (n *Namespaces) Remove(t NamespaceType) bool {
97
-	i := n.index(t)
98
-	if i == -1 {
99
-		return false
100
-	}
101
-	*n = append((*n)[:i], (*n)[i+1:]...)
102
-	return true
103
-}
104
-
105
-func (n *Namespaces) Add(t NamespaceType, path string) {
106
-	i := n.index(t)
107
-	if i == -1 {
108
-		*n = append(*n, Namespace{Type: t, Path: path})
109
-		return
110
-	}
111
-	(*n)[i].Path = path
112
-}
113
-
114
-func (n *Namespaces) index(t NamespaceType) int {
115
-	for i, ns := range *n {
116
-		if ns.Type == t {
117
-			return i
118
-		}
119
-	}
120
-	return -1
121
-}
122
-
123
-func (n *Namespaces) Contains(t NamespaceType) bool {
124
-	return n.index(t) != -1
125
-}
126
-
127
-func (n *Namespaces) PathOf(t NamespaceType) string {
128
-	i := n.index(t)
129
-	if i == -1 {
130
-		return ""
131
-	}
132
-	return (*n)[i].Path
133
-}
134 1
deleted file mode 100644
... ...
@@ -1,45 +0,0 @@
1
-//go:build linux
2
-
3
-package configs
4
-
5
-import "golang.org/x/sys/unix"
6
-
7
-func (n *Namespace) Syscall() int {
8
-	return namespaceInfo[n.Type]
9
-}
10
-
11
-var namespaceInfo = map[NamespaceType]int{
12
-	NEWNET:    unix.CLONE_NEWNET,
13
-	NEWNS:     unix.CLONE_NEWNS,
14
-	NEWUSER:   unix.CLONE_NEWUSER,
15
-	NEWIPC:    unix.CLONE_NEWIPC,
16
-	NEWUTS:    unix.CLONE_NEWUTS,
17
-	NEWPID:    unix.CLONE_NEWPID,
18
-	NEWCGROUP: unix.CLONE_NEWCGROUP,
19
-	NEWTIME:   unix.CLONE_NEWTIME,
20
-}
21
-
22
-// CloneFlags parses the container's Namespaces options to set the correct
23
-// flags on clone, unshare. This function returns flags only for new namespaces.
24
-func (n *Namespaces) CloneFlags() uintptr {
25
-	var flag int
26
-	for _, v := range *n {
27
-		if v.Path != "" {
28
-			continue
29
-		}
30
-		flag |= namespaceInfo[v.Type]
31
-	}
32
-	return uintptr(flag)
33
-}
34
-
35
-// IsPrivate tells whether the namespace of type t is configured as private
36
-// (i.e. it exists and is not shared).
37
-func (n Namespaces) IsPrivate(t NamespaceType) bool {
38
-	for _, v := range n {
39
-		if v.Type == t {
40
-			return v.Path == ""
41
-		}
42
-	}
43
-	// Not found, so implicitly sharing a parent namespace.
44
-	return false
45
-}
46 1
deleted file mode 100644
... ...
@@ -1,13 +0,0 @@
1
-//go:build !linux && !windows
2
-
3
-package configs
4
-
5
-func (n *Namespace) Syscall() int {
6
-	panic("No namespace syscall support")
7
-}
8
-
9
-// CloneFlags parses the container's Namespaces options to set the correct
10
-// flags on clone, unshare. This function returns flags only for new namespaces.
11
-func (n *Namespaces) CloneFlags() uintptr {
12
-	panic("No namespace syscall support")
13
-}
14 1
deleted file mode 100644
... ...
@@ -1,7 +0,0 @@
1
-//go:build !linux
2
-
3
-package configs
4
-
5
-// Namespace defines configuration for each namespace.  It specifies an
6
-// alternate path that is able to be joined via setns.
7
-type Namespace struct{}
8 1
deleted file mode 100644
... ...
@@ -1,75 +0,0 @@
1
-package configs
2
-
3
-// Network defines configuration for a container's networking stack
4
-//
5
-// The network configuration can be omitted from a container causing the
6
-// container to be setup with the host's networking stack
7
-type Network struct {
8
-	// Type sets the networks type, commonly veth and loopback
9
-	Type string `json:"type"`
10
-
11
-	// Name of the network interface
12
-	Name string `json:"name"`
13
-
14
-	// The bridge to use.
15
-	Bridge string `json:"bridge"`
16
-
17
-	// MacAddress contains the MAC address to set on the network interface
18
-	MacAddress string `json:"mac_address"`
19
-
20
-	// Address contains the IPv4 and mask to set on the network interface
21
-	Address string `json:"address"`
22
-
23
-	// Gateway sets the gateway address that is used as the default for the interface
24
-	Gateway string `json:"gateway"`
25
-
26
-	// IPv6Address contains the IPv6 and mask to set on the network interface
27
-	IPv6Address string `json:"ipv6_address"`
28
-
29
-	// IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
30
-	IPv6Gateway string `json:"ipv6_gateway"`
31
-
32
-	// Mtu sets the mtu value for the interface and will be mirrored on both the host and
33
-	// container's interfaces if a pair is created, specifically in the case of type veth
34
-	// Note: This does not apply to loopback interfaces.
35
-	Mtu int `json:"mtu"`
36
-
37
-	// TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
38
-	// container's interfaces if a pair is created, specifically in the case of type veth
39
-	// Note: This does not apply to loopback interfaces.
40
-	TxQueueLen int `json:"txqueuelen"`
41
-
42
-	// HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
43
-	// container.
44
-	HostInterfaceName string `json:"host_interface_name"`
45
-
46
-	// HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
47
-	// bridge port in the case of type veth
48
-	// Note: This is unsupported on some systems.
49
-	// Note: This does not apply to loopback interfaces.
50
-	HairpinMode bool `json:"hairpin_mode"`
51
-}
52
-
53
-// Route defines a routing table entry.
54
-//
55
-// Routes can be specified to create entries in the routing table as the container
56
-// is started.
57
-//
58
-// All of destination, source, and gateway should be either IPv4 or IPv6.
59
-// One of the three options must be present, and omitted entries will use their
60
-// IP family default for the route table.  For IPv4 for example, setting the
61
-// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
62
-// destination of 0.0.0.0(or *) when viewed in the route table.
63
-type Route struct {
64
-	// Destination specifies the destination IP address and mask in the CIDR form.
65
-	Destination string `json:"destination"`
66
-
67
-	// Source specifies the source IP address and mask in the CIDR form.
68
-	Source string `json:"source"`
69
-
70
-	// Gateway specifies the gateway IP address.
71
-	Gateway string `json:"gateway"`
72
-
73
-	// InterfaceName specifies the device to set this route up for, for example eth0.
74
-	InterfaceName string `json:"interface_name"`
75
-}
76 1
deleted file mode 100644
... ...
@@ -1,9 +0,0 @@
1
-package configs
2
-
3
-// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11)
4
-type LinuxRdma struct {
5
-	// Maximum number of HCA handles that can be opened. Default is "no limit".
6
-	HcaHandles *uint32 `json:"hca_handles,omitempty"`
7
-	// Maximum number of HCA objects that can be created. Default is "no limit".
8
-	HcaObjects *uint32 `json:"hca_objects,omitempty"`
9
-}
10 1
deleted file mode 100644
... ...
@@ -1,174 +0,0 @@
1
-package devices
2
-
3
-import (
4
-	"fmt"
5
-	"os"
6
-	"strconv"
7
-)
8
-
9
-const (
10
-	Wildcard = -1
11
-)
12
-
13
-type Device struct {
14
-	Rule
15
-
16
-	// Path to the device.
17
-	Path string `json:"path"`
18
-
19
-	// FileMode permission bits for the device.
20
-	FileMode os.FileMode `json:"file_mode"`
21
-
22
-	// Uid of the device.
23
-	Uid uint32 `json:"uid"`
24
-
25
-	// Gid of the device.
26
-	Gid uint32 `json:"gid"`
27
-}
28
-
29
-// Permissions is a cgroupv1-style string to represent device access. It
30
-// has to be a string for backward compatibility reasons, hence why it has
31
-// methods to do set operations.
32
-type Permissions string
33
-
34
-const (
35
-	deviceRead uint = (1 << iota)
36
-	deviceWrite
37
-	deviceMknod
38
-)
39
-
40
-func (p Permissions) toSet() uint {
41
-	var set uint
42
-	for _, perm := range p {
43
-		switch perm {
44
-		case 'r':
45
-			set |= deviceRead
46
-		case 'w':
47
-			set |= deviceWrite
48
-		case 'm':
49
-			set |= deviceMknod
50
-		}
51
-	}
52
-	return set
53
-}
54
-
55
-func fromSet(set uint) Permissions {
56
-	var perm string
57
-	if set&deviceRead == deviceRead {
58
-		perm += "r"
59
-	}
60
-	if set&deviceWrite == deviceWrite {
61
-		perm += "w"
62
-	}
63
-	if set&deviceMknod == deviceMknod {
64
-		perm += "m"
65
-	}
66
-	return Permissions(perm)
67
-}
68
-
69
-// Union returns the union of the two sets of Permissions.
70
-func (p Permissions) Union(o Permissions) Permissions {
71
-	lhs := p.toSet()
72
-	rhs := o.toSet()
73
-	return fromSet(lhs | rhs)
74
-}
75
-
76
-// Difference returns the set difference of the two sets of Permissions.
77
-// In set notation, A.Difference(B) gives you A\B.
78
-func (p Permissions) Difference(o Permissions) Permissions {
79
-	lhs := p.toSet()
80
-	rhs := o.toSet()
81
-	return fromSet(lhs &^ rhs)
82
-}
83
-
84
-// Intersection computes the intersection of the two sets of Permissions.
85
-func (p Permissions) Intersection(o Permissions) Permissions {
86
-	lhs := p.toSet()
87
-	rhs := o.toSet()
88
-	return fromSet(lhs & rhs)
89
-}
90
-
91
-// IsEmpty returns whether the set of permissions in a Permissions is
92
-// empty.
93
-func (p Permissions) IsEmpty() bool {
94
-	return p == Permissions("")
95
-}
96
-
97
-// IsValid returns whether the set of permissions is a subset of valid
98
-// permissions (namely, {r,w,m}).
99
-func (p Permissions) IsValid() bool {
100
-	return p == fromSet(p.toSet())
101
-}
102
-
103
-type Type rune
104
-
105
-const (
106
-	WildcardDevice Type = 'a'
107
-	BlockDevice    Type = 'b'
108
-	CharDevice     Type = 'c' // or 'u'
109
-	FifoDevice     Type = 'p'
110
-)
111
-
112
-func (t Type) IsValid() bool {
113
-	switch t {
114
-	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
115
-		return true
116
-	default:
117
-		return false
118
-	}
119
-}
120
-
121
-func (t Type) CanMknod() bool {
122
-	switch t {
123
-	case BlockDevice, CharDevice, FifoDevice:
124
-		return true
125
-	default:
126
-		return false
127
-	}
128
-}
129
-
130
-func (t Type) CanCgroup() bool {
131
-	switch t {
132
-	case WildcardDevice, BlockDevice, CharDevice:
133
-		return true
134
-	default:
135
-		return false
136
-	}
137
-}
138
-
139
-type Rule struct {
140
-	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
141
-	// acts as a wildcard and all fields other than Allow are ignored.
142
-	Type Type `json:"type"`
143
-
144
-	// Major is the device's major number.
145
-	Major int64 `json:"major"`
146
-
147
-	// Minor is the device's minor number.
148
-	Minor int64 `json:"minor"`
149
-
150
-	// Permissions is the set of permissions that this rule applies to (in the
151
-	// cgroupv1 format -- any combination of "rwm").
152
-	Permissions Permissions `json:"permissions"`
153
-
154
-	// Allow specifies whether this rule is allowed.
155
-	Allow bool `json:"allow"`
156
-}
157
-
158
-func (d *Rule) CgroupString() string {
159
-	var (
160
-		major = strconv.FormatInt(d.Major, 10)
161
-		minor = strconv.FormatInt(d.Minor, 10)
162
-	)
163
-	if d.Major == Wildcard {
164
-		major = "*"
165
-	}
166
-	if d.Minor == Wildcard {
167
-		minor = "*"
168
-	}
169
-	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
170
-}
171
-
172
-func (d *Rule) Mkdev() (uint64, error) {
173
-	return mkDev(d)
174
-}
175 1
deleted file mode 100644
... ...
@@ -1,119 +0,0 @@
1
-//go:build !windows
2
-
3
-package devices
4
-
5
-import (
6
-	"errors"
7
-	"os"
8
-	"path/filepath"
9
-
10
-	"golang.org/x/sys/unix"
11
-)
12
-
13
-// ErrNotADevice denotes that a file is not a valid linux device.
14
-var ErrNotADevice = errors.New("not a device node")
15
-
16
-// Testing dependencies
17
-var (
18
-	unixLstat = unix.Lstat
19
-	osReadDir = os.ReadDir
20
-)
21
-
22
-func mkDev(d *Rule) (uint64, error) {
23
-	if d.Major == Wildcard || d.Minor == Wildcard {
24
-		return 0, errors.New("cannot mkdev() device with wildcards")
25
-	}
26
-	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
27
-}
28
-
29
-// DeviceFromPath takes the path to a device and its cgroup_permissions (which
30
-// cannot be easily queried) to look up the information about a linux device
31
-// and returns that information as a Device struct.
32
-func DeviceFromPath(path, permissions string) (*Device, error) {
33
-	var stat unix.Stat_t
34
-	err := unixLstat(path, &stat)
35
-	if err != nil {
36
-		return nil, err
37
-	}
38
-
39
-	var (
40
-		devType   Type
41
-		mode      = stat.Mode
42
-		devNumber = uint64(stat.Rdev) //nolint:unconvert // Rdev is uint32 on e.g. MIPS.
43
-		major     = unix.Major(devNumber)
44
-		minor     = unix.Minor(devNumber)
45
-	)
46
-	switch mode & unix.S_IFMT {
47
-	case unix.S_IFBLK:
48
-		devType = BlockDevice
49
-	case unix.S_IFCHR:
50
-		devType = CharDevice
51
-	case unix.S_IFIFO:
52
-		devType = FifoDevice
53
-	default:
54
-		return nil, ErrNotADevice
55
-	}
56
-	return &Device{
57
-		Rule: Rule{
58
-			Type:        devType,
59
-			Major:       int64(major),
60
-			Minor:       int64(minor),
61
-			Permissions: Permissions(permissions),
62
-		},
63
-		Path:     path,
64
-		FileMode: os.FileMode(mode &^ unix.S_IFMT),
65
-		Uid:      stat.Uid,
66
-		Gid:      stat.Gid,
67
-	}, nil
68
-}
69
-
70
-// HostDevices returns all devices that can be found under /dev directory.
71
-func HostDevices() ([]*Device, error) {
72
-	return GetDevices("/dev")
73
-}
74
-
75
-// GetDevices recursively traverses a directory specified by path
76
-// and returns all devices found there.
77
-func GetDevices(path string) ([]*Device, error) {
78
-	files, err := osReadDir(path)
79
-	if err != nil {
80
-		return nil, err
81
-	}
82
-	var out []*Device
83
-	for _, f := range files {
84
-		switch {
85
-		case f.IsDir():
86
-			switch f.Name() {
87
-			// ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
88
-			// ".udev" added to address https://github.com/opencontainers/runc/issues/2093
89
-			case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev":
90
-				continue
91
-			default:
92
-				sub, err := GetDevices(filepath.Join(path, f.Name()))
93
-				if err != nil {
94
-					return nil, err
95
-				}
96
-
97
-				out = append(out, sub...)
98
-				continue
99
-			}
100
-		case f.Name() == "console":
101
-			continue
102
-		}
103
-		device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
104
-		if err != nil {
105
-			if errors.Is(err, ErrNotADevice) {
106
-				continue
107
-			}
108
-			if os.IsNotExist(err) {
109
-				continue
110
-			}
111
-			return nil, err
112
-		}
113
-		if device.Type == FifoDevice {
114
-			continue
115
-		}
116
-		out = append(out, device)
117
-	}
118
-	return out, nil
119
-}
120 1
deleted file mode 100644
... ...
@@ -1,135 +0,0 @@
1
-package utils
2
-
3
-/*
4
- * Copyright 2016, 2017 SUSE LLC
5
- *
6
- * Licensed under the Apache License, Version 2.0 (the "License");
7
- * you may not use this file except in compliance with the License.
8
- * You may obtain a copy of the License at
9
- *
10
- *     http://www.apache.org/licenses/LICENSE-2.0
11
- *
12
- * Unless required by applicable law or agreed to in writing, software
13
- * distributed under the License is distributed on an "AS IS" BASIS,
14
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- * See the License for the specific language governing permissions and
16
- * limitations under the License.
17
- */
18
-
19
-import (
20
-	"fmt"
21
-	"os"
22
-	"runtime"
23
-
24
-	"golang.org/x/sys/unix"
25
-)
26
-
27
-// MaxNameLen is the maximum length of the name of a file descriptor being sent
28
-// using SendFile. The name of the file handle returned by RecvFile will never be
29
-// larger than this value.
30
-const MaxNameLen = 4096
31
-
32
-// oobSpace is the size of the oob slice required to store a single FD. Note
33
-// that unix.UnixRights appears to make the assumption that fd is always int32,
34
-// so sizeof(fd) = 4.
35
-var oobSpace = unix.CmsgSpace(4)
36
-
37
-// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
38
-// socket. The file name of the remote file descriptor will be recreated
39
-// locally (it is sent as non-auxiliary data in the same payload).
40
-func RecvFile(socket *os.File) (_ *os.File, Err error) {
41
-	name := make([]byte, MaxNameLen)
42
-	oob := make([]byte, oobSpace)
43
-
44
-	sockfd := socket.Fd()
45
-	var (
46
-		n, oobn int
47
-		err     error
48
-	)
49
-
50
-	for {
51
-		n, oobn, _, _, err = unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
52
-		if err != unix.EINTR { //nolint:errorlint // unix errors are bare
53
-			break
54
-		}
55
-	}
56
-
57
-	if err != nil {
58
-		return nil, os.NewSyscallError("recvmsg", err)
59
-	}
60
-	if n >= MaxNameLen || oobn != oobSpace {
61
-		return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
62
-	}
63
-	// Truncate.
64
-	name = name[:n]
65
-	oob = oob[:oobn]
66
-
67
-	scms, err := unix.ParseSocketControlMessage(oob)
68
-	if err != nil {
69
-		return nil, err
70
-	}
71
-
72
-	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
73
-	// them all of the descriptors are installed in our fd table, so we need to
74
-	// parse all of the SCM_RIGHTS we received in order to close all of the
75
-	// descriptors on error.
76
-	var fds []int
77
-	defer func() {
78
-		for i, fd := range fds {
79
-			if i == 0 && Err == nil {
80
-				// Only close the first one on error.
81
-				continue
82
-			}
83
-			// Always close extra ones.
84
-			_ = unix.Close(fd)
85
-		}
86
-	}()
87
-	var lastErr error
88
-	for _, scm := range scms {
89
-		if scm.Header.Type == unix.SCM_RIGHTS {
90
-			scmFds, err := unix.ParseUnixRights(&scm)
91
-			if err != nil {
92
-				lastErr = err
93
-			} else {
94
-				fds = append(fds, scmFds...)
95
-			}
96
-		}
97
-	}
98
-	if lastErr != nil {
99
-		return nil, lastErr
100
-	}
101
-
102
-	// We do this after collecting the fds to make sure we close them all when
103
-	// returning an error here.
104
-	if len(scms) != 1 {
105
-		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
106
-	}
107
-	if len(fds) != 1 {
108
-		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
109
-	}
110
-	return os.NewFile(uintptr(fds[0]), string(name)), nil
111
-}
112
-
113
-// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
114
-// included so that if the other end uses RecvFile, the file will have the same
115
-// name information.
116
-func SendFile(socket *os.File, file *os.File) error {
117
-	name := file.Name()
118
-	if len(name) >= MaxNameLen {
119
-		return fmt.Errorf("sendfd: filename too long: %s", name)
120
-	}
121
-	err := SendRawFd(socket, name, file.Fd())
122
-	runtime.KeepAlive(file)
123
-	return err
124
-}
125
-
126
-// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
127
-func SendRawFd(socket *os.File, msg string, fd uintptr) error {
128
-	oob := unix.UnixRights(int(fd))
129
-	for {
130
-		err := unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
131
-		if err != unix.EINTR { //nolint:errorlint // unix errors are bare
132
-			return os.NewSyscallError("sendmsg", err)
133
-		}
134
-	}
135
-}
136 1
deleted file mode 100644
... ...
@@ -1,115 +0,0 @@
1
-package utils
2
-
3
-import (
4
-	"encoding/json"
5
-	"io"
6
-	"os"
7
-	"path/filepath"
8
-	"strings"
9
-
10
-	"golang.org/x/sys/unix"
11
-)
12
-
13
-const (
14
-	exitSignalOffset = 128
15
-)
16
-
17
-// ExitStatus returns the correct exit status for a process based on if it
18
-// was signaled or exited cleanly
19
-func ExitStatus(status unix.WaitStatus) int {
20
-	if status.Signaled() {
21
-		return exitSignalOffset + int(status.Signal())
22
-	}
23
-	return status.ExitStatus()
24
-}
25
-
26
-// WriteJSON writes the provided struct v to w using standard json marshaling
27
-// without a trailing newline. This is used instead of json.Encoder because
28
-// there might be a problem in json decoder in some cases, see:
29
-// https://github.com/docker/docker/issues/14203#issuecomment-174177790
30
-func WriteJSON(w io.Writer, v interface{}) error {
31
-	data, err := json.Marshal(v)
32
-	if err != nil {
33
-		return err
34
-	}
35
-	_, err = w.Write(data)
36
-	return err
37
-}
38
-
39
-// CleanPath makes a path safe for use with filepath.Join. This is done by not
40
-// only cleaning the path, but also (if the path is relative) adding a leading
41
-// '/' and cleaning it (then removing the leading '/'). This ensures that a
42
-// path resulting from prepending another path will always resolve to lexically
43
-// be a subdirectory of the prefixed path. This is all done lexically, so paths
44
-// that include symlinks won't be safe as a result of using CleanPath.
45
-func CleanPath(path string) string {
46
-	// Deal with empty strings nicely.
47
-	if path == "" {
48
-		return ""
49
-	}
50
-
51
-	// Ensure that all paths are cleaned (especially problematic ones like
52
-	// "/../../../../../" which can cause lots of issues).
53
-	path = filepath.Clean(path)
54
-
55
-	// If the path isn't absolute, we need to do more processing to fix paths
56
-	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
57
-	// paths to relative ones.
58
-	if !filepath.IsAbs(path) {
59
-		path = filepath.Clean(string(os.PathSeparator) + path)
60
-		// This can't fail, as (by definition) all paths are relative to root.
61
-		path, _ = filepath.Rel(string(os.PathSeparator), path)
62
-	}
63
-
64
-	// Clean the path again for good measure.
65
-	return filepath.Clean(path)
66
-}
67
-
68
-// stripRoot returns the passed path, stripping the root path if it was
69
-// (lexicially) inside it. Note that both passed paths will always be treated
70
-// as absolute, and the returned path will also always be absolute. In
71
-// addition, the paths are cleaned before stripping the root.
72
-func stripRoot(root, path string) string {
73
-	// Make the paths clean and absolute.
74
-	root, path = CleanPath("/"+root), CleanPath("/"+path)
75
-	switch {
76
-	case path == root:
77
-		path = "/"
78
-	case root == "/":
79
-		// do nothing
80
-	case strings.HasPrefix(path, root+"/"):
81
-		path = strings.TrimPrefix(path, root+"/")
82
-	}
83
-	return CleanPath("/" + path)
84
-}
85
-
86
-// SearchLabels searches through a list of key=value pairs for a given key,
87
-// returning its value, and the binary flag telling whether the key exist.
88
-func SearchLabels(labels []string, key string) (string, bool) {
89
-	key += "="
90
-	for _, s := range labels {
91
-		if strings.HasPrefix(s, key) {
92
-			return s[len(key):], true
93
-		}
94
-	}
95
-	return "", false
96
-}
97
-
98
-// Annotations returns the bundle path and user defined annotations from the
99
-// libcontainer state.  We need to remove the bundle because that is a label
100
-// added by libcontainer.
101
-func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
102
-	userAnnotations = make(map[string]string)
103
-	for _, l := range labels {
104
-		name, value, ok := strings.Cut(l, "=")
105
-		if !ok {
106
-			continue
107
-		}
108
-		if name == "bundle" {
109
-			bundle = value
110
-		} else {
111
-			userAnnotations[name] = value
112
-		}
113
-	}
114
-	return
115
-}
116 1
deleted file mode 100644
... ...
@@ -1,360 +0,0 @@
1
-//go:build !windows
2
-
3
-package utils
4
-
5
-import (
6
-	"fmt"
7
-	"math"
8
-	"os"
9
-	"path/filepath"
10
-	"runtime"
11
-	"strconv"
12
-	"strings"
13
-	"sync"
14
-	_ "unsafe" // for go:linkname
15
-
16
-	securejoin "github.com/cyphar/filepath-securejoin"
17
-	"github.com/sirupsen/logrus"
18
-	"golang.org/x/sys/unix"
19
-)
20
-
21
-// EnsureProcHandle returns whether or not the given file handle is on procfs.
22
-func EnsureProcHandle(fh *os.File) error {
23
-	var buf unix.Statfs_t
24
-	if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
25
-		return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
26
-	}
27
-	if buf.Type != unix.PROC_SUPER_MAGIC {
28
-		return fmt.Errorf("%s is not on procfs", fh.Name())
29
-	}
30
-	return nil
31
-}
32
-
33
-var (
34
-	haveCloseRangeCloexecBool bool
35
-	haveCloseRangeCloexecOnce sync.Once
36
-)
37
-
38
-func haveCloseRangeCloexec() bool {
39
-	haveCloseRangeCloexecOnce.Do(func() {
40
-		// Make sure we're not closing a random file descriptor.
41
-		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
42
-		if err != nil {
43
-			return
44
-		}
45
-		defer unix.Close(tmpFd)
46
-
47
-		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
48
-		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
49
-		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
50
-		// other potential error would imply that even the most basic close
51
-		// operation wouldn't work.
52
-		haveCloseRangeCloexecBool = err == nil
53
-	})
54
-	return haveCloseRangeCloexecBool
55
-}
56
-
57
-type fdFunc func(fd int)
58
-
59
-// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
60
-// the current process.
61
-func fdRangeFrom(minFd int, fn fdFunc) error {
62
-	procSelfFd, closer := ProcThreadSelf("fd")
63
-	defer closer()
64
-
65
-	fdDir, err := os.Open(procSelfFd)
66
-	if err != nil {
67
-		return err
68
-	}
69
-	defer fdDir.Close()
70
-
71
-	if err := EnsureProcHandle(fdDir); err != nil {
72
-		return err
73
-	}
74
-
75
-	fdList, err := fdDir.Readdirnames(-1)
76
-	if err != nil {
77
-		return err
78
-	}
79
-	for _, fdStr := range fdList {
80
-		fd, err := strconv.Atoi(fdStr)
81
-		// Ignore non-numeric file names.
82
-		if err != nil {
83
-			continue
84
-		}
85
-		// Ignore descriptors lower than our specified minimum.
86
-		if fd < minFd {
87
-			continue
88
-		}
89
-		// Ignore the file descriptor we used for readdir, as it will be closed
90
-		// when we return.
91
-		if uintptr(fd) == fdDir.Fd() {
92
-			continue
93
-		}
94
-		// Run the closure.
95
-		fn(fd)
96
-	}
97
-	return nil
98
-}
99
-
100
-// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
101
-// equal to minFd in the current process.
102
-func CloseExecFrom(minFd int) error {
103
-	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
104
-	if haveCloseRangeCloexec() {
105
-		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
106
-		return os.NewSyscallError("close_range", err)
107
-	}
108
-	// Otherwise, fall back to the standard loop.
109
-	return fdRangeFrom(minFd, unix.CloseOnExec)
110
-}
111
-
112
-//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
113
-
114
-// In order to make sure we do not close the internal epoll descriptors the Go
115
-// runtime uses, we need to ensure that we skip descriptors that match
116
-// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
117
-// unfortunately there's no other way to be sure we're only keeping the file
118
-// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
119
-func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
120
-
121
-// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
122
-// current process, except for those critical to Go's runtime (such as the
123
-// netpoll management descriptors).
124
-//
125
-// NOTE: That this function is incredibly dangerous to use in most Go code, as
126
-// closing file descriptors from underneath *os.File handles can lead to very
127
-// bad behaviour (the closed file descriptor can be re-used and then any
128
-// *os.File operations would apply to the wrong file). This function is only
129
-// intended to be called from the last stage of runc init.
130
-func UnsafeCloseFrom(minFd int) error {
131
-	// We cannot use close_range(2) even if it is available, because we must
132
-	// not close some file descriptors.
133
-	return fdRangeFrom(minFd, func(fd int) {
134
-		if runtime_IsPollDescriptor(uintptr(fd)) {
135
-			// These are the Go runtimes internal netpoll file descriptors.
136
-			// These file descriptors are operated on deep in the Go scheduler,
137
-			// and closing those files from underneath Go can result in panics.
138
-			// There is no issue with keeping them because they are not
139
-			// executable and are not useful to an attacker anyway. Also we
140
-			// don't have any choice.
141
-			return
142
-		}
143
-		// There's nothing we can do about errors from close(2), and the
144
-		// only likely error to be seen is EBADF which indicates the fd was
145
-		// already closed (in which case, we got what we wanted).
146
-		_ = unix.Close(fd)
147
-	})
148
-}
149
-
150
-// NewSockPair returns a new SOCK_STREAM unix socket pair.
151
-func NewSockPair(name string) (parent, child *os.File, err error) {
152
-	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
153
-	if err != nil {
154
-		return nil, nil, err
155
-	}
156
-	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
157
-}
158
-
159
-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
160
-// corresponding to the unsafePath resolved within the root. Before passing the
161
-// fd, this path is verified to have been inside the root -- so operating on it
162
-// through the passed fdpath should be safe. Do not access this path through
163
-// the original path strings, and do not attempt to use the pathname outside of
164
-// the passed closure (the file handle will be freed once the closure returns).
165
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
166
-	// Remove the root then forcefully resolve inside the root.
167
-	unsafePath = stripRoot(root, unsafePath)
168
-	path, err := securejoin.SecureJoin(root, unsafePath)
169
-	if err != nil {
170
-		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
171
-	}
172
-
173
-	procSelfFd, closer := ProcThreadSelf("fd/")
174
-	defer closer()
175
-
176
-	// Open the target path.
177
-	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
178
-	if err != nil {
179
-		return fmt.Errorf("open o_path procfd: %w", err)
180
-	}
181
-	defer fh.Close()
182
-
183
-	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
184
-	// Double-check the path is the one we expected.
185
-	if realpath, err := os.Readlink(procfd); err != nil {
186
-		return fmt.Errorf("procfd verification failed: %w", err)
187
-	} else if realpath != path {
188
-		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
189
-	}
190
-
191
-	return fn(procfd)
192
-}
193
-
194
-type ProcThreadSelfCloser func()
195
-
196
-var (
197
-	haveProcThreadSelf     bool
198
-	haveProcThreadSelfOnce sync.Once
199
-)
200
-
201
-// ProcThreadSelf returns a string that is equivalent to
202
-// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
203
-// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
204
-// meaning that the passed string needs to be trusted. The caller _must_ call
205
-// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
206
-// *only once* after it has finished using the returned path string.
207
-func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
208
-	haveProcThreadSelfOnce.Do(func() {
209
-		if _, err := os.Stat("/proc/thread-self/"); err == nil {
210
-			haveProcThreadSelf = true
211
-		} else {
212
-			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
213
-		}
214
-	})
215
-
216
-	// We need to lock our thread until the caller is done with the path string
217
-	// because any non-atomic operation on the path (such as opening a file,
218
-	// then reading it) could be interrupted by the Go runtime where the
219
-	// underlying thread is swapped out and the original thread is killed,
220
-	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
221
-	// addition, the pre-3.17 fallback makes everything non-atomic because the
222
-	// same thing could happen between unix.Gettid() and the path operations.
223
-	//
224
-	// In theory, we don't need to lock in the atomic user case when using
225
-	// /proc/thread-self/, but it's better to be safe than sorry (and there are
226
-	// only one or two truly atomic users of /proc/thread-self/).
227
-	runtime.LockOSThread()
228
-
229
-	threadSelf := "/proc/thread-self/"
230
-	if !haveProcThreadSelf {
231
-		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
232
-		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
233
-		if _, err := os.Stat(threadSelf); err != nil {
234
-			// Unfortunately, this code is called from rootfs_linux.go where we
235
-			// are running inside the pid namespace of the container but /proc
236
-			// is the host's procfs. Unfortunately there is no real way to get
237
-			// the correct tid to use here (the kernel age means we cannot do
238
-			// things like set up a private fsopen("proc") -- even scanning
239
-			// NSpid in all of the tasks in /proc/self/task/*/status requires
240
-			// Linux 4.1).
241
-			//
242
-			// So, we just have to assume that /proc/self is acceptable in this
243
-			// one specific case.
244
-			if os.Getpid() == 1 {
245
-				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
246
-			} else {
247
-				// This should never happen, but the fallback should work in most cases...
248
-				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
249
-			}
250
-			threadSelf = "/proc/self/"
251
-		}
252
-	}
253
-	return threadSelf + subpath, runtime.UnlockOSThread
254
-}
255
-
256
-// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
257
-// create a /proc/thread-self handle for given file descriptor.
258
-//
259
-// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
260
-// without using fmt.Sprintf to avoid unneeded overhead.
261
-func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
262
-	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
263
-}
264
-
265
-// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
266
-// but properly handling the case where path or root are "/".
267
-//
268
-// NOTE: The return value only make sense if the path doesn't contain "..".
269
-func IsLexicallyInRoot(root, path string) bool {
270
-	if root != "/" {
271
-		root += "/"
272
-	}
273
-	if path != "/" {
274
-		path += "/"
275
-	}
276
-	return strings.HasPrefix(path, root)
277
-}
278
-
279
-// MkdirAllInRootOpen attempts to make
280
-//
281
-//	path, _ := securejoin.SecureJoin(root, unsafePath)
282
-//	os.MkdirAll(path, mode)
283
-//	os.Open(path)
284
-//
285
-// safer against attacks where components in the path are changed between
286
-// SecureJoin returning and MkdirAll (or Open) being called. In particular, we
287
-// try to detect any symlink components in the path while we are doing the
288
-// MkdirAll.
289
-//
290
-// NOTE: If unsafePath is a subpath of root, we assume that you have already
291
-// called SecureJoin and so we use the provided path verbatim without resolving
292
-// any symlinks (this is done in a way that avoids symlink-exchange races).
293
-// This means that the path also must not contain ".." elements, otherwise an
294
-// error will occur.
295
-//
296
-// This uses securejoin.MkdirAllHandle under the hood, but it has special
297
-// handling if unsafePath has already been scoped within the rootfs (this is
298
-// needed for a lot of runc callers and fixing this would require reworking a
299
-// lot of path logic).
300
-func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) {
301
-	// If the path is already "within" the root, get the path relative to the
302
-	// root and use that as the unsafe path. This is necessary because a lot of
303
-	// MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
304
-	// all of them to stop using these SecureJoin'd paths would require a fair
305
-	// amount of work.
306
-	// TODO(cyphar): Do the refactor to libpathrs once it's ready.
307
-	if IsLexicallyInRoot(root, unsafePath) {
308
-		subPath, err := filepath.Rel(root, unsafePath)
309
-		if err != nil {
310
-			return nil, err
311
-		}
312
-		unsafePath = subPath
313
-	}
314
-
315
-	// Check for any silly mode bits.
316
-	if mode&^0o7777 != 0 {
317
-		return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
318
-	}
319
-	// Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
320
-	// passed. While it would make sense to return an error in that case (since
321
-	// the user has asked for a mode that won't be applied), for compatibility
322
-	// reasons we have to ignore these bits.
323
-	if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
324
-		logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
325
-		mode &= 0o1777
326
-	}
327
-
328
-	rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
329
-	if err != nil {
330
-		return nil, fmt.Errorf("open root handle: %w", err)
331
-	}
332
-	defer rootDir.Close()
333
-
334
-	return securejoin.MkdirAllHandle(rootDir, unsafePath, mode)
335
-}
336
-
337
-// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
338
-// returned handle, for callers that don't need to use it.
339
-func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error {
340
-	f, err := MkdirAllInRootOpen(root, unsafePath, mode)
341
-	if err == nil {
342
-		_ = f.Close()
343
-	}
344
-	return err
345
-}
346
-
347
-// Openat is a Go-friendly openat(2) wrapper.
348
-func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
349
-	dirFd := unix.AT_FDCWD
350
-	if dir != nil {
351
-		dirFd = int(dir.Fd())
352
-	}
353
-	flags |= unix.O_CLOEXEC
354
-
355
-	fd, err := unix.Openat(dirFd, path, flags, mode)
356
-	if err != nil {
357
-		return nil, &os.PathError{Op: "openat", Path: path, Err: err}
358
-	}
359
-	return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
360
-}
... ...
@@ -1041,6 +1041,10 @@ github.com/morikuni/aec
1041 1041
 # github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822
1042 1042
 ## explicit
1043 1043
 github.com/munnerz/goautoneg
1044
+# github.com/opencontainers/cgroups v0.0.1
1045
+## explicit; go 1.23.0
1046
+github.com/opencontainers/cgroups
1047
+github.com/opencontainers/cgroups/devices/config
1044 1048
 # github.com/opencontainers/go-digest v1.0.0
1045 1049
 ## explicit; go 1.13
1046 1050
 github.com/opencontainers/go-digest
... ...
@@ -1050,12 +1054,6 @@ github.com/opencontainers/go-digest/digestset
1050 1050
 github.com/opencontainers/image-spec/identity
1051 1051
 github.com/opencontainers/image-spec/specs-go
1052 1052
 github.com/opencontainers/image-spec/specs-go/v1
1053
-# github.com/opencontainers/runc v1.2.6
1054
-## explicit; go 1.22
1055
-github.com/opencontainers/runc/libcontainer/cgroups
1056
-github.com/opencontainers/runc/libcontainer/configs
1057
-github.com/opencontainers/runc/libcontainer/devices
1058
-github.com/opencontainers/runc/libcontainer/utils
1059 1053
 # github.com/opencontainers/runtime-spec v1.2.0
1060 1054
 ## explicit
1061 1055
 github.com/opencontainers/runtime-spec/specs-go