... | ... |
@@ -11,6 +11,8 @@ Makefile: ; |
11 | 11 |
|
12 | 12 |
include $(MAKEROOT)/makedefs.mk |
13 | 13 |
|
14 |
+export PATH := $(SRCROOT)/tools/bin:$(PATH) |
|
15 |
+ |
|
14 | 16 |
ifdef PHOTON_CACHE_PATH |
15 | 17 |
PHOTON_PACKAGES := packages-cached |
16 | 18 |
else |
... | ... |
@@ -29,6 +31,9 @@ else |
29 | 29 |
PHOTON_PUBLISH_RPMS := publish-rpms |
30 | 30 |
endif |
31 | 31 |
|
32 |
+TOOLS_BIN := $(SRCROOT)/tools/bin |
|
33 |
+CONTAIN := $(TOOLS_BIN)/contain |
|
34 |
+ |
|
32 | 35 |
.PHONY : all iso clean photon-build-machine photon-vagrant-build photon-vagrant-local \ |
33 | 36 |
check check-bison check-g++ check-gawk check-createrepo check-vagrant check-packer check-packer-ovf-plugin check-sanity \ |
34 | 37 |
clean-install clean-chroot |
... | ... |
@@ -59,7 +64,7 @@ iso: check $(PHOTON_STAGE) $(PHOTON_PACKAGES) |
59 | 59 |
-f > \ |
60 | 60 |
$(PHOTON_LOGS_DIR)/installer.log 2>&1 |
61 | 61 |
|
62 |
-packages: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) |
|
62 |
+packages: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) $(CONTAIN) |
|
63 | 63 |
@echo "Building all RPMS..." |
64 | 64 |
@cd $(PHOTON_PKG_BUILDER_DIR) && \ |
65 | 65 |
$(PHOTON_PACKAGE_BUILDER) -o full \ |
... | ... |
@@ -119,6 +124,8 @@ clean: clean-install clean-chroot |
119 | 119 |
@$(RMDIR) $(PHOTON_STAGE) |
120 | 120 |
@echo "Deleting chroot path..." |
121 | 121 |
@$(RMDIR) $(PHOTON_CHROOT_PATH) |
122 |
+ @echo "Deleting tools/bin..." |
|
123 |
+ @$(RMDIR) $(TOOLS_BIN) |
|
122 | 124 |
|
123 | 125 |
clean-install: |
124 | 126 |
@echo "Cleaning installer working directory..." |
... | ... |
@@ -203,7 +210,7 @@ endif |
203 | 203 |
check-packer-ovf-plugin: |
204 | 204 |
@[[ -e ~/.packer.d/plugins/packer-post-processor-vagrant-vmware-ovf ]] || { echo "Packer OVF post processor not installed. Aborting" >&2; exit 1; } |
205 | 205 |
|
206 |
-%: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) |
|
206 |
+%: check $(PHOTON_PUBLISH_RPMS) $(PHOTON_SOURCES) $(CONTAIN) |
|
207 | 207 |
$(eval PKG_NAME = $@) |
208 | 208 |
@echo "Building package $(PKG_NAME) ..." |
209 | 209 |
@cd $(PHOTON_PKG_BUILDER_DIR) && \ |
... | ... |
@@ -214,3 +221,9 @@ check-packer-ovf-plugin: |
214 | 214 |
-x $(PHOTON_SRCS_DIR) \ |
215 | 215 |
-p $(PHOTON_PUBLISH_RPMS_DIR) \ |
216 | 216 |
-l $(PHOTON_LOGS_DIR) |
217 |
+ |
|
218 |
+$(TOOLS_BIN): |
|
219 |
+ mkdir -p $(TOOLS_BIN) |
|
220 |
+ |
|
221 |
+$(CONTAIN): $(TOOLS_BIN) |
|
222 |
+ gcc -O2 -std=gnu99 -Wall -Wextra $(SRCROOT)/tools/src/contain/*.c -o $@ |
217 | 223 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,19 @@ |
0 |
+Copyright (C) 2013 Chris Webb <chris@arachsys.com> |
|
1 |
+ |
|
2 |
+Permission is hereby granted, free of charge, to any person obtaining a copy |
|
3 |
+of this software and associated documentation files (the "Software"), to |
|
4 |
+deal in the Software without restriction, including without limitation the |
|
5 |
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
|
6 |
+sell copies of the Software, and to permit persons to whom the Software is |
|
7 |
+furnished to do so, subject to the following conditions: |
|
8 |
+ |
|
9 |
+The above copyright notice and this permission notice shall be included in |
|
10 |
+all copies or substantial portions of the Software. |
|
11 |
+ |
|
12 |
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
13 |
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
14 |
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
15 |
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
16 |
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
17 |
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
|
18 |
+IN THE SOFTWARE. |
0 | 19 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,304 @@ |
0 |
+Containers |
|
1 |
+========== |
|
2 |
+ |
|
3 |
+This package is a simple implementation of containers for Linux, making |
|
4 |
+secure containers as easy to create and use as a traditional chroot. It |
|
5 |
+comprises three utilities, contain, inject and pseudo, which use the kernel |
|
6 |
+support for user namespaces merged in Linux 3.8. |
|
7 |
+ |
|
8 |
+ |
|
9 |
+Demonstration |
|
10 |
+------------- |
|
11 |
+ |
|
12 |
+With the utilities already installed, the demo begins in an unprivileged |
|
13 |
+user's shell: |
|
14 |
+ |
|
15 |
+ $ echo $$ $UID |
|
16 |
+ 21260 1000 |
|
17 |
+ |
|
18 |
+To create a simple test container, copy /bin and /lib* from the host into a |
|
19 |
+temporary directory with the default UID/GID mappings applied: |
|
20 |
+ |
|
21 |
+ $ cd $(mktemp -d) |
|
22 |
+ $ tar -c -f - -C / bin lib lib32 lib64 | pseudo tar -x -f - |
|
23 |
+ |
|
24 |
+It is very straightforward to launch a container with this newly-created |
|
25 |
+root filesystem: |
|
26 |
+ |
|
27 |
+ $ contain . /bin/bash |
|
28 |
+ # |
|
29 |
+ |
|
30 |
+The new shell has PID 1 within the container, and cannot see other processes |
|
31 |
+on the host: |
|
32 |
+ |
|
33 |
+ # echo $$ $UID |
|
34 |
+ 1 0 |
|
35 |
+ # ps ax |
|
36 |
+ PID TTY STAT TIME COMMAND |
|
37 |
+ 1 console Ss 0:00 /bin/bash |
|
38 |
+ 2 console R+ 0:00 ps ax |
|
39 |
+ |
|
40 |
+The container root user is able to manipulate ownerships and permissions |
|
41 |
+within its filesystem: |
|
42 |
+ |
|
43 |
+ # ls -l /dev/console |
|
44 |
+ crw--w---- 1 0 5 136, 9 Jul 1 14:00 /dev/console |
|
45 |
+ # chown 12:34 /dev/console |
|
46 |
+ # chmod a+rw /dev/console |
|
47 |
+ # ls -l /dev/console |
|
48 |
+ crw-rw-rw- 1 12 34 136, 9 Jul 1 14:00 /dev/console |
|
49 |
+ |
|
50 |
+and can also make other privileged changes such as setting the hostname: |
|
51 |
+ |
|
52 |
+ # echo -n "hostname $(hostname) -> " && hostname brian && hostname |
|
53 |
+ hostname alice -> brian |
|
54 |
+ |
|
55 |
+or configuring the network stack: |
|
56 |
+ |
|
57 |
+ # ip link show |
|
58 |
+ 1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN mode DEFAULT |
|
59 |
+ link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 |
|
60 |
+ # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down |
|
61 |
+ down |
|
62 |
+ # ip addr add 1.2.3.4/32 dev lo && ip link set lo up |
|
63 |
+ # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down |
|
64 |
+ up |
|
65 |
+ # ip link add type veth && ip link show |
|
66 |
+ 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT |
|
67 |
+ link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 |
|
68 |
+ 2: veth0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000 |
|
69 |
+ link/ether 3a:0c:96:36:2d:ff brd ff:ff:ff:ff:ff:ff |
|
70 |
+ 3: veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000 |
|
71 |
+ link/ether a2:86:1a:92:58:cb brd ff:ff:ff:ff:ff:ff |
|
72 |
+ |
|
73 |
+In all cases, these changes affect the container but not the host as a |
|
74 |
+whole. Processes in the container live in different resource namespaces |
|
75 |
+isolated from the host, and the container root user is unable to do anything |
|
76 |
+that would require elevated capabilities or root privilege on the host |
|
77 |
+itself. |
|
78 |
+ |
|
79 |
+ |
|
80 |
+contain |
|
81 |
+------- |
|
82 |
+ |
|
83 |
+The contain utility is invoked as |
|
84 |
+ |
|
85 |
+ contain [OPTIONS] DIR [CMD [ARG]...] |
|
86 |
+ |
|
87 |
+with options |
|
88 |
+ |
|
89 |
+ -c disable console emulation in the container |
|
90 |
+ -g MAP set the container-to-host GID map |
|
91 |
+ -i CMD run a helper child inside the new namespaces |
|
92 |
+ -n share the host network unprivileged in the container |
|
93 |
+ -o CMD run a helper child outside the new namespaces |
|
94 |
+ -u MAP set the container-to-host UID map |
|
95 |
+ |
|
96 |
+and creates a new container with DIR recursively bound as its root |
|
97 |
+filesystem, running CMD as PID 1 within that container. If unspecified, CMD |
|
98 |
+defaults to /bin/sh to start a shell, so to fully boot a distribution, |
|
99 |
+specify CMD as /bin/init or /sbin/init. |
|
100 |
+ |
|
101 |
+The container init process is isolated in new user, mount, IPC, UTS, and PID |
|
102 |
+namespaces. A synthetic /dev with device nodes bound from the host /dev is |
|
103 |
+automatically mounted within the new mount namespace, together with standard |
|
104 |
+/dev/pts, /proc and /sys filesystems. |
|
105 |
+ |
|
106 |
+Because it runs in its own user namespace, users and groups seen inside a |
|
107 |
+container are not the same as the underlying credentials visible for the |
|
108 |
+same processes and files on the host. Sensible default container-to-host UID |
|
109 |
+and GID mappings are provided and described below, but the -u and -g options |
|
110 |
+can be used to override the defaults. |
|
111 |
+ |
|
112 |
+The container console is a host pseudo-terminal bound at /dev/console in the |
|
113 |
+new /dev filesystem: stdin and stdout are copied to/from this, and it serves |
|
114 |
+as stdin, stdout and stderr for the container init process. This console |
|
115 |
+emulation can be disabled using the -c option: if -c is used, init is run |
|
116 |
+directly with the stdin, stdout and stderr of the contain command. |
|
117 |
+ |
|
118 |
+Containers are usually isolated in their own network namespace, with a |
|
119 |
+distinct set of network interfaces from the host. By specifying the -n |
|
120 |
+option, it is possible to safely share the host network stack instead. If |
|
121 |
+you do this, user networking within the container will work normally, but |
|
122 |
+the container has no privileges with respect to its network namespace so it |
|
123 |
+isn't possible to (re)configure interfaces or routes, and setuid utilities |
|
124 |
+like ping which use a raw socket will fail. |
|
125 |
+ |
|
126 |
+Two different kinds of helper program can be used to help set up a |
|
127 |
+container. A program specified with -i is run inside the new namespaces with |
|
128 |
+the new root filesystem as its working directory, just before pivoting into |
|
129 |
+it. Typically this type of helper is used to bind-mount additional parts of |
|
130 |
+the host filesystem inside the container. |
|
131 |
+ |
|
132 |
+A helper specified with -o is run outside the namespaces but as a direct |
|
133 |
+child of the supervisor process which is running within them. This type of |
|
134 |
+helper can be used to move host network interfaces (such as a macvtap |
|
135 |
+interface or one half of a veth pair) into the container's network |
|
136 |
+namespace. |
|
137 |
+ |
|
138 |
+The environment of the container init process includes "container=contain" |
|
139 |
+so that distributions can identify when they are running under contain. |
|
140 |
+ |
|
141 |
+ |
|
142 |
+inject |
|
143 |
+------ |
|
144 |
+ |
|
145 |
+The inject utility is invoked as |
|
146 |
+ |
|
147 |
+ inject PID [CMD [ARG]...] |
|
148 |
+ |
|
149 |
+where PID is the process ID of a running container supervisor, and runs a |
|
150 |
+command or shell inside the existing container. The environment, stdin, |
|
151 |
+stdout and stderr of inject are all inherited by the command to be run. |
|
152 |
+ |
|
153 |
+The container supervisor PID (i.e. that of contain itself) should be given |
|
154 |
+to inject, not the PID of the descendant init process. The inject utility |
|
155 |
+will only work if process specified has a child with "container=contain" |
|
156 |
+in its environment, which it assumes to be the container init. |
|
157 |
+ |
|
158 |
+Linux allows an unprivileged user to join the user namespace of any |
|
159 |
+container started by his UID, so inject need not be installed setuid even if |
|
160 |
+contain and pseudo are setuid root. It will refuse to run if it detects |
|
161 |
+setuid/setgid operation. |
|
162 |
+ |
|
163 |
+ |
|
164 |
+pseudo |
|
165 |
+------ |
|
166 |
+ |
|
167 |
+The pseudo utility is invoked as |
|
168 |
+ |
|
169 |
+ pseudo [OPTIONS] [CMD [ARG]...] |
|
170 |
+ |
|
171 |
+with options |
|
172 |
+ |
|
173 |
+ -g MAP set the user namespace GID map |
|
174 |
+ -u MAP set the user namespace UID map |
|
175 |
+ |
|
176 |
+and runs a command or shell as root in a new user namespace, by analogy with |
|
177 |
+sudo which runs a command as root in the host user namespace. |
|
178 |
+ |
|
179 |
+Unlike contain, pseudo does not unshare other namespaces or attempt to |
|
180 |
+isolate the new process from the rest of the host. It has identical default |
|
181 |
+UID/GID mappings, -u and -g options, and support for /etc/subuid and |
|
182 |
+/etc/subgid when installed setuid root, but no other contain options are |
|
183 |
+supported. |
|
184 |
+ |
|
185 |
+One use for pseudo is as a more capable replacement for fakeroot, useful for |
|
186 |
+testing, when building software packages or for constructing system images. |
|
187 |
+Unlike the traditional fakeroot approach based on LD_PRELOAD, static |
|
188 |
+binaries and chroot jails are both handled correctly. |
|
189 |
+ |
|
190 |
+It is also invaluable for running host software to access the same |
|
191 |
+filesystem as a container, replicating the user and group file ownerships |
|
192 |
+that the container would see. For example, in the demo above, the system |
|
193 |
+image is untarred under pseudo so that files are written into the filesystem |
|
194 |
+with UIDs and GIDs mapped for the container rather than unmapped as on the |
|
195 |
+host. |
|
196 |
+ |
|
197 |
+ |
|
198 |
+User and group mappings |
|
199 |
+----------------------- |
|
200 |
+ |
|
201 |
+By default, when run as root, contain and pseudo will map container UID/GID |
|
202 |
+0 onto the highest available host UID/GID (4294967294 unless nested), and |
|
203 |
+all other UIDs/GIDs are mapped onto themselves apart from the top container |
|
204 |
+UID and GID which must be left unmapped. |
|
205 |
+ |
|
206 |
+The default mappings avoid host UID and GID 0 as the host root user is still |
|
207 |
+granted a variety of privileges even after dropping all capabilities in the |
|
208 |
+host user namespace. For example, /proc and /sys files typically have (host) |
|
209 |
+root:root ownership, and allowing the container access unfiltered access to |
|
210 |
+things like /proc/sys is dangerous. |
|
211 |
+ |
|
212 |
+Run as an unprivileged user, container UID/GID 0 is mapped onto the |
|
213 |
+unprivileged user's UID/GID, then container UIDs/GIDs 1, 2, etc. are |
|
214 |
+successively mapped onto any ranges delegated to that user in /etc/subuid |
|
215 |
+and /etc/subgid. |
|
216 |
+ |
|
217 |
+The -u and -g options can be used to specify custom mappings, in the format |
|
218 |
+START:LOWER:COUNT[,START:LOWER:COUNT]... where START is the first UID/GID in |
|
219 |
+a container range, LOWER is the first UID/GID in the corresponding range in |
|
220 |
+the host, and COUNT is the length of these ranges. |
|
221 |
+ |
|
222 |
+For example, -u 0:1000:1,1:4000:2000 will map container UID 0 onto host UID |
|
223 |
+1000 and container UIDs 1...2000 onto host UIDs 4000...5999. |
|
224 |
+ |
|
225 |
+It is not possible to map more than one container ID onto a given host ID, |
|
226 |
+nor to list the same container ID twice in a map specification. When invoked |
|
227 |
+by an unprivileged user, all host ranges are checked against /etc/subuid and |
|
228 |
+/etc/subgid. |
|
229 |
+ |
|
230 |
+Unmapped users and groups are mapped by the kernel onto the overflow UID and |
|
231 |
+GID set in /proc/sys/kernel/overflowuid and /proc/sys/kernel/overflowgid. By |
|
232 |
+default the kernel sets both these values to 65534. |
|
233 |
+ |
|
234 |
+ |
|
235 |
+Unprivileged operation, /etc/subuid and /etc/subgid |
|
236 |
+--------------------------------------------------- |
|
237 |
+ |
|
238 |
+When a non-root user runs contain or pseudo unprivileged, these tools can |
|
239 |
+only map container UID/GIDs onto the host UID/GID of that user. The |
|
240 |
+resulting container is not very useful as it has just a single user and |
|
241 |
+group available. (Typically only root is mapped in the container.) |
|
242 |
+ |
|
243 |
+However, contain and pseudo can also be installed setuid root, and in this |
|
244 |
+case, unprivileged users can also map onto ranges of UIDs/GIDs that have |
|
245 |
+been delegated for their use in /etc/subuid and /etc/subgid. |
|
246 |
+ |
|
247 |
+The format of these files is similar to /etc/passwd, /etc/group and |
|
248 |
+/etc/shadow. Each line specifies an additional range of UIDs/GIDs allocated |
|
249 |
+to a particular user, and there can be zero, one, or multiple lines for any |
|
250 |
+given user. There are three colon-delimited fields: the user's login name, |
|
251 |
+the first UID/GID in the range, and the number of UIDs/GIDs in the range. |
|
252 |
+For example, an /etc/subuid containing the lines |
|
253 |
+ |
|
254 |
+ chris:100000:10000 |
|
255 |
+ chris:120000:10000 |
|
256 |
+ |
|
257 |
+allocates UID ranges 100000-109999 and 120000-129999 to my user 'chris' in |
|
258 |
+addition to my normal login UID. |
|
259 |
+ |
|
260 |
+The kernel user namespace author Eric Biederman <ebiederm@xmission.com> has |
|
261 |
+proposed patches against the standard GNU/Linux Shadow package which add |
|
262 |
+support for creating and updating these files in this format; they are |
|
263 |
+likely to become a standard way to delegate sub-users and sub-groups. |
|
264 |
+ |
|
265 |
+Linux 3.19 and later do not allow unprivileged processes to write a GID map |
|
266 |
+unless the setgroups() call has been permanently disabled by writing "deny" |
|
267 |
+to /proc/PID/setgroups. This is a fix for CVE-2014-8989 which applied to |
|
268 |
+strangely-configured systems where group membership implies more restricted |
|
269 |
+permissions rather than supplementary permissions. |
|
270 |
+ |
|
271 |
+As a result, when run non-setuid by an unprivileged user, contain and pseudo |
|
272 |
+must disable setgroups() in the container. Conversely, when installed setuid |
|
273 |
+root, they will use their privilege to bypass this kernel restriction, |
|
274 |
+resulting in fully-functional containers which still support setgroups(). |
|
275 |
+However, this also means that they can be used to bypass restrictions |
|
276 |
+implemented by group membership. |
|
277 |
+ |
|
278 |
+ |
|
279 |
+Building and installing |
|
280 |
+----------------------- |
|
281 |
+ |
|
282 |
+Unpack the source tar.gz file and change to the unpacked directory. |
|
283 |
+ |
|
284 |
+Run 'make', then 'make install' as root to install both binaries setuid root |
|
285 |
+in /bin. Alternatively, you can set DESTDIR and/or BINDIR to install in a |
|
286 |
+different location, or strip and copy the compiled binaries into the correct |
|
287 |
+place manually. |
|
288 |
+ |
|
289 |
+Note that setuid contain and pseudo effectively enable unprivileged users to |
|
290 |
+to drop supplementary group memberships using setgroups(). Consequently, |
|
291 |
+they should NOT be installed setuid root on systems where group membership |
|
292 |
+implies more restricted permissions rather than supplementary permissions. |
|
293 |
+ |
|
294 |
+These utilities were developed on GNU/Linux and are not portable to other |
|
295 |
+platforms as they rely on Linux-specific facilities such as namespaces. |
|
296 |
+Please report any problems or bugs to Chris Webb <chris@arachsys.com>. |
|
297 |
+ |
|
298 |
+ |
|
299 |
+Copying |
|
300 |
+------- |
|
301 |
+ |
|
302 |
+This software was written by Chris Webb <chris@arachsys.com> and is |
|
303 |
+distributed as Free Software under the terms of the MIT license in COPYING. |
0 | 304 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,93 @@ |
0 |
+Shutting down or killing a container |
|
1 |
+------------------------------------ |
|
2 |
+ |
|
3 |
+From the host, the inject utility can be used to run an appropriate command |
|
4 |
+within the container to start a graceful shut down. For example |
|
5 |
+ |
|
6 |
+ inject PID /bin/halt |
|
7 |
+ |
|
8 |
+To immediately kill a container and all its processes, it is sufficient to |
|
9 |
+send the init process a SIGKILL from the host using |
|
10 |
+ |
|
11 |
+ pkill -KILL -P PID |
|
12 |
+ |
|
13 |
+where PID is the process ID of a running container supervisor. It is very |
|
14 |
+important not to SIGKILL the container supervisor itself or the container |
|
15 |
+will be orphaned, continuing to run unsupervised as a child of the host |
|
16 |
+init. |
|
17 |
+ |
|
18 |
+ |
|
19 |
+Using cgroups to limit memory and CPU-share available to a container |
|
20 |
+-------------------------------------------------------------------- |
|
21 |
+ |
|
22 |
+If cgroup support including memcg and memcg-swap is compiled into the kernel |
|
23 |
+and the cgroup filesystem is mounted with the cpu and memory controllers |
|
24 |
+enabled, it is straightforward to apply memory and CPU-share limits to a |
|
25 |
+container as it is started. For example, the shell script |
|
26 |
+ |
|
27 |
+ #!/bin/sh -e |
|
28 |
+ mkdir /sys/fs/cgroup/mycontainer |
|
29 |
+ echo $$ >/sys/fs/cgroup/mycontainer/tasks |
|
30 |
+ echo 2G >/sys/fs/cgroup/mycontainer/memory.limit_in_bytes |
|
31 |
+ echo 2G >/sys/fs/cgroup/mycontainer/memory.memsw.limit_in_bytes |
|
32 |
+ echo 1000 >sys/fs/cgroup/mycontainer/cpu.shares |
|
33 |
+ exec contain [...] |
|
34 |
+ |
|
35 |
+applies a limit of 2GB virtual memory and a CPU-share of 1000 before |
|
36 |
+starting the container. It might also be useful to apply a |
|
37 |
+memory.kmem.limit_in_bytes setting to prevent a container from using |
|
38 |
+excessive amounts of kernel memory. |
|
39 |
+ |
|
40 |
+Note that to set the virtual memory limit in memory.memsw.limit_in_bytes, it |
|
41 |
+is first necessary to set a smaller or equal physical memory limit in |
|
42 |
+memory.limit_in_bytes. |
|
43 |
+ |
|
44 |
+When a container lives inside a memory cgroup, memory.memsw.usage_in_bytes |
|
45 |
+gives a measure of the total virtual memory in use by the container, and |
|
46 |
+memory.usage_in_bytes measures its physical memory footprint. The accounting |
|
47 |
+policy is explained in linux/kernel/Documentation/cgroups/memory.txt. |
|
48 |
+ |
|
49 |
+ |
|
50 |
+Troubleshooting |
|
51 |
+--------------- |
|
52 |
+ |
|
53 |
+The contain/psuedo error message 'Failed to unshare user namespace: Invalid |
|
54 |
+argument' typically means that your kernel is not compiled with support for |
|
55 |
+user namespaces, i.e. CONFIG_USER_NS is not set. The contain tool will also |
|
56 |
+die with a similar message referring to one of the other required namespaces |
|
57 |
+if support for that is not available in the kernel. |
|
58 |
+ |
|
59 |
+To run these tools you need to be running Linux 3.8 or later with |
|
60 |
+ |
|
61 |
+ CONFIG_UTS_NS=y |
|
62 |
+ CONFIG_IPC_NS=y |
|
63 |
+ CONFIG_USER_NS=y |
|
64 |
+ CONFIG_PID_NS=y |
|
65 |
+ CONFIG_NET_NS=y |
|
66 |
+ |
|
67 |
+set in the kernel build config. Note that before Linux 3.12, CONFIG_XFS_FS |
|
68 |
+conflicted with CONFIG_USER_NS, so these tools could not be used where XFS |
|
69 |
+support was compiled either into the kernel or as a module. |
|
70 |
+ |
|
71 |
+The contain tool will fail to mount /dev/pts unless |
|
72 |
+ |
|
73 |
+ CONFIG_DEVPTS_MULTIPLE_INSTANCES=y |
|
74 |
+ |
|
75 |
+is set in the kernel build config. Both container and host /dev/pts must be |
|
76 |
+mounted with -o newinstance, with /dev/ptmx symlinked to pts/ptmx. |
|
77 |
+ |
|
78 |
+Linux 3.12 introduced tighter restrictions on mounting proc and sysfs, which |
|
79 |
+broke older versions of contain. To comply with these new rules, contain |
|
80 |
+now ensures that procfs and sysfs are mounted in the new mount namespace |
|
81 |
+before pivoting into the container and detaching the host root. |
|
82 |
+ |
|
83 |
+A bug in Linux 3.12 will prevent contain from mounting /proc in a container |
|
84 |
+if binfmt_misc is mounted on /proc/sys/fs/binfmt_misc in the host |
|
85 |
+filesystem. This was fixed in Linux 3.13. |
|
86 |
+ |
|
87 |
+Linux 3.19 introduced restrictions on writing a user namespace GID map as an |
|
88 |
+unprivileged user unless setgroups() has been permanently disabled, which |
|
89 |
+broke older versions of contain. Run non-setuid and unprivileged, contain |
|
90 |
+and pseudo must now disable setgroups() to create containers, but if they |
|
91 |
+are installed setuid, they will bypass this kernel restriction and leave |
|
92 |
+setgroups() enabled in the resulting containers. |
0 | 93 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,154 @@ |
0 |
+#define _GNU_SOURCE |
|
1 |
+#include <errno.h> |
|
2 |
+#include <error.h> |
|
3 |
+#include <fcntl.h> |
|
4 |
+#include <limits.h> |
|
5 |
+#include <poll.h> |
|
6 |
+#include <signal.h> |
|
7 |
+#include <stdlib.h> |
|
8 |
+#include <termios.h> |
|
9 |
+#include <unistd.h> |
|
10 |
+#include <sys/ioctl.h> |
|
11 |
+#include <sys/signalfd.h> |
|
12 |
+#include <sys/syscall.h> |
|
13 |
+#include <sys/types.h> |
|
14 |
+#include <sys/wait.h> |
|
15 |
+#include "contain.h" |
|
16 |
+ |
|
17 |
+static struct termios saved; |
|
18 |
+ |
|
19 |
+int getconsole(void) { |
|
20 |
+ int master; |
|
21 |
+ |
|
22 |
+ if ((master = posix_openpt(O_RDWR | O_NOCTTY)) < 0) |
|
23 |
+ error(1, 0, "Failed to allocate a console pseudo-terminal"); |
|
24 |
+ grantpt(master); |
|
25 |
+ unlockpt(master); |
|
26 |
+ return master; |
|
27 |
+} |
|
28 |
+ |
|
29 |
+static void rawmode() { |
|
30 |
+ struct termios termios; |
|
31 |
+ |
|
32 |
+ if (!isatty(STDIN_FILENO)) |
|
33 |
+ return; |
|
34 |
+ if (tcgetattr(STDIN_FILENO, &termios) < 0) |
|
35 |
+ error(1, errno, "tcgetattr"); |
|
36 |
+ cfmakeraw(&termios); |
|
37 |
+ tcsetattr(STDIN_FILENO, TCSANOW, &termios); |
|
38 |
+} |
|
39 |
+ |
|
40 |
+static void restoremode() { |
|
41 |
+ if (isatty(STDIN_FILENO)) |
|
42 |
+ tcsetattr(STDIN_FILENO, TCSANOW, &saved); |
|
43 |
+} |
|
44 |
+ |
|
45 |
+static void savemode() { |
|
46 |
+ if (isatty(STDIN_FILENO) && tcgetattr(STDIN_FILENO, &saved) < 0) |
|
47 |
+ error(1, errno, "tcgetattr"); |
|
48 |
+} |
|
49 |
+ |
|
50 |
+void setconsole(char *name) { |
|
51 |
+ int console; |
|
52 |
+ struct termios termios; |
|
53 |
+ |
|
54 |
+ setsid(); |
|
55 |
+ |
|
56 |
+ if ((console = open(name, O_RDWR)) < 0) |
|
57 |
+ error(1, 0, "Failed to open console in container"); |
|
58 |
+ ioctl(console, TIOCSCTTY, NULL); |
|
59 |
+ |
|
60 |
+ if (tcgetattr(console, &termios) < 0) |
|
61 |
+ error(1, errno, "tcgetattr"); |
|
62 |
+ termios.c_iflag |= IGNBRK | IUTF8; |
|
63 |
+ tcsetattr(console, TCSANOW, &termios); |
|
64 |
+ |
|
65 |
+ dup2(console, STDIN_FILENO); |
|
66 |
+ dup2(console, STDOUT_FILENO); |
|
67 |
+ dup2(console, STDERR_FILENO); |
|
68 |
+ if (console != STDIN_FILENO) |
|
69 |
+ if (console != STDOUT_FILENO) |
|
70 |
+ if (console != STDERR_FILENO) |
|
71 |
+ close(console); |
|
72 |
+} |
|
73 |
+ |
|
74 |
+int supervise(pid_t child, int console) { |
|
75 |
+ char buffer[PIPE_BUF]; |
|
76 |
+ int signals, status; |
|
77 |
+ sigset_t mask; |
|
78 |
+ ssize_t count, length, offset; |
|
79 |
+ struct pollfd fds[3]; |
|
80 |
+ |
|
81 |
+ if (console < 0) { |
|
82 |
+ if (waitpid(child, &status, 0) < 0) |
|
83 |
+ error(1, errno, "waitpid"); |
|
84 |
+ return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE; |
|
85 |
+ } |
|
86 |
+ |
|
87 |
+ sigemptyset(&mask); |
|
88 |
+ sigaddset(&mask, SIGCHLD); |
|
89 |
+ sigprocmask(SIG_BLOCK, &mask, NULL); |
|
90 |
+ if ((signals = signalfd(-1, &mask, 0)) < 0) |
|
91 |
+ error(1, errno, "signalfd"); |
|
92 |
+ |
|
93 |
+ if (waitpid(child, &status, WNOHANG) > 0) |
|
94 |
+ if (WIFEXITED(status) || WIFSIGNALED(status)) |
|
95 |
+ raise(SIGCHLD); |
|
96 |
+ |
|
97 |
+ savemode(); |
|
98 |
+ atexit(restoremode); |
|
99 |
+ rawmode(); |
|
100 |
+ |
|
101 |
+ fds[0].fd = console; |
|
102 |
+ fds[0].events = POLLIN; |
|
103 |
+ fds[1].fd = STDIN_FILENO; |
|
104 |
+ fds[1].events = POLLIN; |
|
105 |
+ fds[2].fd = signals; |
|
106 |
+ fds[2].events = POLLIN; |
|
107 |
+ |
|
108 |
+ while (1) { |
|
109 |
+ if (poll(fds, 3, -1) < 0) |
|
110 |
+ if (errno != EAGAIN && errno != EINTR) |
|
111 |
+ error(1, errno, "poll"); |
|
112 |
+ |
|
113 |
+ if (fds[0].revents & (POLLIN | POLLHUP)) { |
|
114 |
+ while ((length = read(console, buffer, sizeof(buffer))) < 0) |
|
115 |
+ if (errno != EAGAIN && errno != EINTR) |
|
116 |
+ error(1, errno, "read"); |
|
117 |
+ if (length > 0) { |
|
118 |
+ for (offset = 0; length > 0; offset += count, length -= count) |
|
119 |
+ while ((count = write(STDOUT_FILENO, buffer + offset, length)) < 0) |
|
120 |
+ if (errno != EAGAIN && errno != EINTR) |
|
121 |
+ error(1, errno, "write"); |
|
122 |
+ } else { |
|
123 |
+ fds[0].events = 0; |
|
124 |
+ } |
|
125 |
+ } |
|
126 |
+ |
|
127 |
+ if (fds[1].revents & (POLLIN | POLLHUP)) { |
|
128 |
+ while ((length = read(STDIN_FILENO, buffer, sizeof(buffer))) < 0) |
|
129 |
+ if (errno != EAGAIN && errno != EINTR) |
|
130 |
+ error(1, errno, "read"); |
|
131 |
+ if (length > 0) { |
|
132 |
+ for (offset = 0; length > 0; offset += count, length -= count) |
|
133 |
+ while ((count = write(console, buffer + offset, length)) < 0) |
|
134 |
+ if (errno != EAGAIN && errno != EINTR) |
|
135 |
+ error(1, errno, "write"); |
|
136 |
+ } else { |
|
137 |
+ fds[1].events = 0; |
|
138 |
+ } |
|
139 |
+ } |
|
140 |
+ |
|
141 |
+ if (fds[2].revents & POLLIN) { |
|
142 |
+ while (read(signals, buffer, sizeof(buffer)) < 0) |
|
143 |
+ if (errno != EAGAIN && errno != EINTR) |
|
144 |
+ error(1, errno, "read"); |
|
145 |
+ if (waitpid(child, &status, WNOHANG) > 0) |
|
146 |
+ if (WIFEXITED(status) || WIFSIGNALED(status)) |
|
147 |
+ break; |
|
148 |
+ } |
|
149 |
+ } |
|
150 |
+ |
|
151 |
+ close(signals); |
|
152 |
+ return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE; |
|
153 |
+} |
0 | 154 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,137 @@ |
0 |
+#define _GNU_SOURCE |
|
1 |
+#include <errno.h> |
|
2 |
+#include <error.h> |
|
3 |
+#include <fcntl.h> |
|
4 |
+#include <grp.h> |
|
5 |
+#include <sched.h> |
|
6 |
+#include <signal.h> |
|
7 |
+#include <stdio.h> |
|
8 |
+#include <stdlib.h> |
|
9 |
+#include <string.h> |
|
10 |
+#include <sysexits.h> |
|
11 |
+#include <unistd.h> |
|
12 |
+#include <sys/syscall.h> |
|
13 |
+#include <sys/types.h> |
|
14 |
+#include "contain.h" |
|
15 |
+ |
|
16 |
+void usage(char *progname) { |
|
17 |
+ fprintf(stderr, "\ |
|
18 |
+Usage: %s [OPTIONS] DIR [CMD [ARG]...]\n\ |
|
19 |
+Options:\n\ |
|
20 |
+ -b BND bind host path into container\n\ |
|
21 |
+ -c disable console emulation in the container\n\ |
|
22 |
+ -g MAP set the container-to-host GID map\n\ |
|
23 |
+ -i CMD run a helper child inside the new namespaces\n\ |
|
24 |
+ -n share the host network unprivileged in the container\n\ |
|
25 |
+ -o CMD run a helper child outside the new namespaces\n\ |
|
26 |
+ -u MAP set the container-to-host UID map\n\ |
|
27 |
+BND is specified as HOST_DIR:CONTAINER_DIR[,HOST_DIR2:CONTAINER_DIR2]...\n\ |
|
28 |
+GID and UID maps are specified as START:LOWER:COUNT[,START:LOWER:COUNT]...\n\ |
|
29 |
+", progname); |
|
30 |
+ exit(EX_USAGE); |
|
31 |
+} |
|
32 |
+ |
|
33 |
+int main(int argc, char **argv) { |
|
34 |
+ char *gidmap = NULL, *inside = NULL, *outside = NULL, *uidmap = NULL; |
|
35 |
+ char *bind = NULL; |
|
36 |
+ int hostnet = 0, master, option, stdio = 0; |
|
37 |
+ pid_t child, parent; |
|
38 |
+ |
|
39 |
+ while ((option = getopt(argc, argv, "+:b:cg:i:no:u:")) > 0) |
|
40 |
+ switch (option) { |
|
41 |
+ case 'b': |
|
42 |
+ bind = optarg; |
|
43 |
+ break; |
|
44 |
+ case 'c': |
|
45 |
+ stdio++; |
|
46 |
+ break; |
|
47 |
+ case 'g': |
|
48 |
+ gidmap = optarg; |
|
49 |
+ break; |
|
50 |
+ case 'i': |
|
51 |
+ inside = optarg; |
|
52 |
+ break; |
|
53 |
+ case 'n': |
|
54 |
+ hostnet++; |
|
55 |
+ break; |
|
56 |
+ case 'o': |
|
57 |
+ outside = optarg; |
|
58 |
+ break; |
|
59 |
+ case 'u': |
|
60 |
+ uidmap = optarg; |
|
61 |
+ break; |
|
62 |
+ default: |
|
63 |
+ usage(argv[0]); |
|
64 |
+ } |
|
65 |
+ |
|
66 |
+ if (argc <= optind) |
|
67 |
+ usage(argv[0]); |
|
68 |
+ |
|
69 |
+ parent = getpid(); |
|
70 |
+ switch (child = fork()) { |
|
71 |
+ case -1: |
|
72 |
+ error(1, errno, "fork"); |
|
73 |
+ case 0: |
|
74 |
+ raise(SIGSTOP); |
|
75 |
+// if (geteuid() != 0) |
|
76 |
+// denysetgroups(parent); |
|
77 |
+ writemap(parent, GID, gidmap); |
|
78 |
+ writemap(parent, UID, uidmap); |
|
79 |
+ |
|
80 |
+ if (outside) { |
|
81 |
+ if (setgid(getgid()) < 0 || setuid(getuid()) < 0) |
|
82 |
+ error(1, 0, "Failed to drop privileges"); |
|
83 |
+ execlp(SHELL, SHELL, "-c", outside, NULL); |
|
84 |
+ error(1, errno, "exec %s", outside); |
|
85 |
+ } |
|
86 |
+ |
|
87 |
+ exit(EXIT_SUCCESS); |
|
88 |
+ } |
|
89 |
+ |
|
90 |
+ if (setgid(getgid()) < 0 || setuid(getuid()) < 0) |
|
91 |
+ error(1, 0, "Failed to drop privileges"); |
|
92 |
+ |
|
93 |
+ if (unshare(CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWUTS) < 0) |
|
94 |
+ error(1, 0, "Failed to unshare namespaces"); |
|
95 |
+ |
|
96 |
+ if (!hostnet && unshare(CLONE_NEWNET) < 0) |
|
97 |
+ error(1, 0, "Failed to unshare network namespace"); |
|
98 |
+ |
|
99 |
+ waitforstop(child); |
|
100 |
+ kill(child, SIGCONT); |
|
101 |
+ waitforexit(child); |
|
102 |
+ |
|
103 |
+ setgid(0); |
|
104 |
+ setgroups(0, NULL); |
|
105 |
+ setuid(0); |
|
106 |
+ |
|
107 |
+ master = stdio ? -1 : getconsole(); |
|
108 |
+ createroot(argv[optind], master, inside, bind); |
|
109 |
+ |
|
110 |
+ unshare(CLONE_NEWPID); |
|
111 |
+ switch (child = fork()) { |
|
112 |
+ case -1: |
|
113 |
+ error(1, errno, "fork"); |
|
114 |
+ case 0: |
|
115 |
+ mountproc(); |
|
116 |
+ if (!hostnet) |
|
117 |
+ mountsys(); |
|
118 |
+ enterroot(); |
|
119 |
+ |
|
120 |
+ if (master >= 0) { |
|
121 |
+ close(master); |
|
122 |
+ setconsole("/dev/console"); |
|
123 |
+ } |
|
124 |
+ |
|
125 |
+ clearenv(); |
|
126 |
+ putenv("container=contain"); |
|
127 |
+ |
|
128 |
+ if (argv[optind + 1]) |
|
129 |
+ execv(argv[optind + 1], argv + optind + 1); |
|
130 |
+ else |
|
131 |
+ execl(SHELL, SHELL, NULL); |
|
132 |
+ error(1, errno, "exec"); |
|
133 |
+ } |
|
134 |
+ |
|
135 |
+ return supervise(child, master); |
|
136 |
+} |
0 | 137 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,29 @@ |
0 |
+#ifndef CONTAIN_H |
|
1 |
+#define CONTAIN_H |
|
2 |
+ |
|
3 |
+#define GID 0 |
|
4 |
+#define UID 1 |
|
5 |
+#define INVALID ((unsigned) -1) |
|
6 |
+#define SHELL "/bin/sh" |
|
7 |
+ |
|
8 |
+#define getid(type) ((unsigned) ((type) == GID ? getgid() : getuid())) |
|
9 |
+#define idfile(type) ((type) == GID ? "gid_map" : "uid_map") |
|
10 |
+#define idname(type) ((type) == GID ? "GID" : "UID") |
|
11 |
+#define subpath(type) ((type) == GID ? "/etc/subgid" : "/etc/subuid") |
|
12 |
+ |
|
13 |
+extern char *append(char **destination, const char *format, ...); |
|
14 |
+extern void createroot(char *src, int console, char *helper, char *bind); |
|
15 |
+extern void denysetgroups(pid_t pid); |
|
16 |
+extern void enterroot(void); |
|
17 |
+extern int getconsole(void); |
|
18 |
+extern void mountproc(void); |
|
19 |
+extern void mountsys(void); |
|
20 |
+extern void setconsole(char *name); |
|
21 |
+extern char *string(const char *format, ...); |
|
22 |
+extern int supervise(pid_t child, int console); |
|
23 |
+extern char *tmpdir(void); |
|
24 |
+extern void waitforstop(pid_t child); |
|
25 |
+extern void waitforexit(pid_t child); |
|
26 |
+extern void writemap(pid_t pid, int type, char *map); |
|
27 |
+ |
|
28 |
+#endif |
0 | 29 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,222 @@ |
0 |
+#define _GNU_SOURCE |
|
1 |
+#include <errno.h> |
|
2 |
+#include <error.h> |
|
3 |
+#include <grp.h> |
|
4 |
+#include <fcntl.h> |
|
5 |
+#include <pwd.h> |
|
6 |
+#include <sched.h> |
|
7 |
+#include <stdio.h> |
|
8 |
+#include <stdlib.h> |
|
9 |
+#include <string.h> |
|
10 |
+#include <unistd.h> |
|
11 |
+#include "contain.h" |
|
12 |
+ |
|
13 |
+void denysetgroups(pid_t pid) { |
|
14 |
+ char *path, *text = "deny"; |
|
15 |
+ int fd; |
|
16 |
+ |
|
17 |
+ path = string("/proc/%d/setgroups", pid); |
|
18 |
+ if ((fd = open(path, O_WRONLY)) < 0) |
|
19 |
+ error(1, 0, "Failed to disable setgroups() in container"); |
|
20 |
+ else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text)) |
|
21 |
+ error(1, 0, "Failed to disable setgroups() in container"); |
|
22 |
+ close(fd); |
|
23 |
+ free(path); |
|
24 |
+} |
|
25 |
+ |
|
26 |
+static char *getmap(pid_t pid, int type) { |
|
27 |
+ char *line = NULL, *result = NULL, *path; |
|
28 |
+ size_t size; |
|
29 |
+ unsigned count, first, lower; |
|
30 |
+ FILE *file; |
|
31 |
+ |
|
32 |
+ if (pid == -1) |
|
33 |
+ path = string("/proc/self/%s", idfile(type)); |
|
34 |
+ else |
|
35 |
+ path = string("/proc/%d/%s", pid, idfile(type)); |
|
36 |
+ if (!(file = fopen(path, "r"))) |
|
37 |
+ error(1, 0, "Cannot read %s", path); |
|
38 |
+ |
|
39 |
+ while (getline(&line, &size, file) >= 0) { |
|
40 |
+ if (sscanf(line, " %u %u %u", &first, &lower, &count) != 3) |
|
41 |
+ error(1, 0, "Invalid map data in %s", path); |
|
42 |
+ append(&result, "%s%u:%u:%u", result ? "," : "", first, lower, count); |
|
43 |
+ } |
|
44 |
+ |
|
45 |
+ if (!result) |
|
46 |
+ error(1, 0, "Invalid map data in %s", path); |
|
47 |
+ |
|
48 |
+ fclose(file); |
|
49 |
+ free(line); |
|
50 |
+ free(path); |
|
51 |
+ return result; |
|
52 |
+} |
|
53 |
+ |
|
54 |
+static char *mapitem(char *map, unsigned *first, unsigned *lower, |
|
55 |
+ unsigned *count) { |
|
56 |
+ ssize_t skip; |
|
57 |
+ |
|
58 |
+ while (map && *map && strchr(",;", *map)) |
|
59 |
+ map++; |
|
60 |
+ if (map == NULL || *map == '\0') |
|
61 |
+ return NULL; |
|
62 |
+ if (sscanf(map, "%u:%u:%u%zn", first, lower, count, &skip) < 3) |
|
63 |
+ error(1, 0, "Invalid ID map '%s'", map); |
|
64 |
+ return map + skip; |
|
65 |
+} |
|
66 |
+ |
|
67 |
+static char *rangeitem(char *range, unsigned *start, unsigned *length) { |
|
68 |
+ ssize_t skip; |
|
69 |
+ |
|
70 |
+ while (range && *range && strchr(",;", *range)) |
|
71 |
+ range++; |
|
72 |
+ if (range == NULL || *range == '\0') |
|
73 |
+ return NULL; |
|
74 |
+ if (sscanf(range, "%u:%u%zn", start, length, &skip) < 2) |
|
75 |
+ error(1, 0, "Invalid ID range '%s'", range); |
|
76 |
+ return range + skip; |
|
77 |
+} |
|
78 |
+ |
|
79 |
+static char *readranges(int type) { |
|
80 |
+ char *line = NULL, *range, *user; |
|
81 |
+ size_t end, size; |
|
82 |
+ struct passwd *passwd; |
|
83 |
+ unsigned length, start; |
|
84 |
+ FILE *file; |
|
85 |
+ |
|
86 |
+ range = string("%u:1", getid(type)); |
|
87 |
+ if (!(file = fopen(subpath(type), "r"))) |
|
88 |
+ return range; |
|
89 |
+ |
|
90 |
+ user = getenv("USER"); |
|
91 |
+ user = user ? user : getenv("LOGNAME"); |
|
92 |
+ user = user ? user : getlogin(); |
|
93 |
+ if (!user || !(passwd = getpwnam(user)) || passwd->pw_uid != getuid()) { |
|
94 |
+ if (!(passwd = getpwuid(getuid()))) |
|
95 |
+ error(1, 0, "Failed to validate your username"); |
|
96 |
+ user = passwd->pw_name; |
|
97 |
+ } |
|
98 |
+ endpwent(); |
|
99 |
+ |
|
100 |
+ while (getline(&line, &size, file) >= 0) { |
|
101 |
+ if (strncmp(line, user, strlen(user))) |
|
102 |
+ continue; |
|
103 |
+ if (sscanf(line + strlen(user), ":%u:%u%zn", &start, &length, &end) < 2) |
|
104 |
+ continue; |
|
105 |
+ if (strchr(":\n", line[end + strlen(user) + 1])) |
|
106 |
+ append(&range, ",%u:%u", start, length); |
|
107 |
+ } |
|
108 |
+ |
|
109 |
+ free(line); |
|
110 |
+ fclose(file); |
|
111 |
+ return range; |
|
112 |
+} |
|
113 |
+ |
|
114 |
+static char *rootdefault(int type) { |
|
115 |
+ char *cursor, *map, *result; |
|
116 |
+ unsigned count, first, last = INVALID, lower; |
|
117 |
+ |
|
118 |
+ cursor = map = getmap(-1, type); |
|
119 |
+ while ((cursor = mapitem(cursor, &first, &lower, &count))) |
|
120 |
+ if (last == INVALID || last < first + count - 1) |
|
121 |
+ last = first + count - 1; |
|
122 |
+ result = string("0:%u:1", last); |
|
123 |
+ |
|
124 |
+ cursor = map; |
|
125 |
+ while ((cursor = mapitem(cursor, &first, &lower, &count))) { |
|
126 |
+ if (first == 0) { |
|
127 |
+ if (count == 1 && first >= last) |
|
128 |
+ error(1, 0, "No unprivileged %s available\n", idname(type)); |
|
129 |
+ first++, lower++, count--; |
|
130 |
+ } |
|
131 |
+ |
|
132 |
+ if (last <= first + count - 1 && count > 0) |
|
133 |
+ count--; |
|
134 |
+ |
|
135 |
+ if (count > 0) |
|
136 |
+ append(&result, "%s%u:%u:%u", result ? "," : "", first, first, count); |
|
137 |
+ } |
|
138 |
+ |
|
139 |
+ free(map); |
|
140 |
+ return result; |
|
141 |
+} |
|
142 |
+ |
|
143 |
+static char *userdefault(int type) { |
|
144 |
+ char *cursor, *map, *range, *result = NULL; |
|
145 |
+ unsigned count, first, index = 0, length, lower, start; |
|
146 |
+ |
|
147 |
+ if (geteuid() != 0) |
|
148 |
+ return string("0:%u:1", getid(type)); |
|
149 |
+ |
|
150 |
+ map = getmap(-1, type); |
|
151 |
+ range = readranges(type); |
|
152 |
+ |
|
153 |
+ while ((range = rangeitem(range, &start, &length))) { |
|
154 |
+ cursor = map; |
|
155 |
+ while ((cursor = mapitem(cursor, &first, &lower, &count))) { |
|
156 |
+ if (start + length <= first || first + count <= start) |
|
157 |
+ continue; |
|
158 |
+ if (first + count < start + length) |
|
159 |
+ length = start - first + count; |
|
160 |
+ if (start < first) { |
|
161 |
+ index += first - start; |
|
162 |
+ length -= first - start; |
|
163 |
+ start = first; |
|
164 |
+ } |
|
165 |
+ append(&result, "%s%u:%u:%u", result ? "," : "", index, start, length); |
|
166 |
+ index += length; |
|
167 |
+ } |
|
168 |
+ } |
|
169 |
+ |
|
170 |
+ free(map); |
|
171 |
+ free(range); |
|
172 |
+ return result; |
|
173 |
+} |
|
174 |
+ |
|
175 |
+static void validate(char *range, unsigned first, unsigned count) { |
|
176 |
+ unsigned length, start; |
|
177 |
+ |
|
178 |
+ while ((range = rangeitem(range, &start, &length))) |
|
179 |
+ if (first < start + length && start < first + count) { |
|
180 |
+ if (first < start) |
|
181 |
+ validate(range, first, start - first); |
|
182 |
+ if (first + count > start + length) |
|
183 |
+ validate(range, start + length, first + count - start - length); |
|
184 |
+ return; |
|
185 |
+ } |
|
186 |
+ error(1, 0, "Cannot map onto IDs that are not delegated to you"); |
|
187 |
+} |
|
188 |
+ |
|
189 |
+static void verifymap(char *map, char *range) { |
|
190 |
+ unsigned count, first, lower; |
|
191 |
+ |
|
192 |
+ while ((map = mapitem(map, &first, &lower, &count))) |
|
193 |
+ validate(range, lower, count); |
|
194 |
+} |
|
195 |
+ |
|
196 |
+void writemap(pid_t pid, int type, char *map) { |
|
197 |
+ char *path, *range, *text = NULL; |
|
198 |
+ int fd; |
|
199 |
+ unsigned count, first, lower; |
|
200 |
+ |
|
201 |
+ if (!map) { |
|
202 |
+ map = (getuid() == 0 ? rootdefault : userdefault)(type); |
|
203 |
+ } else if (getuid() != 0) { |
|
204 |
+ range = readranges(type); |
|
205 |
+ verifymap(map, range); |
|
206 |
+ free(range); |
|
207 |
+ } |
|
208 |
+ |
|
209 |
+ while ((map = mapitem(map, &first, &lower, &count))) |
|
210 |
+ append(&text, "%u %u %u\n", first, lower, count); |
|
211 |
+ |
|
212 |
+ path = string("/proc/%d/%s", pid, idfile(type)); |
|
213 |
+ if ((fd = open(path, O_WRONLY)) < 0) |
|
214 |
+ error(1, 0, "Failed to set container %s map", idname(type)); |
|
215 |
+ else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text)) |
|
216 |
+ error(1, 0, "Failed to set container %s map", idname(type)); |
|
217 |
+ |
|
218 |
+ close(fd); |
|
219 |
+ free(path); |
|
220 |
+ free(text); |
|
221 |
+} |
0 | 222 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,143 @@ |
0 |
+#define _GNU_SOURCE |
|
1 |
+#include <errno.h> |
|
2 |
+#include <error.h> |
|
3 |
+#include <fcntl.h> |
|
4 |
+#include <stdlib.h> |
|
5 |
+#include <unistd.h> |
|
6 |
+#include <sys/mount.h> |
|
7 |
+#include <sys/stat.h> |
|
8 |
+#include <sys/syscall.h> |
|
9 |
+#include <sys/types.h> |
|
10 |
+#include <string.h> |
|
11 |
+#include "contain.h" |
|
12 |
+ |
|
13 |
+static char *root; |
|
14 |
+ |
|
15 |
+static void bindnode(char *src, char *dst) { |
|
16 |
+ int fd; |
|
17 |
+ |
|
18 |
+ if ((fd = open(dst, O_WRONLY | O_CREAT, 0600)) >= 0) |
|
19 |
+ close(fd); |
|
20 |
+ if (mount(src, dst, NULL, MS_BIND, NULL) < 0) |
|
21 |
+ error(1, 0, "Failed to bind '%s' into '%s'", src, dst); |
|
22 |
+} |
|
23 |
+ |
|
24 |
+void cleanup(void) { |
|
25 |
+ if (root) { |
|
26 |
+ umount2(root, MNT_DETACH); |
|
27 |
+ rmdir(root); |
|
28 |
+ } |
|
29 |
+} |
|
30 |
+ |
|
31 |
+static char *binditem(char *b, char **s, char **d) { |
|
32 |
+ char *orig = b; |
|
33 |
+ |
|
34 |
+ while (b && *b && strchr(",;", *b)) |
|
35 |
+ b++; |
|
36 |
+ if (b == NULL || *b == '\0') |
|
37 |
+ return NULL; |
|
38 |
+ *s = b; |
|
39 |
+ while (*b && *b != ':') |
|
40 |
+ b++; |
|
41 |
+ if (*b != ':') |
|
42 |
+ error(1, 0, "Invalid bind format '%s'", orig); |
|
43 |
+ *b++ = '\0'; |
|
44 |
+ *d = b; |
|
45 |
+ while (*b && !strchr(",;:", *b)) |
|
46 |
+ b++; |
|
47 |
+ if (*b == ':') |
|
48 |
+ error(1, 0, "Invalid bind format '%s'", orig); |
|
49 |
+ if (*b) |
|
50 |
+ *b++ = '\0'; |
|
51 |
+ return b; |
|
52 |
+} |
|
53 |
+ |
|
54 |
+void createroot(char *src, int console, char *helper, char *bind) { |
|
55 |
+ mode_t mask; |
|
56 |
+ pid_t child; |
|
57 |
+ char *bindsrc = NULL, *binddst = NULL; |
|
58 |
+ |
|
59 |
+ root = tmpdir(); |
|
60 |
+ atexit(cleanup); |
|
61 |
+ |
|
62 |
+ if (mount(src, root, NULL, MS_BIND | MS_REC, NULL) < 0) |
|
63 |
+ error(1, 0, "Failed to bind new root filesystem"); |
|
64 |
+ else if (chdir(root) < 0) |
|
65 |
+ error(1, 0, "Failed to enter new root filesystem"); |
|
66 |
+ |
|
67 |
+ mask = umask(0); |
|
68 |
+ mkdir("dev" , 0755); |
|
69 |
+ if (mount("tmpfs", "dev", "tmpfs", 0, "mode=0755") < 0) |
|
70 |
+ error(1, 0, "Failed to mount /dev tmpfs in new root filesystem"); |
|
71 |
+ |
|
72 |
+ mkdir("dev/pts", 0755); |
|
73 |
+ if (mount("devpts", "dev/pts", "devpts", 0, "newinstance,ptmxmode=666") < 0) |
|
74 |
+ error(1, 0, "Failed to mount /dev/pts in new root filesystem"); |
|
75 |
+ |
|
76 |
+ mkdir("dev/tmp", 0755); |
|
77 |
+ umask(mask); |
|
78 |
+ |
|
79 |
+ if (console >= 0) |
|
80 |
+ bindnode(ptsname(console), "dev/console"); |
|
81 |
+ bindnode("/dev/full", "dev/full"); |
|
82 |
+ bindnode("/dev/null", "dev/null"); |
|
83 |
+ bindnode("/dev/random", "dev/random"); |
|
84 |
+ bindnode("/dev/tty", "dev/tty"); |
|
85 |
+ bindnode("/dev/urandom", "dev/urandom"); |
|
86 |
+ bindnode("/dev/zero", "dev/zero"); |
|
87 |
+ symlink("pts/ptmx", "dev/ptmx"); |
|
88 |
+ |
|
89 |
+ while ((bind = binditem(bind, &bindsrc, &binddst))) |
|
90 |
+ bindnode(bindsrc, binddst); |
|
91 |
+ |
|
92 |
+ if (helper) |
|
93 |
+ switch (child = fork()) { |
|
94 |
+ case -1: |
|
95 |
+ error(1, errno, "fork"); |
|
96 |
+ case 0: |
|
97 |
+ execlp(SHELL, SHELL, "-c", helper, NULL); |
|
98 |
+ error(1, errno, "exec %s", helper); |
|
99 |
+ default: |
|
100 |
+ waitforexit(child); |
|
101 |
+ } |
|
102 |
+} |
|
103 |
+ |
|
104 |
+void enterroot(void) { |
|
105 |
+ if (syscall(__NR_pivot_root, ".", "dev/tmp") < 0) |
|
106 |
+ error(1, 0, "Failed to pivot into new root filesystem"); |
|
107 |
+ |
|
108 |
+ if (chdir("/dev/tmp") >= 0) { |
|
109 |
+ while (*root == '/') |
|
110 |
+ root++; |
|
111 |
+ rmdir(root); |
|
112 |
+ } |
|
113 |
+ |
|
114 |
+ root = NULL; |
|
115 |
+ |
|
116 |
+ if (chdir("/") < 0 || umount2("/dev/tmp", MNT_DETACH) < 0) |
|
117 |
+ error(1, 0, "Failed to detach old root filesystem"); |
|
118 |
+ else |
|
119 |
+ rmdir("/dev/tmp"); |
|
120 |
+} |
|
121 |
+ |
|
122 |
+void mountproc(void) { |
|
123 |
+ mode_t mask; |
|
124 |
+ |
|
125 |
+ mask = umask(0); |
|
126 |
+ mkdir("proc" , 0755); |
|
127 |
+ umask(mask); |
|
128 |
+ |
|
129 |
+ if (mount("proc", "proc", "proc", 0, NULL) < 0) |
|
130 |
+ error(1, 0, "Failed to mount /proc in new root filesystem"); |
|
131 |
+} |
|
132 |
+ |
|
133 |
+void mountsys(void) { |
|
134 |
+ mode_t mask; |
|
135 |
+ |
|
136 |
+ mask = umask(0); |
|
137 |
+ mkdir("sys" , 0755); |
|
138 |
+ umask(mask); |
|
139 |
+ |
|
140 |
+ if (mount("sysfs", "sys", "sysfs", 0, NULL) < 0) |
|
141 |
+ error(1, 0, "Failed to mount /sys in new root filesystem"); |
|
142 |
+} |
0 | 143 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,71 @@ |
0 |
+#define _GNU_SOURCE |
|
1 |
+#include <errno.h> |
|
2 |
+#include <error.h> |
|
3 |
+#include <stdarg.h> |
|
4 |
+#include <stdio.h> |
|
5 |
+#include <stdlib.h> |
|
6 |
+#include <string.h> |
|
7 |
+#include <sys/types.h> |
|
8 |
+#include <sys/wait.h> |
|
9 |
+#include "contain.h" |
|
10 |
+ |
|
11 |
+char *append(char **destination, const char *format, ...) { |
|
12 |
+ char *extra, *result; |
|
13 |
+ va_list args; |
|
14 |
+ |
|
15 |
+ va_start(args, format); |
|
16 |
+ if (vasprintf(&extra, format, args) < 0) |
|
17 |
+ error(1, errno, "asprintf"); |
|
18 |
+ va_end(args); |
|
19 |
+ |
|
20 |
+ if (*destination == NULL) { |
|
21 |
+ *destination = extra; |
|
22 |
+ return extra; |
|
23 |
+ } |
|
24 |
+ |
|
25 |
+ if (asprintf(&result, "%s%s", *destination, extra) < 0) |
|
26 |
+ error(1, errno, "asprintf"); |
|
27 |
+ free(*destination); |
|
28 |
+ free(extra); |
|
29 |
+ *destination = result; |
|
30 |
+ return result; |
|
31 |
+} |
|
32 |
+ |
|
33 |
+char *string(const char *format, ...) { |
|
34 |
+ char *result; |
|
35 |
+ va_list args; |
|
36 |
+ |
|
37 |
+ va_start(args, format); |
|
38 |
+ if (vasprintf(&result, format, args) < 0) |
|
39 |
+ error(1, errno, "asprintf"); |
|
40 |
+ va_end(args); |
|
41 |
+ return result; |
|
42 |
+} |
|
43 |
+ |
|
44 |
+char *tmpdir(void) { |
|
45 |
+ char *dir; |
|
46 |
+ |
|
47 |
+ if (!(dir = strdup("/tmp/XXXXXX"))) |
|
48 |
+ error(1, errno, "strdup"); |
|
49 |
+ else if (!mkdtemp(dir)) |
|
50 |
+ error(1, 0, "Failed to create temporary directory"); |
|
51 |
+ return dir; |
|
52 |
+} |
|
53 |
+ |
|
54 |
+void waitforexit(pid_t child) { |
|
55 |
+ int status; |
|
56 |
+ |
|
57 |
+ if (waitpid(child, &status, 0) < 0) |
|
58 |
+ error(1, errno, "waitpid"); |
|
59 |
+ else if (WEXITSTATUS(status) != EXIT_SUCCESS) |
|
60 |
+ exit(WEXITSTATUS(status)); |
|
61 |
+} |
|
62 |
+ |
|
63 |
+void waitforstop(pid_t child) { |
|
64 |
+ int status; |
|
65 |
+ |
|
66 |
+ if (waitpid(child, &status, WUNTRACED) < 0) |
|
67 |
+ error(1, errno, "waitpid"); |
|
68 |
+ if (!WIFSTOPPED(status)) |
|
69 |
+ exit(WEXITSTATUS(status)); |
|
70 |
+} |