Browse code

Merge pull request #44210 from corhere/chrootarchive-without-reexec

Fix 'docker cp' mount table explosion, take four

Brian Goff authored on 2022/11/12 03:47:09
Showing 22 changed files
1 1
deleted file mode 100644
... ...
@@ -1,83 +0,0 @@
1
-package container // import "github.com/docker/docker/container"
2
-
3
-import (
4
-	"os"
5
-	"path/filepath"
6
-
7
-	"github.com/docker/docker/api/types"
8
-	"github.com/docker/docker/pkg/archive"
9
-	"github.com/docker/docker/pkg/system"
10
-	"github.com/pkg/errors"
11
-)
12
-
13
-// ResolvePath resolves the given path in the container to a resource on the
14
-// host. Returns a resolved path (absolute path to the resource on the host),
15
-// the absolute path to the resource relative to the container's rootfs, and
16
-// an error if the path points to outside the container's rootfs.
17
-func (container *Container) ResolvePath(path string) (resolvedPath, absPath string, err error) {
18
-	if container.BaseFS == "" {
19
-		return "", "", errors.New("ResolvePath: BaseFS of container " + container.ID + " is unexpectedly empty")
20
-	}
21
-	// Check if a drive letter supplied, it must be the system drive. No-op except on Windows
22
-	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
23
-	if err != nil {
24
-		return "", "", err
25
-	}
26
-
27
-	// Consider the given path as an absolute path in the container.
28
-	absPath = archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path)
29
-
30
-	// Split the absPath into its Directory and Base components. We will
31
-	// resolve the dir in the scope of the container then append the base.
32
-	dirPath, basePath := filepath.Split(absPath)
33
-
34
-	resolvedDirPath, err := container.GetResourcePath(dirPath)
35
-	if err != nil {
36
-		return "", "", err
37
-	}
38
-
39
-	// resolvedDirPath will have been cleaned (no trailing path separators) so
40
-	// we can manually join it with the base path element.
41
-	resolvedPath = resolvedDirPath + string(filepath.Separator) + basePath
42
-	return resolvedPath, absPath, nil
43
-}
44
-
45
-// StatPath is the unexported version of StatPath. Locks and mounts should
46
-// be acquired before calling this method and the given path should be fully
47
-// resolved to a path on the host corresponding to the given absolute path
48
-// inside the container.
49
-func (container *Container) StatPath(resolvedPath, absPath string) (stat *types.ContainerPathStat, err error) {
50
-	if container.BaseFS == "" {
51
-		return nil, errors.New("StatPath: BaseFS of container " + container.ID + " is unexpectedly empty")
52
-	}
53
-
54
-	lstat, err := os.Lstat(resolvedPath)
55
-	if err != nil {
56
-		return nil, err
57
-	}
58
-
59
-	var linkTarget string
60
-	if lstat.Mode()&os.ModeSymlink != 0 {
61
-		// Fully evaluate the symlink in the scope of the container rootfs.
62
-		hostPath, err := container.GetResourcePath(absPath)
63
-		if err != nil {
64
-			return nil, err
65
-		}
66
-
67
-		linkTarget, err = filepath.Rel(container.BaseFS, hostPath)
68
-		if err != nil {
69
-			return nil, err
70
-		}
71
-
72
-		// Make it an absolute path.
73
-		linkTarget = filepath.Join(string(filepath.Separator), linkTarget)
74
-	}
75
-
76
-	return &types.ContainerPathStat{
77
-		Name:       filepath.Base(absPath),
78
-		Size:       lstat.Size(),
79
-		Mode:       lstat.Mode(),
80
-		Mtime:      lstat.ModTime(),
81
-		LinkTarget: linkTarget,
82
-	}, nil
83
-}
84 1
new file mode 100644
... ...
@@ -0,0 +1,83 @@
0
+package container // import "github.com/docker/docker/container"
1
+
2
+import (
3
+	"os"
4
+	"path/filepath"
5
+
6
+	"github.com/docker/docker/api/types"
7
+	"github.com/docker/docker/pkg/archive"
8
+	"github.com/docker/docker/pkg/system"
9
+	"github.com/pkg/errors"
10
+)
11
+
12
+// ResolvePath resolves the given path in the container to a resource on the
13
+// host. Returns a resolved path (absolute path to the resource on the host),
14
+// the absolute path to the resource relative to the container's rootfs, and
15
+// an error if the path points to outside the container's rootfs.
16
+func (container *Container) ResolvePath(path string) (resolvedPath, absPath string, err error) {
17
+	if container.BaseFS == "" {
18
+		return "", "", errors.New("ResolvePath: BaseFS of container " + container.ID + " is unexpectedly empty")
19
+	}
20
+	// Check if a drive letter supplied, it must be the system drive.
21
+	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
22
+	if err != nil {
23
+		return "", "", err
24
+	}
25
+
26
+	// Consider the given path as an absolute path in the container.
27
+	absPath = archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path)
28
+
29
+	// Split the absPath into its Directory and Base components. We will
30
+	// resolve the dir in the scope of the container then append the base.
31
+	dirPath, basePath := filepath.Split(absPath)
32
+
33
+	resolvedDirPath, err := container.GetResourcePath(dirPath)
34
+	if err != nil {
35
+		return "", "", err
36
+	}
37
+
38
+	// resolvedDirPath will have been cleaned (no trailing path separators) so
39
+	// we can manually join it with the base path element.
40
+	resolvedPath = resolvedDirPath + string(filepath.Separator) + basePath
41
+	return resolvedPath, absPath, nil
42
+}
43
+
44
+// StatPath is the unexported version of StatPath. Locks and mounts should
45
+// be acquired before calling this method and the given path should be fully
46
+// resolved to a path on the host corresponding to the given absolute path
47
+// inside the container.
48
+func (container *Container) StatPath(resolvedPath, absPath string) (stat *types.ContainerPathStat, err error) {
49
+	if container.BaseFS == "" {
50
+		return nil, errors.New("StatPath: BaseFS of container " + container.ID + " is unexpectedly empty")
51
+	}
52
+
53
+	lstat, err := os.Lstat(resolvedPath)
54
+	if err != nil {
55
+		return nil, err
56
+	}
57
+
58
+	var linkTarget string
59
+	if lstat.Mode()&os.ModeSymlink != 0 {
60
+		// Fully evaluate the symlink in the scope of the container rootfs.
61
+		hostPath, err := container.GetResourcePath(absPath)
62
+		if err != nil {
63
+			return nil, err
64
+		}
65
+
66
+		linkTarget, err = filepath.Rel(container.BaseFS, hostPath)
67
+		if err != nil {
68
+			return nil, err
69
+		}
70
+
71
+		// Make it an absolute path.
72
+		linkTarget = filepath.Join(string(filepath.Separator), linkTarget)
73
+	}
74
+
75
+	return &types.ContainerPathStat{
76
+		Name:       filepath.Base(absPath),
77
+		Size:       lstat.Size(),
78
+		Mode:       lstat.Mode(),
79
+		Mtime:      lstat.ModTime(),
80
+		LinkTarget: linkTarget,
81
+	}, nil
82
+}
... ...
@@ -3,17 +3,9 @@ package daemon // import "github.com/docker/docker/daemon"
3 3
 import (
4 4
 	"io"
5 5
 	"os"
6
-	"path/filepath"
7
-	"strings"
8 6
 
9 7
 	"github.com/docker/docker/api/types"
10
-	"github.com/docker/docker/container"
11 8
 	"github.com/docker/docker/errdefs"
12
-	"github.com/docker/docker/pkg/archive"
13
-	"github.com/docker/docker/pkg/chrootarchive"
14
-	"github.com/docker/docker/pkg/ioutils"
15
-	"github.com/docker/docker/pkg/system"
16
-	"github.com/pkg/errors"
17 9
 )
18 10
 
19 11
 // ContainerCopy performs a deprecated operation of archiving the resource at
... ...
@@ -24,11 +16,6 @@ func (daemon *Daemon) ContainerCopy(name string, res string) (io.ReadCloser, err
24 24
 		return nil, err
25 25
 	}
26 26
 
27
-	// Make sure an online file-system operation is permitted.
28
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
29
-		return nil, errdefs.System(err)
30
-	}
31
-
32 27
 	data, err := daemon.containerCopy(ctr, res)
33 28
 	if err == nil {
34 29
 		return data, nil
... ...
@@ -48,11 +35,6 @@ func (daemon *Daemon) ContainerStatPath(name string, path string) (stat *types.C
48 48
 		return nil, err
49 49
 	}
50 50
 
51
-	// Make sure an online file-system operation is permitted.
52
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
53
-		return nil, errdefs.System(err)
54
-	}
55
-
56 51
 	stat, err = daemon.containerStatPath(ctr, path)
57 52
 	if err == nil {
58 53
 		return stat, nil
... ...
@@ -73,11 +55,6 @@ func (daemon *Daemon) ContainerArchivePath(name string, path string) (content io
73 73
 		return nil, nil, err
74 74
 	}
75 75
 
76
-	// Make sure an online file-system operation is permitted.
77
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
78
-		return nil, nil, errdefs.System(err)
79
-	}
80
-
81 76
 	content, stat, err = daemon.containerArchivePath(ctr, path)
82 77
 	if err == nil {
83 78
 		return content, stat, nil
... ...
@@ -101,11 +78,6 @@ func (daemon *Daemon) ContainerExtractToDir(name, path string, copyUIDGID, noOve
101 101
 		return err
102 102
 	}
103 103
 
104
-	// Make sure an online file-system operation is permitted.
105
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
106
-		return errdefs.System(err)
107
-	}
108
-
109 104
 	err = daemon.containerExtractToDir(ctr, path, copyUIDGID, noOverwriteDirNonDir, content)
110 105
 	if err == nil {
111 106
 		return nil
... ...
@@ -116,299 +88,3 @@ func (daemon *Daemon) ContainerExtractToDir(name, path string, copyUIDGID, noOve
116 116
 	}
117 117
 	return errdefs.System(err)
118 118
 }
119
-
120
-// containerStatPath stats the filesystem resource at the specified path in this
121
-// container. Returns stat info about the resource.
122
-func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) {
123
-	container.Lock()
124
-	defer container.Unlock()
125
-
126
-	if err = daemon.Mount(container); err != nil {
127
-		return nil, err
128
-	}
129
-	defer daemon.Unmount(container)
130
-
131
-	err = daemon.mountVolumes(container)
132
-	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
133
-	if err != nil {
134
-		return nil, err
135
-	}
136
-
137
-	// Normalize path before sending to rootfs
138
-	path = filepath.FromSlash(path)
139
-
140
-	resolvedPath, absPath, err := container.ResolvePath(path)
141
-	if err != nil {
142
-		return nil, err
143
-	}
144
-
145
-	return container.StatPath(resolvedPath, absPath)
146
-}
147
-
148
-// containerArchivePath creates an archive of the filesystem resource at the specified
149
-// path in this container. Returns a tar archive of the resource and stat info
150
-// about the resource.
151
-func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) {
152
-	container.Lock()
153
-
154
-	defer func() {
155
-		if err != nil {
156
-			// Wait to unlock the container until the archive is fully read
157
-			// (see the ReadCloseWrapper func below) or if there is an error
158
-			// before that occurs.
159
-			container.Unlock()
160
-		}
161
-	}()
162
-
163
-	if err = daemon.Mount(container); err != nil {
164
-		return nil, nil, err
165
-	}
166
-
167
-	defer func() {
168
-		if err != nil {
169
-			// unmount any volumes
170
-			container.DetachAndUnmount(daemon.LogVolumeEvent)
171
-			// unmount the container's rootfs
172
-			daemon.Unmount(container)
173
-		}
174
-	}()
175
-
176
-	if err = daemon.mountVolumes(container); err != nil {
177
-		return nil, nil, err
178
-	}
179
-
180
-	// Normalize path before sending to rootfs
181
-	path = filepath.FromSlash(path)
182
-
183
-	resolvedPath, absPath, err := container.ResolvePath(path)
184
-	if err != nil {
185
-		return nil, nil, err
186
-	}
187
-
188
-	stat, err = container.StatPath(resolvedPath, absPath)
189
-	if err != nil {
190
-		return nil, nil, err
191
-	}
192
-
193
-	// We need to rebase the archive entries if the last element of the
194
-	// resolved path was a symlink that was evaluated and is now different
195
-	// than the requested path. For example, if the given path was "/foo/bar/",
196
-	// but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want
197
-	// to ensure that the archive entries start with "bar" and not "baz". This
198
-	// also catches the case when the root directory of the container is
199
-	// requested: we want the archive entries to start with "/" and not the
200
-	// container ID.
201
-
202
-	// Get the source and the base paths of the container resolved path in order
203
-	// to get the proper tar options for the rebase tar.
204
-	resolvedPath = filepath.Clean(resolvedPath)
205
-	if filepath.Base(resolvedPath) == "." {
206
-		resolvedPath += string(filepath.Separator) + "."
207
-	}
208
-
209
-	sourceDir := resolvedPath
210
-	sourceBase := "."
211
-
212
-	if stat.Mode&os.ModeDir == 0 { // not dir
213
-		sourceDir, sourceBase = filepath.Split(resolvedPath)
214
-	}
215
-	opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath))
216
-
217
-	data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS)
218
-	if err != nil {
219
-		return nil, nil, err
220
-	}
221
-
222
-	content = ioutils.NewReadCloserWrapper(data, func() error {
223
-		err := data.Close()
224
-		container.DetachAndUnmount(daemon.LogVolumeEvent)
225
-		daemon.Unmount(container)
226
-		container.Unlock()
227
-		return err
228
-	})
229
-
230
-	daemon.LogContainerEvent(container, "archive-path")
231
-
232
-	return content, stat, nil
233
-}
234
-
235
-// containerExtractToDir extracts the given tar archive to the specified location in the
236
-// filesystem of this container. The given path must be of a directory in the
237
-// container. If it is not, the error will be an errdefs.InvalidParameter. If
238
-// noOverwriteDirNonDir is true then it will be an error if unpacking the
239
-// given content would cause an existing directory to be replaced with a non-
240
-// directory and vice versa.
241
-func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) {
242
-	container.Lock()
243
-	defer container.Unlock()
244
-
245
-	if err = daemon.Mount(container); err != nil {
246
-		return err
247
-	}
248
-	defer daemon.Unmount(container)
249
-
250
-	err = daemon.mountVolumes(container)
251
-	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
252
-	if err != nil {
253
-		return err
254
-	}
255
-
256
-	// Normalize path before sending to rootfs'
257
-	path = filepath.FromSlash(path)
258
-
259
-	// Check if a drive letter supplied, it must be the system drive. No-op except on Windows
260
-	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
261
-	if err != nil {
262
-		return err
263
-	}
264
-
265
-	// The destination path needs to be resolved to a host path, with all
266
-	// symbolic links followed in the scope of the container's rootfs. Note
267
-	// that we do not use `container.ResolvePath(path)` here because we need
268
-	// to also evaluate the last path element if it is a symlink. This is so
269
-	// that you can extract an archive to a symlink that points to a directory.
270
-
271
-	// Consider the given path as an absolute path in the container.
272
-	absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path)
273
-
274
-	// This will evaluate the last path element if it is a symlink.
275
-	resolvedPath, err := container.GetResourcePath(absPath)
276
-	if err != nil {
277
-		return err
278
-	}
279
-
280
-	stat, err := os.Lstat(resolvedPath)
281
-	if err != nil {
282
-		return err
283
-	}
284
-
285
-	if !stat.IsDir() {
286
-		return errdefs.InvalidParameter(errors.New("extraction point is not a directory"))
287
-	}
288
-
289
-	// Need to check if the path is in a volume. If it is, it cannot be in a
290
-	// read-only volume. If it is not in a volume, the container cannot be
291
-	// configured with a read-only rootfs.
292
-
293
-	// Use the resolved path relative to the container rootfs as the new
294
-	// absPath. This way we fully follow any symlinks in a volume that may
295
-	// lead back outside the volume.
296
-	//
297
-	// The Windows implementation of filepath.Rel in golang 1.4 does not
298
-	// support volume style file path semantics. On Windows when using the
299
-	// filter driver, we are guaranteed that the path will always be
300
-	// a volume file path.
301
-	var baseRel string
302
-	if strings.HasPrefix(resolvedPath, `\\?\Volume{`) {
303
-		if strings.HasPrefix(resolvedPath, container.BaseFS) {
304
-			baseRel = resolvedPath[len(container.BaseFS):]
305
-			if baseRel[:1] == `\` {
306
-				baseRel = baseRel[1:]
307
-			}
308
-		}
309
-	} else {
310
-		baseRel, err = filepath.Rel(container.BaseFS, resolvedPath)
311
-	}
312
-	if err != nil {
313
-		return err
314
-	}
315
-	// Make it an absolute path.
316
-	absPath = filepath.Join(string(filepath.Separator), baseRel)
317
-
318
-	toVolume, err := checkIfPathIsInAVolume(container, absPath)
319
-	if err != nil {
320
-		return err
321
-	}
322
-
323
-	if !toVolume && container.HostConfig.ReadonlyRootfs {
324
-		return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only"))
325
-	}
326
-
327
-	options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir)
328
-
329
-	if copyUIDGID {
330
-		var err error
331
-		// tarCopyOptions will appropriately pull in the right uid/gid for the
332
-		// user/group and will set the options.
333
-		options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir)
334
-		if err != nil {
335
-			return err
336
-		}
337
-	}
338
-
339
-	if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil {
340
-		return err
341
-	}
342
-
343
-	daemon.LogContainerEvent(container, "extract-to-dir")
344
-
345
-	return nil
346
-}
347
-
348
-func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) {
349
-	if resource[0] == '/' || resource[0] == '\\' {
350
-		resource = resource[1:]
351
-	}
352
-	container.Lock()
353
-
354
-	defer func() {
355
-		if err != nil {
356
-			// Wait to unlock the container until the archive is fully read
357
-			// (see the ReadCloseWrapper func below) or if there is an error
358
-			// before that occurs.
359
-			container.Unlock()
360
-		}
361
-	}()
362
-
363
-	if err := daemon.Mount(container); err != nil {
364
-		return nil, err
365
-	}
366
-
367
-	defer func() {
368
-		if err != nil {
369
-			// unmount any volumes
370
-			container.DetachAndUnmount(daemon.LogVolumeEvent)
371
-			// unmount the container's rootfs
372
-			daemon.Unmount(container)
373
-		}
374
-	}()
375
-
376
-	if err := daemon.mountVolumes(container); err != nil {
377
-		return nil, err
378
-	}
379
-
380
-	// Normalize path before sending to rootfs
381
-	resource = filepath.FromSlash(resource)
382
-
383
-	basePath, err := container.GetResourcePath(resource)
384
-	if err != nil {
385
-		return nil, err
386
-	}
387
-	stat, err := os.Stat(basePath)
388
-	if err != nil {
389
-		return nil, err
390
-	}
391
-	var filter []string
392
-	if !stat.IsDir() {
393
-		d, f := filepath.Split(basePath)
394
-		basePath = d
395
-		filter = []string{f}
396
-	}
397
-	archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{
398
-		Compression:  archive.Uncompressed,
399
-		IncludeFiles: filter,
400
-	}, container.BaseFS)
401
-	if err != nil {
402
-		return nil, err
403
-	}
404
-
405
-	reader := ioutils.NewReadCloserWrapper(archv, func() error {
406
-		err := archv.Close()
407
-		container.DetachAndUnmount(daemon.LogVolumeEvent)
408
-		daemon.Unmount(container)
409
-		container.Unlock()
410
-		return err
411
-	})
412
-	daemon.LogContainerEvent(container, "copy")
413
-	return reader, nil
414
-}
... ...
@@ -4,12 +4,212 @@
4 4
 package daemon // import "github.com/docker/docker/daemon"
5 5
 
6 6
 import (
7
+	"context"
8
+	"io"
9
+	"os"
10
+	"path/filepath"
11
+
12
+	"github.com/docker/docker/api/types"
7 13
 	"github.com/docker/docker/container"
8 14
 	"github.com/docker/docker/errdefs"
15
+	"github.com/docker/docker/pkg/archive"
16
+	"github.com/docker/docker/pkg/ioutils"
9 17
 	volumemounts "github.com/docker/docker/volume/mounts"
10 18
 	"github.com/pkg/errors"
11 19
 )
12 20
 
21
+// containerStatPath stats the filesystem resource at the specified path in this
22
+// container. Returns stat info about the resource.
23
+func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) {
24
+	container.Lock()
25
+	defer container.Unlock()
26
+
27
+	cfs, err := daemon.openContainerFS(container)
28
+	if err != nil {
29
+		return nil, err
30
+	}
31
+	defer cfs.Close()
32
+
33
+	return cfs.Stat(context.TODO(), path)
34
+}
35
+
36
+// containerArchivePath creates an archive of the filesystem resource at the specified
37
+// path in this container. Returns a tar archive of the resource and stat info
38
+// about the resource.
39
+func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) {
40
+	container.Lock()
41
+
42
+	defer func() {
43
+		if err != nil {
44
+			// Wait to unlock the container until the archive is fully read
45
+			// (see the ReadCloseWrapper func below) or if there is an error
46
+			// before that occurs.
47
+			container.Unlock()
48
+		}
49
+	}()
50
+
51
+	cfs, err := daemon.openContainerFS(container)
52
+	if err != nil {
53
+		return nil, nil, err
54
+	}
55
+
56
+	defer func() {
57
+		if err != nil {
58
+			cfs.Close()
59
+		}
60
+	}()
61
+
62
+	absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join("/", path), path)
63
+
64
+	stat, err = cfs.Stat(context.TODO(), absPath)
65
+	if err != nil {
66
+		return nil, nil, err
67
+	}
68
+
69
+	sourceDir, sourceBase := absPath, "."
70
+	if stat.Mode&os.ModeDir == 0 { // not dir
71
+		sourceDir, sourceBase = filepath.Split(absPath)
72
+	}
73
+	opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath))
74
+
75
+	tb, err := archive.NewTarballer(sourceDir, opts)
76
+	if err != nil {
77
+		return nil, nil, err
78
+	}
79
+
80
+	cfs.GoInFS(context.TODO(), tb.Do)
81
+	data := tb.Reader()
82
+	content = ioutils.NewReadCloserWrapper(data, func() error {
83
+		err := data.Close()
84
+		_ = cfs.Close()
85
+		container.Unlock()
86
+		return err
87
+	})
88
+
89
+	daemon.LogContainerEvent(container, "archive-path")
90
+
91
+	return content, stat, nil
92
+}
93
+
94
+// containerExtractToDir extracts the given tar archive to the specified location in the
95
+// filesystem of this container. The given path must be of a directory in the
96
+// container. If it is not, the error will be an errdefs.InvalidParameter. If
97
+// noOverwriteDirNonDir is true then it will be an error if unpacking the
98
+// given content would cause an existing directory to be replaced with a non-
99
+// directory and vice versa.
100
+func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) {
101
+	container.Lock()
102
+	defer container.Unlock()
103
+
104
+	cfs, err := daemon.openContainerFS(container)
105
+	if err != nil {
106
+		return err
107
+	}
108
+	defer cfs.Close()
109
+
110
+	err = cfs.RunInFS(context.TODO(), func() error {
111
+		// The destination path needs to be resolved with all symbolic links
112
+		// followed. Note that we need to also evaluate the last path element if
113
+		// it is a symlink. This is so that you can extract an archive to a
114
+		// symlink that points to a directory.
115
+		absPath, err := filepath.EvalSymlinks(filepath.Join("/", path))
116
+		if err != nil {
117
+			return err
118
+		}
119
+		absPath = archive.PreserveTrailingDotOrSeparator(absPath, path)
120
+
121
+		stat, err := os.Lstat(absPath)
122
+		if err != nil {
123
+			return err
124
+		}
125
+		if !stat.IsDir() {
126
+			return errdefs.InvalidParameter(errors.New("extraction point is not a directory"))
127
+		}
128
+
129
+		// Need to check if the path is in a volume. If it is, it cannot be in a
130
+		// read-only volume. If it is not in a volume, the container cannot be
131
+		// configured with a read-only rootfs.
132
+		toVolume, err := checkIfPathIsInAVolume(container, absPath)
133
+		if err != nil {
134
+			return err
135
+		}
136
+
137
+		if !toVolume && container.HostConfig.ReadonlyRootfs {
138
+			return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only"))
139
+		}
140
+
141
+		options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir)
142
+
143
+		if copyUIDGID {
144
+			var err error
145
+			// tarCopyOptions will appropriately pull in the right uid/gid for the
146
+			// user/group and will set the options.
147
+			options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir)
148
+			if err != nil {
149
+				return err
150
+			}
151
+		}
152
+
153
+		return archive.Untar(content, absPath, options)
154
+	})
155
+	if err != nil {
156
+		return err
157
+	}
158
+
159
+	daemon.LogContainerEvent(container, "extract-to-dir")
160
+
161
+	return nil
162
+}
163
+
164
+func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) {
165
+	container.Lock()
166
+
167
+	defer func() {
168
+		if err != nil {
169
+			// Wait to unlock the container until the archive is fully read
170
+			// (see the ReadCloseWrapper func below) or if there is an error
171
+			// before that occurs.
172
+			container.Unlock()
173
+		}
174
+	}()
175
+
176
+	cfs, err := daemon.openContainerFS(container)
177
+	if err != nil {
178
+		return nil, err
179
+	}
180
+	defer func() {
181
+		if err != nil {
182
+			cfs.Close()
183
+		}
184
+	}()
185
+
186
+	err = cfs.RunInFS(context.TODO(), func() error {
187
+		_, err := os.Stat(resource)
188
+		return err
189
+	})
190
+	if err != nil {
191
+		return nil, err
192
+	}
193
+
194
+	tb, err := archive.NewTarballer(resource, &archive.TarOptions{
195
+		Compression: archive.Uncompressed,
196
+	})
197
+	if err != nil {
198
+		return nil, err
199
+	}
200
+
201
+	cfs.GoInFS(context.TODO(), tb.Do)
202
+	archv := tb.Reader()
203
+	reader := ioutils.NewReadCloserWrapper(archv, func() error {
204
+		err := archv.Close()
205
+		_ = cfs.Close()
206
+		container.Unlock()
207
+		return err
208
+	})
209
+	daemon.LogContainerEvent(container, "copy")
210
+	return reader, nil
211
+}
212
+
13 213
 // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it
14 214
 // cannot be in a read-only volume. If it  is not in a volume, the container
15 215
 // cannot be configured with a read-only rootfs.
... ...
@@ -26,9 +226,3 @@ func checkIfPathIsInAVolume(container *container.Container, absPath string) (boo
26 26
 	}
27 27
 	return toVolume, nil
28 28
 }
29
-
30
-// isOnlineFSOperationPermitted returns an error if an online filesystem operation
31
-// is not permitted.
32
-func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error {
33
-	return nil
34
-}
... ...
@@ -2,11 +2,337 @@ package daemon // import "github.com/docker/docker/daemon"
2 2
 
3 3
 import (
4 4
 	"errors"
5
+	"io"
6
+	"os"
7
+	"path/filepath"
8
+	"strings"
5 9
 
10
+	"github.com/docker/docker/api/types"
6 11
 	containertypes "github.com/docker/docker/api/types/container"
7 12
 	"github.com/docker/docker/container"
13
+	"github.com/docker/docker/errdefs"
14
+	"github.com/docker/docker/pkg/archive"
15
+	"github.com/docker/docker/pkg/chrootarchive"
16
+	"github.com/docker/docker/pkg/ioutils"
17
+	"github.com/docker/docker/pkg/system"
8 18
 )
9 19
 
20
+// containerStatPath stats the filesystem resource at the specified path in this
21
+// container. Returns stat info about the resource.
22
+func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) {
23
+	container.Lock()
24
+	defer container.Unlock()
25
+
26
+	// Make sure an online file-system operation is permitted.
27
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
28
+		return nil, err
29
+	}
30
+
31
+	if err = daemon.Mount(container); err != nil {
32
+		return nil, err
33
+	}
34
+	defer daemon.Unmount(container)
35
+
36
+	err = daemon.mountVolumes(container)
37
+	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
38
+	if err != nil {
39
+		return nil, err
40
+	}
41
+
42
+	// Normalize path before sending to rootfs
43
+	path = filepath.FromSlash(path)
44
+
45
+	resolvedPath, absPath, err := container.ResolvePath(path)
46
+	if err != nil {
47
+		return nil, err
48
+	}
49
+
50
+	return container.StatPath(resolvedPath, absPath)
51
+}
52
+
53
+// containerArchivePath creates an archive of the filesystem resource at the specified
54
+// path in this container. Returns a tar archive of the resource and stat info
55
+// about the resource.
56
+func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) {
57
+	container.Lock()
58
+
59
+	defer func() {
60
+		if err != nil {
61
+			// Wait to unlock the container until the archive is fully read
62
+			// (see the ReadCloseWrapper func below) or if there is an error
63
+			// before that occurs.
64
+			container.Unlock()
65
+		}
66
+	}()
67
+
68
+	// Make sure an online file-system operation is permitted.
69
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
70
+		return nil, nil, err
71
+	}
72
+
73
+	if err = daemon.Mount(container); err != nil {
74
+		return nil, nil, err
75
+	}
76
+
77
+	defer func() {
78
+		if err != nil {
79
+			// unmount any volumes
80
+			container.DetachAndUnmount(daemon.LogVolumeEvent)
81
+			// unmount the container's rootfs
82
+			daemon.Unmount(container)
83
+		}
84
+	}()
85
+
86
+	if err = daemon.mountVolumes(container); err != nil {
87
+		return nil, nil, err
88
+	}
89
+
90
+	// Normalize path before sending to rootfs
91
+	path = filepath.FromSlash(path)
92
+
93
+	resolvedPath, absPath, err := container.ResolvePath(path)
94
+	if err != nil {
95
+		return nil, nil, err
96
+	}
97
+
98
+	stat, err = container.StatPath(resolvedPath, absPath)
99
+	if err != nil {
100
+		return nil, nil, err
101
+	}
102
+
103
+	// We need to rebase the archive entries if the last element of the
104
+	// resolved path was a symlink that was evaluated and is now different
105
+	// than the requested path. For example, if the given path was "/foo/bar/",
106
+	// but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want
107
+	// to ensure that the archive entries start with "bar" and not "baz". This
108
+	// also catches the case when the root directory of the container is
109
+	// requested: we want the archive entries to start with "/" and not the
110
+	// container ID.
111
+
112
+	// Get the source and the base paths of the container resolved path in order
113
+	// to get the proper tar options for the rebase tar.
114
+	resolvedPath = filepath.Clean(resolvedPath)
115
+	if filepath.Base(resolvedPath) == "." {
116
+		resolvedPath += string(filepath.Separator) + "."
117
+	}
118
+
119
+	sourceDir := resolvedPath
120
+	sourceBase := "."
121
+
122
+	if stat.Mode&os.ModeDir == 0 { // not dir
123
+		sourceDir, sourceBase = filepath.Split(resolvedPath)
124
+	}
125
+	opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath))
126
+
127
+	data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS)
128
+	if err != nil {
129
+		return nil, nil, err
130
+	}
131
+
132
+	content = ioutils.NewReadCloserWrapper(data, func() error {
133
+		err := data.Close()
134
+		container.DetachAndUnmount(daemon.LogVolumeEvent)
135
+		daemon.Unmount(container)
136
+		container.Unlock()
137
+		return err
138
+	})
139
+
140
+	daemon.LogContainerEvent(container, "archive-path")
141
+
142
+	return content, stat, nil
143
+}
144
+
145
+// containerExtractToDir extracts the given tar archive to the specified location in the
146
+// filesystem of this container. The given path must be of a directory in the
147
+// container. If it is not, the error will be an errdefs.InvalidParameter. If
148
+// noOverwriteDirNonDir is true then it will be an error if unpacking the
149
+// given content would cause an existing directory to be replaced with a non-
150
+// directory and vice versa.
151
+func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) {
152
+	container.Lock()
153
+	defer container.Unlock()
154
+
155
+	// Make sure an online file-system operation is permitted.
156
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
157
+		return err
158
+	}
159
+
160
+	if err = daemon.Mount(container); err != nil {
161
+		return err
162
+	}
163
+	defer daemon.Unmount(container)
164
+
165
+	err = daemon.mountVolumes(container)
166
+	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
167
+	if err != nil {
168
+		return err
169
+	}
170
+
171
+	// Normalize path before sending to rootfs'
172
+	path = filepath.FromSlash(path)
173
+
174
+	// Check if a drive letter supplied, it must be the system drive. No-op except on Windows
175
+	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
176
+	if err != nil {
177
+		return err
178
+	}
179
+
180
+	// The destination path needs to be resolved to a host path, with all
181
+	// symbolic links followed in the scope of the container's rootfs. Note
182
+	// that we do not use `container.ResolvePath(path)` here because we need
183
+	// to also evaluate the last path element if it is a symlink. This is so
184
+	// that you can extract an archive to a symlink that points to a directory.
185
+
186
+	// Consider the given path as an absolute path in the container.
187
+	absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path)
188
+
189
+	// This will evaluate the last path element if it is a symlink.
190
+	resolvedPath, err := container.GetResourcePath(absPath)
191
+	if err != nil {
192
+		return err
193
+	}
194
+
195
+	stat, err := os.Lstat(resolvedPath)
196
+	if err != nil {
197
+		return err
198
+	}
199
+
200
+	if !stat.IsDir() {
201
+		return errdefs.InvalidParameter(errors.New("extraction point is not a directory"))
202
+	}
203
+
204
+	// Need to check if the path is in a volume. If it is, it cannot be in a
205
+	// read-only volume. If it is not in a volume, the container cannot be
206
+	// configured with a read-only rootfs.
207
+
208
+	// Use the resolved path relative to the container rootfs as the new
209
+	// absPath. This way we fully follow any symlinks in a volume that may
210
+	// lead back outside the volume.
211
+	//
212
+	// The Windows implementation of filepath.Rel in golang 1.4 does not
213
+	// support volume style file path semantics. On Windows when using the
214
+	// filter driver, we are guaranteed that the path will always be
215
+	// a volume file path.
216
+	var baseRel string
217
+	if strings.HasPrefix(resolvedPath, `\\?\Volume{`) {
218
+		if strings.HasPrefix(resolvedPath, container.BaseFS) {
219
+			baseRel = resolvedPath[len(container.BaseFS):]
220
+			if baseRel[:1] == `\` {
221
+				baseRel = baseRel[1:]
222
+			}
223
+		}
224
+	} else {
225
+		baseRel, err = filepath.Rel(container.BaseFS, resolvedPath)
226
+	}
227
+	if err != nil {
228
+		return err
229
+	}
230
+	// Make it an absolute path.
231
+	absPath = filepath.Join(string(filepath.Separator), baseRel)
232
+
233
+	toVolume, err := checkIfPathIsInAVolume(container, absPath)
234
+	if err != nil {
235
+		return err
236
+	}
237
+
238
+	if !toVolume && container.HostConfig.ReadonlyRootfs {
239
+		return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only"))
240
+	}
241
+
242
+	options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir)
243
+
244
+	if copyUIDGID {
245
+		var err error
246
+		// tarCopyOptions will appropriately pull in the right uid/gid for the
247
+		// user/group and will set the options.
248
+		options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir)
249
+		if err != nil {
250
+			return err
251
+		}
252
+	}
253
+
254
+	if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil {
255
+		return err
256
+	}
257
+
258
+	daemon.LogContainerEvent(container, "extract-to-dir")
259
+
260
+	return nil
261
+}
262
+
263
+func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) {
264
+	if resource[0] == '/' || resource[0] == '\\' {
265
+		resource = resource[1:]
266
+	}
267
+	container.Lock()
268
+
269
+	defer func() {
270
+		if err != nil {
271
+			// Wait to unlock the container until the archive is fully read
272
+			// (see the ReadCloseWrapper func below) or if there is an error
273
+			// before that occurs.
274
+			container.Unlock()
275
+		}
276
+	}()
277
+
278
+	// Make sure an online file-system operation is permitted.
279
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
280
+		return nil, err
281
+	}
282
+
283
+	if err := daemon.Mount(container); err != nil {
284
+		return nil, err
285
+	}
286
+
287
+	defer func() {
288
+		if err != nil {
289
+			// unmount any volumes
290
+			container.DetachAndUnmount(daemon.LogVolumeEvent)
291
+			// unmount the container's rootfs
292
+			daemon.Unmount(container)
293
+		}
294
+	}()
295
+
296
+	if err := daemon.mountVolumes(container); err != nil {
297
+		return nil, err
298
+	}
299
+
300
+	// Normalize path before sending to rootfs
301
+	resource = filepath.FromSlash(resource)
302
+
303
+	basePath, err := container.GetResourcePath(resource)
304
+	if err != nil {
305
+		return nil, err
306
+	}
307
+	stat, err := os.Stat(basePath)
308
+	if err != nil {
309
+		return nil, err
310
+	}
311
+	var filter []string
312
+	if !stat.IsDir() {
313
+		d, f := filepath.Split(basePath)
314
+		basePath = d
315
+		filter = []string{f}
316
+	}
317
+	archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{
318
+		Compression:  archive.Uncompressed,
319
+		IncludeFiles: filter,
320
+	}, container.BaseFS)
321
+	if err != nil {
322
+		return nil, err
323
+	}
324
+
325
+	reader := ioutils.NewReadCloserWrapper(archv, func() error {
326
+		err := archv.Close()
327
+		container.DetachAndUnmount(daemon.LogVolumeEvent)
328
+		daemon.Unmount(container)
329
+		container.Unlock()
330
+		return err
331
+	})
332
+	daemon.LogContainerEvent(container, "copy")
333
+	return reader, nil
334
+}
335
+
10 336
 // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it
11 337
 // cannot be in a read-only volume. If it  is not in a volume, the container
12 338
 // cannot be configured with a read-only rootfs.
... ...
@@ -21,9 +347,9 @@ func checkIfPathIsInAVolume(container *container.Container, absPath string) (boo
21 21
 // is not permitted (such as stat or for copying). Running Hyper-V containers
22 22
 // cannot have their file-system interrogated from the host as the filter is
23 23
 // loaded inside the utility VM, not the host.
24
-// IMPORTANT: The container lock must NOT be held when calling this function.
24
+// IMPORTANT: The container lock MUST be held when calling this function.
25 25
 func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error {
26
-	if !container.IsRunning() {
26
+	if !container.Running {
27 27
 		return nil
28 28
 	}
29 29
 
30 30
new file mode 100644
... ...
@@ -0,0 +1,221 @@
0
+package daemon // import "github.com/docker/docker/daemon"
1
+
2
+import (
3
+	"context"
4
+	"os"
5
+	"path/filepath"
6
+	"runtime"
7
+	"strings"
8
+
9
+	"github.com/hashicorp/go-multierror"
10
+	"github.com/moby/sys/mount"
11
+	"github.com/moby/sys/symlink"
12
+	"golang.org/x/sys/unix"
13
+
14
+	"github.com/docker/docker/api/types"
15
+	"github.com/docker/docker/container"
16
+	"github.com/docker/docker/internal/mounttree"
17
+	"github.com/docker/docker/internal/unshare"
18
+	"github.com/docker/docker/pkg/fileutils"
19
+)
20
+
21
+type future struct {
22
+	fn  func() error
23
+	res chan<- error
24
+}
25
+
26
+// containerFSView allows functions to be run in the context of a container's
27
+// filesystem. Inside these functions, the root directory is the container root
28
+// for all native OS filesystem APIs, including, but not limited to, the [os]
29
+// and [golang.org/x/sys/unix] packages. The view of the container's filesystem
30
+// is live and read-write. Each view has its own private set of tmpfs mounts.
31
+// Any files written under a tmpfs mount are not visible to processes inside the
32
+// container nor any other view of the container's filesystem, and vice versa.
33
+//
34
+// Each view has its own current working directory which is initialized to the
35
+// root of the container filesystem and can be changed with [os.Chdir]. Changes
36
+// to the current directory persist across successive [*containerFSView.RunInFS]
37
+// and [*containerFSView.GoInFS] calls.
38
+//
39
+// Multiple views of the same container filesystem can coexist at the same time.
40
+// Only one function can be running in a particular filesystem view at any given
41
+// time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will
42
+// block while another function is running. If more than one call is blocked
43
+// concurrently, the order they are unblocked is undefined.
44
+type containerFSView struct {
45
+	d    *Daemon
46
+	ctr  *container.Container
47
+	todo chan future
48
+	done chan error
49
+}
50
+
51
+// openContainerFS opens a new view of the container's filesystem.
52
+func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) {
53
+	if err := daemon.Mount(container); err != nil {
54
+		return nil, err
55
+	}
56
+	defer func() {
57
+		if err != nil {
58
+			_ = daemon.Unmount(container)
59
+		}
60
+	}()
61
+
62
+	mounts, err := daemon.setupMounts(container)
63
+	if err != nil {
64
+		return nil, err
65
+	}
66
+	defer func() {
67
+		if err != nil {
68
+			_ = container.UnmountVolumes(daemon.LogVolumeEvent)
69
+		}
70
+	}()
71
+
72
+	// Setup in initial mount namespace complete. We're ready to unshare the
73
+	// mount namespace and bind the volume mounts into that private view of
74
+	// the container FS.
75
+	todo := make(chan future)
76
+	done := make(chan error)
77
+	err = unshare.Go(unix.CLONE_NEWNS,
78
+		func() error {
79
+			if err := mount.MakeRSlave("/"); err != nil {
80
+				return err
81
+			}
82
+			for _, m := range mounts {
83
+				dest, err := container.GetResourcePath(m.Destination)
84
+				if err != nil {
85
+					return err
86
+				}
87
+
88
+				var stat os.FileInfo
89
+				stat, err = os.Stat(m.Source)
90
+				if err != nil {
91
+					return err
92
+				}
93
+				if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
94
+					return err
95
+				}
96
+
97
+				bindMode := "rbind"
98
+				if m.NonRecursive {
99
+					bindMode = "bind"
100
+				}
101
+				writeMode := "ro"
102
+				if m.Writable {
103
+					writeMode = "rw"
104
+				}
105
+
106
+				// openContainerFS() is called for temporary mounts
107
+				// outside the container. Soon these will be unmounted
108
+				// with lazy unmount option and given we have mounted
109
+				// them rbind, all the submounts will propagate if these
110
+				// are shared. If daemon is running in host namespace
111
+				// and has / as shared then these unmounts will
112
+				// propagate and unmount original mount as well. So make
113
+				// all these mounts rprivate.  Do not use propagation
114
+				// property of volume as that should apply only when
115
+				// mounting happens inside the container.
116
+				opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
117
+				if err := mount.Mount(m.Source, dest, "", opts); err != nil {
118
+					return err
119
+				}
120
+			}
121
+
122
+			return mounttree.SwitchRoot(container.BaseFS)
123
+		},
124
+		func() {
125
+			defer close(done)
126
+
127
+			for it := range todo {
128
+				err := it.fn()
129
+				if it.res != nil {
130
+					it.res <- err
131
+				}
132
+			}
133
+
134
+			// The thread will terminate when this goroutine returns, taking the
135
+			// mount namespace and all the volume bind-mounts with it.
136
+		},
137
+	)
138
+	if err != nil {
139
+		return nil, err
140
+	}
141
+	vw := &containerFSView{
142
+		d:    daemon,
143
+		ctr:  container,
144
+		todo: todo,
145
+		done: done,
146
+	}
147
+	runtime.SetFinalizer(vw, (*containerFSView).Close)
148
+	return vw, nil
149
+}
150
+
151
+// RunInFS synchronously runs fn in the context of the container filesytem and
152
+// passes through its return value.
153
+//
154
+// The container filesystem is only visible to functions called in the same
155
+// goroutine as fn. Goroutines started from fn will see the host's filesystem.
156
+func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
157
+	res := make(chan error)
158
+	select {
159
+	case vw.todo <- future{fn: fn, res: res}:
160
+	case <-ctx.Done():
161
+		return ctx.Err()
162
+	}
163
+	return <-res
164
+}
165
+
166
+// GoInFS starts fn in the container FS. It blocks until fn is started but does
167
+// not wait until fn returns. An error is returned if ctx is canceled before fn
168
+// has been started.
169
+//
170
+// The container filesystem is only visible to functions called in the same
171
+// goroutine as fn. Goroutines started from fn will see the host's filesystem.
172
+func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
173
+	select {
174
+	case vw.todo <- future{fn: func() error { fn(); return nil }}:
175
+		return nil
176
+	case <-ctx.Done():
177
+		return ctx.Err()
178
+	}
179
+}
180
+
181
+// Close waits until any in-flight operations complete and frees all
182
+// resources associated with vw.
183
+func (vw *containerFSView) Close() error {
184
+	runtime.SetFinalizer(vw, nil)
185
+	close(vw.todo)
186
+	err := multierror.Append(nil, <-vw.done)
187
+	err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent))
188
+	err = multierror.Append(err, vw.d.Unmount(vw.ctr))
189
+	return err.ErrorOrNil()
190
+}
191
+
192
+// Stat returns the metadata for path, relative to the current working directory
193
+// of vw inside the container filesystem view.
194
+func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) {
195
+	var stat *types.ContainerPathStat
196
+	err := vw.RunInFS(ctx, func() error {
197
+		lstat, err := os.Lstat(path)
198
+		if err != nil {
199
+			return err
200
+		}
201
+		var target string
202
+		if lstat.Mode()&os.ModeSymlink != 0 {
203
+			// Fully evaluate symlinks along path to the ultimate
204
+			// target, or as much as possible with broken links.
205
+			target, err = symlink.FollowSymlinkInScope(path, "/")
206
+			if err != nil {
207
+				return err
208
+			}
209
+		}
210
+		stat = &types.ContainerPathStat{
211
+			Name:       filepath.Base(path),
212
+			Size:       lstat.Size(),
213
+			Mode:       lstat.Mode(),
214
+			Mtime:      lstat.ModTime(),
215
+			LinkTarget: target,
216
+		}
217
+		return nil
218
+	})
219
+	return stat, err
220
+}
... ...
@@ -12,9 +12,7 @@ import (
12 12
 
13 13
 	mounttypes "github.com/docker/docker/api/types/mount"
14 14
 	"github.com/docker/docker/container"
15
-	"github.com/docker/docker/pkg/fileutils"
16 15
 	volumemounts "github.com/docker/docker/volume/mounts"
17
-	"github.com/moby/sys/mount"
18 16
 )
19 17
 
20 18
 // setupMounts iterates through each of the mount points for a container and
... ...
@@ -112,51 +110,3 @@ func setBindModeIfNull(bind *volumemounts.MountPoint) {
112 112
 		bind.Mode = "z"
113 113
 	}
114 114
 }
115
-
116
-func (daemon *Daemon) mountVolumes(container *container.Container) error {
117
-	mounts, err := daemon.setupMounts(container)
118
-	if err != nil {
119
-		return err
120
-	}
121
-
122
-	for _, m := range mounts {
123
-		dest, err := container.GetResourcePath(m.Destination)
124
-		if err != nil {
125
-			return err
126
-		}
127
-
128
-		var stat os.FileInfo
129
-		stat, err = os.Stat(m.Source)
130
-		if err != nil {
131
-			return err
132
-		}
133
-		if err = fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
134
-			return err
135
-		}
136
-
137
-		bindMode := "rbind"
138
-		if m.NonRecursive {
139
-			bindMode = "bind"
140
-		}
141
-		writeMode := "ro"
142
-		if m.Writable {
143
-			writeMode = "rw"
144
-		}
145
-
146
-		// mountVolumes() seems to be called for temporary mounts
147
-		// outside the container. Soon these will be unmounted with
148
-		// lazy unmount option and given we have mounted the rbind,
149
-		// all the submounts will propagate if these are shared. If
150
-		// daemon is running in host namespace and has / as shared
151
-		// then these unmounts will propagate and unmount original
152
-		// mount as well. So make all these mounts rprivate.
153
-		// Do not use propagation property of volume as that should
154
-		// apply only when mounting happens inside the container.
155
-		opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
156
-		if err := mount.Mount(m.Source, dest, "", opts); err != nil {
157
-			return err
158
-		}
159
-	}
160
-
161
-	return nil
162
-}
... ...
@@ -37,6 +37,10 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
37 37
 		> /sys/fs/cgroup/cgroup.subtree_control
38 38
 fi
39 39
 
40
+# Change mount propagation to shared to make the environment more similar to a
41
+# modern Linux system, e.g. with SystemD as PID 1.
42
+mount --make-rshared /
43
+
40 44
 if [ $# -gt 0 ]; then
41 45
 	exec "$@"
42 46
 fi
... ...
@@ -13,6 +13,11 @@ if [ ! -t 0 ]; then
13 13
 	exit 1
14 14
 fi
15 15
 
16
+# Change mount propagation to shared, which SystemD PID 1 would normally do
17
+# itself when started by the kernel. SystemD skips that when it detects it is
18
+# running in a container.
19
+mount --make-rshared /
20
+
16 21
 env > /etc/docker-entrypoint-env
17 22
 
18 23
 cat > /etc/systemd/system/docker-entrypoint.target << EOF
... ...
@@ -158,16 +158,23 @@ func TestCopyFromContainer(t *testing.T) {
158 158
 		expect map[string]string
159 159
 	}{
160 160
 		{"/", map[string]string{"/": "", "/foo": "hello", "/bar/quux/baz": "world", "/bar/filesymlink": "", "/bar/dirsymlink": "", "/bar/notarget": ""}},
161
+		{".", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
162
+		{"/.", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
163
+		{"./", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
164
+		{"/./", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
161 165
 		{"/bar/root", map[string]string{"root": ""}},
162 166
 		{"/bar/root/", map[string]string{"root/": "", "root/foo": "hello", "root/bar/quux/baz": "world", "root/bar/filesymlink": "", "root/bar/dirsymlink": "", "root/bar/notarget": ""}},
167
+		{"/bar/root/.", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
163 168
 
164 169
 		{"bar/quux", map[string]string{"quux/": "", "quux/baz": "world"}},
165 170
 		{"bar/quux/", map[string]string{"quux/": "", "quux/baz": "world"}},
171
+		{"bar/quux/.", map[string]string{"./": "", "./baz": "world"}},
166 172
 		{"bar/quux/baz", map[string]string{"baz": "world"}},
167 173
 
168 174
 		{"bar/filesymlink", map[string]string{"filesymlink": ""}},
169 175
 		{"bar/dirsymlink", map[string]string{"dirsymlink": ""}},
170 176
 		{"bar/dirsymlink/", map[string]string{"dirsymlink/": "", "dirsymlink/baz": "world"}},
177
+		{"bar/dirsymlink/.", map[string]string{"./": "", "./baz": "world"}},
171 178
 		{"bar/notarget", map[string]string{"notarget": ""}},
172 179
 	} {
173 180
 		t.Run(x.src, func(t *testing.T) {
... ...
@@ -393,3 +393,38 @@ func TestContainerVolumesMountedAsSlave(t *testing.T) {
393 393
 		t.Fatal(err)
394 394
 	}
395 395
 }
396
+
397
+// Regression test for #38995 and #43390.
398
+func TestContainerCopyLeaksMounts(t *testing.T) {
399
+	defer setupTest(t)()
400
+
401
+	bindMount := mounttypes.Mount{
402
+		Type:   mounttypes.TypeBind,
403
+		Source: "/var",
404
+		Target: "/hostvar",
405
+		BindOptions: &mounttypes.BindOptions{
406
+			Propagation: mounttypes.PropagationRSlave,
407
+		},
408
+	}
409
+
410
+	ctx := context.Background()
411
+	client := testEnv.APIClient()
412
+	cid := container.Run(ctx, t, client, container.WithMount(bindMount), container.WithCmd("sleep", "120s"))
413
+
414
+	getMounts := func() string {
415
+		t.Helper()
416
+		res, err := container.Exec(ctx, client, cid, []string{"cat", "/proc/self/mountinfo"})
417
+		assert.NilError(t, err)
418
+		assert.Equal(t, res.ExitCode, 0)
419
+		return res.Stdout()
420
+	}
421
+
422
+	mountsBefore := getMounts()
423
+
424
+	_, _, err := client.CopyFromContainer(ctx, cid, "/etc/passwd")
425
+	assert.NilError(t, err)
426
+
427
+	mountsAfter := getMounts()
428
+
429
+	assert.Equal(t, mountsBefore, mountsAfter)
430
+}
396 431
new file mode 100644
... ...
@@ -0,0 +1,94 @@
0
+package mounttree // import "github.com/docker/docker/internal/mounttree"
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"path/filepath"
6
+
7
+	"github.com/moby/sys/mount"
8
+	"github.com/moby/sys/mountinfo"
9
+	"golang.org/x/sys/unix"
10
+)
11
+
12
+// SwitchRoot changes path to be the root of the mount tree and changes the
13
+// current working directory to the new root.
14
+//
15
+// This function bind-mounts onto path; it is the caller's responsibility to set
16
+// the desired propagation mode of path's parent mount beforehand to prevent
17
+// unwanted propagation into different mount namespaces.
18
+func SwitchRoot(path string) error {
19
+	if mounted, _ := mountinfo.Mounted(path); !mounted {
20
+		if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
21
+			return realChroot(path)
22
+		}
23
+	}
24
+
25
+	// setup oldRoot for pivot_root
26
+	pivotDir, err := os.MkdirTemp(path, ".pivot_root")
27
+	if err != nil {
28
+		return fmt.Errorf("Error setting up pivot dir: %v", err)
29
+	}
30
+
31
+	var mounted bool
32
+	defer func() {
33
+		if mounted {
34
+			// make sure pivotDir is not mounted before we try to remove it
35
+			if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
36
+				if err == nil {
37
+					err = errCleanup
38
+				}
39
+				return
40
+			}
41
+		}
42
+
43
+		errCleanup := os.Remove(pivotDir)
44
+		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
45
+		// because we already cleaned it up on failed pivot_root
46
+		if errCleanup != nil && !os.IsNotExist(errCleanup) {
47
+			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
48
+			if err == nil {
49
+				err = errCleanup
50
+			}
51
+		}
52
+	}()
53
+
54
+	if err := unix.PivotRoot(path, pivotDir); err != nil {
55
+		// If pivot fails, fall back to the normal chroot after cleaning up temp dir
56
+		if err := os.Remove(pivotDir); err != nil {
57
+			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
58
+		}
59
+		return realChroot(path)
60
+	}
61
+	mounted = true
62
+
63
+	// This is the new path for where the old root (prior to the pivot) has been moved to
64
+	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
65
+	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
66
+
67
+	if err := unix.Chdir("/"); err != nil {
68
+		return fmt.Errorf("Error changing to new root: %v", err)
69
+	}
70
+
71
+	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
72
+	if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
73
+		return fmt.Errorf("Error making old root private after pivot: %v", err)
74
+	}
75
+
76
+	// Now unmount the old root so it's no longer visible from the new root
77
+	if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
78
+		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
79
+	}
80
+	mounted = false
81
+
82
+	return nil
83
+}
84
+
85
+func realChroot(path string) error {
86
+	if err := unix.Chroot(path); err != nil {
87
+		return fmt.Errorf("Error after fallback to chroot: %v", err)
88
+	}
89
+	if err := unix.Chdir("/"); err != nil {
90
+		return fmt.Errorf("Error changing to new root after chroot: %v", err)
91
+	}
92
+	return nil
93
+}
0 94
new file mode 100644
... ...
@@ -0,0 +1,176 @@
0
+//go:build go1.10
1
+// +build go1.10
2
+
3
+package unshare // import "github.com/docker/docker/internal/unshare"
4
+
5
+import (
6
+	"fmt"
7
+	"os"
8
+	"runtime"
9
+
10
+	"golang.org/x/sys/unix"
11
+)
12
+
13
+func init() {
14
+	// The startup thread of a process is special in a few different ways.
15
+	// Most pertinent to the discussion at hand, any per-thread kernel state
16
+	// reflected in the /proc/[pid]/ directory for a process is taken from
17
+	// the state of the startup thread. Same goes for /proc/self/; it shows
18
+	// the state of the current process' startup thread, no matter which
19
+	// thread the files are being opened from. For most programs this is a
20
+	// distinction without a difference as the kernel state, such as the
21
+	// mount namespace and current working directory, is shared among (and
22
+	// kept synchronized across) all threads of a process. But things start
23
+	// to break down once threads start unsharing and modifying parts of
24
+	// their kernel state.
25
+	//
26
+	// The Go runtime schedules goroutines to execute on the startup thread,
27
+	// same as any other. How this could be problematic is best illustrated
28
+	// with a concrete example. Consider what happens if a call to
29
+	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
30
+	// onto the startup thread. The thread's mount namespace will be
31
+	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
32
+	// will then describe the mount tree of the unshared namespace, not the
33
+	// namespace of any other thread. It will remain this way until the
34
+	// process exits. (The startup thread is special in another way: exiting
35
+	// it puts the process into a "non-waitable zombie" state. To avoid this
36
+	// fate, the Go runtime parks the thread instead of exiting if a
37
+	// goroutine returns while locked to the startup thread. More
38
+	// information can be found in the Go runtime sources:
39
+	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
40
+	// package reads from /proc/self/mountinfo, so will read the mount tree
41
+	// for the wrong namespace if the startup thread has had its mount
42
+	// namespace unshared! The /proc/thread-self/ directory, introduced in
43
+	// Linux 3.17, is one potential solution to this problem, but every
44
+	// package which opens files in /proc/self/ would need to be updated,
45
+	// and fallbacks to /proc/self/task/[tid]/ would be required to support
46
+	// older kernels. Overlooking any reference to /proc/self/ would
47
+	// manifest as stochastically-reproducible bugs, so this is far from an
48
+	// ideal solution.
49
+	//
50
+	// Reading from /proc/self/ would not be a problem if we could prevent
51
+	// the per-thread state of the startup thread from being modified
52
+	// nondeterministically in the first place. We can accomplish this
53
+	// simply by locking the main() function to the startup thread! Doing so
54
+	// excludes any other goroutine from being scheduled on the thread.
55
+	runtime.LockOSThread()
56
+}
57
+
58
+// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
59
+// reversed using setns(2). The values are the basenames of the corresponding
60
+// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
61
+// state.
62
+var reversibleSetnsFlags = map[int]string{
63
+	unix.CLONE_NEWCGROUP: "cgroup",
64
+	unix.CLONE_NEWNET:    "net",
65
+	unix.CLONE_NEWUTS:    "uts",
66
+	unix.CLONE_NEWPID:    "pid",
67
+	unix.CLONE_NEWTIME:   "time",
68
+
69
+	// The following CLONE_NEW* flags are not included because they imply
70
+	// another, irreversible flag when used with unshare(2).
71
+	//  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
72
+	//  - unix.CLONE_NEWNS:   implies CLONE_FS
73
+	//  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
74
+}
75
+
76
+// Go calls the given functions in a new goroutine, locked to an OS thread,
77
+// which has had the parts of its execution state disassociated from the rest of
78
+// the current process using [unshare(2)]. It blocks until the new goroutine has
79
+// started and setupfn has returned. fn is only called if setupfn returns nil. A
80
+// nil setupfn or fn is equivalent to passing a no-op function.
81
+//
82
+// The disassociated execution state and any changes made to it are only visible
83
+// to the goroutine which the functions are called in. Any other goroutines,
84
+// including ones started from the function, will see the same execution state
85
+// as the rest of the process.
86
+//
87
+// The acceptable flags are documented in the [unshare(2)] Linux man-page.
88
+// The corresponding CLONE_* constants are defined in package [unix].
89
+//
90
+// # Warning
91
+//
92
+// This function may terminate the thread which the new goroutine executed on
93
+// after fn returns, which could cause subprocesses started with the
94
+// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
95
+// termination. Any subprocess started before this function is called may be
96
+// affected, in addition to any subprocesses started inside setupfn or fn.
97
+// There are more details at https://go.dev/issue/27505.
98
+//
99
+// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
100
+func Go(flags int, setupfn func() error, fn func()) error {
101
+	started := make(chan error)
102
+
103
+	maskedFlags := flags
104
+	for f := range reversibleSetnsFlags {
105
+		maskedFlags &^= f
106
+	}
107
+	isReversible := maskedFlags == 0
108
+
109
+	go func() {
110
+		// Prepare to manipulate per-thread kernel state.
111
+		runtime.LockOSThread()
112
+
113
+		// Not all changes to the execution state can be reverted.
114
+		// If an irreversible change to the execution state is made, our
115
+		// only recourse is to have the tampered thread terminated by
116
+		// returning from this function while the goroutine remains
117
+		// wired to the thread. The Go runtime will terminate the thread
118
+		// and replace it with a fresh one as needed.
119
+
120
+		if isReversible {
121
+			defer func() {
122
+				if isReversible {
123
+					// All execution state has been restored without error.
124
+					// The thread is once again fungible.
125
+					runtime.UnlockOSThread()
126
+				}
127
+			}()
128
+			tid := unix.Gettid()
129
+			for f, ns := range reversibleSetnsFlags {
130
+				if flags&f != f {
131
+					continue
132
+				}
133
+				// The /proc/thread-self directory was added in Linux 3.17.
134
+				// We are not using it to maximize compatibility.
135
+				pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
136
+				fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
137
+				if err != nil {
138
+					started <- &os.PathError{Op: "open", Path: pth, Err: err}
139
+					return
140
+				}
141
+				defer func() {
142
+					if isReversible {
143
+						if err := unix.Setns(fd, 0); err != nil {
144
+							isReversible = false
145
+						}
146
+					}
147
+					_ = unix.Close(fd)
148
+				}()
149
+			}
150
+		}
151
+
152
+		// Threads are implemented under Linux as processes which share
153
+		// a virtual memory space. Therefore in a multithreaded process
154
+		// unshare(2) disassociates parts of the calling thread's
155
+		// context from the thread it was clone(2)'d from.
156
+		if err := unix.Unshare(flags); err != nil {
157
+			started <- os.NewSyscallError("unshare", err)
158
+			return
159
+		}
160
+
161
+		if setupfn != nil {
162
+			if err := setupfn(); err != nil {
163
+				started <- err
164
+				return
165
+			}
166
+		}
167
+		close(started)
168
+
169
+		if fn != nil {
170
+			fn()
171
+		}
172
+	}()
173
+
174
+	return <-started
175
+}
... ...
@@ -821,10 +821,29 @@ func Tar(path string, compression Compression) (io.ReadCloser, error) {
821 821
 // TarWithOptions creates an archive from the directory at `path`, only including files whose relative
822 822
 // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`.
823 823
 func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) {
824
-	// Fix the source path to work with long path names. This is a no-op
825
-	// on platforms other than Windows.
826
-	srcPath = fixVolumePathPrefix(srcPath)
824
+	tb, err := NewTarballer(srcPath, options)
825
+	if err != nil {
826
+		return nil, err
827
+	}
828
+	go tb.Do()
829
+	return tb.Reader(), nil
830
+}
831
+
832
+// Tarballer is a lower-level interface to TarWithOptions which gives the caller
833
+// control over which goroutine the archiving operation executes on.
834
+type Tarballer struct {
835
+	srcPath           string
836
+	options           *TarOptions
837
+	pm                *patternmatcher.PatternMatcher
838
+	pipeReader        *io.PipeReader
839
+	pipeWriter        *io.PipeWriter
840
+	compressWriter    io.WriteCloser
841
+	whiteoutConverter tarWhiteoutConverter
842
+}
827 843
 
844
+// NewTarballer constructs a new tarballer. The arguments are the same as for
845
+// TarWithOptions.
846
+func NewTarballer(srcPath string, options *TarOptions) (*Tarballer, error) {
828 847
 	pm, err := patternmatcher.New(options.ExcludePatterns)
829 848
 	if err != nil {
830 849
 		return nil, err
... ...
@@ -842,183 +861,201 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
842 842
 		return nil, err
843 843
 	}
844 844
 
845
-	go func() {
846
-		ta := newTarAppender(
847
-			options.IDMap,
848
-			compressWriter,
849
-			options.ChownOpts,
850
-		)
851
-		ta.WhiteoutConverter = whiteoutConverter
852
-
853
-		defer func() {
854
-			// Make sure to check the error on Close.
855
-			if err := ta.TarWriter.Close(); err != nil {
856
-				logrus.Errorf("Can't close tar writer: %s", err)
857
-			}
858
-			if err := compressWriter.Close(); err != nil {
859
-				logrus.Errorf("Can't close compress writer: %s", err)
860
-			}
861
-			if err := pipeWriter.Close(); err != nil {
862
-				logrus.Errorf("Can't close pipe writer: %s", err)
863
-			}
864
-		}()
845
+	return &Tarballer{
846
+		// Fix the source path to work with long path names. This is a no-op
847
+		// on platforms other than Windows.
848
+		srcPath:           fixVolumePathPrefix(srcPath),
849
+		options:           options,
850
+		pm:                pm,
851
+		pipeReader:        pipeReader,
852
+		pipeWriter:        pipeWriter,
853
+		compressWriter:    compressWriter,
854
+		whiteoutConverter: whiteoutConverter,
855
+	}, nil
856
+}
865 857
 
866
-		// this buffer is needed for the duration of this piped stream
867
-		defer pools.BufioWriter32KPool.Put(ta.Buffer)
858
+// Reader returns the reader for the created archive.
859
+func (t *Tarballer) Reader() io.ReadCloser {
860
+	return t.pipeReader
861
+}
868 862
 
869
-		// In general we log errors here but ignore them because
870
-		// during e.g. a diff operation the container can continue
871
-		// mutating the filesystem and we can see transient errors
872
-		// from this
863
+// Do performs the archiving operation in the background. The resulting archive
864
+// can be read from t.Reader(). Do should only be called once on each Tarballer
865
+// instance.
866
+func (t *Tarballer) Do() {
867
+	ta := newTarAppender(
868
+		t.options.IDMap,
869
+		t.compressWriter,
870
+		t.options.ChownOpts,
871
+	)
872
+	ta.WhiteoutConverter = t.whiteoutConverter
873 873
 
874
-		stat, err := os.Lstat(srcPath)
875
-		if err != nil {
876
-			return
874
+	defer func() {
875
+		// Make sure to check the error on Close.
876
+		if err := ta.TarWriter.Close(); err != nil {
877
+			logrus.Errorf("Can't close tar writer: %s", err)
877 878
 		}
878
-
879
-		if !stat.IsDir() {
880
-			// We can't later join a non-dir with any includes because the
881
-			// 'walk' will error if "file/." is stat-ed and "file" is not a
882
-			// directory. So, we must split the source path and use the
883
-			// basename as the include.
884
-			if len(options.IncludeFiles) > 0 {
885
-				logrus.Warn("Tar: Can't archive a file with includes")
886
-			}
887
-
888
-			dir, base := SplitPathDirEntry(srcPath)
889
-			srcPath = dir
890
-			options.IncludeFiles = []string{base}
879
+		if err := t.compressWriter.Close(); err != nil {
880
+			logrus.Errorf("Can't close compress writer: %s", err)
891 881
 		}
892
-
893
-		if len(options.IncludeFiles) == 0 {
894
-			options.IncludeFiles = []string{"."}
882
+		if err := t.pipeWriter.Close(); err != nil {
883
+			logrus.Errorf("Can't close pipe writer: %s", err)
895 884
 		}
885
+	}()
896 886
 
897
-		seen := make(map[string]bool)
898
-
899
-		for _, include := range options.IncludeFiles {
900
-			rebaseName := options.RebaseNames[include]
887
+	// this buffer is needed for the duration of this piped stream
888
+	defer pools.BufioWriter32KPool.Put(ta.Buffer)
901 889
 
902
-			var (
903
-				parentMatchInfo []patternmatcher.MatchInfo
904
-				parentDirs      []string
905
-			)
890
+	// In general we log errors here but ignore them because
891
+	// during e.g. a diff operation the container can continue
892
+	// mutating the filesystem and we can see transient errors
893
+	// from this
906 894
 
907
-			walkRoot := getWalkRoot(srcPath, include)
908
-			filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
909
-				if err != nil {
910
-					logrus.Errorf("Tar: Can't stat file %s to tar: %s", srcPath, err)
911
-					return nil
912
-				}
895
+	stat, err := os.Lstat(t.srcPath)
896
+	if err != nil {
897
+		return
898
+	}
913 899
 
914
-				relFilePath, err := filepath.Rel(srcPath, filePath)
915
-				if err != nil || (!options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
916
-					// Error getting relative path OR we are looking
917
-					// at the source directory path. Skip in both situations.
918
-					return nil
919
-				}
900
+	if !stat.IsDir() {
901
+		// We can't later join a non-dir with any includes because the
902
+		// 'walk' will error if "file/." is stat-ed and "file" is not a
903
+		// directory. So, we must split the source path and use the
904
+		// basename as the include.
905
+		if len(t.options.IncludeFiles) > 0 {
906
+			logrus.Warn("Tar: Can't archive a file with includes")
907
+		}
920 908
 
921
-				if options.IncludeSourceDir && include == "." && relFilePath != "." {
922
-					relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
923
-				}
909
+		dir, base := SplitPathDirEntry(t.srcPath)
910
+		t.srcPath = dir
911
+		t.options.IncludeFiles = []string{base}
912
+	}
924 913
 
925
-				skip := false
926
-
927
-				// If "include" is an exact match for the current file
928
-				// then even if there's an "excludePatterns" pattern that
929
-				// matches it, don't skip it. IOW, assume an explicit 'include'
930
-				// is asking for that file no matter what - which is true
931
-				// for some files, like .dockerignore and Dockerfile (sometimes)
932
-				if include != relFilePath {
933
-					for len(parentDirs) != 0 {
934
-						lastParentDir := parentDirs[len(parentDirs)-1]
935
-						if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
936
-							break
937
-						}
938
-						parentDirs = parentDirs[:len(parentDirs)-1]
939
-						parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
940
-					}
914
+	if len(t.options.IncludeFiles) == 0 {
915
+		t.options.IncludeFiles = []string{"."}
916
+	}
941 917
 
942
-					var matchInfo patternmatcher.MatchInfo
943
-					if len(parentMatchInfo) != 0 {
944
-						skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
945
-					} else {
946
-						skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
947
-					}
948
-					if err != nil {
949
-						logrus.Errorf("Error matching %s: %v", relFilePath, err)
950
-						return err
951
-					}
918
+	seen := make(map[string]bool)
952 919
 
953
-					if f.IsDir() {
954
-						parentDirs = append(parentDirs, relFilePath)
955
-						parentMatchInfo = append(parentMatchInfo, matchInfo)
956
-					}
957
-				}
920
+	for _, include := range t.options.IncludeFiles {
921
+		rebaseName := t.options.RebaseNames[include]
958 922
 
959
-				if skip {
960
-					// If we want to skip this file and its a directory
961
-					// then we should first check to see if there's an
962
-					// excludes pattern (e.g. !dir/file) that starts with this
963
-					// dir. If so then we can't skip this dir.
923
+		var (
924
+			parentMatchInfo []patternmatcher.MatchInfo
925
+			parentDirs      []string
926
+		)
964 927
 
965
-					// Its not a dir then so we can just return/skip.
966
-					if !f.IsDir() {
967
-						return nil
968
-					}
928
+		walkRoot := getWalkRoot(t.srcPath, include)
929
+		filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
930
+			if err != nil {
931
+				logrus.Errorf("Tar: Can't stat file %s to tar: %s", t.srcPath, err)
932
+				return nil
933
+			}
969 934
 
970
-					// No exceptions (!...) in patterns so just skip dir
971
-					if !pm.Exclusions() {
972
-						return filepath.SkipDir
973
-					}
935
+			relFilePath, err := filepath.Rel(t.srcPath, filePath)
936
+			if err != nil || (!t.options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
937
+				// Error getting relative path OR we are looking
938
+				// at the source directory path. Skip in both situations.
939
+				return nil
940
+			}
974 941
 
975
-					dirSlash := relFilePath + string(filepath.Separator)
942
+			if t.options.IncludeSourceDir && include == "." && relFilePath != "." {
943
+				relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
944
+			}
976 945
 
977
-					for _, pat := range pm.Patterns() {
978
-						if !pat.Exclusion() {
979
-							continue
980
-						}
981
-						if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
982
-							// found a match - so can't skip this dir
983
-							return nil
984
-						}
946
+			skip := false
947
+
948
+			// If "include" is an exact match for the current file
949
+			// then even if there's an "excludePatterns" pattern that
950
+			// matches it, don't skip it. IOW, assume an explicit 'include'
951
+			// is asking for that file no matter what - which is true
952
+			// for some files, like .dockerignore and Dockerfile (sometimes)
953
+			if include != relFilePath {
954
+				for len(parentDirs) != 0 {
955
+					lastParentDir := parentDirs[len(parentDirs)-1]
956
+					if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
957
+						break
985 958
 					}
959
+					parentDirs = parentDirs[:len(parentDirs)-1]
960
+					parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
961
+				}
986 962
 
987
-					// No matching exclusion dir so just skip dir
988
-					return filepath.SkipDir
963
+				var matchInfo patternmatcher.MatchInfo
964
+				if len(parentMatchInfo) != 0 {
965
+					skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
966
+				} else {
967
+					skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
968
+				}
969
+				if err != nil {
970
+					logrus.Errorf("Error matching %s: %v", relFilePath, err)
971
+					return err
972
+				}
973
+
974
+				if f.IsDir() {
975
+					parentDirs = append(parentDirs, relFilePath)
976
+					parentMatchInfo = append(parentMatchInfo, matchInfo)
989 977
 				}
978
+			}
979
+
980
+			if skip {
981
+				// If we want to skip this file and its a directory
982
+				// then we should first check to see if there's an
983
+				// excludes pattern (e.g. !dir/file) that starts with this
984
+				// dir. If so then we can't skip this dir.
990 985
 
991
-				if seen[relFilePath] {
986
+				// Its not a dir then so we can just return/skip.
987
+				if !f.IsDir() {
992 988
 					return nil
993 989
 				}
994
-				seen[relFilePath] = true
995
-
996
-				// Rename the base resource.
997
-				if rebaseName != "" {
998
-					var replacement string
999
-					if rebaseName != string(filepath.Separator) {
1000
-						// Special case the root directory to replace with an
1001
-						// empty string instead so that we don't end up with
1002
-						// double slashes in the paths.
1003
-						replacement = rebaseName
1004
-					}
1005 990
 
1006
-					relFilePath = strings.Replace(relFilePath, include, replacement, 1)
991
+				// No exceptions (!...) in patterns so just skip dir
992
+				if !t.pm.Exclusions() {
993
+					return filepath.SkipDir
1007 994
 				}
1008 995
 
1009
-				if err := ta.addTarFile(filePath, relFilePath); err != nil {
1010
-					logrus.Errorf("Can't add file %s to tar: %s", filePath, err)
1011
-					// if pipe is broken, stop writing tar stream to it
1012
-					if err == io.ErrClosedPipe {
1013
-						return err
996
+				dirSlash := relFilePath + string(filepath.Separator)
997
+
998
+				for _, pat := range t.pm.Patterns() {
999
+					if !pat.Exclusion() {
1000
+						continue
1001
+					}
1002
+					if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
1003
+						// found a match - so can't skip this dir
1004
+						return nil
1014 1005
 					}
1015 1006
 				}
1007
+
1008
+				// No matching exclusion dir so just skip dir
1009
+				return filepath.SkipDir
1010
+			}
1011
+
1012
+			if seen[relFilePath] {
1016 1013
 				return nil
1017
-			})
1018
-		}
1019
-	}()
1014
+			}
1015
+			seen[relFilePath] = true
1016
+
1017
+			// Rename the base resource.
1018
+			if rebaseName != "" {
1019
+				var replacement string
1020
+				if rebaseName != string(filepath.Separator) {
1021
+					// Special case the root directory to replace with an
1022
+					// empty string instead so that we don't end up with
1023
+					// double slashes in the paths.
1024
+					replacement = rebaseName
1025
+				}
1020 1026
 
1021
-	return pipeReader, nil
1027
+				relFilePath = strings.Replace(relFilePath, include, replacement, 1)
1028
+			}
1029
+
1030
+			if err := ta.addTarFile(filePath, relFilePath); err != nil {
1031
+				logrus.Errorf("Can't add file %s to tar: %s", filePath, err)
1032
+				// if pipe is broken, stop writing tar stream to it
1033
+				if err == io.ErrClosedPipe {
1034
+					return err
1035
+				}
1036
+			}
1037
+			return nil
1038
+		})
1039
+	}
1022 1040
 }
1023 1041
 
1024 1042
 // Unpack unpacks the decompressedArchive to dest with options.
... ...
@@ -87,7 +87,7 @@ func UnpackLayer(dest string, layer io.Reader, options *TarOptions) (size int64,
87 87
 				basename := filepath.Base(hdr.Name)
88 88
 				aufsHardlinks[basename] = hdr
89 89
 				if aufsTempdir == "" {
90
-					if aufsTempdir, err = os.MkdirTemp("", "dockerplnk"); err != nil {
90
+					if aufsTempdir, err = os.MkdirTemp(dest, "dockerplnk"); err != nil {
91 91
 						return 0, err
92 92
 					}
93 93
 					defer os.RemoveAll(aufsTempdir)
... ...
@@ -4,223 +4,71 @@
4 4
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
5 5
 
6 6
 import (
7
-	"bytes"
8
-	"encoding/json"
9
-	"flag"
10
-	"fmt"
11 7
 	"io"
12
-	"os"
13 8
 	"path/filepath"
14
-	"runtime"
15 9
 	"strings"
16 10
 
17 11
 	"github.com/docker/docker/pkg/archive"
18
-	"github.com/docker/docker/pkg/reexec"
19 12
 	"github.com/pkg/errors"
20 13
 )
21 14
 
22
-// untar is the entry-point for docker-untar on re-exec. This is not used on
23
-// Windows as it does not support chroot, hence no point sandboxing through
24
-// chroot and rexec.
25
-func untar() {
26
-	runtime.LockOSThread()
27
-	flag.Parse()
28
-
29
-	var options archive.TarOptions
30
-
31
-	// read the options from the pipe "ExtraFiles"
32
-	if err := json.NewDecoder(os.NewFile(3, "options")).Decode(&options); err != nil {
33
-		fatal(err)
34
-	}
35
-
36
-	dst := flag.Arg(0)
37
-	var root string
38
-	if len(flag.Args()) > 1 {
39
-		root = flag.Arg(1)
40
-	}
41
-
42
-	if root == "" {
43
-		root = dst
44
-	}
45
-
46
-	if err := chroot(root); err != nil {
47
-		fatal(err)
48
-	}
49
-
50
-	if err := archive.Unpack(os.Stdin, dst, &options); err != nil {
51
-		fatal(err)
52
-	}
53
-	// fully consume stdin in case it is zero padded
54
-	if _, err := flush(os.Stdin); err != nil {
55
-		fatal(err)
56
-	}
57
-
58
-	os.Exit(0)
59
-}
60
-
61 15
 func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error {
62
-	if root == "" {
63
-		return errors.New("must specify a root to chroot to")
64
-	}
65
-
66
-	// We can't pass a potentially large exclude list directly via cmd line
67
-	// because we easily overrun the kernel's max argument/environment size
68
-	// when the full image list is passed (e.g. when this is used by
69
-	// `docker load`). We will marshall the options via a pipe to the
70
-	// child
71
-	r, w, err := os.Pipe()
16
+	relDest, err := resolvePathInChroot(root, dest)
72 17
 	if err != nil {
73
-		return fmt.Errorf("Untar pipe failure: %v", err)
74
-	}
75
-
76
-	if root != "" {
77
-		relDest, err := filepath.Rel(root, dest)
78
-		if err != nil {
79
-			return err
80
-		}
81
-		if relDest == "." {
82
-			relDest = "/"
83
-		}
84
-		if relDest[0] != '/' {
85
-			relDest = "/" + relDest
86
-		}
87
-		dest = relDest
18
+		return err
88 19
 	}
89 20
 
90
-	cmd := reexec.Command("docker-untar", dest, root)
91
-	cmd.Stdin = decompressedArchive
92
-
93
-	cmd.ExtraFiles = append(cmd.ExtraFiles, r)
94
-	output := bytes.NewBuffer(nil)
95
-	cmd.Stdout = output
96
-	cmd.Stderr = output
97
-
98
-	// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which
99
-	// causes the started process to be signaled when the creating OS thread
100
-	// dies. Ensure that the reexec is not prematurely signaled. See
101
-	// https://go.dev/issue/27505 for more information.
102
-	runtime.LockOSThread()
103
-	defer runtime.UnlockOSThread()
104
-	if err := cmd.Start(); err != nil {
105
-		w.Close()
106
-		return fmt.Errorf("Untar error on re-exec cmd: %v", err)
107
-	}
108
-
109
-	// write the options to the pipe for the untar exec to read
110
-	if err := json.NewEncoder(w).Encode(options); err != nil {
111
-		w.Close()
112
-		return fmt.Errorf("Untar json encode to pipe failed: %v", err)
113
-	}
114
-	w.Close()
115
-
116
-	if err := cmd.Wait(); err != nil {
117
-		// when `xz -d -c -q | docker-untar ...` failed on docker-untar side,
118
-		// we need to exhaust `xz`'s output, otherwise the `xz` side will be
119
-		// pending on write pipe forever
120
-		io.Copy(io.Discard, decompressedArchive)
121
-
122
-		return fmt.Errorf("Error processing tar file(%v): %s", err, output)
123
-	}
124
-	return nil
125
-}
126
-
127
-func tar() {
128
-	runtime.LockOSThread()
129
-	flag.Parse()
130
-
131
-	src := flag.Arg(0)
132
-	var root string
133
-	if len(flag.Args()) > 1 {
134
-		root = flag.Arg(1)
135
-	}
136
-
137
-	if root == "" {
138
-		root = src
139
-	}
140
-
141
-	if err := realChroot(root); err != nil {
142
-		fatal(err)
143
-	}
144
-
145
-	var options archive.TarOptions
146
-	if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil {
147
-		fatal(err)
148
-	}
149
-
150
-	rdr, err := archive.TarWithOptions(src, &options)
21
+	done := make(chan error)
22
+	err = goInChroot(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
151 23
 	if err != nil {
152
-		fatal(err)
153
-	}
154
-	defer rdr.Close()
155
-
156
-	if _, err := io.Copy(os.Stdout, rdr); err != nil {
157
-		fatal(err)
24
+		return err
158 25
 	}
159
-
160
-	os.Exit(0)
26
+	return <-done
161 27
 }
162 28
 
163 29
 func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) {
164
-	if root == "" {
165
-		return nil, errors.New("root path must not be empty")
166
-	}
167
-
168
-	relSrc, err := filepath.Rel(root, srcPath)
30
+	relSrc, err := resolvePathInChroot(root, srcPath)
169 31
 	if err != nil {
170 32
 		return nil, err
171 33
 	}
172
-	if relSrc == "." {
173
-		relSrc = "/"
174
-	}
175
-	if relSrc[0] != '/' {
176
-		relSrc = "/" + relSrc
177
-	}
178 34
 
179
-	// make sure we didn't trim a trailing slash with the call to `Rel`
35
+	// make sure we didn't trim a trailing slash with the call to `resolvePathInChroot`
180 36
 	if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") {
181 37
 		relSrc += "/"
182 38
 	}
183 39
 
184
-	cmd := reexec.Command("docker-tar", relSrc, root)
185
-
186
-	errBuff := bytes.NewBuffer(nil)
187
-	cmd.Stderr = errBuff
188
-
189
-	tarR, tarW := io.Pipe()
190
-	cmd.Stdout = tarW
191
-
192
-	stdin, err := cmd.StdinPipe()
40
+	tb, err := archive.NewTarballer(relSrc, options)
193 41
 	if err != nil {
194
-		return nil, errors.Wrap(err, "error getting options pipe for tar process")
42
+		return nil, errors.Wrap(err, "error processing tar file")
195 43
 	}
196
-
197
-	started := make(chan error)
198
-	go func() {
199
-		// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux,
200
-		// which causes the started process to be signaled when the
201
-		// creating OS thread dies. Ensure that the subprocess is not
202
-		// prematurely signaled. See https://go.dev/issue/27505 for more
203
-		// information.
204
-		runtime.LockOSThread()
205
-		defer runtime.UnlockOSThread()
206
-		if err := cmd.Start(); err != nil {
207
-			started <- err
208
-			return
209
-		}
210
-		close(started)
211
-		err := cmd.Wait()
212
-		err = errors.Wrapf(err, "error processing tar file: %s", errBuff)
213
-		tarW.CloseWithError(err)
214
-	}()
215
-	if err := <-started; err != nil {
216
-		return nil, errors.Wrap(err, "tar error on re-exec cmd")
44
+	err = goInChroot(root, tb.Do)
45
+	if err != nil {
46
+		return nil, errors.Wrap(err, "could not chroot")
217 47
 	}
48
+	return tb.Reader(), nil
49
+}
218 50
 
219
-	if err := json.NewEncoder(stdin).Encode(options); err != nil {
220
-		stdin.Close()
221
-		return nil, errors.Wrap(err, "tar json encode to pipe failed")
51
+// resolvePathInChroot returns the equivalent to path inside a chroot rooted at root.
52
+// The returned path always begins with '/'.
53
+//
54
+//   - resolvePathInChroot("/a/b", "/a/b/c/d") -> "/c/d"
55
+//   - resolvePathInChroot("/a/b", "/a/b")     -> "/"
56
+//
57
+// The implementation is buggy, and some bugs may be load-bearing.
58
+// Here be dragons.
59
+func resolvePathInChroot(root, path string) (string, error) {
60
+	if root == "" {
61
+		return "", errors.New("root path must not be empty")
222 62
 	}
223
-	stdin.Close()
224
-
225
-	return tarR, nil
63
+	rel, err := filepath.Rel(root, path)
64
+	if err != nil {
65
+		return "", err
66
+	}
67
+	if rel == "." {
68
+		rel = "/"
69
+	}
70
+	if rel[0] != '/' {
71
+		rel = "/" + rel
72
+	}
73
+	return rel, nil
226 74
 }
... ...
@@ -7,11 +7,6 @@ import (
7 7
 	"github.com/docker/docker/pkg/longpath"
8 8
 )
9 9
 
10
-// chroot is not supported by Windows
11
-func chroot(path string) error {
12
-	return nil
13
-}
14
-
15 10
 func invokeUnpack(decompressedArchive io.ReadCloser,
16 11
 	dest string,
17 12
 	options *archive.TarOptions, root string) error {
... ...
@@ -1,113 +1,34 @@
1 1
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
2 2
 
3 3
 import (
4
-	"fmt"
5
-	"os"
6
-	"path/filepath"
7
-
8
-	"github.com/containerd/containerd/pkg/userns"
4
+	"github.com/docker/docker/internal/mounttree"
5
+	"github.com/docker/docker/internal/unshare"
9 6
 	"github.com/moby/sys/mount"
10
-	"github.com/moby/sys/mountinfo"
11 7
 	"golang.org/x/sys/unix"
12 8
 )
13 9
 
14
-// chroot on linux uses pivot_root instead of chroot
15
-// pivot_root takes a new root and an old root.
16
-// Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root.
17
-// New root is where the new rootfs is set to.
18
-// Old root is removed after the call to pivot_root so it is no longer available under the new root.
19
-// This is similar to how libcontainer sets up a container's rootfs
20
-func chroot(path string) (err error) {
21
-	// if the engine is running in a user namespace we need to use actual chroot
22
-	if userns.RunningInUserNS() {
23
-		return realChroot(path)
24
-	}
25
-	if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
26
-		return fmt.Errorf("Error creating mount namespace before pivot: %v", err)
27
-	}
28
-
29
-	// Make everything in new ns slave.
30
-	// Don't use `private` here as this could race where the mountns gets a
31
-	//   reference to a mount and an unmount from the host does not propagate,
32
-	//   which could potentially cause transient errors for other operations,
33
-	//   even though this should be relatively small window here `slave` should
34
-	//   not cause any problems.
35
-	if err := mount.MakeRSlave("/"); err != nil {
36
-		return err
37
-	}
38
-
39
-	if mounted, _ := mountinfo.Mounted(path); !mounted {
40
-		if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
41
-			return realChroot(path)
42
-		}
43
-	}
44
-
45
-	// setup oldRoot for pivot_root
46
-	pivotDir, err := os.MkdirTemp(path, ".pivot_root")
47
-	if err != nil {
48
-		return fmt.Errorf("Error setting up pivot dir: %v", err)
49
-	}
50
-
51
-	var mounted bool
52
-	defer func() {
53
-		if mounted {
54
-			// make sure pivotDir is not mounted before we try to remove it
55
-			if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
56
-				if err == nil {
57
-					err = errCleanup
58
-				}
59
-				return
60
-			}
61
-		}
62
-
63
-		errCleanup := os.Remove(pivotDir)
64
-		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
65
-		// because we already cleaned it up on failed pivot_root
66
-		if errCleanup != nil && !os.IsNotExist(errCleanup) {
67
-			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
68
-			if err == nil {
69
-				err = errCleanup
10
+// goInChroot starts fn in a goroutine where the root directory, current working
11
+// directory and umask are unshared from other goroutines and the root directory
12
+// has been changed to path. These changes are only visible to the goroutine in
13
+// which fn is executed. Any other goroutines, including ones started from fn,
14
+// will see the same root directory and file system attributes as the rest of
15
+// the process.
16
+func goInChroot(path string, fn func()) error {
17
+	return unshare.Go(
18
+		unix.CLONE_FS|unix.CLONE_NEWNS,
19
+		func() error {
20
+			// Make everything in new ns slave.
21
+			// Don't use `private` here as this could race where the mountns gets a
22
+			//   reference to a mount and an unmount from the host does not propagate,
23
+			//   which could potentially cause transient errors for other operations,
24
+			//   even though this should be relatively small window here `slave` should
25
+			//   not cause any problems.
26
+			if err := mount.MakeRSlave("/"); err != nil {
27
+				return err
70 28
 			}
71
-		}
72
-	}()
73
-
74
-	if err := unix.PivotRoot(path, pivotDir); err != nil {
75
-		// If pivot fails, fall back to the normal chroot after cleaning up temp dir
76
-		if err := os.Remove(pivotDir); err != nil {
77
-			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
78
-		}
79
-		return realChroot(path)
80
-	}
81
-	mounted = true
82
-
83
-	// This is the new path for where the old root (prior to the pivot) has been moved to
84
-	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
85
-	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
86
-
87
-	if err := unix.Chdir("/"); err != nil {
88
-		return fmt.Errorf("Error changing to new root: %v", err)
89
-	}
90
-
91
-	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
92
-	if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
93
-		return fmt.Errorf("Error making old root private after pivot: %v", err)
94
-	}
95
-
96
-	// Now unmount the old root so it's no longer visible from the new root
97
-	if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
98
-		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
99
-	}
100
-	mounted = false
101
-
102
-	return nil
103
-}
104 29
 
105
-func realChroot(path string) error {
106
-	if err := unix.Chroot(path); err != nil {
107
-		return fmt.Errorf("Error after fallback to chroot: %v", err)
108
-	}
109
-	if err := unix.Chdir("/"); err != nil {
110
-		return fmt.Errorf("Error changing to new root after chroot: %v", err)
111
-	}
112
-	return nil
30
+			return mounttree.SwitchRoot(path)
31
+		},
32
+		fn,
33
+	)
113 34
 }
114 35
deleted file mode 100644
... ...
@@ -1,17 +0,0 @@
1
-//go:build !windows && !linux
2
-// +build !windows,!linux
3
-
4
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
5
-
6
-import "golang.org/x/sys/unix"
7
-
8
-func chroot(path string) error {
9
-	if err := unix.Chroot(path); err != nil {
10
-		return err
11
-	}
12
-	return unix.Chdir("/")
13
-}
14
-
15
-func realChroot(path string) error {
16
-	return chroot(path)
17
-}
... ...
@@ -4,78 +4,14 @@
4 4
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
5 5
 
6 6
 import (
7
-	"bytes"
8
-	"encoding/json"
9
-	"flag"
10
-	"fmt"
11 7
 	"io"
12
-	"os"
13 8
 	"path/filepath"
14
-	"runtime"
15 9
 
16 10
 	"github.com/containerd/containerd/pkg/userns"
17 11
 	"github.com/docker/docker/pkg/archive"
18
-	"github.com/docker/docker/pkg/reexec"
19 12
 	"golang.org/x/sys/unix"
20 13
 )
21 14
 
22
-type applyLayerResponse struct {
23
-	LayerSize int64 `json:"layerSize"`
24
-}
25
-
26
-// applyLayer is the entry-point for docker-applylayer on re-exec. This is not
27
-// used on Windows as it does not support chroot, hence no point sandboxing
28
-// through chroot and rexec.
29
-func applyLayer() {
30
-
31
-	var (
32
-		tmpDir  string
33
-		err     error
34
-		options *archive.TarOptions
35
-	)
36
-	runtime.LockOSThread()
37
-	flag.Parse()
38
-
39
-	inUserns := userns.RunningInUserNS()
40
-	if err := chroot(flag.Arg(0)); err != nil {
41
-		fatal(err)
42
-	}
43
-
44
-	// We need to be able to set any perms
45
-	oldmask := unix.Umask(0)
46
-	defer unix.Umask(oldmask)
47
-
48
-	if err := json.Unmarshal([]byte(os.Getenv("OPT")), &options); err != nil {
49
-		fatal(err)
50
-	}
51
-
52
-	if inUserns {
53
-		options.InUserNS = true
54
-	}
55
-
56
-	if tmpDir, err = os.MkdirTemp("/", "temp-docker-extract"); err != nil {
57
-		fatal(err)
58
-	}
59
-
60
-	os.Setenv("TMPDIR", tmpDir)
61
-	size, err := archive.UnpackLayer("/", os.Stdin, options)
62
-	os.RemoveAll(tmpDir)
63
-	if err != nil {
64
-		fatal(err)
65
-	}
66
-
67
-	encoder := json.NewEncoder(os.Stdout)
68
-	if err := encoder.Encode(applyLayerResponse{size}); err != nil {
69
-		fatal(fmt.Errorf("unable to encode layerSize JSON: %s", err))
70
-	}
71
-
72
-	if _, err := flush(os.Stdin); err != nil {
73
-		fatal(err)
74
-	}
75
-
76
-	os.Exit(0)
77
-}
78
-
79 15
 // applyLayerHandler parses a diff in the standard layer format from `layer`, and
80 16
 // applies it to the directory `dest`. Returns the size in bytes of the
81 17
 // contents of the layer.
... ...
@@ -92,42 +28,30 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
92 92
 	}
93 93
 	if options == nil {
94 94
 		options = &archive.TarOptions{}
95
-		if userns.RunningInUserNS() {
96
-			options.InUserNS = true
97
-		}
95
+	}
96
+	if userns.RunningInUserNS() {
97
+		options.InUserNS = true
98 98
 	}
99 99
 	if options.ExcludePatterns == nil {
100 100
 		options.ExcludePatterns = []string{}
101 101
 	}
102 102
 
103
-	data, err := json.Marshal(options)
104
-	if err != nil {
105
-		return 0, fmt.Errorf("ApplyLayer json encode: %v", err)
103
+	type result struct {
104
+		layerSize int64
105
+		err       error
106 106
 	}
107 107
 
108
-	cmd := reexec.Command("docker-applyLayer", dest)
109
-	cmd.Stdin = layer
110
-	cmd.Env = append(cmd.Env, fmt.Sprintf("OPT=%s", data))
111
-
112
-	outBuf, errBuf := new(bytes.Buffer), new(bytes.Buffer)
113
-	cmd.Stdout, cmd.Stderr = outBuf, errBuf
108
+	done := make(chan result)
109
+	err = goInChroot(dest, func() {
110
+		// We need to be able to set any perms
111
+		_ = unix.Umask(0)
114 112
 
115
-	// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which
116
-	// causes the started process to be signaled when the creating OS thread
117
-	// dies. Ensure that the reexec is not prematurely signaled. See
118
-	// https://go.dev/issue/27505 for more information.
119
-	runtime.LockOSThread()
120
-	defer runtime.UnlockOSThread()
121
-	if err = cmd.Run(); err != nil {
122
-		return 0, fmt.Errorf("ApplyLayer %s stdout: %s stderr: %s", err, outBuf, errBuf)
123
-	}
124
-
125
-	// Stdout should be a valid JSON struct representing an applyLayerResponse.
126
-	response := applyLayerResponse{}
127
-	decoder := json.NewDecoder(outBuf)
128
-	if err = decoder.Decode(&response); err != nil {
129
-		return 0, fmt.Errorf("unable to decode ApplyLayer JSON response: %s", err)
113
+		size, err := archive.UnpackLayer("/", layer, options)
114
+		done <- result{layerSize: size, err: err}
115
+	})
116
+	if err != nil {
117
+		return 0, err
130 118
 	}
131
-
132
-	return response.LayerSize, nil
119
+	res := <-done
120
+	return res.layerSize, res.err
133 121
 }
... ...
@@ -3,7 +3,6 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
3 3
 import (
4 4
 	"fmt"
5 5
 	"io"
6
-	"os"
7 6
 	"path/filepath"
8 7
 
9 8
 	"github.com/docker/docker/pkg/archive"
... ...
@@ -29,13 +28,7 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
29 29
 		layer = decompressed
30 30
 	}
31 31
 
32
-	tmpDir, err := os.MkdirTemp(os.Getenv("temp"), "temp-docker-extract")
33
-	if err != nil {
34
-		return 0, fmt.Errorf("ApplyLayer failed to create temp-docker-extract under %s. %s", dest, err)
35
-	}
36
-
37 32
 	s, err := archive.UnpackLayer(dest, layer, nil)
38
-	os.RemoveAll(tmpDir)
39 33
 	if err != nil {
40 34
 		return 0, fmt.Errorf("ApplyLayer %s failed UnpackLayer to %s: %s", layer, dest, err)
41 35
 	}
42 36
deleted file mode 100644
... ...
@@ -1,29 +0,0 @@
1
-//go:build !windows
2
-// +build !windows
3
-
4
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
5
-
6
-import (
7
-	"fmt"
8
-	"io"
9
-	"os"
10
-
11
-	"github.com/docker/docker/pkg/reexec"
12
-)
13
-
14
-func init() {
15
-	reexec.Register("docker-applyLayer", applyLayer)
16
-	reexec.Register("docker-untar", untar)
17
-	reexec.Register("docker-tar", tar)
18
-}
19
-
20
-func fatal(err error) {
21
-	fmt.Fprint(os.Stderr, err)
22
-	os.Exit(1)
23
-}
24
-
25
-// flush consumes all the bytes from the reader discarding
26
-// any errors
27
-func flush(r io.Reader) (bytes int64, err error) {
28
-	return io.Copy(io.Discard, r)
29
-}