This changeset allows Docker's VFS, and Overlay to take advantage of
Linux's zerocopy APIs.
The copy function first tries to use the ficlone ioctl. Reason being:
- they do not allow partial success (aka short writes)
- clones are expected to be a fast metadata operation
See: http://oss.sgi.com/archives/xfs/2015-12/msg00356.html
If the clone fails, we fall back to copy_file_range, which internally
may fall back to splice, which has an upper limit on the size
of copy it can perform. Given that, we have to loop until the copy
is done.
For a given dirCopy operation, if the clone fails, we will not try
it again during any other file copy. Same is true with copy_file_range.
If all else fails, we fall back to traditional copy.
Signed-off-by: Sargun Dhillon <sargun@sargun.me>
| ... | ... |
@@ -2,8 +2,17 @@ |
| 2 | 2 |
|
| 3 | 3 |
package copy |
| 4 | 4 |
|
| 5 |
+/* |
|
| 6 |
+#include <linux/fs.h> |
|
| 7 |
+ |
|
| 8 |
+#ifndef FICLONE |
|
| 9 |
+#define FICLONE _IOW(0x94, 9, int) |
|
| 10 |
+#endif |
|
| 11 |
+*/ |
|
| 12 |
+import "C" |
|
| 5 | 13 |
import ( |
| 6 | 14 |
"fmt" |
| 15 |
+ "io" |
|
| 7 | 16 |
"os" |
| 8 | 17 |
"path/filepath" |
| 9 | 18 |
"syscall" |
| ... | ... |
@@ -15,6 +24,7 @@ import ( |
| 15 | 15 |
"golang.org/x/sys/unix" |
| 16 | 16 |
) |
| 17 | 17 |
|
| 18 |
+// Mode indicates whether to use hardlink or copy content |
|
| 18 | 19 |
type Mode int |
| 19 | 20 |
|
| 20 | 21 |
const ( |
| ... | ... |
@@ -24,20 +34,61 @@ const ( |
| 24 | 24 |
Hardlink |
| 25 | 25 |
) |
| 26 | 26 |
|
| 27 |
-func copyRegular(srcPath, dstPath string, mode os.FileMode) error {
|
|
| 27 |
+func copyRegular(srcPath, dstPath string, fileinfo os.FileInfo, copyWithFileRange, copyWithFileClone *bool) error {
|
|
| 28 | 28 |
srcFile, err := os.Open(srcPath) |
| 29 | 29 |
if err != nil {
|
| 30 | 30 |
return err |
| 31 | 31 |
} |
| 32 | 32 |
defer srcFile.Close() |
| 33 | 33 |
|
| 34 |
- dstFile, err := os.OpenFile(dstPath, os.O_WRONLY|os.O_CREATE, mode) |
|
| 34 |
+ // If the destination file already exists, we shouldn't blow it away |
|
| 35 |
+ dstFile, err := os.OpenFile(dstPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, fileinfo.Mode()) |
|
| 35 | 36 |
if err != nil {
|
| 36 | 37 |
return err |
| 37 | 38 |
} |
| 38 | 39 |
defer dstFile.Close() |
| 39 | 40 |
|
| 40 |
- _, err = pools.Copy(dstFile, srcFile) |
|
| 41 |
+ if *copyWithFileClone {
|
|
| 42 |
+ _, _, err = unix.Syscall(unix.SYS_IOCTL, dstFile.Fd(), C.FICLONE, srcFile.Fd()) |
|
| 43 |
+ if err == nil {
|
|
| 44 |
+ return nil |
|
| 45 |
+ } |
|
| 46 |
+ |
|
| 47 |
+ *copyWithFileClone = false |
|
| 48 |
+ if err == unix.EXDEV {
|
|
| 49 |
+ *copyWithFileRange = false |
|
| 50 |
+ } |
|
| 51 |
+ } |
|
| 52 |
+ if *copyWithFileRange {
|
|
| 53 |
+ err = doCopyWithFileRange(srcFile, dstFile, fileinfo) |
|
| 54 |
+ // Trying the file_clone may not have caught the exdev case |
|
| 55 |
+ // as the ioctl may not have been available (therefore EINVAL) |
|
| 56 |
+ if err == unix.EXDEV || err == unix.ENOSYS {
|
|
| 57 |
+ *copyWithFileRange = false |
|
| 58 |
+ } else if err != nil {
|
|
| 59 |
+ return err |
|
| 60 |
+ } |
|
| 61 |
+ } |
|
| 62 |
+ return legacyCopy(srcFile, dstFile) |
|
| 63 |
+} |
|
| 64 |
+ |
|
| 65 |
+func doCopyWithFileRange(srcFile, dstFile *os.File, fileinfo os.FileInfo) error {
|
|
| 66 |
+ amountLeftToCopy := fileinfo.Size() |
|
| 67 |
+ |
|
| 68 |
+ for amountLeftToCopy > 0 {
|
|
| 69 |
+ n, err := unix.CopyFileRange(int(srcFile.Fd()), nil, int(dstFile.Fd()), nil, int(amountLeftToCopy), 0) |
|
| 70 |
+ if err != nil {
|
|
| 71 |
+ return err |
|
| 72 |
+ } |
|
| 73 |
+ |
|
| 74 |
+ amountLeftToCopy = amountLeftToCopy - int64(n) |
|
| 75 |
+ } |
|
| 76 |
+ |
|
| 77 |
+ return nil |
|
| 78 |
+} |
|
| 79 |
+ |
|
| 80 |
+func legacyCopy(srcFile io.Reader, dstFile io.Writer) error {
|
|
| 81 |
+ _, err := pools.Copy(dstFile, srcFile) |
|
| 41 | 82 |
|
| 42 | 83 |
return err |
| 43 | 84 |
} |
| ... | ... |
@@ -58,6 +109,8 @@ func copyXattr(srcPath, dstPath, attr string) error {
|
| 58 | 58 |
// DirCopy copies or hardlinks the contents of one directory to another, |
| 59 | 59 |
// properly handling xattrs, and soft links |
| 60 | 60 |
func DirCopy(srcDir, dstDir string, copyMode Mode) error {
|
| 61 |
+ copyWithFileRange := true |
|
| 62 |
+ copyWithFileClone := true |
|
| 61 | 63 |
err := filepath.Walk(srcDir, func(srcPath string, f os.FileInfo, err error) error {
|
| 62 | 64 |
if err != nil {
|
| 63 | 65 |
return err |
| ... | ... |
@@ -85,13 +138,12 @@ func DirCopy(srcDir, dstDir string, copyMode Mode) error {
|
| 85 | 85 |
case 0: // Regular file |
| 86 | 86 |
if copyMode == Hardlink {
|
| 87 | 87 |
isHardlink = true |
| 88 |
- if err := os.Link(srcPath, dstPath); err != nil {
|
|
| 89 |
- return err |
|
| 88 |
+ if err2 := os.Link(srcPath, dstPath); err2 != nil {
|
|
| 89 |
+ return err2 |
|
| 90 | 90 |
} |
| 91 | 91 |
} else {
|
| 92 |
- // Always fall back to Content copymode |
|
| 93 |
- if err := copyRegular(srcPath, dstPath, f.Mode()); err != nil {
|
|
| 94 |
- return err |
|
| 92 |
+ if err2 := copyRegular(srcPath, dstPath, f, ©WithFileRange, ©WithFileClone); err2 != nil {
|
|
| 93 |
+ return err2 |
|
| 95 | 94 |
} |
| 96 | 95 |
} |
| 97 | 96 |
|
| 98 | 97 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,67 @@ |
| 0 |
+// +build linux |
|
| 1 |
+ |
|
| 2 |
+package copy |
|
| 3 |
+ |
|
| 4 |
+import ( |
|
| 5 |
+ "io/ioutil" |
|
| 6 |
+ "math/rand" |
|
| 7 |
+ "os" |
|
| 8 |
+ "path/filepath" |
|
| 9 |
+ "testing" |
|
| 10 |
+ |
|
| 11 |
+ "github.com/docker/docker/pkg/parsers/kernel" |
|
| 12 |
+ "github.com/stretchr/testify/assert" |
|
| 13 |
+ "github.com/stretchr/testify/require" |
|
| 14 |
+) |
|
| 15 |
+ |
|
| 16 |
+func TestIsCopyFileRangeSyscallAvailable(t *testing.T) {
|
|
| 17 |
+ // Verifies: |
|
| 18 |
+ // 1. That copyFileRangeEnabled is being set to true when copy_file_range syscall is available |
|
| 19 |
+ // 2. That isCopyFileRangeSyscallAvailable() works on "new" kernels |
|
| 20 |
+ v, err := kernel.GetKernelVersion() |
|
| 21 |
+ require.NoError(t, err) |
|
| 22 |
+ |
|
| 23 |
+ copyWithFileRange := true |
|
| 24 |
+ copyWithFileClone := false |
|
| 25 |
+ doCopyTest(t, ©WithFileRange, ©WithFileClone) |
|
| 26 |
+ |
|
| 27 |
+ if kernel.CompareKernelVersion(*v, kernel.VersionInfo{Kernel: 4, Major: 5, Minor: 0}) < 0 {
|
|
| 28 |
+ assert.False(t, copyWithFileRange) |
|
| 29 |
+ } else {
|
|
| 30 |
+ assert.True(t, copyWithFileRange) |
|
| 31 |
+ } |
|
| 32 |
+ |
|
| 33 |
+} |
|
| 34 |
+ |
|
| 35 |
+func TestCopy(t *testing.T) {
|
|
| 36 |
+ copyWithFileRange := true |
|
| 37 |
+ copyWithFileClone := true |
|
| 38 |
+ doCopyTest(t, ©WithFileRange, ©WithFileClone) |
|
| 39 |
+} |
|
| 40 |
+ |
|
| 41 |
+func TestCopyWithoutRange(t *testing.T) {
|
|
| 42 |
+ copyWithFileRange := false |
|
| 43 |
+ copyWithFileClone := false |
|
| 44 |
+ doCopyTest(t, ©WithFileRange, ©WithFileClone) |
|
| 45 |
+} |
|
| 46 |
+ |
|
| 47 |
+func doCopyTest(t *testing.T, copyWithFileRange, copyWithFileClone *bool) {
|
|
| 48 |
+ dir, err := ioutil.TempDir("", "docker-copy-check")
|
|
| 49 |
+ require.NoError(t, err) |
|
| 50 |
+ defer os.RemoveAll(dir) |
|
| 51 |
+ srcFilename := filepath.Join(dir, "srcFilename") |
|
| 52 |
+ dstFilename := filepath.Join(dir, "dstilename") |
|
| 53 |
+ |
|
| 54 |
+ r := rand.New(rand.NewSource(0)) |
|
| 55 |
+ buf := make([]byte, 1024) |
|
| 56 |
+ _, err = r.Read(buf) |
|
| 57 |
+ require.NoError(t, err) |
|
| 58 |
+ require.NoError(t, ioutil.WriteFile(srcFilename, buf, 0777)) |
|
| 59 |
+ fileinfo, err := os.Stat(srcFilename) |
|
| 60 |
+ require.NoError(t, err) |
|
| 61 |
+ |
|
| 62 |
+ require.NoError(t, copyRegular(srcFilename, dstFilename, fileinfo, copyWithFileRange, copyWithFileClone)) |
|
| 63 |
+ readBuf, err := ioutil.ReadFile(dstFilename) |
|
| 64 |
+ require.NoError(t, err) |
|
| 65 |
+ assert.Equal(t, buf, readBuf) |
|
| 66 |
+} |