Reworked some of the hardlink / same file detection code to be a
little more general purpose. Now it can be used to detect duplicate
files on either remote or local side.
When transferring remote->local, if we already have a copy (same
md5sum) of a file locally that we would otherwise transfer, don't
transfer, but hardlink it. Should hardlink not be avaialble (e.g. on
Windows), use shutil.copy2() instead. This lets us avoid the second
download completely.
_get_filelist_local() grew an initial list argument. This lets us
avoid copying / merging / updating a bunch of different lists back
into one - it starts as one list and grows. Much cleaner (and the
fact these were separate cost me several hours of debugging to track
down why something would get set, like the by_md5 hash, only to have
it be empty shortly thereafter.
... | ... |
@@ -137,7 +137,7 @@ def handle_exclude_include_walk(root, dirs, files): |
137 | 137 |
debug(u"PASS: %s" % (file)) |
138 | 138 |
|
139 | 139 |
def fetch_local_list(args, recursive = None): |
140 |
- def _get_filelist_local(local_uri): |
|
140 |
+ def _get_filelist_local(loc_list, local_uri): |
|
141 | 141 |
info(u"Compiling list of local files...") |
142 | 142 |
if local_uri.isdir(): |
143 | 143 |
local_base = deunicodise(local_uri.basename()) |
... | ... |
@@ -149,7 +149,6 @@ def fetch_local_list(args, recursive = None): |
149 | 149 |
local_path = deunicodise(local_uri.dirname()) |
150 | 150 |
filelist = [( local_path, [], [deunicodise(local_uri.basename())] )] |
151 | 151 |
single_file = True |
152 |
- loc_list = SortedDict(ignore_case = False) |
|
153 | 152 |
for root, dirs, files in filelist: |
154 | 153 |
rel_root = root.replace(local_path, local_base, 1) |
155 | 154 |
for f in files: |
... | ... |
@@ -173,7 +172,6 @@ def fetch_local_list(args, recursive = None): |
173 | 173 |
'full_name' : full_name, |
174 | 174 |
'size' : sr.st_size, |
175 | 175 |
'mtime' : sr.st_mtime, |
176 |
- 'nlink' : sr.st_nlink, # record hardlink information |
|
177 | 176 |
'dev' : sr.st_dev, |
178 | 177 |
'inode' : sr.st_ino, |
179 | 178 |
'uid' : sr.st_uid, |
... | ... |
@@ -181,7 +179,9 @@ def fetch_local_list(args, recursive = None): |
181 | 181 |
'sr': sr # save it all, may need it in preserve_attrs_list |
182 | 182 |
## TODO: Possibly more to save here... |
183 | 183 |
} |
184 |
- loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino) |
|
184 |
+ if 'md5' in cfg.sync_checks: |
|
185 |
+ md5 = loc_list.get_md5(relative_file) |
|
186 |
+ loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5) |
|
185 | 187 |
return loc_list, single_file |
186 | 188 |
|
187 | 189 |
cfg = Config() |
... | ... |
@@ -204,8 +204,7 @@ def fetch_local_list(args, recursive = None): |
204 | 204 |
local_uris.append(uri) |
205 | 205 |
|
206 | 206 |
for uri in local_uris: |
207 |
- list_for_uri, single_file = _get_filelist_local(uri) |
|
208 |
- local_list.update(list_for_uri) |
|
207 |
+ list_for_uri, single_file = _get_filelist_local(local_list, uri) |
|
209 | 208 |
|
210 | 209 |
## Single file is True if and only if the user |
211 | 210 |
## specified one local URI and that URI represents |
... | ... |
@@ -264,7 +263,6 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
264 | 264 |
'object_key' : object['Key'], |
265 | 265 |
'object_uri_str' : object_uri_str, |
266 | 266 |
'base_uri' : remote_uri, |
267 |
- 'nlink' : 1, # S3 doesn't support hardlinks itself |
|
268 | 267 |
'dev' : None, |
269 | 268 |
'inode' : None, |
270 | 269 |
} |
... | ... |
@@ -406,7 +404,7 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
406 | 406 |
debug("Comparing filelists (direction: %s -> %s)" % (__direction_str(src_remote), __direction_str(dst_remote))) |
407 | 407 |
|
408 | 408 |
for relative_file in src_list.keys(): |
409 |
- debug(u"CHECK: %s: %s" % (relative_file, src_list.get_md5(relative_file))) |
|
409 |
+ debug(u"CHECK: %s" % (relative_file)) |
|
410 | 410 |
|
411 | 411 |
if dst_list.has_key(relative_file): |
412 | 412 |
## Was --skip-existing requested? |
... | ... |
@@ -416,7 +414,14 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
416 | 416 |
del(dst_list[relative_file]) |
417 | 417 |
continue |
418 | 418 |
|
419 |
- if _compare(src_list, dst_list, src_remote, dst_remote, relative_file): |
|
419 |
+ try: |
|
420 |
+ compare_result = _compare(src_list, dst_list, src_remote, dst_remote, relative_file) |
|
421 |
+ except (IOError,OSError), e: |
|
422 |
+ del(src_list[relative_file]) |
|
423 |
+ del(dst_list[relative_file]) |
|
424 |
+ continue |
|
425 |
+ |
|
426 |
+ if compare_result: |
|
420 | 427 |
debug(u"IGNR: %s (transfer not needed)" % relative_file) |
421 | 428 |
del(src_list[relative_file]) |
422 | 429 |
del(dst_list[relative_file]) |
... | ... |
@@ -434,7 +439,6 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
434 | 434 |
else: |
435 | 435 |
# record that we will get this file transferred to us (before all the copies), so if we come across it later again, |
436 | 436 |
# we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter). |
437 |
- debug(u"REMOTE COPY src before") |
|
438 | 437 |
dst_list.record_md5(relative_file, md5) |
439 | 438 |
update_list[relative_file] = src_list[relative_file] |
440 | 439 |
del src_list[relative_file] |
... | ... |
@@ -448,7 +452,6 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
448 | 448 |
if dst1 is not None: |
449 | 449 |
# Found one, we want to copy |
450 | 450 |
debug(u"REMOTE COPY dst: %s -> %s" % (dst1, relative_file)) |
451 |
- # FIXME this blows up when dst1 is not in dst_list, because we added it below in record_md5 but it's not really in dst_list. |
|
452 | 451 |
copy_pairs.append((dst1, relative_file)) |
453 | 452 |
del(src_list[relative_file]) |
454 | 453 |
else: |
... | ... |
@@ -456,7 +459,6 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
456 | 456 |
# record that we will get this file transferred to us (before all the copies), so if we come across it later again, |
457 | 457 |
# we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter). |
458 | 458 |
dst_list.record_md5(relative_file, md5) |
459 |
- debug(u"REMOTE COPY dst before") |
|
460 | 459 |
|
461 | 460 |
for f in dst_list.keys(): |
462 | 461 |
if not src_list.has_key(f) and not update_list.has_key(f): |
... | ... |
@@ -27,7 +27,7 @@ class SortedDict(dict): |
27 | 27 |
""" |
28 | 28 |
dict.__init__(self, mapping, **kwargs) |
29 | 29 |
self.ignore_case = ignore_case |
30 |
- self.hardlinks = dict() |
|
30 |
+ self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}} |
|
31 | 31 |
self.by_md5 = dict() # {md5: set(relative_files)} |
32 | 32 |
|
33 | 33 |
def keys(self): |
... | ... |
@@ -60,39 +60,24 @@ class SortedDict(dict): |
60 | 60 |
except: |
61 | 61 |
return None |
62 | 62 |
|
63 |
- |
|
64 | 63 |
def get_md5(self, relative_file): |
65 | 64 |
md5 = None |
66 | 65 |
if 'md5' in self[relative_file]: |
67 | 66 |
return self[relative_file]['md5'] |
68 |
- if self.is_hardlinked(relative_file): # speedup by getting it from one of the hardlinks already processed |
|
69 |
- md5 = self.get_hardlink_md5(relative_file) |
|
70 |
- if md5 is None: |
|
71 |
- md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
72 |
- self.record_md5(relative_file, md5) |
|
73 |
- self.set_hardlink_md5(relative_file, md5) |
|
74 |
- else: |
|
75 |
- md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
76 |
- self[relative_file]['md5'] = md5 |
|
77 |
- l.record_md5(relative_file, md5) |
|
67 |
+ md5 = self.get_hardlink_md5(relative_file) |
|
68 |
+ if md5 is None: |
|
69 |
+ md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
70 |
+ self.record_md5(relative_file, md5) |
|
71 |
+ self[relative_file]['md5'] = md5 |
|
78 | 72 |
return md5 |
79 | 73 |
|
80 |
- def record_hardlink(self, relative_file, dev, inode): |
|
74 |
+ def record_hardlink(self, relative_file, dev, inode, md5): |
|
81 | 75 |
if dev not in self.hardlinks: |
82 | 76 |
self.hardlinks[dev] = dict() |
83 | 77 |
if inode not in self.hardlinks[dev]: |
84 |
- self.hardlinks[dev][inode] = dict(md5=None, relative_files=set()) |
|
78 |
+ self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set()) |
|
85 | 79 |
self.hardlinks[dev][inode]['relative_files'].add(relative_file) |
86 | 80 |
|
87 |
- def set_hardlink_md5(self, relative_file, md5): |
|
88 |
- dev = self[relative_file]['dev'] |
|
89 |
- inode = self[relative_file]['inode'] |
|
90 |
- self.record_hardlink(relative_file, dev, inode) |
|
91 |
- self.hardlinks[dev][inode]['md5'] = md5 |
|
92 |
- |
|
93 |
- def is_hardlinked(self, relative_file): |
|
94 |
- return self[relative_file]['nlink'] > 1 |
|
95 |
- |
|
96 | 81 |
def get_hardlink_md5(self, relative_file): |
97 | 82 |
md5 = None |
98 | 83 |
dev = self[relative_file]['dev'] |
... | ... |
@@ -23,6 +23,7 @@ import locale |
23 | 23 |
import subprocess |
24 | 24 |
import htmlentitydefs |
25 | 25 |
import socket |
26 |
+import shutil |
|
26 | 27 |
|
27 | 28 |
from copy import copy |
28 | 29 |
from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter |
... | ... |
@@ -700,8 +701,9 @@ def cmd_sync_remote2local(args): |
700 | 700 |
local_count = len(local_list) |
701 | 701 |
remote_count = len(remote_list) |
702 | 702 |
update_count = len(update_list) |
703 |
+ copy_pairs_count = len(copy_pairs) |
|
703 | 704 |
|
704 |
- info(u"Summary: %d remote files to download, %d local files to delete" % (remote_count + update_count, local_count)) |
|
705 |
+ info(u"Summary: %d remote files to download, %d local files to delete, %d local files to hardlink" % (remote_count + update_count, local_count, copy_pairs_count)) |
|
705 | 706 |
|
706 | 707 |
def _set_local_filename(remote_list, destination_base): |
707 | 708 |
if not os.path.isdir(destination_base): |
... | ... |
@@ -826,6 +828,7 @@ def cmd_sync_remote2local(args): |
826 | 826 |
seq = 0 |
827 | 827 |
seq, total_size = _download(remote_list, seq, remote_count + update_count, total_size, dir_cache) |
828 | 828 |
seq, total_size = _download(update_list, seq, remote_count + update_count, total_size, dir_cache) |
829 |
+ local_hardlink(copy_pairs, destination_base) |
|
829 | 830 |
|
830 | 831 |
total_elapsed = time.time() - timestamp_start |
831 | 832 |
speed_fmt = formatSize(total_size/total_elapsed, human_readable = True, floating_point = True) |
... | ... |
@@ -841,6 +844,15 @@ def cmd_sync_remote2local(args): |
841 | 841 |
if cfg.delete_removed and cfg.delete_after: |
842 | 842 |
_do_deletes(local_list) |
843 | 843 |
|
844 |
+def local_hardlink(copy_pairs, destination_base): |
|
845 |
+ for (dst1, dst2) in copy_pairs: |
|
846 |
+ try: |
|
847 |
+ os.link(destination_base + dst1, destination_base + dst2) |
|
848 |
+ debug(u"Hardlinking %s to %s" % (destination_base + dst1, destination_base + dst2)) |
|
849 |
+ except: |
|
850 |
+ shutil.copy2(destination_base + dst1, destination_base + dst2) |
|
851 |
+ debug(u"Hardlinking unavailable, copying %s to %s" % (destination_base + dst1, destination_base + dst2)) |
|
852 |
+ |
|
844 | 853 |
def remote_copy(s3, copy_pairs, destination_base): |
845 | 854 |
saved_bytes = 0 |
846 | 855 |
for (dst1, dst2) in copy_pairs: |