Browse code

handle remote->local transfers with local hardlink/copy if possible

Reworked some of the hardlink / same file detection code to be a
little more general purpose. Now it can be used to detect duplicate
files on either remote or local side.

When transferring remote->local, if we already have a copy (same
md5sum) of a file locally that we would otherwise transfer, don't
transfer, but hardlink it. Should hardlink not be avaialble (e.g. on
Windows), use shutil.copy2() instead. This lets us avoid the second
download completely.

_get_filelist_local() grew an initial list argument. This lets us
avoid copying / merging / updating a bunch of different lists back
into one - it starts as one list and grows. Much cleaner (and the
fact these were separate cost me several hours of debugging to track
down why something would get set, like the by_md5 hash, only to have
it be empty shortly thereafter.

Matt Domsch authored on 2012/06/19 01:36:06
Showing 3 changed files
... ...
@@ -137,7 +137,7 @@ def handle_exclude_include_walk(root, dirs, files):
137 137
             debug(u"PASS: %s" % (file))
138 138
 
139 139
 def fetch_local_list(args, recursive = None):
140
-    def _get_filelist_local(local_uri):
140
+    def _get_filelist_local(loc_list, local_uri):
141 141
         info(u"Compiling list of local files...")
142 142
         if local_uri.isdir():
143 143
             local_base = deunicodise(local_uri.basename())
... ...
@@ -149,7 +149,6 @@ def fetch_local_list(args, recursive = None):
149 149
             local_path = deunicodise(local_uri.dirname())
150 150
             filelist = [( local_path, [], [deunicodise(local_uri.basename())] )]
151 151
             single_file = True
152
-        loc_list = SortedDict(ignore_case = False)
153 152
         for root, dirs, files in filelist:
154 153
             rel_root = root.replace(local_path, local_base, 1)
155 154
             for f in files:
... ...
@@ -173,7 +172,6 @@ def fetch_local_list(args, recursive = None):
173 173
                     'full_name' : full_name,
174 174
                     'size' : sr.st_size,
175 175
                     'mtime' : sr.st_mtime,
176
-		    'nlink' : sr.st_nlink, # record hardlink information
177 176
 		    'dev'   : sr.st_dev,
178 177
 		    'inode' : sr.st_ino,
179 178
 		    'uid' : sr.st_uid,
... ...
@@ -181,7 +179,9 @@ def fetch_local_list(args, recursive = None):
181 181
 		    'sr': sr # save it all, may need it in preserve_attrs_list
182 182
                     ## TODO: Possibly more to save here...
183 183
                 }
184
-		loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino)
184
+		if 'md5' in cfg.sync_checks:
185
+		    md5 = loc_list.get_md5(relative_file)
186
+		    loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5)
185 187
         return loc_list, single_file
186 188
 
187 189
     cfg = Config()
... ...
@@ -204,8 +204,7 @@ def fetch_local_list(args, recursive = None):
204 204
         local_uris.append(uri)
205 205
 
206 206
     for uri in local_uris:
207
-        list_for_uri, single_file = _get_filelist_local(uri)
208
-        local_list.update(list_for_uri)
207
+        list_for_uri, single_file = _get_filelist_local(local_list, uri)
209 208
 
210 209
     ## Single file is True if and only if the user
211 210
     ## specified one local URI and that URI represents
... ...
@@ -264,7 +263,6 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
264 264
                 'object_key' : object['Key'],
265 265
                 'object_uri_str' : object_uri_str,
266 266
                 'base_uri' : remote_uri,
267
-		'nlink' : 1, # S3 doesn't support hardlinks itself
268 267
 		'dev' : None,
269 268
 		'inode' : None,
270 269
             }
... ...
@@ -406,7 +404,7 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates
406 406
     debug("Comparing filelists (direction: %s -> %s)" % (__direction_str(src_remote), __direction_str(dst_remote)))
407 407
 
408 408
     for relative_file in src_list.keys():
409
-        debug(u"CHECK: %s: %s" % (relative_file, src_list.get_md5(relative_file)))
409
+        debug(u"CHECK: %s" % (relative_file))
410 410
 
411 411
         if dst_list.has_key(relative_file):
412 412
             ## Was --skip-existing requested?
... ...
@@ -416,7 +414,14 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates
416 416
 	        del(dst_list[relative_file])
417 417
 	        continue
418 418
 
419
-	    if _compare(src_list, dst_list, src_remote, dst_remote, relative_file):
419
+	    try:
420
+                compare_result = _compare(src_list, dst_list, src_remote, dst_remote, relative_file)
421
+	    except (IOError,OSError), e:
422
+	        del(src_list[relative_file])
423
+		del(dst_list[relative_file])
424
+                continue
425
+
426
+	    if compare_result:
420 427
 	        debug(u"IGNR: %s (transfer not needed)" % relative_file)
421 428
 	        del(src_list[relative_file])
422 429
 		del(dst_list[relative_file])
... ...
@@ -434,7 +439,6 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates
434 434
                 else:
435 435
 		    # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
436 436
 		    # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
437
-                    debug(u"REMOTE COPY src before")
438 437
 		    dst_list.record_md5(relative_file, md5) 
439 438
 		    update_list[relative_file] = src_list[relative_file]
440 439
 		    del src_list[relative_file]
... ...
@@ -448,7 +452,6 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates
448 448
 	    if dst1 is not None:
449 449
                 # Found one, we want to copy
450 450
                 debug(u"REMOTE COPY dst: %s -> %s" % (dst1, relative_file))
451
-		# FIXME this blows up when dst1 is not in dst_list, because we added it below in record_md5 but it's not really in dst_list.
452 451
 		copy_pairs.append((dst1, relative_file))
453 452
 		del(src_list[relative_file])
454 453
 	    else:
... ...
@@ -456,7 +459,6 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates
456 456
 	        # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
457 457
 	        # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
458 458
 	        dst_list.record_md5(relative_file, md5) 
459
-		debug(u"REMOTE COPY dst before")
460 459
 
461 460
     for f in dst_list.keys():
462 461
         if not src_list.has_key(f) and not update_list.has_key(f):
... ...
@@ -27,7 +27,7 @@ class SortedDict(dict):
27 27
         """
28 28
         dict.__init__(self, mapping, **kwargs)
29 29
         self.ignore_case = ignore_case
30
-        self.hardlinks = dict()
30
+        self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}}
31 31
         self.by_md5 = dict() # {md5: set(relative_files)}
32 32
 
33 33
     def keys(self):
... ...
@@ -60,39 +60,24 @@ class SortedDict(dict):
60 60
         except:
61 61
             return None
62 62
             
63
-
64 63
     def get_md5(self, relative_file):
65 64
         md5 = None
66 65
         if 'md5' in self[relative_file]:
67 66
             return self[relative_file]['md5']
68
-	if self.is_hardlinked(relative_file): # speedup by getting it from one of the hardlinks already processed
69
-            md5 = self.get_hardlink_md5(relative_file)
70
-	    if md5 is None:
71
-                md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
72
-		self.record_md5(relative_file, md5)
73
-	        self.set_hardlink_md5(relative_file, md5)
74
-	else:
75
-                md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
76
-	        self[relative_file]['md5'] = md5
77
-		l.record_md5(relative_file, md5)
67
+        md5 = self.get_hardlink_md5(relative_file)
68
+        if md5 is None:
69
+            md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
70
+        self.record_md5(relative_file, md5)
71
+        self[relative_file]['md5'] = md5
78 72
 	return md5
79 73
 
80
-    def record_hardlink(self, relative_file, dev, inode):
74
+    def record_hardlink(self, relative_file, dev, inode, md5):
81 75
         if dev not in self.hardlinks:
82 76
             self.hardlinks[dev] = dict()
83 77
         if inode not in self.hardlinks[dev]:
84
-            self.hardlinks[dev][inode] = dict(md5=None, relative_files=set())
78
+            self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set())
85 79
         self.hardlinks[dev][inode]['relative_files'].add(relative_file)
86 80
 
87
-    def set_hardlink_md5(self, relative_file, md5):
88
-        dev = self[relative_file]['dev']
89
-        inode = self[relative_file]['inode']
90
-        self.record_hardlink(relative_file, dev, inode)
91
-        self.hardlinks[dev][inode]['md5'] = md5
92
-
93
-    def is_hardlinked(self, relative_file):
94
-        return self[relative_file]['nlink'] > 1
95
-
96 81
     def get_hardlink_md5(self, relative_file):
97 82
         md5 = None
98 83
         dev = self[relative_file]['dev']
... ...
@@ -23,6 +23,7 @@ import locale
23 23
 import subprocess
24 24
 import htmlentitydefs
25 25
 import socket
26
+import shutil
26 27
 
27 28
 from copy import copy
28 29
 from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter
... ...
@@ -700,8 +701,9 @@ def cmd_sync_remote2local(args):
700 700
     local_count = len(local_list)
701 701
     remote_count = len(remote_list)
702 702
     update_count = len(update_list)
703
+    copy_pairs_count = len(copy_pairs)
703 704
 
704
-    info(u"Summary: %d remote files to download, %d local files to delete" % (remote_count + update_count, local_count))
705
+    info(u"Summary: %d remote files to download, %d local files to delete, %d local files to hardlink" % (remote_count + update_count, local_count, copy_pairs_count))
705 706
 
706 707
     def _set_local_filename(remote_list, destination_base):
707 708
         if not os.path.isdir(destination_base):
... ...
@@ -826,6 +828,7 @@ def cmd_sync_remote2local(args):
826 826
     seq = 0
827 827
     seq, total_size = _download(remote_list, seq, remote_count + update_count, total_size, dir_cache)
828 828
     seq, total_size = _download(update_list, seq, remote_count + update_count, total_size, dir_cache)
829
+    local_hardlink(copy_pairs, destination_base)
829 830
     
830 831
     total_elapsed = time.time() - timestamp_start
831 832
     speed_fmt = formatSize(total_size/total_elapsed, human_readable = True, floating_point = True)
... ...
@@ -841,6 +844,15 @@ def cmd_sync_remote2local(args):
841 841
     if cfg.delete_removed and cfg.delete_after:
842 842
         _do_deletes(local_list)
843 843
 
844
+def local_hardlink(copy_pairs, destination_base):
845
+    for (dst1, dst2) in copy_pairs:
846
+        try:
847
+            os.link(destination_base + dst1, destination_base + dst2)
848
+            debug(u"Hardlinking %s to %s" % (destination_base + dst1, destination_base + dst2))
849
+        except:
850
+            shutil.copy2(destination_base + dst1, destination_base + dst2)
851
+            debug(u"Hardlinking unavailable, copying %s to %s" % (destination_base + dst1, destination_base + dst2))
852
+
844 853
 def remote_copy(s3, copy_pairs, destination_base):
845 854
     saved_bytes = 0
846 855
     for (dst1, dst2) in copy_pairs: