Browse code

Handle hardlinks and duplicate files

Minimize uploads in sync local->remote by looking for existing same
files elsewhere in remote destination and do an S3 COPY command
instead of uploading the file again.

We now store the (locally generated) md5 of the file in the
x-amz-meta-s3cmd-attrs metadata, because we can't count on the ETag
being correct due to multipart uploads. Use this value if it's
available.

This also reduces the number of local stat() calls made by
recording more useful information during the inital
os.walk(). This cuts the number of stat()s in half.

Matt Domsch authored on 2012/06/16 13:43:00
Showing 4 changed files
... ...
@@ -50,6 +50,7 @@ class Config(object):
50 50
         'mtime',    # Modification timestamp
51 51
         'ctime',    # Creation timestamp
52 52
         'mode',     # File mode (e.g. rwxr-xr-x = 755)
53
+        'md5',      # File MD5 (if known)
53 54
         #'acl',     # Full ACL (not yet supported)
54 55
     ]
55 56
     delete_removed = False
... ...
@@ -16,7 +16,7 @@ import os
16 16
 import glob
17 17
 import copy
18 18
 
19
-__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include"]
19
+__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include", "parse_attrs_header"]
20 20
 
21 21
 def _fswalk_follow_symlinks(path):
22 22
         '''
... ...
@@ -173,8 +173,15 @@ def fetch_local_list(args, recursive = None):
173 173
                     'full_name' : full_name,
174 174
                     'size' : sr.st_size,
175 175
                     'mtime' : sr.st_mtime,
176
+		    'nlink' : sr.st_nlink, # record hardlink information
177
+		    'dev'   : sr.st_dev,
178
+		    'inode' : sr.st_ino,
179
+		    'uid' : sr.st_uid,
180
+		    'gid' : sr.st_gid,
181
+		    'sr': sr # save it all, may need it in preserve_attrs_list
176 182
                     ## TODO: Possibly more to save here...
177 183
                 }
184
+		loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino)
178 185
         return loc_list, single_file
179 186
 
180 187
     cfg = Config()
... ...
@@ -257,7 +264,12 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
257 257
                 'object_key' : object['Key'],
258 258
                 'object_uri_str' : object_uri_str,
259 259
                 'base_uri' : remote_uri,
260
+		'nlink' : 1, # S3 doesn't support hardlinks itself
261
+		'dev' : None,
262
+		'inode' : None,
260 263
             }
264
+	    md5 = object['ETag'][1:-1]
265
+	    rem_list.record_md5(key, md5)
261 266
             if break_now:
262 267
                 break
263 268
         return rem_list
... ...
@@ -283,6 +295,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
283 283
             objectlist = _get_filelist_remote(uri)
284 284
             for key in objectlist:
285 285
                 remote_list[key] = objectlist[key]
286
+		remote_list.record_md5(key, objectlist.get_md5(key))
286 287
     else:
287 288
         for uri in remote_uris:
288 289
             uri_str = str(uri)
... ...
@@ -320,90 +333,135 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
320 320
                     'md5': response['headers']['etag'].strip('"\''),
321 321
                     'timestamp' : dateRFC822toUnix(response['headers']['date'])
322 322
                     })
323
+		    # get md5 from header if it's present.  We would have set that during upload
324
+		    if response['headers'].has_key('x-amz-meta-s3cmd-attrs'):
325
+                        attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
326
+			if attrs.has_key('md5'):
327
+                            remote_item.update({'md5': attrs['md5']})
328
+		    
323 329
                 remote_list[key] = remote_item
324 330
     return remote_list
325 331
 
332
+def parse_attrs_header(attrs_header):
333
+    attrs = {}
334
+    for attr in attrs_header.split("/"):
335
+        key, val = attr.split(":")
336
+	attrs[key] = val
337
+    return attrs
338
+
339
+
326 340
 def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates = False):
327 341
     def __direction_str(is_remote):
328 342
         return is_remote and "remote" or "local"
329 343
 
330
-    # We don't support local->local sync, use 'rsync' or something like that instead ;-)
344
+    def _compare(src_list, dst_lst, src_remote, dst_remote, file):
345
+        """Return True if src_list[file] matches dst_list[file], else False"""
346
+        attribs_match = True
347
+	if not (src_list.has_key(file) and dst_list.has_key(file)):
348
+            info(u"file does not exist in one side or the other: src_list=%s, dst_list=%s" % (src_list.has_key(file), dst_list.has_key(file)))
349
+            return False
350
+
351
+        ## check size first
352
+	if 'size' in cfg.sync_checks and dst_list[file]['size'] != src_list[file]['size']:
353
+            debug(u"xfer: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
354
+	    attribs_match = False
355
+
356
+        ## check md5
357
+        compare_md5 = 'md5' in cfg.sync_checks
358
+        # Multipart-uploaded files don't have a valid md5 sum - it ends with "...-nn"
359
+        if compare_md5:
360
+            if (src_remote == True and src_list[file]['md5'].find("-") >= 0) or (dst_remote == True and dst_list[file]['md5'].find("-") >= 0):
361
+                compare_md5 = False
362
+                info(u"disabled md5 check for %s" % file)
363
+        if attribs_match and compare_md5:
364
+            try:
365
+                src_md5 = src_list.get_md5(file)
366
+                dst_md5 = dst_list.get_md5(file)
367
+	    except (IOError,OSError), e:
368
+                # md5 sum verification failed - ignore that file altogether
369
+                debug(u"IGNR: %s (disappeared)" % (file))
370
+                warning(u"%s: file disappeared, ignoring." % (file))
371
+		raise
372
+
373
+            if src_md5 != dst_md5:
374
+                ## checksums are different.
375
+                attribs_match = False
376
+                debug(u"XFER: %s (md5 mismatch: src=%s dst=%s)" % (file, src_md5, dst_md5))
377
+
378
+        return attribs_match
379
+
380
+    # we don't support local->local sync, use 'rsync' or something like that instead ;-)
331 381
     assert(not(src_remote == False and dst_remote == False))
332 382
 
333 383
     info(u"Verifying attributes...")
334 384
     cfg = Config()
335
-    exists_list = SortedDict(ignore_case = False)
385
+    ## Items left on src_list will be transferred
386
+    ## Items left on update_list will be transferred after src_list
387
+    ## Items left on copy_pairs will be copied from dst1 to dst2
336 388
     update_list = SortedDict(ignore_case = False)
389
+    ## Items left on dst_list will be deleted
390
+    copy_pairs = []
391
+
337 392
 
338 393
     debug("Comparing filelists (direction: %s -> %s)" % (__direction_str(src_remote), __direction_str(dst_remote)))
339
-    debug("src_list.keys: %s" % src_list.keys())
340
-    debug("dst_list.keys: %s" % dst_list.keys())
341 394
 
342
-    for file in src_list.keys():
343
-        debug(u"CHECK: %s" % file)
344
-        if dst_list.has_key(file):
345
-            ## Was --skip-existing requested?
346
-            if cfg.skip_existing:
347
-                debug(u"IGNR: %s (used --skip-existing)" % (file))
348
-                exists_list[file] = src_list[file]
349
-                del(src_list[file])
350
-                ## Remove from destination-list, all that is left there will be deleted
351
-                del(dst_list[file])
352
-                continue
353
-
354
-            attribs_match = True
355
-            ## Check size first
356
-            if 'size' in cfg.sync_checks and dst_list[file]['size'] != src_list[file]['size']:
357
-                debug(u"XFER: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
358
-                attribs_match = False
395
+    for relative_file in src_list.keys():
396
+        debug(u"CHECK: %s: %s" % (relative_file, src_list.get_md5(relative_file)))
359 397
 
360
-            ## Check MD5
361
-            compare_md5 = 'md5' in cfg.sync_checks
362
-            # Multipart-uploaded files don't have a valid MD5 sum - it ends with "...-NN"
363
-            if compare_md5:
364
-                if (src_remote == True and src_list[file]['md5'].find("-") >= 0) or (dst_remote == True and dst_list[file]['md5'].find("-") >= 0):
365
-                    compare_md5 = False
366
-                    info(u"Disabled MD5 check for %s" % file)
367
-            if attribs_match and compare_md5:
368
-                try:
369
-                    if src_remote == False and dst_remote == True:
370
-                        src_md5 = hash_file_md5(src_list[file]['full_name'])
371
-                        dst_md5 = dst_list[file]['md5']
372
-                    elif src_remote == True and dst_remote == False:
373
-                        src_md5 = src_list[file]['md5']
374
-                        dst_md5 = hash_file_md5(dst_list[file]['full_name'])
375
-                    elif src_remote == True and dst_remote == True:
376
-                        src_md5 = src_list[file]['md5']
377
-                        dst_md5 = dst_list[file]['md5']
378
-                except (IOError,OSError), e:
379
-                    # MD5 sum verification failed - ignore that file altogether
380
-                    debug(u"IGNR: %s (disappeared)" % (file))
381
-                    warning(u"%s: file disappeared, ignoring." % (file))
382
-                    del(src_list[file])
383
-                    del(dst_list[file])
384
-                    continue
398
+        if dst_list.has_key(relative_file):
399
+            ## Was --skip-existing requested?
400
+	    if cfg.skip_existing:
401
+	        debug(u"IGNR: %s (used --skip-existing)" % (relative_file))
402
+	        del(src_list[relative_file])
403
+	        del(dst_list[relative_file])
404
+	        continue
385 405
 
386
-                if src_md5 != dst_md5:
387
-                    ## Checksums are different.
388
-                    attribs_match = False
389
-                    debug(u"XFER: %s (md5 mismatch: src=%s dst=%s)" % (file, src_md5, dst_md5))
406
+	    if _compare(src_list, dst_list, src_remote, dst_remote, relative_file):
407
+	        debug(u"IGNR: %s (transfer not needed)" % relative_file)
408
+	        del(src_list[relative_file])
409
+		del(dst_list[relative_file])
390 410
 
391
-            if attribs_match:
392
-                ## Remove from source-list, all that is left there will be transferred
393
-                debug(u"IGNR: %s (transfer not needed)" % file)
394
-                exists_list[file] = src_list[file]
395
-                del(src_list[file])
396 411
 	    else:
397
-	        if delay_updates:
398
-	            ## Remove from source-list, all that is left there will be transferred
399
-		    ## Add to update-list to transfer last
400
-		    debug(u"XFER UPDATE: %s" % file)
401
-		    update_list[file] = src_list[file]
402
-		    del(src_list[file])
403
-
404
-            ## Remove from destination-list, all that is left there will be deleted
405
-            del(dst_list[file])
406
-
407
-    return src_list, dst_list, exists_list, update_list
412
+                # look for matching file in src
413
+		md5 = src_list.get_md5(relative_file)
414
+		if md5 is not None and dst_list.by_md5.has_key(md5):
415
+                    # Found one, we want to copy
416
+                    dst1 = list(dst_list.by_md5[md5])[0]
417
+                    debug(u"REMOTE COPY src: %s -> %s" % (dst1, relative_file))
418
+		    copy_pairs.append((dst_list[dst1], relative_file))
419
+		    del(src_list[relative_file])
420
+		    del(dst_list[relative_file])
421
+                else:
422
+		    # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
423
+		    # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
424
+                    debug(u"REMOTE COPY src before")
425
+		    dst_list.record_md5(relative_file, md5) 
426
+		    update_list[relative_file] = src_list[relative_file]
427
+		    del src_list[relative_file]
428
+		    del dst_list[relative_file]
429
+
430
+	else:
431
+            # dst doesn't have this file
432
+            # look for matching file elsewhere in dst
433
+	    md5 = src_list.get_md5(relative_file)
434
+	    dst1 = dst_list.find_md5_one(md5)
435
+	    if dst1 is not None:
436
+                # Found one, we want to copy
437
+                debug(u"REMOTE COPY dst: %s -> %s" % (dst1, relative_file))
438
+		copy_pairs.append((dst_list[dst1], relative_file))
439
+		del(src_list[relative_file])
440
+	    else:
441
+                # we don't have this file, and we don't have a copy of this file elsewhere.  Get it.
442
+	        # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
443
+	        # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
444
+	        dst_list.record_md5(relative_file, md5) 
445
+		debug(u"REMOTE COPY dst before")
446
+
447
+    for f in dst_list.keys():
448
+        if not src_list.has_key(f) and not update_list.has_key(f):
449
+            # leave only those not on src_list + update_list
450
+            del dst_list[f]
451
+
452
+    return src_list, dst_list, update_list, copy_pairs
408 453
 
409 454
 # vim:et:ts=4:sts=4:ai
... ...
@@ -4,6 +4,7 @@
4 4
 ## License: GPL Version 2
5 5
 
6 6
 from BidirMap import BidirMap
7
+import Utils
7 8
 
8 9
 class SortedDictIterator(object):
9 10
     def __init__(self, sorted_dict, keys):
... ...
@@ -26,6 +27,8 @@ class SortedDict(dict):
26 26
         """
27 27
         dict.__init__(self, mapping, **kwargs)
28 28
         self.ignore_case = ignore_case
29
+        self.hardlinks = dict()
30
+        self.by_md5 = dict() # {md5: set(relative_files)}
29 31
 
30 32
     def keys(self):
31 33
         keys = dict.keys(self)
... ...
@@ -45,6 +48,61 @@ class SortedDict(dict):
45 45
     def __iter__(self):
46 46
         return SortedDictIterator(self, self.keys())
47 47
 
48
+
49
+    def record_md5(self, relative_file, md5):
50
+        if md5 not in self.by_md5:
51
+            self.by_md5[md5] = set()
52
+        self.by_md5[md5].add(relative_file)
53
+
54
+    def find_md5_one(self, md5):
55
+        try:
56
+            return list(self.by_md5.get(md5, set()))[0]
57
+        except:
58
+            return None
59
+            
60
+
61
+    def get_md5(self, relative_file):
62
+        md5 = None
63
+        if 'md5' in self[relative_file]:
64
+            return self[relative_file]['md5']
65
+	if self.is_hardlinked(relative_file): # speedup by getting it from one of the hardlinks already processed
66
+            md5 = self.get_hardlink_md5(relative_file)
67
+	    if md5 is None:
68
+                md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
69
+		self.record_md5(relative_file, md5)
70
+	        self.set_hardlink_md5(relative_file, md5)
71
+	else:
72
+                md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
73
+	        self[relative_file]['md5'] = md5
74
+		l.record_md5(relative_file, md5)
75
+	return md5
76
+
77
+    def record_hardlink(self, relative_file, dev, inode):
78
+        if dev not in self.hardlinks:
79
+            self.hardlinks[dev] = dict()
80
+        if inode not in self.hardlinks[dev]:
81
+            self.hardlinks[dev][inode] = dict(md5=None, relative_files=set())
82
+        self.hardlinks[dev][inode]['relative_files'].add(relative_file)
83
+
84
+    def set_hardlink_md5(self, relative_file, md5):
85
+        dev = self[relative_file]['dev']
86
+        inode = self[relative_file]['inode']
87
+        self.record_hardlink(relative_file, dev, inode)
88
+        self.hardlinks[dev][inode]['md5'] = md5
89
+
90
+    def is_hardlinked(self, relative_file):
91
+        return self[relative_file]['nlink'] > 1
92
+
93
+    def get_hardlink_md5(self, relative_file):
94
+        md5 = None
95
+        dev = self[relative_file]['dev']
96
+        inode = self[relative_file]['inode']
97
+        try:
98
+            md5 = self.hardlinks[dev][inode]['md5']
99
+        except:
100
+            pass
101
+        return md5
102
+
48 103
 if __name__ == "__main__":
49 104
     d = { 'AWS' : 1, 'Action' : 2, 'america' : 3, 'Auckland' : 4, 'America' : 5 }
50 105
     sd = SortedDict(d)
... ...
@@ -608,7 +608,7 @@ def cmd_sync_remote2remote(args):
608 608
 
609 609
     src_list, exclude_list = filter_exclude_include(src_list)
610 610
 
611
-    src_list, dst_list, existing_list, update_list = compare_filelists(src_list, dst_list, src_remote = True, dst_remote = True, delay_updates = cfg.delay_updates)
611
+    src_list, dst_list, update_list, copy_pairs = compare_filelists(src_list, dst_list, src_remote = True, dst_remote = True, delay_updates = cfg.delay_updates)
612 612
 
613 613
     src_count = len(src_list)
614 614
     update_count = len(update_list)
... ...
@@ -632,6 +632,11 @@ def cmd_sync_remote2remote(args):
632 632
         warning(u"Exitting now because of --dry-run")
633 633
         return
634 634
 
635
+    # if there are copy pairs, we can't do delete_before, on the chance
636
+    # we need one of the to-be-deleted files as a copy source.
637
+    if len(copy_pairs) > 0:
638
+        cfg.delete_after = True
639
+
635 640
     # Delete items in destination that are not in source
636 641
     if cfg.delete_removed and not cfg.delete_after:
637 642
         _do_deletes(s3, dst_list)
... ...
@@ -658,6 +663,7 @@ def cmd_sync_remote2remote(args):
658 658
     seq = 0
659 659
     seq = _upload(src_list, seq, src_count + update_count)
660 660
     seq = _upload(update_list, seq, src_count + update_count)
661
+    n_copied, bytes_saved = remote_copy(s3, dst_list, copy_pairs, destination_base)
661 662
 
662 663
     total_elapsed = time.time() - timestamp_start
663 664
     outstr = "Done. Copied %d files in %0.1f seconds, %0.2f files/s" % (seq, total_elapsed, seq/total_elapsed)
... ...
@@ -671,13 +677,6 @@ def cmd_sync_remote2remote(args):
671 671
         _do_deletes(s3, dst_list)
672 672
 
673 673
 def cmd_sync_remote2local(args):
674
-    def _parse_attrs_header(attrs_header):
675
-        attrs = {}
676
-        for attr in attrs_header.split("/"):
677
-            key, val = attr.split(":")
678
-            attrs[key] = val
679
-        return attrs
680
-
681 674
     def _do_deletes(local_list):
682 675
         for key in local_list:
683 676
             os.unlink(local_list[key]['full_name'])
... ...
@@ -696,7 +695,7 @@ def cmd_sync_remote2local(args):
696 696
 
697 697
     remote_list, exclude_list = filter_exclude_include(remote_list)
698 698
 
699
-    remote_list, local_list, existing_list, update_list = compare_filelists(remote_list, local_list, src_remote = True, dst_remote = False, delay_updates = cfg.delay_updates)
699
+    remote_list, local_list, update_list, copy_pairs = compare_filelists(remote_list, local_list, src_remote = True, dst_remote = False, delay_updates = cfg.delay_updates)
700 700
 
701 701
     local_count = len(local_list)
702 702
     remote_count = len(remote_list)
... ...
@@ -736,6 +735,11 @@ def cmd_sync_remote2local(args):
736 736
         warning(u"Exitting now because of --dry-run")
737 737
         return
738 738
 
739
+    # if there are copy pairs, we can't do delete_before, on the chance
740
+    # we need one of the to-be-deleted files as a copy source.
741
+    if len(copy_pairs) > 0:
742
+        cfg.delete_after = True
743
+
739 744
     if cfg.delete_removed and not cfg.delete_after:
740 745
         _do_deletes(local_list)
741 746
 
... ...
@@ -768,7 +772,7 @@ def cmd_sync_remote2local(args):
768 768
                     response = s3.object_get(uri, dst_stream, extra_label = seq_label)
769 769
                     dst_stream.close()
770 770
                     if response['headers'].has_key('x-amz-meta-s3cmd-attrs') and cfg.preserve_attrs:
771
-                        attrs = _parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
771
+                        attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
772 772
                         if attrs.has_key('mode'):
773 773
                             os.chmod(dst_file, int(attrs['mode']))
774 774
                         if attrs.has_key('mtime') or attrs.has_key('atime'):
... ...
@@ -837,33 +841,48 @@ def cmd_sync_remote2local(args):
837 837
     if cfg.delete_removed and cfg.delete_after:
838 838
         _do_deletes(local_list)
839 839
 
840
+def remote_copy(s3, remote_list, copy_pairs, destination_base):
841
+    saved_bytes = 0
842
+    for (dst1, dst2) in copy_pairs:
843
+        debug(u"Remote Copying from %s to %s" % (dst1, dst2))
844
+        dst_uri = S3Uri(destination_base + dst2)
845
+        extra_headers = copy(cfg.extra_headers)
846
+        try:
847
+            s3.object_copy(S3Uri(dst1['object_uri_str']), dst_uri, extra_headers)
848
+            saved_bytes = saved_bytes + dst1['size']
849
+        except:
850
+            raise
851
+    return (len(copy_pairs), saved_bytes)
852
+
853
+
840 854
 def cmd_sync_local2remote(args):
841
-    def _build_attr_header(src):
855
+    def _build_attr_header(local_list, src):
842 856
         import pwd, grp
843 857
         attrs = {}
844
-        src = deunicodise(src)
845
-        try:
846
-            st = os.stat_result(os.stat(src))
847
-        except OSError, e:
848
-            raise InvalidFileError(u"%s: %s" % (unicodise(src), e.strerror))
849 858
         for attr in cfg.preserve_attrs_list:
850 859
             if attr == 'uname':
851 860
                 try:
852
-                    val = pwd.getpwuid(st.st_uid).pw_name
861
+                    val = pwd.getpwuid(local_list[src]['uid']).pw_name
853 862
                 except KeyError:
854 863
                     attr = "uid"
855
-                    val = st.st_uid
856
-                    warning(u"%s: Owner username not known. Storing UID=%d instead." % (unicodise(src), val))
864
+                    val = local_list[src].get(uid)
865
+                    warning(u"%s: Owner username not known. Storing UID=%d instead." % (src, val))
857 866
             elif attr == 'gname':
858 867
                 try:
859
-                    val = grp.getgrgid(st.st_gid).gr_name
868
+                    val = grp.getgrgid(local_list[src].get('gid')).gr_name
860 869
                 except KeyError:
861 870
                     attr = "gid"
862
-                    val = st.st_gid
863
-                    warning(u"%s: Owner groupname not known. Storing GID=%d instead." % (unicodise(src), val))
871
+                    val = local_list[src].get('gid')
872
+                    warning(u"%s: Owner groupname not known. Storing GID=%d instead." % (src, val))
873
+            elif attr == 'md5':
874
+                val = local_list.get_md5(src)
864 875
             else:
865
-                val = getattr(st, 'st_' + attr)
876
+                val = getattr(local_list[src]['sr'], 'st_' + attr)
866 877
             attrs[attr] = val
878
+
879
+        if 'md5' in attrs and attrs['md5'] is None:
880
+            del attrs['md5']
881
+
867 882
         result = ""
868 883
         for k in attrs: result += "%s:%s/" % (k, attrs[k])
869 884
         return { 'x-amz-meta-s3cmd-attrs' : result[:-1] }
... ...
@@ -904,15 +923,15 @@ def cmd_sync_local2remote(args):
904 904
         # Flush remote_list, by the way
905 905
         remote_list = { local_list.keys()[0] : remote_list_entry }
906 906
 
907
-    local_list, remote_list, existing_list, update_list = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates)
907
+    local_list, remote_list, update_list, copy_pairs = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates)
908 908
 
909 909
 
910 910
     local_count = len(local_list)
911 911
     update_count = len(update_list)
912
+    copy_count = len(copy_pairs)
912 913
     remote_count = len(remote_list)
913 914
 
914
-    info(u"Summary: %d local files to upload, %d remote files to delete" % (local_count + update_count, remote_count))
915
-
915
+    info(u"Summary: %d local files to upload, %d files to remote copy, %d remote files to delete" % (local_count + update_count, copy_count, remote_count))
916 916
 
917 917
     def _set_remote_uri(local_list, destination_base, single_file_local):
918 918
         if len(local_list) > 0:
... ...
@@ -931,17 +950,24 @@ def cmd_sync_local2remote(args):
931 931
     if cfg.dry_run:
932 932
         for key in exclude_list:
933 933
             output(u"exclude: %s" % unicodise(key))
934
-        if cfg.delete_removed:
935
-            for key in remote_list:
936
-                output(u"delete: %s" % remote_list[key]['object_uri_str'])
937 934
         for key in local_list:
938 935
             output(u"upload: %s -> %s" % (local_list[key]['full_name_unicode'], local_list[key]['remote_uri']))
939 936
         for key in update_list:
940 937
             output(u"upload: %s -> %s" % (update_list[key]['full_name_unicode'], update_list[key]['remote_uri']))
938
+        for (dst1, dst2) in copy_pairs:
939
+            output(u"remote copy: %s -> %s" % (dst1['object_key'], remote_list[dst2]['object_key']))
940
+        if cfg.delete_removed:
941
+            for key in remote_list:
942
+                output(u"delete: %s" % remote_list[key]['object_uri_str'])
941 943
 
942 944
         warning(u"Exitting now because of --dry-run")
943 945
         return
944 946
 
947
+    # if there are copy pairs, we can't do delete_before, on the chance
948
+    # we need one of the to-be-deleted files as a copy source.
949
+    if len(copy_pairs) > 0:
950
+        cfg.delete_after = True
951
+
945 952
     if cfg.delete_removed and not cfg.delete_after:
946 953
         _do_deletes(s3, remote_list)
947 954
 
... ...
@@ -962,7 +988,7 @@ def cmd_sync_local2remote(args):
962 962
             extra_headers = copy(cfg.extra_headers)
963 963
             try:
964 964
                 if cfg.preserve_attrs:
965
-                    attr_header = _build_attr_header(src)
965
+                    attr_header = _build_attr_header(local_list, file)
966 966
                     debug(u"attr_header: %s" % attr_header)
967 967
                     extra_headers.update(attr_header)
968 968
                 response = s3.object_put(src, uri, extra_headers, extra_label = seq_label)
... ...
@@ -981,23 +1007,24 @@ def cmd_sync_local2remote(args):
981 981
             uploaded_objects_list.append(uri.object())
982 982
         return seq, total_size
983 983
 
984
+
984 985
     n, total_size = _upload(local_list, 0, local_count, total_size)
985 986
     n, total_size = _upload(update_list, n, local_count, total_size)
987
+    n_copies, saved_bytes = remote_copy(s3, remote_list, copy_pairs, destination_base)
988
+    if cfg.delete_removed and cfg.delete_after:
989
+        _do_deletes(s3, remote_list)
986 990
     total_elapsed = time.time() - timestamp_start
987 991
     total_speed = total_elapsed and total_size/total_elapsed or 0.0
988 992
     speed_fmt = formatSize(total_speed, human_readable = True, floating_point = True)
989 993
 
990 994
     # Only print out the result if any work has been done or
991 995
     # if the user asked for verbose output
992
-    outstr = "Done. Uploaded %d bytes in %0.1f seconds, %0.2f %sB/s" % (total_size, total_elapsed, speed_fmt[0], speed_fmt[1])
993
-    if total_size > 0:
996
+    outstr = "Done. Uploaded %d bytes in %0.1f seconds, %0.2f %sB/s.  Copied %d files saving %d bytes transfer." % (total_size, total_elapsed, speed_fmt[0], speed_fmt[1], n_copies, saved_bytes)
997
+    if total_size + saved_bytes > 0:
994 998
         output(outstr)
995 999
     else:
996 1000
         info(outstr)
997 1001
 
998
-    if cfg.delete_removed and cfg.delete_after:
999
-        _do_deletes(s3, remote_list)
1000
-
1001 1002
     if cfg.invalidate_on_cf:
1002 1003
         if len(uploaded_objects_list) == 0:
1003 1004
             info("Nothing to invalidate in CloudFront")