Minimize uploads in sync local->remote by looking for existing same
files elsewhere in remote destination and do an S3 COPY command
instead of uploading the file again.
We now store the (locally generated) md5 of the file in the
x-amz-meta-s3cmd-attrs metadata, because we can't count on the ETag
being correct due to multipart uploads. Use this value if it's
available.
This also reduces the number of local stat() calls made by
recording more useful information during the inital
os.walk(). This cuts the number of stat()s in half.
... | ... |
@@ -16,7 +16,7 @@ import os |
16 | 16 |
import glob |
17 | 17 |
import copy |
18 | 18 |
|
19 |
-__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include"] |
|
19 |
+__all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include", "parse_attrs_header"] |
|
20 | 20 |
|
21 | 21 |
def _fswalk_follow_symlinks(path): |
22 | 22 |
''' |
... | ... |
@@ -173,8 +173,15 @@ def fetch_local_list(args, recursive = None): |
173 | 173 |
'full_name' : full_name, |
174 | 174 |
'size' : sr.st_size, |
175 | 175 |
'mtime' : sr.st_mtime, |
176 |
+ 'nlink' : sr.st_nlink, # record hardlink information |
|
177 |
+ 'dev' : sr.st_dev, |
|
178 |
+ 'inode' : sr.st_ino, |
|
179 |
+ 'uid' : sr.st_uid, |
|
180 |
+ 'gid' : sr.st_gid, |
|
181 |
+ 'sr': sr # save it all, may need it in preserve_attrs_list |
|
176 | 182 |
## TODO: Possibly more to save here... |
177 | 183 |
} |
184 |
+ loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino) |
|
178 | 185 |
return loc_list, single_file |
179 | 186 |
|
180 | 187 |
cfg = Config() |
... | ... |
@@ -257,7 +264,12 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
257 | 257 |
'object_key' : object['Key'], |
258 | 258 |
'object_uri_str' : object_uri_str, |
259 | 259 |
'base_uri' : remote_uri, |
260 |
+ 'nlink' : 1, # S3 doesn't support hardlinks itself |
|
261 |
+ 'dev' : None, |
|
262 |
+ 'inode' : None, |
|
260 | 263 |
} |
264 |
+ md5 = object['ETag'][1:-1] |
|
265 |
+ rem_list.record_md5(key, md5) |
|
261 | 266 |
if break_now: |
262 | 267 |
break |
263 | 268 |
return rem_list |
... | ... |
@@ -283,6 +295,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
283 | 283 |
objectlist = _get_filelist_remote(uri) |
284 | 284 |
for key in objectlist: |
285 | 285 |
remote_list[key] = objectlist[key] |
286 |
+ remote_list.record_md5(key, objectlist.get_md5(key)) |
|
286 | 287 |
else: |
287 | 288 |
for uri in remote_uris: |
288 | 289 |
uri_str = str(uri) |
... | ... |
@@ -320,90 +333,135 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
320 | 320 |
'md5': response['headers']['etag'].strip('"\''), |
321 | 321 |
'timestamp' : dateRFC822toUnix(response['headers']['date']) |
322 | 322 |
}) |
323 |
+ # get md5 from header if it's present. We would have set that during upload |
|
324 |
+ if response['headers'].has_key('x-amz-meta-s3cmd-attrs'): |
|
325 |
+ attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs']) |
|
326 |
+ if attrs.has_key('md5'): |
|
327 |
+ remote_item.update({'md5': attrs['md5']}) |
|
328 |
+ |
|
323 | 329 |
remote_list[key] = remote_item |
324 | 330 |
return remote_list |
325 | 331 |
|
332 |
+def parse_attrs_header(attrs_header): |
|
333 |
+ attrs = {} |
|
334 |
+ for attr in attrs_header.split("/"): |
|
335 |
+ key, val = attr.split(":") |
|
336 |
+ attrs[key] = val |
|
337 |
+ return attrs |
|
338 |
+ |
|
339 |
+ |
|
326 | 340 |
def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates = False): |
327 | 341 |
def __direction_str(is_remote): |
328 | 342 |
return is_remote and "remote" or "local" |
329 | 343 |
|
330 |
- # We don't support local->local sync, use 'rsync' or something like that instead ;-) |
|
344 |
+ def _compare(src_list, dst_lst, src_remote, dst_remote, file): |
|
345 |
+ """Return True if src_list[file] matches dst_list[file], else False""" |
|
346 |
+ attribs_match = True |
|
347 |
+ if not (src_list.has_key(file) and dst_list.has_key(file)): |
|
348 |
+ info(u"file does not exist in one side or the other: src_list=%s, dst_list=%s" % (src_list.has_key(file), dst_list.has_key(file))) |
|
349 |
+ return False |
|
350 |
+ |
|
351 |
+ ## check size first |
|
352 |
+ if 'size' in cfg.sync_checks and dst_list[file]['size'] != src_list[file]['size']: |
|
353 |
+ debug(u"xfer: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size'])) |
|
354 |
+ attribs_match = False |
|
355 |
+ |
|
356 |
+ ## check md5 |
|
357 |
+ compare_md5 = 'md5' in cfg.sync_checks |
|
358 |
+ # Multipart-uploaded files don't have a valid md5 sum - it ends with "...-nn" |
|
359 |
+ if compare_md5: |
|
360 |
+ if (src_remote == True and src_list[file]['md5'].find("-") >= 0) or (dst_remote == True and dst_list[file]['md5'].find("-") >= 0): |
|
361 |
+ compare_md5 = False |
|
362 |
+ info(u"disabled md5 check for %s" % file) |
|
363 |
+ if attribs_match and compare_md5: |
|
364 |
+ try: |
|
365 |
+ src_md5 = src_list.get_md5(file) |
|
366 |
+ dst_md5 = dst_list.get_md5(file) |
|
367 |
+ except (IOError,OSError), e: |
|
368 |
+ # md5 sum verification failed - ignore that file altogether |
|
369 |
+ debug(u"IGNR: %s (disappeared)" % (file)) |
|
370 |
+ warning(u"%s: file disappeared, ignoring." % (file)) |
|
371 |
+ raise |
|
372 |
+ |
|
373 |
+ if src_md5 != dst_md5: |
|
374 |
+ ## checksums are different. |
|
375 |
+ attribs_match = False |
|
376 |
+ debug(u"XFER: %s (md5 mismatch: src=%s dst=%s)" % (file, src_md5, dst_md5)) |
|
377 |
+ |
|
378 |
+ return attribs_match |
|
379 |
+ |
|
380 |
+ # we don't support local->local sync, use 'rsync' or something like that instead ;-) |
|
331 | 381 |
assert(not(src_remote == False and dst_remote == False)) |
332 | 382 |
|
333 | 383 |
info(u"Verifying attributes...") |
334 | 384 |
cfg = Config() |
335 |
- exists_list = SortedDict(ignore_case = False) |
|
385 |
+ ## Items left on src_list will be transferred |
|
386 |
+ ## Items left on update_list will be transferred after src_list |
|
387 |
+ ## Items left on copy_pairs will be copied from dst1 to dst2 |
|
336 | 388 |
update_list = SortedDict(ignore_case = False) |
389 |
+ ## Items left on dst_list will be deleted |
|
390 |
+ copy_pairs = [] |
|
391 |
+ |
|
337 | 392 |
|
338 | 393 |
debug("Comparing filelists (direction: %s -> %s)" % (__direction_str(src_remote), __direction_str(dst_remote))) |
339 |
- debug("src_list.keys: %s" % src_list.keys()) |
|
340 |
- debug("dst_list.keys: %s" % dst_list.keys()) |
|
341 | 394 |
|
342 |
- for file in src_list.keys(): |
|
343 |
- debug(u"CHECK: %s" % file) |
|
344 |
- if dst_list.has_key(file): |
|
345 |
- ## Was --skip-existing requested? |
|
346 |
- if cfg.skip_existing: |
|
347 |
- debug(u"IGNR: %s (used --skip-existing)" % (file)) |
|
348 |
- exists_list[file] = src_list[file] |
|
349 |
- del(src_list[file]) |
|
350 |
- ## Remove from destination-list, all that is left there will be deleted |
|
351 |
- del(dst_list[file]) |
|
352 |
- continue |
|
353 |
- |
|
354 |
- attribs_match = True |
|
355 |
- ## Check size first |
|
356 |
- if 'size' in cfg.sync_checks and dst_list[file]['size'] != src_list[file]['size']: |
|
357 |
- debug(u"XFER: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size'])) |
|
358 |
- attribs_match = False |
|
395 |
+ for relative_file in src_list.keys(): |
|
396 |
+ debug(u"CHECK: %s: %s" % (relative_file, src_list.get_md5(relative_file))) |
|
359 | 397 |
|
360 |
- ## Check MD5 |
|
361 |
- compare_md5 = 'md5' in cfg.sync_checks |
|
362 |
- # Multipart-uploaded files don't have a valid MD5 sum - it ends with "...-NN" |
|
363 |
- if compare_md5: |
|
364 |
- if (src_remote == True and src_list[file]['md5'].find("-") >= 0) or (dst_remote == True and dst_list[file]['md5'].find("-") >= 0): |
|
365 |
- compare_md5 = False |
|
366 |
- info(u"Disabled MD5 check for %s" % file) |
|
367 |
- if attribs_match and compare_md5: |
|
368 |
- try: |
|
369 |
- if src_remote == False and dst_remote == True: |
|
370 |
- src_md5 = hash_file_md5(src_list[file]['full_name']) |
|
371 |
- dst_md5 = dst_list[file]['md5'] |
|
372 |
- elif src_remote == True and dst_remote == False: |
|
373 |
- src_md5 = src_list[file]['md5'] |
|
374 |
- dst_md5 = hash_file_md5(dst_list[file]['full_name']) |
|
375 |
- elif src_remote == True and dst_remote == True: |
|
376 |
- src_md5 = src_list[file]['md5'] |
|
377 |
- dst_md5 = dst_list[file]['md5'] |
|
378 |
- except (IOError,OSError), e: |
|
379 |
- # MD5 sum verification failed - ignore that file altogether |
|
380 |
- debug(u"IGNR: %s (disappeared)" % (file)) |
|
381 |
- warning(u"%s: file disappeared, ignoring." % (file)) |
|
382 |
- del(src_list[file]) |
|
383 |
- del(dst_list[file]) |
|
384 |
- continue |
|
398 |
+ if dst_list.has_key(relative_file): |
|
399 |
+ ## Was --skip-existing requested? |
|
400 |
+ if cfg.skip_existing: |
|
401 |
+ debug(u"IGNR: %s (used --skip-existing)" % (relative_file)) |
|
402 |
+ del(src_list[relative_file]) |
|
403 |
+ del(dst_list[relative_file]) |
|
404 |
+ continue |
|
385 | 405 |
|
386 |
- if src_md5 != dst_md5: |
|
387 |
- ## Checksums are different. |
|
388 |
- attribs_match = False |
|
389 |
- debug(u"XFER: %s (md5 mismatch: src=%s dst=%s)" % (file, src_md5, dst_md5)) |
|
406 |
+ if _compare(src_list, dst_list, src_remote, dst_remote, relative_file): |
|
407 |
+ debug(u"IGNR: %s (transfer not needed)" % relative_file) |
|
408 |
+ del(src_list[relative_file]) |
|
409 |
+ del(dst_list[relative_file]) |
|
390 | 410 |
|
391 |
- if attribs_match: |
|
392 |
- ## Remove from source-list, all that is left there will be transferred |
|
393 |
- debug(u"IGNR: %s (transfer not needed)" % file) |
|
394 |
- exists_list[file] = src_list[file] |
|
395 |
- del(src_list[file]) |
|
396 | 411 |
else: |
397 |
- if delay_updates: |
|
398 |
- ## Remove from source-list, all that is left there will be transferred |
|
399 |
- ## Add to update-list to transfer last |
|
400 |
- debug(u"XFER UPDATE: %s" % file) |
|
401 |
- update_list[file] = src_list[file] |
|
402 |
- del(src_list[file]) |
|
403 |
- |
|
404 |
- ## Remove from destination-list, all that is left there will be deleted |
|
405 |
- del(dst_list[file]) |
|
406 |
- |
|
407 |
- return src_list, dst_list, exists_list, update_list |
|
412 |
+ # look for matching file in src |
|
413 |
+ md5 = src_list.get_md5(relative_file) |
|
414 |
+ if md5 is not None and dst_list.by_md5.has_key(md5): |
|
415 |
+ # Found one, we want to copy |
|
416 |
+ dst1 = list(dst_list.by_md5[md5])[0] |
|
417 |
+ debug(u"REMOTE COPY src: %s -> %s" % (dst1, relative_file)) |
|
418 |
+ copy_pairs.append((dst_list[dst1], relative_file)) |
|
419 |
+ del(src_list[relative_file]) |
|
420 |
+ del(dst_list[relative_file]) |
|
421 |
+ else: |
|
422 |
+ # record that we will get this file transferred to us (before all the copies), so if we come across it later again, |
|
423 |
+ # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter). |
|
424 |
+ debug(u"REMOTE COPY src before") |
|
425 |
+ dst_list.record_md5(relative_file, md5) |
|
426 |
+ update_list[relative_file] = src_list[relative_file] |
|
427 |
+ del src_list[relative_file] |
|
428 |
+ del dst_list[relative_file] |
|
429 |
+ |
|
430 |
+ else: |
|
431 |
+ # dst doesn't have this file |
|
432 |
+ # look for matching file elsewhere in dst |
|
433 |
+ md5 = src_list.get_md5(relative_file) |
|
434 |
+ dst1 = dst_list.find_md5_one(md5) |
|
435 |
+ if dst1 is not None: |
|
436 |
+ # Found one, we want to copy |
|
437 |
+ debug(u"REMOTE COPY dst: %s -> %s" % (dst1, relative_file)) |
|
438 |
+ copy_pairs.append((dst_list[dst1], relative_file)) |
|
439 |
+ del(src_list[relative_file]) |
|
440 |
+ else: |
|
441 |
+ # we don't have this file, and we don't have a copy of this file elsewhere. Get it. |
|
442 |
+ # record that we will get this file transferred to us (before all the copies), so if we come across it later again, |
|
443 |
+ # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter). |
|
444 |
+ dst_list.record_md5(relative_file, md5) |
|
445 |
+ debug(u"REMOTE COPY dst before") |
|
446 |
+ |
|
447 |
+ for f in dst_list.keys(): |
|
448 |
+ if not src_list.has_key(f) and not update_list.has_key(f): |
|
449 |
+ # leave only those not on src_list + update_list |
|
450 |
+ del dst_list[f] |
|
451 |
+ |
|
452 |
+ return src_list, dst_list, update_list, copy_pairs |
|
408 | 453 |
|
409 | 454 |
# vim:et:ts=4:sts=4:ai |
... | ... |
@@ -4,6 +4,7 @@ |
4 | 4 |
## License: GPL Version 2 |
5 | 5 |
|
6 | 6 |
from BidirMap import BidirMap |
7 |
+import Utils |
|
7 | 8 |
|
8 | 9 |
class SortedDictIterator(object): |
9 | 10 |
def __init__(self, sorted_dict, keys): |
... | ... |
@@ -26,6 +27,8 @@ class SortedDict(dict): |
26 | 26 |
""" |
27 | 27 |
dict.__init__(self, mapping, **kwargs) |
28 | 28 |
self.ignore_case = ignore_case |
29 |
+ self.hardlinks = dict() |
|
30 |
+ self.by_md5 = dict() # {md5: set(relative_files)} |
|
29 | 31 |
|
30 | 32 |
def keys(self): |
31 | 33 |
keys = dict.keys(self) |
... | ... |
@@ -45,6 +48,61 @@ class SortedDict(dict): |
45 | 45 |
def __iter__(self): |
46 | 46 |
return SortedDictIterator(self, self.keys()) |
47 | 47 |
|
48 |
+ |
|
49 |
+ def record_md5(self, relative_file, md5): |
|
50 |
+ if md5 not in self.by_md5: |
|
51 |
+ self.by_md5[md5] = set() |
|
52 |
+ self.by_md5[md5].add(relative_file) |
|
53 |
+ |
|
54 |
+ def find_md5_one(self, md5): |
|
55 |
+ try: |
|
56 |
+ return list(self.by_md5.get(md5, set()))[0] |
|
57 |
+ except: |
|
58 |
+ return None |
|
59 |
+ |
|
60 |
+ |
|
61 |
+ def get_md5(self, relative_file): |
|
62 |
+ md5 = None |
|
63 |
+ if 'md5' in self[relative_file]: |
|
64 |
+ return self[relative_file]['md5'] |
|
65 |
+ if self.is_hardlinked(relative_file): # speedup by getting it from one of the hardlinks already processed |
|
66 |
+ md5 = self.get_hardlink_md5(relative_file) |
|
67 |
+ if md5 is None: |
|
68 |
+ md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
69 |
+ self.record_md5(relative_file, md5) |
|
70 |
+ self.set_hardlink_md5(relative_file, md5) |
|
71 |
+ else: |
|
72 |
+ md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
73 |
+ self[relative_file]['md5'] = md5 |
|
74 |
+ l.record_md5(relative_file, md5) |
|
75 |
+ return md5 |
|
76 |
+ |
|
77 |
+ def record_hardlink(self, relative_file, dev, inode): |
|
78 |
+ if dev not in self.hardlinks: |
|
79 |
+ self.hardlinks[dev] = dict() |
|
80 |
+ if inode not in self.hardlinks[dev]: |
|
81 |
+ self.hardlinks[dev][inode] = dict(md5=None, relative_files=set()) |
|
82 |
+ self.hardlinks[dev][inode]['relative_files'].add(relative_file) |
|
83 |
+ |
|
84 |
+ def set_hardlink_md5(self, relative_file, md5): |
|
85 |
+ dev = self[relative_file]['dev'] |
|
86 |
+ inode = self[relative_file]['inode'] |
|
87 |
+ self.record_hardlink(relative_file, dev, inode) |
|
88 |
+ self.hardlinks[dev][inode]['md5'] = md5 |
|
89 |
+ |
|
90 |
+ def is_hardlinked(self, relative_file): |
|
91 |
+ return self[relative_file]['nlink'] > 1 |
|
92 |
+ |
|
93 |
+ def get_hardlink_md5(self, relative_file): |
|
94 |
+ md5 = None |
|
95 |
+ dev = self[relative_file]['dev'] |
|
96 |
+ inode = self[relative_file]['inode'] |
|
97 |
+ try: |
|
98 |
+ md5 = self.hardlinks[dev][inode]['md5'] |
|
99 |
+ except: |
|
100 |
+ pass |
|
101 |
+ return md5 |
|
102 |
+ |
|
48 | 103 |
if __name__ == "__main__": |
49 | 104 |
d = { 'AWS' : 1, 'Action' : 2, 'america' : 3, 'Auckland' : 4, 'America' : 5 } |
50 | 105 |
sd = SortedDict(d) |
... | ... |
@@ -608,7 +608,7 @@ def cmd_sync_remote2remote(args): |
608 | 608 |
|
609 | 609 |
src_list, exclude_list = filter_exclude_include(src_list) |
610 | 610 |
|
611 |
- src_list, dst_list, existing_list, update_list = compare_filelists(src_list, dst_list, src_remote = True, dst_remote = True, delay_updates = cfg.delay_updates) |
|
611 |
+ src_list, dst_list, update_list, copy_pairs = compare_filelists(src_list, dst_list, src_remote = True, dst_remote = True, delay_updates = cfg.delay_updates) |
|
612 | 612 |
|
613 | 613 |
src_count = len(src_list) |
614 | 614 |
update_count = len(update_list) |
... | ... |
@@ -632,6 +632,11 @@ def cmd_sync_remote2remote(args): |
632 | 632 |
warning(u"Exitting now because of --dry-run") |
633 | 633 |
return |
634 | 634 |
|
635 |
+ # if there are copy pairs, we can't do delete_before, on the chance |
|
636 |
+ # we need one of the to-be-deleted files as a copy source. |
|
637 |
+ if len(copy_pairs) > 0: |
|
638 |
+ cfg.delete_after = True |
|
639 |
+ |
|
635 | 640 |
# Delete items in destination that are not in source |
636 | 641 |
if cfg.delete_removed and not cfg.delete_after: |
637 | 642 |
_do_deletes(s3, dst_list) |
... | ... |
@@ -658,6 +663,7 @@ def cmd_sync_remote2remote(args): |
658 | 658 |
seq = 0 |
659 | 659 |
seq = _upload(src_list, seq, src_count + update_count) |
660 | 660 |
seq = _upload(update_list, seq, src_count + update_count) |
661 |
+ n_copied, bytes_saved = remote_copy(s3, dst_list, copy_pairs, destination_base) |
|
661 | 662 |
|
662 | 663 |
total_elapsed = time.time() - timestamp_start |
663 | 664 |
outstr = "Done. Copied %d files in %0.1f seconds, %0.2f files/s" % (seq, total_elapsed, seq/total_elapsed) |
... | ... |
@@ -671,13 +677,6 @@ def cmd_sync_remote2remote(args): |
671 | 671 |
_do_deletes(s3, dst_list) |
672 | 672 |
|
673 | 673 |
def cmd_sync_remote2local(args): |
674 |
- def _parse_attrs_header(attrs_header): |
|
675 |
- attrs = {} |
|
676 |
- for attr in attrs_header.split("/"): |
|
677 |
- key, val = attr.split(":") |
|
678 |
- attrs[key] = val |
|
679 |
- return attrs |
|
680 |
- |
|
681 | 674 |
def _do_deletes(local_list): |
682 | 675 |
for key in local_list: |
683 | 676 |
os.unlink(local_list[key]['full_name']) |
... | ... |
@@ -696,7 +695,7 @@ def cmd_sync_remote2local(args): |
696 | 696 |
|
697 | 697 |
remote_list, exclude_list = filter_exclude_include(remote_list) |
698 | 698 |
|
699 |
- remote_list, local_list, existing_list, update_list = compare_filelists(remote_list, local_list, src_remote = True, dst_remote = False, delay_updates = cfg.delay_updates) |
|
699 |
+ remote_list, local_list, update_list, copy_pairs = compare_filelists(remote_list, local_list, src_remote = True, dst_remote = False, delay_updates = cfg.delay_updates) |
|
700 | 700 |
|
701 | 701 |
local_count = len(local_list) |
702 | 702 |
remote_count = len(remote_list) |
... | ... |
@@ -736,6 +735,11 @@ def cmd_sync_remote2local(args): |
736 | 736 |
warning(u"Exitting now because of --dry-run") |
737 | 737 |
return |
738 | 738 |
|
739 |
+ # if there are copy pairs, we can't do delete_before, on the chance |
|
740 |
+ # we need one of the to-be-deleted files as a copy source. |
|
741 |
+ if len(copy_pairs) > 0: |
|
742 |
+ cfg.delete_after = True |
|
743 |
+ |
|
739 | 744 |
if cfg.delete_removed and not cfg.delete_after: |
740 | 745 |
_do_deletes(local_list) |
741 | 746 |
|
... | ... |
@@ -768,7 +772,7 @@ def cmd_sync_remote2local(args): |
768 | 768 |
response = s3.object_get(uri, dst_stream, extra_label = seq_label) |
769 | 769 |
dst_stream.close() |
770 | 770 |
if response['headers'].has_key('x-amz-meta-s3cmd-attrs') and cfg.preserve_attrs: |
771 |
- attrs = _parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs']) |
|
771 |
+ attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs']) |
|
772 | 772 |
if attrs.has_key('mode'): |
773 | 773 |
os.chmod(dst_file, int(attrs['mode'])) |
774 | 774 |
if attrs.has_key('mtime') or attrs.has_key('atime'): |
... | ... |
@@ -837,33 +841,48 @@ def cmd_sync_remote2local(args): |
837 | 837 |
if cfg.delete_removed and cfg.delete_after: |
838 | 838 |
_do_deletes(local_list) |
839 | 839 |
|
840 |
+def remote_copy(s3, remote_list, copy_pairs, destination_base): |
|
841 |
+ saved_bytes = 0 |
|
842 |
+ for (dst1, dst2) in copy_pairs: |
|
843 |
+ debug(u"Remote Copying from %s to %s" % (dst1, dst2)) |
|
844 |
+ dst_uri = S3Uri(destination_base + dst2) |
|
845 |
+ extra_headers = copy(cfg.extra_headers) |
|
846 |
+ try: |
|
847 |
+ s3.object_copy(S3Uri(dst1['object_uri_str']), dst_uri, extra_headers) |
|
848 |
+ saved_bytes = saved_bytes + dst1['size'] |
|
849 |
+ except: |
|
850 |
+ raise |
|
851 |
+ return (len(copy_pairs), saved_bytes) |
|
852 |
+ |
|
853 |
+ |
|
840 | 854 |
def cmd_sync_local2remote(args): |
841 |
- def _build_attr_header(src): |
|
855 |
+ def _build_attr_header(local_list, src): |
|
842 | 856 |
import pwd, grp |
843 | 857 |
attrs = {} |
844 |
- src = deunicodise(src) |
|
845 |
- try: |
|
846 |
- st = os.stat_result(os.stat(src)) |
|
847 |
- except OSError, e: |
|
848 |
- raise InvalidFileError(u"%s: %s" % (unicodise(src), e.strerror)) |
|
849 | 858 |
for attr in cfg.preserve_attrs_list: |
850 | 859 |
if attr == 'uname': |
851 | 860 |
try: |
852 |
- val = pwd.getpwuid(st.st_uid).pw_name |
|
861 |
+ val = pwd.getpwuid(local_list[src]['uid']).pw_name |
|
853 | 862 |
except KeyError: |
854 | 863 |
attr = "uid" |
855 |
- val = st.st_uid |
|
856 |
- warning(u"%s: Owner username not known. Storing UID=%d instead." % (unicodise(src), val)) |
|
864 |
+ val = local_list[src].get(uid) |
|
865 |
+ warning(u"%s: Owner username not known. Storing UID=%d instead." % (src, val)) |
|
857 | 866 |
elif attr == 'gname': |
858 | 867 |
try: |
859 |
- val = grp.getgrgid(st.st_gid).gr_name |
|
868 |
+ val = grp.getgrgid(local_list[src].get('gid')).gr_name |
|
860 | 869 |
except KeyError: |
861 | 870 |
attr = "gid" |
862 |
- val = st.st_gid |
|
863 |
- warning(u"%s: Owner groupname not known. Storing GID=%d instead." % (unicodise(src), val)) |
|
871 |
+ val = local_list[src].get('gid') |
|
872 |
+ warning(u"%s: Owner groupname not known. Storing GID=%d instead." % (src, val)) |
|
873 |
+ elif attr == 'md5': |
|
874 |
+ val = local_list.get_md5(src) |
|
864 | 875 |
else: |
865 |
- val = getattr(st, 'st_' + attr) |
|
876 |
+ val = getattr(local_list[src]['sr'], 'st_' + attr) |
|
866 | 877 |
attrs[attr] = val |
878 |
+ |
|
879 |
+ if 'md5' in attrs and attrs['md5'] is None: |
|
880 |
+ del attrs['md5'] |
|
881 |
+ |
|
867 | 882 |
result = "" |
868 | 883 |
for k in attrs: result += "%s:%s/" % (k, attrs[k]) |
869 | 884 |
return { 'x-amz-meta-s3cmd-attrs' : result[:-1] } |
... | ... |
@@ -904,15 +923,15 @@ def cmd_sync_local2remote(args): |
904 | 904 |
# Flush remote_list, by the way |
905 | 905 |
remote_list = { local_list.keys()[0] : remote_list_entry } |
906 | 906 |
|
907 |
- local_list, remote_list, existing_list, update_list = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates) |
|
907 |
+ local_list, remote_list, update_list, copy_pairs = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates) |
|
908 | 908 |
|
909 | 909 |
|
910 | 910 |
local_count = len(local_list) |
911 | 911 |
update_count = len(update_list) |
912 |
+ copy_count = len(copy_pairs) |
|
912 | 913 |
remote_count = len(remote_list) |
913 | 914 |
|
914 |
- info(u"Summary: %d local files to upload, %d remote files to delete" % (local_count + update_count, remote_count)) |
|
915 |
- |
|
915 |
+ info(u"Summary: %d local files to upload, %d files to remote copy, %d remote files to delete" % (local_count + update_count, copy_count, remote_count)) |
|
916 | 916 |
|
917 | 917 |
def _set_remote_uri(local_list, destination_base, single_file_local): |
918 | 918 |
if len(local_list) > 0: |
... | ... |
@@ -931,17 +950,24 @@ def cmd_sync_local2remote(args): |
931 | 931 |
if cfg.dry_run: |
932 | 932 |
for key in exclude_list: |
933 | 933 |
output(u"exclude: %s" % unicodise(key)) |
934 |
- if cfg.delete_removed: |
|
935 |
- for key in remote_list: |
|
936 |
- output(u"delete: %s" % remote_list[key]['object_uri_str']) |
|
937 | 934 |
for key in local_list: |
938 | 935 |
output(u"upload: %s -> %s" % (local_list[key]['full_name_unicode'], local_list[key]['remote_uri'])) |
939 | 936 |
for key in update_list: |
940 | 937 |
output(u"upload: %s -> %s" % (update_list[key]['full_name_unicode'], update_list[key]['remote_uri'])) |
938 |
+ for (dst1, dst2) in copy_pairs: |
|
939 |
+ output(u"remote copy: %s -> %s" % (dst1['object_key'], remote_list[dst2]['object_key'])) |
|
940 |
+ if cfg.delete_removed: |
|
941 |
+ for key in remote_list: |
|
942 |
+ output(u"delete: %s" % remote_list[key]['object_uri_str']) |
|
941 | 943 |
|
942 | 944 |
warning(u"Exitting now because of --dry-run") |
943 | 945 |
return |
944 | 946 |
|
947 |
+ # if there are copy pairs, we can't do delete_before, on the chance |
|
948 |
+ # we need one of the to-be-deleted files as a copy source. |
|
949 |
+ if len(copy_pairs) > 0: |
|
950 |
+ cfg.delete_after = True |
|
951 |
+ |
|
945 | 952 |
if cfg.delete_removed and not cfg.delete_after: |
946 | 953 |
_do_deletes(s3, remote_list) |
947 | 954 |
|
... | ... |
@@ -962,7 +988,7 @@ def cmd_sync_local2remote(args): |
962 | 962 |
extra_headers = copy(cfg.extra_headers) |
963 | 963 |
try: |
964 | 964 |
if cfg.preserve_attrs: |
965 |
- attr_header = _build_attr_header(src) |
|
965 |
+ attr_header = _build_attr_header(local_list, file) |
|
966 | 966 |
debug(u"attr_header: %s" % attr_header) |
967 | 967 |
extra_headers.update(attr_header) |
968 | 968 |
response = s3.object_put(src, uri, extra_headers, extra_label = seq_label) |
... | ... |
@@ -981,23 +1007,24 @@ def cmd_sync_local2remote(args): |
981 | 981 |
uploaded_objects_list.append(uri.object()) |
982 | 982 |
return seq, total_size |
983 | 983 |
|
984 |
+ |
|
984 | 985 |
n, total_size = _upload(local_list, 0, local_count, total_size) |
985 | 986 |
n, total_size = _upload(update_list, n, local_count, total_size) |
987 |
+ n_copies, saved_bytes = remote_copy(s3, remote_list, copy_pairs, destination_base) |
|
988 |
+ if cfg.delete_removed and cfg.delete_after: |
|
989 |
+ _do_deletes(s3, remote_list) |
|
986 | 990 |
total_elapsed = time.time() - timestamp_start |
987 | 991 |
total_speed = total_elapsed and total_size/total_elapsed or 0.0 |
988 | 992 |
speed_fmt = formatSize(total_speed, human_readable = True, floating_point = True) |
989 | 993 |
|
990 | 994 |
# Only print out the result if any work has been done or |
991 | 995 |
# if the user asked for verbose output |
992 |
- outstr = "Done. Uploaded %d bytes in %0.1f seconds, %0.2f %sB/s" % (total_size, total_elapsed, speed_fmt[0], speed_fmt[1]) |
|
993 |
- if total_size > 0: |
|
996 |
+ outstr = "Done. Uploaded %d bytes in %0.1f seconds, %0.2f %sB/s. Copied %d files saving %d bytes transfer." % (total_size, total_elapsed, speed_fmt[0], speed_fmt[1], n_copies, saved_bytes) |
|
997 |
+ if total_size + saved_bytes > 0: |
|
994 | 998 |
output(outstr) |
995 | 999 |
else: |
996 | 1000 |
info(outstr) |
997 | 1001 |
|
998 |
- if cfg.delete_removed and cfg.delete_after: |
|
999 |
- _do_deletes(s3, remote_list) |
|
1000 |
- |
|
1001 | 1002 |
if cfg.invalidate_on_cf: |
1002 | 1003 |
if len(uploaded_objects_list) == 0: |
1003 | 1004 |
info("Nothing to invalidate in CloudFront") |