I should not have overloaded SortedDict with all the md5 and hardlink
tracking stuff which only used by file lists. There are other uses of
SortedDict elsewhere in the app that need not be cluttered by these
additional features in SortedDict.
Solution: derive new class FileDict from SortedDict, and put the md5
and hardlink tracking code into that. Then use FileDict instead of
SortedDict where those features are needed.
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,53 @@ |
0 |
+## Amazon S3 manager |
|
1 |
+## Author: Michal Ludvig <michal@logix.cz> |
|
2 |
+## http://www.logix.cz/michal |
|
3 |
+## License: GPL Version 2 |
|
4 |
+ |
|
5 |
+from SortedDict import SortedDict |
|
6 |
+import Utils |
|
7 |
+ |
|
8 |
+class FileDict(SortedDict): |
|
9 |
+ def __init__(self, mapping = {}, ignore_case = True, **kwargs): |
|
10 |
+ SortedDict.__init__(self, mapping = mapping, ignore_case = ignore_case, **kwargs) |
|
11 |
+ self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}} |
|
12 |
+ self.by_md5 = dict() # {md5: set(relative_files)} |
|
13 |
+ |
|
14 |
+ def record_md5(self, relative_file, md5): |
|
15 |
+ if md5 not in self.by_md5: |
|
16 |
+ self.by_md5[md5] = set() |
|
17 |
+ self.by_md5[md5].add(relative_file) |
|
18 |
+ |
|
19 |
+ def find_md5_one(self, md5): |
|
20 |
+ try: |
|
21 |
+ return list(self.by_md5.get(md5, set()))[0] |
|
22 |
+ except: |
|
23 |
+ return None |
|
24 |
+ |
|
25 |
+ def get_md5(self, relative_file): |
|
26 |
+ """returns md5 if it can, or raises IOError if file is unreadable""" |
|
27 |
+ md5 = None |
|
28 |
+ if 'md5' in self[relative_file]: |
|
29 |
+ return self[relative_file]['md5'] |
|
30 |
+ md5 = self.get_hardlink_md5(relative_file) |
|
31 |
+ if md5 is None: |
|
32 |
+ md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
33 |
+ self.record_md5(relative_file, md5) |
|
34 |
+ self[relative_file]['md5'] = md5 |
|
35 |
+ return md5 |
|
36 |
+ |
|
37 |
+ def record_hardlink(self, relative_file, dev, inode, md5): |
|
38 |
+ if dev not in self.hardlinks: |
|
39 |
+ self.hardlinks[dev] = dict() |
|
40 |
+ if inode not in self.hardlinks[dev]: |
|
41 |
+ self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set()) |
|
42 |
+ self.hardlinks[dev][inode]['relative_files'].add(relative_file) |
|
43 |
+ |
|
44 |
+ def get_hardlink_md5(self, relative_file): |
|
45 |
+ md5 = None |
|
46 |
+ dev = self[relative_file]['dev'] |
|
47 |
+ inode = self[relative_file]['inode'] |
|
48 |
+ try: |
|
49 |
+ md5 = self.hardlinks[dev][inode]['md5'] |
|
50 |
+ except: |
|
51 |
+ pass |
|
52 |
+ return md5 |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
from S3 import S3 |
7 | 7 |
from Config import Config |
8 | 8 |
from S3Uri import S3Uri |
9 |
-from SortedDict import SortedDict |
|
9 |
+from FileDict import FileDict |
|
10 | 10 |
from Utils import * |
11 | 11 |
from Exceptions import ParameterError |
12 | 12 |
from HashCache import HashCache |
... | ... |
@@ -58,7 +58,7 @@ def _fswalk_no_symlinks(path): |
58 | 58 |
def filter_exclude_include(src_list): |
59 | 59 |
info(u"Applying --exclude/--include") |
60 | 60 |
cfg = Config() |
61 |
- exclude_list = SortedDict(ignore_case = False) |
|
61 |
+ exclude_list = FileDict(ignore_case = False) |
|
62 | 62 |
for file in src_list.keys(): |
63 | 63 |
debug(u"CHECK: %s" % file) |
64 | 64 |
excluded = False |
... | ... |
@@ -224,7 +224,7 @@ def fetch_local_list(args, recursive = None): |
224 | 224 |
info(u"No cache file found, creating it.") |
225 | 225 |
|
226 | 226 |
local_uris = [] |
227 |
- local_list = SortedDict(ignore_case = False) |
|
227 |
+ local_list = FileDict(ignore_case = False) |
|
228 | 228 |
single_file = False |
229 | 229 |
|
230 | 230 |
if type(args) not in (list, tuple): |
... | ... |
@@ -284,7 +284,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
284 | 284 |
rem_base = rem_base[:rem_base.rfind('/')+1] |
285 | 285 |
remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base)) |
286 | 286 |
rem_base_len = len(rem_base) |
287 |
- rem_list = SortedDict(ignore_case = False) |
|
287 |
+ rem_list = FileDict(ignore_case = False) |
|
288 | 288 |
break_now = False |
289 | 289 |
for object in response['list']: |
290 | 290 |
if object['Key'] == rem_base_original and object['Key'][-1] != os.path.sep: |
... | ... |
@@ -292,7 +292,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
292 | 292 |
key = os.path.basename(object['Key']) |
293 | 293 |
object_uri_str = remote_uri_original.uri() |
294 | 294 |
break_now = True |
295 |
- rem_list = SortedDict(ignore_case = False) ## Remove whatever has already been put to rem_list |
|
295 |
+ rem_list = FileDict(ignore_case = False) ## Remove whatever has already been put to rem_list |
|
296 | 296 |
else: |
297 | 297 |
key = object['Key'][rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! |
298 | 298 |
object_uri_str = remote_uri.uri() + key |
... | ... |
@@ -314,7 +314,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
314 | 314 |
|
315 | 315 |
cfg = Config() |
316 | 316 |
remote_uris = [] |
317 |
- remote_list = SortedDict(ignore_case = False) |
|
317 |
+ remote_list = FileDict(ignore_case = False) |
|
318 | 318 |
|
319 | 319 |
if type(args) not in (list, tuple): |
320 | 320 |
args = [args] |
... | ... |
@@ -436,7 +436,7 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
436 | 436 |
## Items left on src_list will be transferred |
437 | 437 |
## Items left on update_list will be transferred after src_list |
438 | 438 |
## Items left on copy_pairs will be copied from dst1 to dst2 |
439 |
- update_list = SortedDict(ignore_case = False) |
|
439 |
+ update_list = FileDict(ignore_case = False) |
|
440 | 440 |
## Items left on dst_list will be deleted |
441 | 441 |
copy_pairs = [] |
442 | 442 |
|
... | ... |
@@ -27,8 +27,6 @@ class SortedDict(dict): |
27 | 27 |
""" |
28 | 28 |
dict.__init__(self, mapping, **kwargs) |
29 | 29 |
self.ignore_case = ignore_case |
30 |
- self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}} |
|
31 |
- self.by_md5 = dict() # {md5: set(relative_files)} |
|
32 | 30 |
|
33 | 31 |
def keys(self): |
34 | 32 |
keys = dict.keys(self) |
... | ... |
@@ -49,45 +47,6 @@ class SortedDict(dict): |
49 | 49 |
return SortedDictIterator(self, self.keys()) |
50 | 50 |
|
51 | 51 |
|
52 |
- def record_md5(self, relative_file, md5): |
|
53 |
- if md5 not in self.by_md5: |
|
54 |
- self.by_md5[md5] = set() |
|
55 |
- self.by_md5[md5].add(relative_file) |
|
56 |
- |
|
57 |
- def find_md5_one(self, md5): |
|
58 |
- try: |
|
59 |
- return list(self.by_md5.get(md5, set()))[0] |
|
60 |
- except: |
|
61 |
- return None |
|
62 |
- |
|
63 |
- def get_md5(self, relative_file): |
|
64 |
- """returns md5 if it can, or raises IOError if file is unreadable""" |
|
65 |
- md5 = None |
|
66 |
- if 'md5' in self[relative_file]: |
|
67 |
- return self[relative_file]['md5'] |
|
68 |
- md5 = self.get_hardlink_md5(relative_file) |
|
69 |
- if md5 is None: |
|
70 |
- md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
71 |
- self.record_md5(relative_file, md5) |
|
72 |
- self[relative_file]['md5'] = md5 |
|
73 |
- return md5 |
|
74 |
- |
|
75 |
- def record_hardlink(self, relative_file, dev, inode, md5): |
|
76 |
- if dev not in self.hardlinks: |
|
77 |
- self.hardlinks[dev] = dict() |
|
78 |
- if inode not in self.hardlinks[dev]: |
|
79 |
- self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set()) |
|
80 |
- self.hardlinks[dev][inode]['relative_files'].add(relative_file) |
|
81 |
- |
|
82 |
- def get_hardlink_md5(self, relative_file): |
|
83 |
- md5 = None |
|
84 |
- dev = self[relative_file]['dev'] |
|
85 |
- inode = self[relative_file]['inode'] |
|
86 |
- try: |
|
87 |
- md5 = self.hardlinks[dev][inode]['md5'] |
|
88 |
- except: |
|
89 |
- pass |
|
90 |
- return md5 |
|
91 | 52 |
|
92 | 53 |
if __name__ == "__main__": |
93 | 54 |
d = { 'AWS' : 1, 'Action' : 2, 'america' : 3, 'Auckland' : 4, 'America' : 5 } |
... | ... |
@@ -894,7 +894,7 @@ def local_copy(copy_pairs, destination_base): |
894 | 894 |
# Do NOT hardlink local files by default, that'd be silly |
895 | 895 |
# For instance all empty files would become hardlinked together! |
896 | 896 |
|
897 |
- failed_copy_list = SortedDict() |
|
897 |
+ failed_copy_list = FileDict() |
|
898 | 898 |
for (src_obj, dst1, relative_file) in copy_pairs: |
899 | 899 |
src_file = os.path.join(destination_base, dst1) |
900 | 900 |
dst_file = os.path.join(destination_base, relative_file) |
... | ... |
@@ -1059,7 +1059,7 @@ def cmd_sync_local2remote(args): |
1059 | 1059 |
## Make remote_key same as local_key for comparison if we're dealing with only one file |
1060 | 1060 |
remote_list_entry = remote_list[remote_list.keys()[0]] |
1061 | 1061 |
# Flush remote_list, by the way |
1062 |
- remote_list = SortedDict() |
|
1062 |
+ remote_list = FileDict() |
|
1063 | 1063 |
remote_list[local_list.keys()[0]] = remote_list_entry |
1064 | 1064 |
|
1065 | 1065 |
local_list, remote_list, update_list, copy_pairs = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates) |
... | ... |
@@ -2053,6 +2053,7 @@ if __name__ == '__main__': |
2053 | 2053 |
from S3.S3 import S3 |
2054 | 2054 |
from S3.Config import Config |
2055 | 2055 |
from S3.SortedDict import SortedDict |
2056 |
+ from S3.FileDict import FileDict |
|
2056 | 2057 |
from S3.S3Uri import S3Uri |
2057 | 2058 |
from S3 import Utils |
2058 | 2059 |
from S3.Utils import * |