Split md5/hardlink stuff out of SortedDict into FileDict
Michal Ludvig authored on 2013/03/09 09:31:381 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,53 @@ |
0 |
+## Amazon S3 manager |
|
1 |
+## Author: Michal Ludvig <michal@logix.cz> |
|
2 |
+## http://www.logix.cz/michal |
|
3 |
+## License: GPL Version 2 |
|
4 |
+ |
|
5 |
+from SortedDict import SortedDict |
|
6 |
+import Utils |
|
7 |
+ |
|
8 |
+class FileDict(SortedDict): |
|
9 |
+ def __init__(self, mapping = {}, ignore_case = True, **kwargs): |
|
10 |
+ SortedDict.__init__(self, mapping = mapping, ignore_case = ignore_case, **kwargs) |
|
11 |
+ self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}} |
|
12 |
+ self.by_md5 = dict() # {md5: set(relative_files)} |
|
13 |
+ |
|
14 |
+ def record_md5(self, relative_file, md5): |
|
15 |
+ if md5 not in self.by_md5: |
|
16 |
+ self.by_md5[md5] = set() |
|
17 |
+ self.by_md5[md5].add(relative_file) |
|
18 |
+ |
|
19 |
+ def find_md5_one(self, md5): |
|
20 |
+ try: |
|
21 |
+ return list(self.by_md5.get(md5, set()))[0] |
|
22 |
+ except: |
|
23 |
+ return None |
|
24 |
+ |
|
25 |
+ def get_md5(self, relative_file): |
|
26 |
+ """returns md5 if it can, or raises IOError if file is unreadable""" |
|
27 |
+ md5 = None |
|
28 |
+ if 'md5' in self[relative_file]: |
|
29 |
+ return self[relative_file]['md5'] |
|
30 |
+ md5 = self.get_hardlink_md5(relative_file) |
|
31 |
+ if md5 is None: |
|
32 |
+ md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
33 |
+ self.record_md5(relative_file, md5) |
|
34 |
+ self[relative_file]['md5'] = md5 |
|
35 |
+ return md5 |
|
36 |
+ |
|
37 |
+ def record_hardlink(self, relative_file, dev, inode, md5): |
|
38 |
+ if dev not in self.hardlinks: |
|
39 |
+ self.hardlinks[dev] = dict() |
|
40 |
+ if inode not in self.hardlinks[dev]: |
|
41 |
+ self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set()) |
|
42 |
+ self.hardlinks[dev][inode]['relative_files'].add(relative_file) |
|
43 |
+ |
|
44 |
+ def get_hardlink_md5(self, relative_file): |
|
45 |
+ md5 = None |
|
46 |
+ dev = self[relative_file]['dev'] |
|
47 |
+ inode = self[relative_file]['inode'] |
|
48 |
+ try: |
|
49 |
+ md5 = self.hardlinks[dev][inode]['md5'] |
|
50 |
+ except: |
|
51 |
+ pass |
|
52 |
+ return md5 |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
from S3 import S3 |
7 | 7 |
from Config import Config |
8 | 8 |
from S3Uri import S3Uri |
9 |
-from SortedDict import SortedDict |
|
9 |
+from FileDict import FileDict |
|
10 | 10 |
from Utils import * |
11 | 11 |
from Exceptions import ParameterError |
12 | 12 |
from HashCache import HashCache |
... | ... |
@@ -58,7 +58,7 @@ def _fswalk_no_symlinks(path): |
58 | 58 |
def filter_exclude_include(src_list): |
59 | 59 |
info(u"Applying --exclude/--include") |
60 | 60 |
cfg = Config() |
61 |
- exclude_list = SortedDict(ignore_case = False) |
|
61 |
+ exclude_list = FileDict(ignore_case = False) |
|
62 | 62 |
for file in src_list.keys(): |
63 | 63 |
debug(u"CHECK: %s" % file) |
64 | 64 |
excluded = False |
... | ... |
@@ -224,7 +224,7 @@ def fetch_local_list(args, recursive = None): |
224 | 224 |
info(u"No cache file found, creating it.") |
225 | 225 |
|
226 | 226 |
local_uris = [] |
227 |
- local_list = SortedDict(ignore_case = False) |
|
227 |
+ local_list = FileDict(ignore_case = False) |
|
228 | 228 |
single_file = False |
229 | 229 |
|
230 | 230 |
if type(args) not in (list, tuple): |
... | ... |
@@ -284,7 +284,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
284 | 284 |
rem_base = rem_base[:rem_base.rfind('/')+1] |
285 | 285 |
remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base)) |
286 | 286 |
rem_base_len = len(rem_base) |
287 |
- rem_list = SortedDict(ignore_case = False) |
|
287 |
+ rem_list = FileDict(ignore_case = False) |
|
288 | 288 |
break_now = False |
289 | 289 |
for object in response['list']: |
290 | 290 |
if object['Key'] == rem_base_original and object['Key'][-1] != os.path.sep: |
... | ... |
@@ -292,7 +292,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
292 | 292 |
key = os.path.basename(object['Key']) |
293 | 293 |
object_uri_str = remote_uri_original.uri() |
294 | 294 |
break_now = True |
295 |
- rem_list = SortedDict(ignore_case = False) ## Remove whatever has already been put to rem_list |
|
295 |
+ rem_list = FileDict(ignore_case = False) ## Remove whatever has already been put to rem_list |
|
296 | 296 |
else: |
297 | 297 |
key = object['Key'][rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! |
298 | 298 |
object_uri_str = remote_uri.uri() + key |
... | ... |
@@ -314,7 +314,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None): |
314 | 314 |
|
315 | 315 |
cfg = Config() |
316 | 316 |
remote_uris = [] |
317 |
- remote_list = SortedDict(ignore_case = False) |
|
317 |
+ remote_list = FileDict(ignore_case = False) |
|
318 | 318 |
|
319 | 319 |
if type(args) not in (list, tuple): |
320 | 320 |
args = [args] |
... | ... |
@@ -436,7 +436,7 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates |
436 | 436 |
## Items left on src_list will be transferred |
437 | 437 |
## Items left on update_list will be transferred after src_list |
438 | 438 |
## Items left on copy_pairs will be copied from dst1 to dst2 |
439 |
- update_list = SortedDict(ignore_case = False) |
|
439 |
+ update_list = FileDict(ignore_case = False) |
|
440 | 440 |
## Items left on dst_list will be deleted |
441 | 441 |
copy_pairs = [] |
442 | 442 |
|
... | ... |
@@ -27,8 +27,6 @@ class SortedDict(dict): |
27 | 27 |
""" |
28 | 28 |
dict.__init__(self, mapping, **kwargs) |
29 | 29 |
self.ignore_case = ignore_case |
30 |
- self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}} |
|
31 |
- self.by_md5 = dict() # {md5: set(relative_files)} |
|
32 | 30 |
|
33 | 31 |
def keys(self): |
34 | 32 |
keys = dict.keys(self) |
... | ... |
@@ -49,45 +47,6 @@ class SortedDict(dict): |
49 | 49 |
return SortedDictIterator(self, self.keys()) |
50 | 50 |
|
51 | 51 |
|
52 |
- def record_md5(self, relative_file, md5): |
|
53 |
- if md5 not in self.by_md5: |
|
54 |
- self.by_md5[md5] = set() |
|
55 |
- self.by_md5[md5].add(relative_file) |
|
56 |
- |
|
57 |
- def find_md5_one(self, md5): |
|
58 |
- try: |
|
59 |
- return list(self.by_md5.get(md5, set()))[0] |
|
60 |
- except: |
|
61 |
- return None |
|
62 |
- |
|
63 |
- def get_md5(self, relative_file): |
|
64 |
- """returns md5 if it can, or raises IOError if file is unreadable""" |
|
65 |
- md5 = None |
|
66 |
- if 'md5' in self[relative_file]: |
|
67 |
- return self[relative_file]['md5'] |
|
68 |
- md5 = self.get_hardlink_md5(relative_file) |
|
69 |
- if md5 is None: |
|
70 |
- md5 = Utils.hash_file_md5(self[relative_file]['full_name']) |
|
71 |
- self.record_md5(relative_file, md5) |
|
72 |
- self[relative_file]['md5'] = md5 |
|
73 |
- return md5 |
|
74 |
- |
|
75 |
- def record_hardlink(self, relative_file, dev, inode, md5): |
|
76 |
- if dev not in self.hardlinks: |
|
77 |
- self.hardlinks[dev] = dict() |
|
78 |
- if inode not in self.hardlinks[dev]: |
|
79 |
- self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set()) |
|
80 |
- self.hardlinks[dev][inode]['relative_files'].add(relative_file) |
|
81 |
- |
|
82 |
- def get_hardlink_md5(self, relative_file): |
|
83 |
- md5 = None |
|
84 |
- dev = self[relative_file]['dev'] |
|
85 |
- inode = self[relative_file]['inode'] |
|
86 |
- try: |
|
87 |
- md5 = self.hardlinks[dev][inode]['md5'] |
|
88 |
- except: |
|
89 |
- pass |
|
90 |
- return md5 |
|
91 | 52 |
|
92 | 53 |
if __name__ == "__main__": |
93 | 54 |
d = { 'AWS' : 1, 'Action' : 2, 'america' : 3, 'Auckland' : 4, 'America' : 5 } |
... | ... |
@@ -911,7 +911,7 @@ def local_copy(copy_pairs, destination_base): |
911 | 911 |
# Do NOT hardlink local files by default, that'd be silly |
912 | 912 |
# For instance all empty files would become hardlinked together! |
913 | 913 |
|
914 |
- failed_copy_list = SortedDict() |
|
914 |
+ failed_copy_list = FileDict() |
|
915 | 915 |
for (src_obj, dst1, relative_file) in copy_pairs: |
916 | 916 |
src_file = os.path.join(destination_base, dst1) |
917 | 917 |
dst_file = os.path.join(destination_base, relative_file) |
... | ... |
@@ -1076,7 +1076,7 @@ def cmd_sync_local2remote(args): |
1076 | 1076 |
## Make remote_key same as local_key for comparison if we're dealing with only one file |
1077 | 1077 |
remote_list_entry = remote_list[remote_list.keys()[0]] |
1078 | 1078 |
# Flush remote_list, by the way |
1079 |
- remote_list = SortedDict() |
|
1079 |
+ remote_list = FileDict() |
|
1080 | 1080 |
remote_list[local_list.keys()[0]] = remote_list_entry |
1081 | 1081 |
|
1082 | 1082 |
local_list, remote_list, update_list, copy_pairs = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates) |
... | ... |
@@ -2079,6 +2079,7 @@ if __name__ == '__main__': |
2079 | 2079 |
from S3.S3 import S3 |
2080 | 2080 |
from S3.Config import Config |
2081 | 2081 |
from S3.SortedDict import SortedDict |
2082 |
+ from S3.FileDict import FileDict |
|
2082 | 2083 |
from S3.S3Uri import S3Uri |
2083 | 2084 |
from S3 import Utils |
2084 | 2085 |
from S3.Utils import * |