Browse code

Split md5/hardlink stuff out of SortedDict into FileDict

I should not have overloaded SortedDict with all the md5 and hardlink
tracking stuff which only used by file lists. There are other uses of
SortedDict elsewhere in the app that need not be cluttered by these
additional features in SortedDict.

Solution: derive new class FileDict from SortedDict, and put the md5
and hardlink tracking code into that. Then use FileDict instead of
SortedDict where those features are needed.

Matt Domsch authored on 2013/03/07 23:29:34
Showing 4 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,53 @@
0
+## Amazon S3 manager
1
+## Author: Michal Ludvig <michal@logix.cz>
2
+##         http://www.logix.cz/michal
3
+## License: GPL Version 2
4
+
5
+from SortedDict import SortedDict
6
+import Utils
7
+
8
+class FileDict(SortedDict):
9
+    def __init__(self, mapping = {}, ignore_case = True, **kwargs):
10
+        SortedDict.__init__(self, mapping = mapping, ignore_case = ignore_case, **kwargs)
11
+        self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}}
12
+        self.by_md5 = dict() # {md5: set(relative_files)}
13
+
14
+    def record_md5(self, relative_file, md5):
15
+        if md5 not in self.by_md5:
16
+            self.by_md5[md5] = set()
17
+        self.by_md5[md5].add(relative_file)
18
+
19
+    def find_md5_one(self, md5):
20
+        try:
21
+            return list(self.by_md5.get(md5, set()))[0]
22
+        except:
23
+            return None
24
+
25
+    def get_md5(self, relative_file):
26
+        """returns md5 if it can, or raises IOError if file is unreadable"""
27
+        md5 = None
28
+        if 'md5' in self[relative_file]:
29
+            return self[relative_file]['md5']
30
+        md5 = self.get_hardlink_md5(relative_file)
31
+        if md5 is None:
32
+            md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
33
+        self.record_md5(relative_file, md5)
34
+        self[relative_file]['md5'] = md5
35
+        return md5
36
+
37
+    def record_hardlink(self, relative_file, dev, inode, md5):
38
+        if dev not in self.hardlinks:
39
+            self.hardlinks[dev] = dict()
40
+        if inode not in self.hardlinks[dev]:
41
+            self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set())
42
+        self.hardlinks[dev][inode]['relative_files'].add(relative_file)
43
+
44
+    def get_hardlink_md5(self, relative_file):
45
+        md5 = None
46
+        dev = self[relative_file]['dev']
47
+        inode = self[relative_file]['inode']
48
+        try:
49
+            md5 = self.hardlinks[dev][inode]['md5']
50
+        except:
51
+            pass
52
+        return md5
... ...
@@ -6,7 +6,7 @@
6 6
 from S3 import S3
7 7
 from Config import Config
8 8
 from S3Uri import S3Uri
9
-from SortedDict import SortedDict
9
+from FileDict import FileDict
10 10
 from Utils import *
11 11
 from Exceptions import ParameterError
12 12
 from HashCache import HashCache
... ...
@@ -58,7 +58,7 @@ def _fswalk_no_symlinks(path):
58 58
 def filter_exclude_include(src_list):
59 59
     info(u"Applying --exclude/--include")
60 60
     cfg = Config()
61
-    exclude_list = SortedDict(ignore_case = False)
61
+    exclude_list = FileDict(ignore_case = False)
62 62
     for file in src_list.keys():
63 63
         debug(u"CHECK: %s" % file)
64 64
         excluded = False
... ...
@@ -224,7 +224,7 @@ def fetch_local_list(args, recursive = None):
224 224
             info(u"No cache file found, creating it.")
225 225
 
226 226
     local_uris = []
227
-    local_list = SortedDict(ignore_case = False)
227
+    local_list = FileDict(ignore_case = False)
228 228
     single_file = False
229 229
 
230 230
     if type(args) not in (list, tuple):
... ...
@@ -284,7 +284,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
284 284
             rem_base = rem_base[:rem_base.rfind('/')+1]
285 285
             remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base))
286 286
         rem_base_len = len(rem_base)
287
-        rem_list = SortedDict(ignore_case = False)
287
+        rem_list = FileDict(ignore_case = False)
288 288
         break_now = False
289 289
         for object in response['list']:
290 290
             if object['Key'] == rem_base_original and object['Key'][-1] != os.path.sep:
... ...
@@ -292,7 +292,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
292 292
                 key = os.path.basename(object['Key'])
293 293
                 object_uri_str = remote_uri_original.uri()
294 294
                 break_now = True
295
-                rem_list = SortedDict(ignore_case = False)   ## Remove whatever has already been put to rem_list
295
+                rem_list = FileDict(ignore_case = False)   ## Remove whatever has already been put to rem_list
296 296
             else:
297 297
                 key = object['Key'][rem_base_len:]      ## Beware - this may be '' if object['Key']==rem_base !!
298 298
                 object_uri_str = remote_uri.uri() + key
... ...
@@ -314,7 +314,7 @@ def fetch_remote_list(args, require_attribs = False, recursive = None):
314 314
 
315 315
     cfg = Config()
316 316
     remote_uris = []
317
-    remote_list = SortedDict(ignore_case = False)
317
+    remote_list = FileDict(ignore_case = False)
318 318
 
319 319
     if type(args) not in (list, tuple):
320 320
         args = [args]
... ...
@@ -436,7 +436,7 @@ def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates
436 436
     ## Items left on src_list will be transferred
437 437
     ## Items left on update_list will be transferred after src_list
438 438
     ## Items left on copy_pairs will be copied from dst1 to dst2
439
-    update_list = SortedDict(ignore_case = False)
439
+    update_list = FileDict(ignore_case = False)
440 440
     ## Items left on dst_list will be deleted
441 441
     copy_pairs = []
442 442
 
... ...
@@ -27,8 +27,6 @@ class SortedDict(dict):
27 27
         """
28 28
         dict.__init__(self, mapping, **kwargs)
29 29
         self.ignore_case = ignore_case
30
-        self.hardlinks = dict() # { dev: { inode : {'md5':, 'relative_files':}}}
31
-        self.by_md5 = dict() # {md5: set(relative_files)}
32 30
 
33 31
     def keys(self):
34 32
         keys = dict.keys(self)
... ...
@@ -49,45 +47,6 @@ class SortedDict(dict):
49 49
         return SortedDictIterator(self, self.keys())
50 50
 
51 51
 
52
-    def record_md5(self, relative_file, md5):
53
-        if md5 not in self.by_md5:
54
-            self.by_md5[md5] = set()
55
-        self.by_md5[md5].add(relative_file)
56
-
57
-    def find_md5_one(self, md5):
58
-        try:
59
-            return list(self.by_md5.get(md5, set()))[0]
60
-        except:
61
-            return None
62
-
63
-    def get_md5(self, relative_file):
64
-        """returns md5 if it can, or raises IOError if file is unreadable"""
65
-        md5 = None
66
-        if 'md5' in self[relative_file]:
67
-            return self[relative_file]['md5']
68
-        md5 = self.get_hardlink_md5(relative_file)
69
-        if md5 is None:
70
-            md5 = Utils.hash_file_md5(self[relative_file]['full_name'])
71
-        self.record_md5(relative_file, md5)
72
-        self[relative_file]['md5'] = md5
73
-        return md5
74
-
75
-    def record_hardlink(self, relative_file, dev, inode, md5):
76
-        if dev not in self.hardlinks:
77
-            self.hardlinks[dev] = dict()
78
-        if inode not in self.hardlinks[dev]:
79
-            self.hardlinks[dev][inode] = dict(md5=md5, relative_files=set())
80
-        self.hardlinks[dev][inode]['relative_files'].add(relative_file)
81
-
82
-    def get_hardlink_md5(self, relative_file):
83
-        md5 = None
84
-        dev = self[relative_file]['dev']
85
-        inode = self[relative_file]['inode']
86
-        try:
87
-            md5 = self.hardlinks[dev][inode]['md5']
88
-        except:
89
-            pass
90
-        return md5
91 52
 
92 53
 if __name__ == "__main__":
93 54
     d = { 'AWS' : 1, 'Action' : 2, 'america' : 3, 'Auckland' : 4, 'America' : 5 }
... ...
@@ -894,7 +894,7 @@ def local_copy(copy_pairs, destination_base):
894 894
     # Do NOT hardlink local files by default, that'd be silly
895 895
     # For instance all empty files would become hardlinked together!
896 896
 
897
-    failed_copy_list = SortedDict()
897
+    failed_copy_list = FileDict()
898 898
     for (src_obj, dst1, relative_file) in copy_pairs:
899 899
         src_file = os.path.join(destination_base, dst1)
900 900
         dst_file = os.path.join(destination_base, relative_file)
... ...
@@ -1059,7 +1059,7 @@ def cmd_sync_local2remote(args):
1059 1059
             ## Make remote_key same as local_key for comparison if we're dealing with only one file
1060 1060
             remote_list_entry = remote_list[remote_list.keys()[0]]
1061 1061
             # Flush remote_list, by the way
1062
-            remote_list = SortedDict()
1062
+            remote_list = FileDict()
1063 1063
             remote_list[local_list.keys()[0]] =  remote_list_entry
1064 1064
 
1065 1065
         local_list, remote_list, update_list, copy_pairs = compare_filelists(local_list, remote_list, src_remote = False, dst_remote = True, delay_updates = cfg.delay_updates)
... ...
@@ -2053,6 +2053,7 @@ if __name__ == '__main__':
2053 2053
         from S3.S3 import S3
2054 2054
         from S3.Config import Config
2055 2055
         from S3.SortedDict import SortedDict
2056
+        from S3.FileDict import FileDict
2056 2057
         from S3.S3Uri import S3Uri
2057 2058
         from S3 import Utils
2058 2059
         from S3.Utils import *