Browse code

add local tree MD5 caching

This creates and maintains a cache (aka HashCache) of each inode in
the local tree. This is used to avoid doing local disk I/O to
calculate an MD5 value for a file if it's inode, mtime, and size
haven't changed. If these values have changed, then it does the disk
I/O.

This introduces command line option --cache-file <foo>. The file is
created if it does not exist, is read upon start and written upon
close. The contents are only useful for a given directory tree, so
caches should not be reused for different directory tree syncs.

Matt Domsch authored on 2012/06/19 10:10:33
Showing 4 changed files
... ...
@@ -87,6 +87,7 @@ class Config(object):
87 87
     website_error = ""
88 88
     website_endpoint = "http://%(bucket)s.s3-website-%(location)s.amazonaws.com/"
89 89
     additional_destinations = []
90
+    cache_file = ""
90 91
 
91 92
     ## Creating a singleton
92 93
     def __new__(self, configfile = None):
... ...
@@ -9,6 +9,7 @@ from S3Uri import S3Uri
9 9
 from SortedDict import SortedDict
10 10
 from Utils import *
11 11
 from Exceptions import ParameterError
12
+from HashCache import HashCache
12 13
 
13 14
 from logging import debug, info, warning, error
14 15
 
... ...
@@ -137,7 +138,7 @@ def handle_exclude_include_walk(root, dirs, files):
137 137
             debug(u"PASS: %s" % (file))
138 138
 
139 139
 def fetch_local_list(args, recursive = None):
140
-    def _get_filelist_local(loc_list, local_uri):
140
+    def _get_filelist_local(loc_list, local_uri, cache):
141 141
         info(u"Compiling list of local files...")
142 142
         if local_uri.isdir():
143 143
             local_base = deunicodise(local_uri.basename())
... ...
@@ -180,11 +181,30 @@ def fetch_local_list(args, recursive = None):
180 180
                     ## TODO: Possibly more to save here...
181 181
                 }
182 182
 		if 'md5' in cfg.sync_checks:
183
-		    md5 = loc_list.get_md5(relative_file)
183
+                    md5 = cache.md5(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size)
184
+		    if md5 is None:
185
+                        md5 = loc_list.get_md5(relative_file) # this does the file I/O
186
+			cache.add(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size, md5)
184 187
 		    loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5)
185 188
         return loc_list, single_file
186 189
 
190
+    def _maintain_cache(cache, local_list):
191
+        if cfg.cache_file:
192
+            cache.mark_all_for_purge()
193
+	    for i in local_list.keys():
194
+                cache.unmark_for_purge(local_list[i]['dev'], local_list[i]['inode'], local_list[i]['mtime'], local_list[i]['size'])
195
+            cache.purge()
196
+	    cache.save(cfg.cache_file)
197
+
187 198
     cfg = Config()
199
+
200
+    cache = HashCache()
201
+    if cfg.cache_file:
202
+        try:
203
+            cache.load(cfg.cache_file)
204
+	except IOError:
205
+	    info(u"No cache file found, creating it.")
206
+    
188 207
     local_uris = []
189 208
     local_list = SortedDict(ignore_case = False)
190 209
     single_file = False
... ...
@@ -204,7 +224,7 @@ def fetch_local_list(args, recursive = None):
204 204
         local_uris.append(uri)
205 205
 
206 206
     for uri in local_uris:
207
-        list_for_uri, single_file = _get_filelist_local(local_list, uri)
207
+        list_for_uri, single_file = _get_filelist_local(local_list, uri, cache)
208 208
 
209 209
     ## Single file is True if and only if the user
210 210
     ## specified one local URI and that URI represents
... ...
@@ -214,6 +234,8 @@ def fetch_local_list(args, recursive = None):
214 214
     if len(local_list) > 1:
215 215
         single_file = False
216 216
 
217
+    _maintain_cache(cache, local_list)
218
+
217 219
     return local_list, single_file
218 220
 
219 221
 def fetch_remote_list(args, require_attribs = False, recursive = None):
220 222
new file mode 100644
... ...
@@ -0,0 +1,52 @@
0
+import cPickle as pickle
1
+
2
+class HashCache(object):
3
+    def __init__(self):
4
+        self.inodes = dict()
5
+    
6
+    def add(self, dev, inode, mtime, size, md5):
7
+        if dev not in self.inodes:
8
+            self.inodes[dev] = dict()
9
+        if inode not in self.inodes[dev]:
10
+            self.inodes[dev][inode] = dict()
11
+        self.inodes[dev][inode][mtime] = dict(md5=md5, size=size)
12
+
13
+    def md5(self, dev, inode, mtime, size):
14
+        try:
15
+            d = self.inodes[dev][inode][mtime]
16
+            if d['size'] != size:
17
+                return None
18
+        except:
19
+            return None
20
+        return d['md5']
21
+
22
+    def mark_all_for_purge(self):
23
+        for d in self.inodes.keys():
24
+            for i in self.inodes[d].keys():
25
+                for c in self.inodes[d][i].keys():
26
+                    self.inodes[d][i][c]['purge'] = True
27
+
28
+    def unmark_for_purge(self, dev, inode, mtime, size):
29
+        d = self.inodes[dev][inode][mtime]
30
+        if d['size'] == size and 'purge' in d:
31
+            del self.inodes[dev][inode][mtime]['purge']
32
+
33
+    def purge(self):
34
+        for d in self.inodes.keys():
35
+            for i in self.inodes[d].keys():
36
+                for m in self.inodes[d][i].keys():
37
+                    if 'purge' in self.inodes[d][i][m]:
38
+                        del self.inodes[d][i]
39
+    
40
+    def save(self, f):
41
+        d = dict(inodes=self.inodes, version=1)
42
+        f = open(f, 'w')
43
+        p = pickle.dump(d, f)
44
+        f.close()
45
+
46
+    def load(self, f):
47
+        f = open(f, 'r')
48
+        d = pickle.load(f)
49
+        f.close()
50
+        if d.get('version') == 1 and 'inodes' in d:
51
+            self.inodes = d['inodes']
... ...
@@ -1671,6 +1671,7 @@ def main():
1671 1671
     optparser.add_option("-d", "--debug", dest="verbosity", action="store_const", const=logging.DEBUG, help="Enable debug output.")
1672 1672
     optparser.add_option(      "--version", dest="show_version", action="store_true", help="Show s3cmd version (%s) and exit." % (PkgInfo.version))
1673 1673
     optparser.add_option("-F", "--follow-symlinks", dest="follow_symlinks", action="store_true", default=False, help="Follow symbolic links as if they are regular files")
1674
+    optparser.add_option(      "--cache-file", dest="cache_file", action="store", default="",  metavar="FILE", help="Cache FILE containing local source MD5 values")
1674 1675
 
1675 1676
     optparser.set_usage(optparser.usage + " COMMAND [parameters]")
1676 1677
     optparser.set_description('S3cmd is a tool for managing objects in '+