Browse code

move getting local file list stat() and md5sum until after handling excludes

and move handling includes into fetch_local_list() instead of after it.

This lets us get the list, filter for excludes, then run stat() and
do all the file I/O to calculate the md5sums.

This will greatly speed up any use of excludes, as we won't do all
the disk I/O that's completely unnecessary.

Matt Domsch authored on 2014/03/16 07:49:40
Showing 2 changed files
... ...
@@ -159,6 +159,41 @@ def _get_filelist_from_file(cfg, local_path):
159 159
     return result
160 160
 
161 161
 def fetch_local_list(args, is_src = False, recursive = None):
162
+
163
+    def _fetch_local_list_info(loc_list):
164
+        for relative_file in sorted(loc_list.keys()):
165
+            if relative_file == '-': continue
166
+
167
+            full_name = loc_list[relative_file]['full_name']
168
+            try:
169
+                sr = os.stat_result(os.stat(full_name))
170
+            except OSError, e:
171
+                if e.errno == errno.ENOENT:
172
+                    # file was removed async to us getting the list
173
+                    continue
174
+                else:
175
+                    raise
176
+            loc_list[relative_file].update({
177
+                'size' : sr.st_size,
178
+                'mtime' : sr.st_mtime,
179
+                'dev'   : sr.st_dev,
180
+                'inode' : sr.st_ino,
181
+                'uid' : sr.st_uid,
182
+                'gid' : sr.st_gid,
183
+                'sr': sr # save it all, may need it in preserve_attrs_list
184
+                ## TODO: Possibly more to save here...
185
+            })
186
+            if 'md5' in cfg.sync_checks:
187
+                md5 = cache.md5(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size)
188
+                if md5 is None:
189
+                        try:
190
+                            md5 = loc_list.get_md5(relative_file) # this does the file I/O
191
+                        except IOError:
192
+                            continue
193
+                        cache.add(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size, md5)
194
+                loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5, sr.st_size)
195
+
196
+
162 197
     def _get_filelist_local(loc_list, local_uri, cache):
163 198
         info(u"Compiling list of local files...")
164 199
 
... ...
@@ -214,35 +249,11 @@ def fetch_local_list(args, is_src = False, recursive = None):
214 214
                     relative_file = replace_nonprintables(relative_file)
215 215
                 if relative_file.startswith('./'):
216 216
                     relative_file = relative_file[2:]
217
-                try:
218
-                    sr = os.stat_result(os.stat(full_name))
219
-                except OSError, e:
220
-                    if e.errno == errno.ENOENT:
221
-                        # file was removed async to us getting the list
222
-                        continue
223
-                    else:
224
-                        raise
225 217
                 loc_list[relative_file] = {
226 218
                     'full_name_unicode' : unicodise(full_name),
227 219
                     'full_name' : full_name,
228
-                    'size' : sr.st_size,
229
-                    'mtime' : sr.st_mtime,
230
-                    'dev'   : sr.st_dev,
231
-                    'inode' : sr.st_ino,
232
-                    'uid' : sr.st_uid,
233
-                    'gid' : sr.st_gid,
234
-                    'sr': sr # save it all, may need it in preserve_attrs_list
235
-                    ## TODO: Possibly more to save here...
236 220
                 }
237
-                if 'md5' in cfg.sync_checks:
238
-                    md5 = cache.md5(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size)
239
-                    if md5 is None:
240
-                            try:
241
-                                md5 = loc_list.get_md5(relative_file) # this does the file I/O
242
-                            except IOError:
243
-                                continue
244
-                            cache.add(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size, md5)
245
-                    loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5, sr.st_size)
221
+
246 222
         return loc_list, single_file
247 223
 
248 224
     def _maintain_cache(cache, local_list):
... ...
@@ -295,9 +306,10 @@ def fetch_local_list(args, is_src = False, recursive = None):
295 295
     if len(local_list) > 1:
296 296
         single_file = False
297 297
 
298
+    local_list, exclude_list = filter_exclude_include(local_list)
299
+    _fetch_local_list_info(local_list)
298 300
     _maintain_cache(cache, local_list)
299
-
300
-    return local_list, single_file
301
+    return local_list, single_file, exclude_list
301 302
 
302 303
 def fetch_remote_list(args, require_attribs = False, recursive = None, batch_mode = False, uri_params = {}):
303 304
     def _get_remote_attribs(uri, remote_item):
... ...
@@ -284,9 +284,7 @@ def cmd_object_put(args):
284 284
     if len(args) == 0:
285 285
         raise ParameterError("Nothing to upload. Expecting a local file or directory.")
286 286
 
287
-    local_list, single_file_local = fetch_local_list(args, is_src = True)
288
-
289
-    local_list, exclude_list = filter_exclude_include(local_list)
287
+    local_list, single_file_local, exclude_list = fetch_local_list(args, is_src = True)
290 288
 
291 289
     local_count = len(local_list)
292 290
 
... ...
@@ -875,7 +873,7 @@ def cmd_sync_remote2local(args):
875 875
     s3 = S3(Config())
876 876
 
877 877
     destination_base = args[-1]
878
-    local_list, single_file_local = fetch_local_list(destination_base, is_src = False, recursive = True)
878
+    local_list, single_file_local, dst_exclude_list = fetch_local_list(destination_base, is_src = False, recursive = True)
879 879
     remote_list = fetch_remote_list(args[:-1], recursive = True, require_attribs = True)
880 880
 
881 881
     local_count = len(local_list)
... ...
@@ -885,7 +883,6 @@ def cmd_sync_remote2local(args):
885 885
     info(u"Found %d remote files, %d local files" % (remote_count, local_count))
886 886
 
887 887
     remote_list, src_exclude_list = filter_exclude_include(remote_list)
888
-    local_list, dst_exclude_list = filter_exclude_include(local_list)
889 888
 
890 889
     remote_list, local_list, update_list, copy_pairs = compare_filelists(remote_list, local_list, src_remote = True, dst_remote = False, delay_updates = cfg.delay_updates)
891 890
 
... ...
@@ -1363,8 +1360,7 @@ def cmd_sync_local2remote(args):
1363 1363
         error(u"or disable encryption with --no-encrypt parameter.")
1364 1364
         sys.exit(1)
1365 1365
 
1366
-    local_list, single_file_local = fetch_local_list(args[:-1], is_src = True, recursive = True)
1367
-    local_list, exclude_list = filter_exclude_include(local_list)
1366
+    local_list, single_file_local, exclude_list = fetch_local_list(args[:-1], is_src = True, recursive = True)
1368 1367
 
1369 1368
     destinations = [args[-1]]
1370 1369
     if cfg.additional_destinations: