S3/FileLists.py
111b7a63
 ## Create and compare lists of files/objects
 ## Author: Michal Ludvig <michal@logix.cz>
 ##         http://www.logix.cz/michal
 ## License: GPL Version 2
 
 from S3 import S3
 from Config import Config
 from S3Uri import S3Uri
 from SortedDict import SortedDict
 from Utils import *
16d5faff
 from Exceptions import ParameterError
488c9565
 from HashCache import HashCache
111b7a63
 
 from logging import debug, info, warning, error
 
 import os
 import glob
d609665c
 import copy
111b7a63
 
1703df70
 __all__ = ["fetch_local_list", "fetch_remote_list", "compare_filelists", "filter_exclude_include", "parse_attrs_header"]
111b7a63
 
 def _fswalk_follow_symlinks(path):
136fb2d6
     '''
     Walk filesystem, following symbolic links (but without recursion), on python2.4 and later
 
     If a symlink directory loop is detected, emit a warning and skip.
     E.g.: dir1/dir2/sym-dir -> ../dir2
     '''
     assert os.path.isdir(path) # only designed for directory argument
     walkdirs = set([path])
     for dirpath, dirnames, filenames in os.walk(path):
         handle_exclude_include_walk(dirpath, dirnames, [])
         real_dirpath = os.path.realpath(dirpath)
         for dirname in dirnames:
             current = os.path.join(dirpath, dirname)
             real_current = os.path.realpath(current)
             if os.path.islink(current):
                 if (real_dirpath == real_current or
                     real_dirpath.startswith(real_current + os.path.sep)):
                     warning("Skipping recursively symlinked directory %s" % dirname)
                 else:
                     walkdirs.add(current)
     for walkdir in walkdirs:
         for dirpath, dirnames, filenames in os.walk(walkdir):
             handle_exclude_include_walk(dirpath, dirnames, [])
             yield (dirpath, dirnames, filenames)
111b7a63
 
eb04bc15
 def _fswalk_no_symlinks(path):
136fb2d6
     '''
     Directory tree generator
111b7a63
 
136fb2d6
     path (str) is the root of the directory tree to walk
     '''
     for dirpath, dirnames, filenames in os.walk(path):
         handle_exclude_include_walk(dirpath, dirnames, filenames)
         yield (dirpath, dirnames, filenames)
111b7a63
 
 def filter_exclude_include(src_list):
d439efb4
     info(u"Applying --exclude/--include")
     cfg = Config()
     exclude_list = SortedDict(ignore_case = False)
     for file in src_list.keys():
         debug(u"CHECK: %s" % file)
         excluded = False
         for r in cfg.exclude:
             if r.search(file):
                 excluded = True
                 debug(u"EXCL-MATCH: '%s'" % (cfg.debug_exclude[r]))
                 break
         if excluded:
             ## No need to check for --include if not excluded
             for r in cfg.include:
                 if r.search(file):
                     excluded = False
                     debug(u"INCL-MATCH: '%s'" % (cfg.debug_include[r]))
                     break
         if excluded:
             ## Still excluded - ok, action it
             debug(u"EXCLUDE: %s" % file)
             exclude_list[file] = src_list[file]
             del(src_list[file])
             continue
8d3d73cd
         else:
             debug(u"PASS: %r" % (file))
d439efb4
     return src_list, exclude_list
111b7a63
 
d609665c
 def handle_exclude_include_walk(root, dirs, files):
     cfg = Config()
     copydirs = copy.copy(dirs)
     copyfiles = copy.copy(files)
 
     # exclude dir matches in the current directory
     # this prevents us from recursing down trees we know we want to ignore
     for x in copydirs:
         d = os.path.join(root, x, '')
8d3d73cd
         debug(u"CHECK: %r" % d)
d609665c
         excluded = False
         for r in cfg.exclude:
             if r.search(d):
                 excluded = True
                 debug(u"EXCL-MATCH: '%s'" % (cfg.debug_exclude[r]))
                 break
         if excluded:
             ## No need to check for --include if not excluded
             for r in cfg.include:
                 if r.search(d):
                     excluded = False
                     debug(u"INCL-MATCH: '%s'" % (cfg.debug_include[r]))
                     break
         if excluded:
             ## Still excluded - ok, action it
8d3d73cd
             debug(u"EXCLUDE: %r" % d)
136fb2d6
             dirs.remove(x)
d609665c
             continue
8d3d73cd
         else:
             debug(u"PASS: %r" % (d))
d609665c
 
     # exclude file matches in the current directory
     for x in copyfiles:
         file = os.path.join(root, x)
8d3d73cd
         debug(u"CHECK: %r" % file)
d609665c
         excluded = False
         for r in cfg.exclude:
             if r.search(file):
                 excluded = True
                 debug(u"EXCL-MATCH: '%s'" % (cfg.debug_exclude[r]))
                 break
         if excluded:
             ## No need to check for --include if not excluded
             for r in cfg.include:
                 if r.search(file):
                     excluded = False
                     debug(u"INCL-MATCH: '%s'" % (cfg.debug_include[r]))
                     break
         if excluded:
             ## Still excluded - ok, action it
             debug(u"EXCLUDE: %s" % file)
             files.remove(x)
             continue
8d3d73cd
         else:
             debug(u"PASS: %r" % (file))
d609665c
 
111b7a63
 def fetch_local_list(args, recursive = None):
488c9565
     def _get_filelist_local(loc_list, local_uri, cache):
d439efb4
         info(u"Compiling list of local files...")
136fb2d6
 
490cca09
         if deunicodise(local_uri.basename()) == "-":
             loc_list = SortedDict(ignore_case = False)
             loc_list["-"] = {
                 'full_name_unicode' : '-',
                 'full_name' : '-',
                 'size' : -1,
                 'mtime' : -1,
             }
             return loc_list, True
d439efb4
         if local_uri.isdir():
             local_base = deunicodise(local_uri.basename())
             local_path = deunicodise(local_uri.path())
eb04bc15
             if cfg.follow_symlinks:
                 filelist = _fswalk_follow_symlinks(local_path)
             else:
                 filelist = _fswalk_no_symlinks(local_path)
d439efb4
             single_file = False
         else:
             local_base = ""
             local_path = deunicodise(local_uri.dirname())
             filelist = [( local_path, [], [deunicodise(local_uri.basename())] )]
             single_file = True
         for root, dirs, files in filelist:
             rel_root = root.replace(local_path, local_base, 1)
             for f in files:
                 full_name = os.path.join(root, f)
                 if not os.path.isfile(full_name):
                     continue
                 if os.path.islink(full_name):
111b7a63
                                     if not cfg.follow_symlinks:
                                             continue
d439efb4
                 relative_file = unicodise(os.path.join(rel_root, f))
                 if os.path.sep != "/":
                     # Convert non-unix dir separators to '/'
                     relative_file = "/".join(relative_file.split(os.path.sep))
                 if cfg.urlencoding_mode == "normal":
                     relative_file = replace_nonprintables(relative_file)
                 if relative_file.startswith('./'):
                     relative_file = relative_file[2:]
                 sr = os.stat_result(os.lstat(full_name))
                 loc_list[relative_file] = {
                     'full_name_unicode' : unicodise(full_name),
                     'full_name' : full_name,
                     'size' : sr.st_size,
                     'mtime' : sr.st_mtime,
136fb2d6
                     'dev'   : sr.st_dev,
                     'inode' : sr.st_ino,
                     'uid' : sr.st_uid,
                     'gid' : sr.st_gid,
                     'sr': sr # save it all, may need it in preserve_attrs_list
d439efb4
                     ## TODO: Possibly more to save here...
                 }
136fb2d6
                 if 'md5' in cfg.sync_checks:
488c9565
                     md5 = cache.md5(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size)
136fb2d6
                     if md5 is None:
                             try:
772691bd
                                 md5 = loc_list.get_md5(relative_file) # this does the file I/O
136fb2d6
                             except IOError:
772691bd
                                 continue
136fb2d6
                             cache.add(sr.st_dev, sr.st_ino, sr.st_mtime, sr.st_size, md5)
                     loc_list.record_hardlink(relative_file, sr.st_dev, sr.st_ino, md5)
d439efb4
         return loc_list, single_file
 
488c9565
     def _maintain_cache(cache, local_list):
         if cfg.cache_file:
             cache.mark_all_for_purge()
136fb2d6
             for i in local_list.keys():
488c9565
                 cache.unmark_for_purge(local_list[i]['dev'], local_list[i]['inode'], local_list[i]['mtime'], local_list[i]['size'])
             cache.purge()
136fb2d6
             cache.save(cfg.cache_file)
488c9565
 
d439efb4
     cfg = Config()
488c9565
 
     cache = HashCache()
     if cfg.cache_file:
         try:
             cache.load(cfg.cache_file)
136fb2d6
         except IOError:
             info(u"No cache file found, creating it.")
 
d439efb4
     local_uris = []
     local_list = SortedDict(ignore_case = False)
     single_file = False
 
     if type(args) not in (list, tuple):
         args = [args]
 
     if recursive == None:
         recursive = cfg.recursive
 
     for arg in args:
         uri = S3Uri(arg)
         if not uri.type == 'file':
             raise ParameterError("Expecting filename or directory instead of: %s" % arg)
         if uri.isdir() and not recursive:
             raise ParameterError("Use --recursive to upload a directory: %s" % arg)
         local_uris.append(uri)
 
     for uri in local_uris:
488c9565
         list_for_uri, single_file = _get_filelist_local(local_list, uri, cache)
d439efb4
 
     ## Single file is True if and only if the user
     ## specified one local URI and that URI represents
     ## a FILE. Ie it is False if the URI was of a DIR
     ## and that dir contained only one FILE. That's not
     ## a case of single_file==True.
     if len(local_list) > 1:
         single_file = False
 
488c9565
     _maintain_cache(cache, local_list)
 
d439efb4
     return local_list, single_file
111b7a63
 
 def fetch_remote_list(args, require_attribs = False, recursive = None):
d439efb4
     def _get_filelist_remote(remote_uri, recursive = True):
         ## If remote_uri ends with '/' then all remote files will have
         ## the remote_uri prefix removed in the relative path.
         ## If, on the other hand, the remote_uri ends with something else
         ## (probably alphanumeric symbol) we'll use the last path part
         ## in the relative path.
         ##
         ## Complicated, eh? See an example:
         ## _get_filelist_remote("s3://bckt/abc/def") may yield:
         ## { 'def/file1.jpg' : {}, 'def/xyz/blah.txt' : {} }
         ## _get_filelist_remote("s3://bckt/abc/def/") will yield:
         ## { 'file1.jpg' : {}, 'xyz/blah.txt' : {} }
         ## Furthermore a prefix-magic can restrict the return list:
         ## _get_filelist_remote("s3://bckt/abc/def/x") yields:
         ## { 'xyz/blah.txt' : {} }
 
         info(u"Retrieving list of remote files for %s ..." % remote_uri)
 
         s3 = S3(Config())
         response = s3.bucket_list(remote_uri.bucket(), prefix = remote_uri.object(), recursive = recursive)
 
         rem_base_original = rem_base = remote_uri.object()
         remote_uri_original = remote_uri
         if rem_base != '' and rem_base[-1] != '/':
             rem_base = rem_base[:rem_base.rfind('/')+1]
             remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base))
         rem_base_len = len(rem_base)
         rem_list = SortedDict(ignore_case = False)
         break_now = False
         for object in response['list']:
             if object['Key'] == rem_base_original and object['Key'][-1] != os.path.sep:
                 ## We asked for one file and we got that file :-)
                 key = os.path.basename(object['Key'])
                 object_uri_str = remote_uri_original.uri()
                 break_now = True
3758d1f0
                 rem_list = SortedDict(ignore_case = False)   ## Remove whatever has already been put to rem_list
d439efb4
             else:
                 key = object['Key'][rem_base_len:]      ## Beware - this may be '' if object['Key']==rem_base !!
                 object_uri_str = remote_uri.uri() + key
             rem_list[key] = {
                 'size' : int(object['Size']),
                 'timestamp' : dateS3toUnix(object['LastModified']), ## Sadly it's upload time, not our lastmod time :-(
                 'md5' : object['ETag'][1:-1],
                 'object_key' : object['Key'],
                 'object_uri_str' : object_uri_str,
                 'base_uri' : remote_uri,
136fb2d6
                 'dev' : None,
                 'inode' : None,
d439efb4
             }
136fb2d6
             md5 = object['ETag'][1:-1]
             rem_list.record_md5(key, md5)
d439efb4
             if break_now:
                 break
         return rem_list
 
     cfg = Config()
     remote_uris = []
     remote_list = SortedDict(ignore_case = False)
 
     if type(args) not in (list, tuple):
         args = [args]
 
     if recursive == None:
         recursive = cfg.recursive
 
     for arg in args:
         uri = S3Uri(arg)
         if not uri.type == 's3':
             raise ParameterError("Expecting S3 URI instead of '%s'" % arg)
         remote_uris.append(uri)
 
     if recursive:
         for uri in remote_uris:
             objectlist = _get_filelist_remote(uri)
             for key in objectlist:
                 remote_list[key] = objectlist[key]
136fb2d6
                 remote_list.record_md5(key, objectlist.get_md5(key))
d439efb4
     else:
         for uri in remote_uris:
             uri_str = str(uri)
             ## Wildcards used in remote URI?
             ## If yes we'll need a bucket listing...
             if uri_str.find('*') > -1 or uri_str.find('?') > -1:
                 first_wildcard = uri_str.find('*')
                 first_questionmark = uri_str.find('?')
                 if first_questionmark > -1 and first_questionmark < first_wildcard:
                     first_wildcard = first_questionmark
                 prefix = uri_str[:first_wildcard]
                 rest = uri_str[first_wildcard+1:]
                 ## Only request recursive listing if the 'rest' of the URI,
                 ## i.e. the part after first wildcard, contains '/'
                 need_recursion = rest.find('/') > -1
                 objectlist = _get_filelist_remote(S3Uri(prefix), recursive = need_recursion)
                 for key in objectlist:
                     ## Check whether the 'key' matches the requested wildcards
                     if glob.fnmatch.fnmatch(objectlist[key]['object_uri_str'], uri_str):
                         remote_list[key] = objectlist[key]
             else:
                 ## No wildcards - simply append the given URI to the list
                 key = os.path.basename(uri.object())
                 if not key:
                     raise ParameterError(u"Expecting S3 URI with a filename or --recursive: %s" % uri.uri())
                 remote_item = {
                     'base_uri': uri,
                     'object_uri_str': unicode(uri),
                     'object_key': uri.object()
                 }
                 if require_attribs:
                     response = S3(cfg).object_info(uri)
                     remote_item.update({
                     'size': int(response['headers']['content-length']),
                     'md5': response['headers']['etag'].strip('"\''),
                     'timestamp' : dateRFC822toUnix(response['headers']['date'])
                     })
136fb2d6
                     # get md5 from header if it's present.  We would have set that during upload
                     if response['headers'].has_key('x-amz-meta-s3cmd-attrs'):
1703df70
                         attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs'])
136fb2d6
                         if attrs.has_key('md5'):
1703df70
                             remote_item.update({'md5': attrs['md5']})
136fb2d6
 
d439efb4
                 remote_list[key] = remote_item
     return remote_list
111b7a63
 
1703df70
 def parse_attrs_header(attrs_header):
     attrs = {}
     for attr in attrs_header.split("/"):
         key, val = attr.split(":")
136fb2d6
         attrs[key] = val
1703df70
     return attrs
 
 
c3deb6a8
 def compare_filelists(src_list, dst_list, src_remote, dst_remote, delay_updates = False):
d439efb4
     def __direction_str(is_remote):
         return is_remote and "remote" or "local"
 
1703df70
     def _compare(src_list, dst_lst, src_remote, dst_remote, file):
         """Return True if src_list[file] matches dst_list[file], else False"""
         attribs_match = True
136fb2d6
         if not (src_list.has_key(file) and dst_list.has_key(file)):
51b08fa8
             info(u"%s: does not exist in one side or the other: src_list=%s, dst_list=%s" % (file, src_list.has_key(file), dst_list.has_key(file)))
1703df70
             return False
 
         ## check size first
136fb2d6
         if 'size' in cfg.sync_checks and dst_list[file]['size'] != src_list[file]['size']:
1703df70
             debug(u"xfer: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
136fb2d6
             attribs_match = False
1703df70
 
         ## check md5
         compare_md5 = 'md5' in cfg.sync_checks
         # Multipart-uploaded files don't have a valid md5 sum - it ends with "...-nn"
         if compare_md5:
             if (src_remote == True and src_list[file]['md5'].find("-") >= 0) or (dst_remote == True and dst_list[file]['md5'].find("-") >= 0):
                 compare_md5 = False
                 info(u"disabled md5 check for %s" % file)
         if attribs_match and compare_md5:
             try:
                 src_md5 = src_list.get_md5(file)
                 dst_md5 = dst_list.get_md5(file)
136fb2d6
             except (IOError,OSError), e:
1703df70
                 # md5 sum verification failed - ignore that file altogether
                 debug(u"IGNR: %s (disappeared)" % (file))
                 warning(u"%s: file disappeared, ignoring." % (file))
136fb2d6
                 raise
1703df70
 
             if src_md5 != dst_md5:
                 ## checksums are different.
                 attribs_match = False
                 debug(u"XFER: %s (md5 mismatch: src=%s dst=%s)" % (file, src_md5, dst_md5))
 
         return attribs_match
 
     # we don't support local->local sync, use 'rsync' or something like that instead ;-)
d439efb4
     assert(not(src_remote == False and dst_remote == False))
 
     info(u"Verifying attributes...")
     cfg = Config()
1703df70
     ## Items left on src_list will be transferred
     ## Items left on update_list will be transferred after src_list
     ## Items left on copy_pairs will be copied from dst1 to dst2
c3deb6a8
     update_list = SortedDict(ignore_case = False)
1703df70
     ## Items left on dst_list will be deleted
     copy_pairs = []
 
d439efb4
     debug("Comparing filelists (direction: %s -> %s)" % (__direction_str(src_remote), __direction_str(dst_remote)))
 
1703df70
     for relative_file in src_list.keys():
ddb5ef90
         debug(u"CHECK: %s" % (relative_file))
d439efb4
 
1703df70
         if dst_list.has_key(relative_file):
             ## Was --skip-existing requested?
136fb2d6
             if cfg.skip_existing:
                 debug(u"IGNR: %s (used --skip-existing)" % (relative_file))
                 del(src_list[relative_file])
                 del(dst_list[relative_file])
                 continue
d439efb4
 
136fb2d6
             try:
51b08fa8
                 same_file = _compare(src_list, dst_list, src_remote, dst_remote, relative_file)
136fb2d6
             except (IOError,OSError), e:
51b08fa8
                 debug(u"IGNR: %s (disappeared)" % (relative_file))
                 warning(u"%s: file disappeared, ignoring." % (relative_file))
136fb2d6
                 del(src_list[relative_file])
                 del(dst_list[relative_file])
ddb5ef90
                 continue
 
136fb2d6
             if same_file:
                 debug(u"IGNR: %s (transfer not needed)" % relative_file)
                 del(src_list[relative_file])
                 del(dst_list[relative_file])
d439efb4
 
136fb2d6
             else:
1703df70
                 # look for matching file in src
772691bd
                 try:
                     md5 = src_list.get_md5(relative_file)
136fb2d6
                 except IOError:
772691bd
                     md5 = None
136fb2d6
                 if md5 is not None and dst_list.by_md5.has_key(md5):
1703df70
                     # Found one, we want to copy
                     dst1 = list(dst_list.by_md5[md5])[0]
51b08fa8
                     debug(u"DST COPY src: %s -> %s" % (dst1, relative_file))
136fb2d6
                     copy_pairs.append((src_list[relative_file], dst1, relative_file))
                     del(src_list[relative_file])
                     del(dst_list[relative_file])
1703df70
                 else:
136fb2d6
                     # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
                     # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
                     dst_list.record_md5(relative_file, md5)
                     update_list[relative_file] = src_list[relative_file]
                     del src_list[relative_file]
                     del dst_list[relative_file]
 
         else:
1703df70
             # dst doesn't have this file
             # look for matching file elsewhere in dst
772691bd
             try:
                 md5 = src_list.get_md5(relative_file)
             except IOError:
                md5 = None
136fb2d6
             dst1 = dst_list.find_md5_one(md5)
             if dst1 is not None:
1703df70
                 # Found one, we want to copy
51b08fa8
                 debug(u"DST COPY dst: %s -> %s" % (dst1, relative_file))
136fb2d6
                 copy_pairs.append((src_list[relative_file], dst1, relative_file))
                 del(src_list[relative_file])
             else:
1703df70
                 # we don't have this file, and we don't have a copy of this file elsewhere.  Get it.
136fb2d6
                 # record that we will get this file transferred to us (before all the copies), so if we come across it later again,
                 # we can copy from _this_ copy (e.g. we only upload it once, and copy thereafter).
                 dst_list.record_md5(relative_file, md5)
1703df70
 
     for f in dst_list.keys():
06ee5e29
         if src_list.has_key(f) or update_list.has_key(f):
1703df70
             # leave only those not on src_list + update_list
             del dst_list[f]
 
     return src_list, dst_list, update_list, copy_pairs
d439efb4
 
 # vim:et:ts=4:sts=4:ai