Browse code

* s3cmd: Migrated 'sync' remote->local to the new scheme with fetch_{local,remote}_list(). Changed fetch_remote_list() to return dict() compatible with fetch_local_list(). Re-implemented --exclude / --include processing. * S3/Utils.py: functions for parsing RFC822 dates (for HTTP header responses). * S3/Config.py: placeholders for --include.

git-svn-id: https://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@347 830e0280-6d2a-0410-9c65-932aecc39d9d

Michal Ludvig authored on 2009/01/20 15:01:45
Showing 4 changed files
... ...
@@ -1,3 +1,14 @@
1
+2009-01-20  Michal Ludvig  <michal@logix.cz>
2
+
3
+	* s3cmd: Migrated 'sync' remote->local to the new
4
+	  scheme with fetch_{local,remote}_list().
5
+	  Changed fetch_remote_list() to return dict() compatible
6
+	  with fetch_local_list().
7
+	  Re-implemented --exclude / --include processing.
8
+	* S3/Utils.py: functions for parsing RFC822 dates (for HTTP
9
+	  header responses).
10
+	* S3/Config.py: placeholders for --include.
11
+
1 12
 2009-01-15  Michal Ludvig  <michal@logix.cz>
2 13
 
3 14
 	* s3cmd, S3/S3Uri.py, NEWS: Support for recursive 'put'.
... ...
@@ -55,10 +55,14 @@ class Config(object):
55 55
 	default_mime_type = "binary/octet-stream"
56 56
 	guess_mime_type = False
57 57
 	debug_syncmatch = False
58
+	# List of checks to be performed for 'sync'
59
+	sync_checks = ['size', 'md5']	# 'weak-timestamp'
58 60
 	# List of compiled REGEXPs
59 61
 	exclude = []
62
+	include = []
60 63
 	# Dict mapping compiled REGEXPs back to their textual form
61 64
 	debug_exclude = {}
65
+	debug_include = {}
62 66
 	encoding = "utf-8"
63 67
 
64 68
 	## Creating a singleton
... ...
@@ -8,6 +8,7 @@ import time
8 8
 import re
9 9
 import string
10 10
 import random
11
+import rfc822
11 12
 try:
12 13
 	from hashlib import md5
13 14
 except ImportError:
... ...
@@ -84,6 +85,12 @@ def dateS3toUnix(date):
84 84
 	## treats it as "localtime". Anyway...
85 85
 	return time.mktime(dateS3toPython(date))
86 86
 
87
+def dateRFC822toPython(date):
88
+	return rfc822.parsedate(date)
89
+
90
+def dateRFC822toUnix(date):
91
+	return time.mktime(dateRFC822toPython(date))
92
+
87 93
 def formatSize(size, human_readable = False, floating_point = False):
88 94
 	size = floating_point and float(size) or int(size)
89 95
 	if human_readable:
... ...
@@ -185,9 +192,9 @@ def unicodise(string, encoding = None, errors = "replace"):
185 185
 	if not encoding:
186 186
 		encoding = Config.Config().encoding
187 187
 
188
-	debug("Unicodising %r using %s" % (string, encoding))
189 188
 	if type(string) == unicode:
190 189
 		return string
190
+	debug("Unicodising %r using %s" % (string, encoding))
191 191
 	try:
192 192
 		return string.decode(encoding, errors)
193 193
 	except UnicodeDecodeError:
... ...
@@ -202,9 +209,9 @@ def deunicodise(string, encoding = None, errors = "replace"):
202 202
 	if not encoding:
203 203
 		encoding = Config.Config().encoding
204 204
 
205
-	debug("DeUnicodising %r using %s" % (string, encoding))
206 205
 	if type(string) != unicode:
207 206
 		return str(string)
207
+	debug("DeUnicodising %r using %s" % (string, encoding))
208 208
 	try:
209 209
 		return string.encode(encoding, errors)
210 210
 	except UnicodeEncodeError:
... ...
@@ -173,32 +173,38 @@ def cmd_bucket_delete(args):
173 173
 		_bucket_delete_one(uri)
174 174
 		output(u"Bucket '%s' removed" % uri.uri())
175 175
 
176
-def fetch_local_list(args):
176
+def fetch_local_list(args, recursive = None):
177 177
 	local_uris = []
178 178
 	local_list = {}
179 179
 
180
+	if type(args) not in (list, tuple):
181
+		args = [args]
182
+
183
+	if recursive == None:
184
+		recursive = cfg.recursive
185
+
180 186
 	for arg in args:
181 187
 		uri = S3Uri(arg)
182 188
 		if not uri.type == 'file':
183 189
 			raise ParameterError("Expecting filename or directory instead of: %s" % arg)
184
-		if uri.isdir() and not cfg.recursive:
190
+		if uri.isdir() and not recursive:
185 191
 			raise ParameterError("Use --recursive to upload a directory: %s" % arg)
186 192
 		local_uris.append(uri)
187 193
 
188 194
 	for uri in local_uris:
189
-		filelist = _get_filelist_local(uri)
190
-		for key in filelist:
191
-			upload_item = {
192
-				'file_info' : filelist[key],
193
-				'relative_key' : key,
194
-			}
195
-			local_list[key] = filelist[key]
195
+		local_list.update(_get_filelist_local(uri))
196 196
 
197 197
 	return local_list
198 198
 
199
-def fetch_remote_list(args):
199
+def fetch_remote_list(args, require_attribs = False, recursive = None):
200 200
 	remote_uris = []
201
-	remote_list = []
201
+	remote_list = {}
202
+
203
+	if type(args) not in (list, tuple):
204
+		args = [args]
205
+
206
+	if recursive == None:
207
+		recursive = cfg.recursive
202 208
 
203 209
 	for arg in args:
204 210
 		uri = S3Uri(arg)
... ...
@@ -206,19 +212,15 @@ def fetch_remote_list(args):
206 206
 			raise ParameterError("Expecting S3 URI instead of '%s'" % arg)
207 207
 		remote_uris.append(uri)
208 208
 
209
-	if cfg.recursive:
209
+	if recursive:
210 210
 		for uri in remote_uris:
211 211
 			objectlist = _get_filelist_remote(uri)
212
-			for key in objectlist.iterkeys():
213
-				object = S3Uri(objectlist[key]['object_uri_str'])
212
+			for key in objectlist:
213
+				#object = S3Uri(objectlist[key]['object_uri_str'])
214 214
 				## Remove leading '/' from remote filenames
215
-				if key.find("/") == 0:
216
-					key = key[1:]
217
-				download_item = {
218
-					'remote_uri' : object,
219
-					'key' : key
220
-				}
221
-				remote_list.append(download_item)
215
+				#if key.find("/") == 0:
216
+				#	key = key[1:]
217
+				remote_list[key] = objectlist[key]
222 218
 	else:
223 219
 		for uri in remote_uris:
224 220
 			uri_str = str(uri)
... ...
@@ -238,21 +240,25 @@ def fetch_remote_list(args):
238 238
 				for key in objectlist:
239 239
 					## Check whether the 'key' matches the requested wildcards
240 240
 					if glob.fnmatch.fnmatch(objectlist[key]['object_uri_str'], uri_str):
241
-						download_item = {
242
-							'remote_uri' : S3Uri(objectlist[key]['object_uri_str']),
243
-							'key' : key,
244
-						}
245
-						remote_list.append(download_item)
241
+						remote_list[key] = objectlist[key]
246 242
 			else:
247 243
 				## No wildcards - simply append the given URI to the list
248 244
 				key = os.path.basename(uri.object())
249 245
 				if not key:
250 246
 					raise ParameterError(u"Expecting S3 URI with a filename or --recursive: %s" % uri.uri())
251
-				download_item = {
252
-					'remote_uri' : uri,
253
-					'key' : key
247
+				remote_item = {
248
+					'base_uri': uri,
249
+					'object_uri_str': unicode(uri),
250
+					'object_key': uri.object()
254 251
 				}
255
-				remote_list.append(download_item)
252
+				if require_attribs:
253
+					response = S3(cfg).object_info(uri)
254
+					remote_item.update({
255
+					'size': int(response['headers']['content-length']),
256
+					'md5': response['headers']['etag'].strip('"\''),
257
+					'timestamp' : Utils.dateRFC822toUnix(response['headers']['date'])
258
+					})
259
+				remote_list[key] = remote_item
256 260
 	return remote_list
257 261
 
258 262
 def cmd_object_put(args):
... ...
@@ -362,26 +368,27 @@ def cmd_object_get(args):
362 362
 	if len(args) == 0:
363 363
 		raise ParameterError("Nothing to download. Expecting S3 URI.")
364 364
 
365
-	remote_list = fetch_remote_list(args)
365
+	remote_list = fetch_remote_list(args, require_attribs = False)
366 366
 	remote_count = len(remote_list)
367 367
 
368 368
 	if not os.path.isdir(destination_base) or destination_base == '-':
369 369
 		## We were either given a file name (existing or not) or want STDOUT
370 370
 		if remote_count > 1:
371 371
 			raise ParameterError("Destination must be a directory when downloading multiple sources.")
372
-		remote_list[0]['local_filename'] = destination_base
372
+		remote_list[remote_list.keys()[0]]['local_filename'] = deunicodise(destination_base)
373 373
 	elif os.path.isdir(destination_base):
374 374
 		if destination_base[-1] != os.path.sep:
375 375
 			destination_base += os.path.sep
376 376
 		for key in remote_list:
377
-			key['local_filename'] = destination_base + key['key']
377
+			remote_list[key]['local_filename'] = destination_base + key
378 378
 	else:
379 379
 		raise InternalError("WTF? Is it a dir or not? -- %s" % destination_base)
380 380
 
381 381
 	seq = 0
382
-	for item in remote_list:
382
+	for key in remote_list:
383 383
 		seq += 1
384
-		uri = item['remote_uri']
384
+		item = remote_list[key]
385
+		uri = S3Uri(item['object_uri_str'])
385 386
 		## Encode / Decode destination with "replace" to make sure it's compatible with current encoding
386 387
 		destination = unicodise_safe(item['local_filename'])
387 388
 		seq_label = "[%d of %d]" % (seq, remote_count)
... ...
@@ -606,35 +613,45 @@ def _get_filelist_remote(remote_uri, recursive = True):
606 606
 			break
607 607
 	return rem_list
608 608
 
609
-def _compare_filelists(src_list, dst_list, src_is_local_and_dst_is_remote):
610
-	info(u"Verifying checksums...")
609
+def _filelist_filter_exclude_include(src_list):
610
+	info(u"Applying --exclude/--include")
611 611
 	cfg = Config()
612
-	exists_list = {}
613 612
 	exclude_list = {}
614
-	if cfg.debug_syncmatch:
615
-		logging.root.setLevel(logging.DEBUG)
616 613
 	for file in src_list.keys():
617
-		if not cfg.debug_syncmatch:
618
-			debug(u"CHECK: %s" % (os.sep + file))
614
+		debug(u"CHECK: %s" % file)
619 615
 		excluded = False
620 616
 		for r in cfg.exclude:
621
-			## all paths start with '/' from the base dir
622
-			if r.search(os.sep + file):
623
-				## Can't directly 'continue' to the outer loop
624
-				## therefore this awkward excluded switch :-(
617
+			if r.search(file):
625 618
 				excluded = True
626
-				if cfg.debug_syncmatch:
627
-					debug(u"EXCL: %s" % (os.sep + file))
628
-					debug(u"RULE: '%s'" % (cfg.debug_exclude[r]))
629
-				else:
630
-					info(u"%s: excluded" % file)
619
+				debug(u"EXCL-MATCH: '%s'" % (cfg.debug_exclude[r]))
631 620
 				break
632 621
 		if excluded:
633
-			exclude_list = src_list[file]
622
+			## No need to check for --include if not excluded
623
+			for r in cfg.include:
624
+				if r.search(file):
625
+					excluded = False
626
+					debug(u"INCL-MATCH: '%s'" % (cfg.debug_include[r]))
627
+					break
628
+		if excluded:
629
+			## Still excluded - ok, action it
630
+			debug(u"EXCLUDE: %s" % file)
631
+			exclude_list[file] = src_list[file]
634 632
 			del(src_list[file])
635 633
 			continue
636 634
 		else:
637
-			debug(u"PASS: %s" % (os.sep + file))
635
+			debug(u"PASS: %s" % (file))
636
+	return src_list, exclude_list
637
+
638
+def _compare_filelists(src_list, dst_list, src_is_local_and_dst_is_remote):
639
+	info(u"Verifying attributes...")
640
+	cfg = Config()
641
+	exists_list = {}
642
+	if cfg.debug_syncmatch:
643
+		logging.root.setLevel(logging.DEBUG)
644
+
645
+	for file in src_list.keys():
646
+		if not cfg.debug_syncmatch:
647
+			debug(u"CHECK: %s" % file)
638 648
 		if dst_list.has_key(file):
639 649
 			## Was --skip-existing requested?
640 650
 			if cfg.skip_existing:
... ...
@@ -645,9 +662,13 @@ def _compare_filelists(src_list, dst_list, src_is_local_and_dst_is_remote):
645 645
 				del(dst_list[file])
646 646
 				continue
647 647
 
648
+			attribs_match = True
648 649
 			## Check size first
649
-			if dst_list[file]['size'] == src_list[file]['size']:
650
-				#debug(u"%s same size: %s" % (file, dst_list[file]['size']))
650
+			if 'size' in cfg.sync_checks and dst_list[file]['size'] != src_list[file]['size']:
651
+				debug(u"XFER: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
652
+				attribs_match = False
653
+			
654
+			if attribs_match and 'md5' in cfg.sync_checks:
651 655
 				## ... same size, check MD5
652 656
 				if src_is_local_and_dst_is_remote:
653 657
 					src_md5 = Utils.hash_file_md5(src_list[file]['full_name'])
... ...
@@ -655,28 +676,27 @@ def _compare_filelists(src_list, dst_list, src_is_local_and_dst_is_remote):
655 655
 				else:
656 656
 					src_md5 = src_list[file]['md5']
657 657
 					dst_md5 = Utils.hash_file_md5(dst_list[file]['full_name'])
658
-				if src_md5 == dst_md5:
659
-					#debug(u"%s md5 matches: %s" % (file, dst_md5))
660
-					## Checksums are the same.
661
-					## Remove from source-list, all that is left there will be transferred
662
-					debug(u"IGNR: %s (transfer not needed: MD5 OK, Size OK)" % file)
663
-					exists_list[file] = src_list[file]
664
-					del(src_list[file])
665
-				else:
658
+				if src_md5 != dst_md5:
659
+					## Checksums are different.
660
+					attribs_match = False
666 661
 					debug(u"XFER: %s (md5 mismatch: src=%s dst=%s)" % (file, src_md5, dst_md5))
667
-			else:
668
-				debug(u"XFER: %s (size mismatch: src=%s dst=%s)" % (file, src_list[file]['size'], dst_list[file]['size']))
669
-                        
662
+
663
+			if attribs_match:
664
+				## Remove from source-list, all that is left there will be transferred
665
+				debug(u"IGNR: %s (transfer not needed)" % file)
666
+				exists_list[file] = src_list[file]
667
+				del(src_list[file])
668
+
670 669
 			## Remove from destination-list, all that is left there will be deleted
671
-			#debug(u"%s removed from destination list" % file)
672 670
 			del(dst_list[file])
671
+
673 672
 	if cfg.debug_syncmatch:
674 673
 		warning(u"Exiting because of --debug-syncmatch")
675
-		sys.exit(0)
674
+		sys.exit(1)
676 675
 
677
-	return src_list, dst_list, exists_list, exclude_list
676
+	return src_list, dst_list, exists_list
678 677
 
679
-def cmd_sync_remote2local(src, dst):
678
+def cmd_sync_remote2local(args):
680 679
 	def _parse_attrs_header(attrs_header):
681 680
 		attrs = {}
682 681
 		for attr in attrs_header.split("/"):
... ...
@@ -686,45 +706,65 @@ def cmd_sync_remote2local(src, dst):
686 686
 		
687 687
 	s3 = S3(Config())
688 688
 
689
-	src_uri = S3Uri(src)
690
-	dst_uri = S3Uri(dst)
689
+	destination_base = args[-1]
690
+	local_list = fetch_local_list(destination_base, recursive = True)
691
+	remote_list = fetch_remote_list(args[:-1], recursive = True, require_attribs = True)
692
+
693
+	local_count = len(local_list)
694
+	remote_count = len(remote_list)
691 695
 
692
-	src_base = src_uri.uri()
693
-	dst_base = dst_uri.path()
694
-	if not src_base[-1] == "/": src_base += "/"
696
+	info(u"Found %d remote files, %d local files" % (remote_count, local_count))
695 697
 
696
-	rem_list = _get_filelist_remote(src_uri)
697
-	rem_count = len(rem_list)
698
+	remote_list, exclude_list = _filelist_filter_exclude_include(remote_list)
698 699
 
699
-	loc_list = _get_filelist_local(dst_uri)
700
-	loc_count = len(loc_list)
701
-	
702
-	info(u"Found %d remote files, %d local files" % (rem_count, loc_count))
700
+	remote_list, local_list, existing_list = _compare_filelists(remote_list, local_list, False)
701
+
702
+	local_count = len(local_list)
703
+	remote_count = len(remote_list)
703 704
 
704
-	_compare_filelists(rem_list, loc_list, False)
705
+	if not os.path.isdir(destination_base):
706
+		## We were either given a file name (existing or not) or want STDOUT
707
+		if remote_count > 1:
708
+			raise ParameterError("Destination must be a directory when downloading multiple sources.")
709
+		remote_list[remote_list.keys()[0]]['local_filename'] = deunicodise(destination_base)
710
+	else:
711
+		if destination_base[-1] != os.path.sep:
712
+			destination_base += os.path.sep
713
+		for key in remote_list:
714
+			remote_list[key]['local_filename'] = deunicodise(destination_base + key)
705 715
 
706
-	info(u"Summary: %d remote files to download, %d local files to delete" % (len(rem_list), len(loc_list)))
716
+	info(u"Summary: %d remote files to download, %d local files to delete" % (remote_count, local_count))
707 717
 
708
-	for file in loc_list:
718
+	for file in local_list:
709 719
 		if cfg.delete_removed:
710
-			os.unlink(dst_base + file)
711
-			output(u"deleted '%s'" % (dst_base + file))
720
+			os.unlink(local_list[file]['full_name'])
721
+			output(u"deleted: %s" % local_list[file]['full_name'])
712 722
 		else:
713
-			output(u"not-deleted '%s'" % file)
723
+			info(u"deleted: %s" % local_list[file]['full_name'])
724
+
725
+	if cfg.verbosity == logging.DEBUG:
726
+		for key in exclude_list:
727
+			debug(u"excluded: %s" % unicodise(key))
728
+		for key in remote_list:
729
+			debug(u"download: %s" % unicodise(key))
730
+
731
+	if cfg.dry_run:
732
+		warning(u"Exitting now because of --dry-run")
733
+		return
714 734
 
715 735
 	total_size = 0
716
-	total_count = len(rem_list)
717 736
 	total_elapsed = 0.0
718 737
 	timestamp_start = time.time()
719 738
 	seq = 0
720 739
 	dir_cache = {}
721
-	file_list = rem_list.keys()
740
+	file_list = remote_list.keys()
722 741
 	file_list.sort()
723 742
 	for file in file_list:
724 743
 		seq += 1
725
-		uri = S3Uri(src_base + file)
726
-		dst_file = dst_base + file
727
-		seq_label = "[%d of %d]" % (seq, total_count)
744
+		item = remote_list[file]
745
+		uri = S3Uri(item['object_uri_str'])
746
+		dst_file = item['local_filename']
747
+		seq_label = "[%d of %d]" % (seq, remote_count)
728 748
 		try:
729 749
 			dst_dir = os.path.dirname(dst_file)
730 750
 			if not dir_cache.has_key(dst_dir):
... ...
@@ -734,10 +774,8 @@ def cmd_sync_remote2local(src, dst):
734 734
 				continue
735 735
 			try:
736 736
 				open_flags = os.O_CREAT
737
-				if cfg.force:
738
-					open_flags |= os.O_TRUNC
739
-				else:
740
-					open_flags |= os.O_EXCL
737
+				open_flags |= os.O_TRUNC
738
+				# open_flags |= os.O_EXCL
741 739
 
742 740
 				debug(u"dst_file=%s" % dst_file)
743 741
 				# This will have failed should the file exist
... ...
@@ -907,21 +945,14 @@ def cmd_sync_local2remote(src, dst):
907 907
 		info(outstr)
908 908
 
909 909
 def cmd_sync(args):
910
-	src = args.pop(0)
911
-	dst = args.pop(0)
912
-	if (len(args)):
913
-		raise ParameterError("Too many parameters! Expected: %s" % commands['sync']['param'])
914
-
915
-	if S3Uri(src).type == "s3" and not src.endswith('/'):
916
-		src += "/"
917
-
918
-	if not dst.endswith('/'):
919
-		dst += "/"
910
+	if (len(args) < 2):
911
+		raise ParameterError("Too few parameters! Expected: %s" % commands['sync']['param'])
920 912
 
921
-	if S3Uri(src).type == "file" and S3Uri(dst).type == "s3":
922
-		return cmd_sync_local2remote(src, dst)
923
-	if S3Uri(src).type == "s3" and S3Uri(dst).type == "file":
924
-		return cmd_sync_remote2local(src, dst)
913
+	if S3Uri(args[0]).type == "file" and S3Uri(args[-1]).type == "s3":
914
+		return cmd_sync_local2remote(args)
915
+	if S3Uri(args[0]).type == "s3" and S3Uri(args[-1]).type == "file":
916
+		return cmd_sync_remote2local(args)
917
+	raise ParameterError("Invalid source/destination: '%s'" % "' '".join(args))
925 918
 
926 919
 def cmd_setacl(args):
927 920
 	s3 = S3(cfg)
... ...
@@ -1326,8 +1357,7 @@ def main():
1326 1326
 			debug(u"processing rule: %s" % ex)
1327 1327
 			exc = re.compile(glob.fnmatch.translate(ex))
1328 1328
 			cfg.exclude.append(exc)
1329
-			if options.debug_syncmatch:
1330
-				cfg.debug_exclude[exc] = ex
1329
+			cfg.debug_exclude[exc] = ex
1331 1330
 
1332 1331
 	## Process REGEXP style excludes
1333 1332
 	if options.rexclude is None:
... ...
@@ -1343,8 +1373,7 @@ def main():
1343 1343
 			debug(u"processing rule: %s" % ex)
1344 1344
 			exc = re.compile(ex)
1345 1345
 			cfg.exclude.append(exc)
1346
-			if options.debug_syncmatch:
1347
-				cfg.debug_exclude[exc] = ex
1346
+			cfg.debug_exclude[exc] = ex
1348 1347
 
1349 1348
 	if cfg.encrypt and cfg.gpg_passphrase == "":
1350 1349
 		error(u"Encryption requested but no passphrase set in config file.")