Browse code

2007-09-02 Michal Ludvig <michal@logix.cz>

* s3cmd: Initial support for 'sync' operation. For
now only local->s3 direction. In this version doesn't
work well with non-ASCII filenames and doesn't support
encryption.



git-svn-id: https://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@145 830e0280-6d2a-0410-9c65-932aecc39d9d

Michal Ludvig authored on 2007/09/02 00:50:44
Showing 5 changed files
... ...
@@ -1,3 +1,10 @@
1
+2007-09-02  Michal Ludvig  <michal@logix.cz>
2
+
3
+	* s3cmd: Initial support for 'sync' operation. For
4
+	  now only local->s3 direction. In this version doesn't
5
+	  work well with non-ASCII filenames and doesn't support
6
+	  encryption.
7
+
1 8
 2007-08-24  Michal Ludvig  <michal@logix.cz>
2 9
 
3 10
 	* s3cmd, S3/Util.py: More ElementTree imports cleanup
... ...
@@ -10,6 +10,7 @@ import re
10 10
 class Config(object):
11 11
 	_instance = None
12 12
 	_parsed_files = []
13
+	_doc = {}
13 14
 	access_key = ""
14 15
 	secret_key = ""
15 16
 	host = "s3.amazonaws.com"
... ...
@@ -22,6 +23,8 @@ class Config(object):
22 22
 	proxy_host = ""
23 23
 	proxy_port = 3128
24 24
 	encrypt = False
25
+	delete_removed = False
26
+	_doc['delete_removed'] = "[sync] Remove remote S3 objects when local file has been deleted"
25 27
 	gpg_passphrase = ""
26 28
 	gpg_command = ""
27 29
 	gpg_encrypt = "%(gpg_command)s -c --verbose --no-use-agent --batch --yes --passphrase-fd %(passphrase_fd)s -o %(output_file)s %(input_file)s"
... ...
@@ -108,8 +108,10 @@ class S3(object):
108 108
 		## TODO: use prefix if supplied
109 109
 		request = self.create_request("BUCKET_LIST", bucket = bucket, prefix = prefix)
110 110
 		response = self.send_request(request)
111
-		debug(response)
111
+		#debug(response)
112 112
 		response["list"] = getListFromXml(response["data"], "Contents")
113
+		if getTextFromXml(response['data'], ".//IsTruncated").lower() != "false":
114
+			raise Exception("Listing truncated. Please notify s3cmd developers.")
113 115
 		return response
114 116
 
115 117
 	def bucket_create(self, bucket):
... ...
@@ -15,20 +15,30 @@ try:
15 15
 except ImportError:
16 16
 	import elementtree.ElementTree as ET
17 17
 
18
+def stripTagXmlns(xmlns, tag):
19
+	"""
20
+	Returns a function that, given a tag name argument, removes
21
+	eventual ElementTree xmlns from it.
22
+
23
+	Example:
24
+		stripTagXmlns("{myXmlNS}tag") -> "tag"
25
+	"""
26
+	if not xmlns:
27
+		return tag
28
+	return re.sub(xmlns, "", tag)
29
+
30
+def fixupXPath(xmlns, xpath, max = 0):
31
+	if not xmlns:
32
+		return xpath
33
+	retval = re.subn("//", "//%s" % xmlns, xpath, max)[0]
34
+	return retval
35
+
18 36
 def parseNodes(nodes, xmlns = ""):
19 37
 	retval = []
20 38
 	for node in nodes:
21 39
 		retval_item = {}
22
-		if xmlns != "":
23
-			## Take regexp compilation out of the loop
24
-			r = re.compile(xmlns)
25
-			fixup = lambda string : r.sub("", string)
26
-		else:
27
-			## Do-nothing function
28
-			fixup = lambda string : string
29
-
30 40
 		for child in node.getchildren():
31
-			name = fixup(child.tag)
41
+			name = stripTagXmlns(xmlns, child.tag)
32 42
 			retval_item[name] = node.findtext(".//%s" % child.tag)
33 43
 
34 44
 		retval.append(retval_item)
... ...
@@ -45,6 +55,11 @@ def getListFromXml(xml, node):
45 45
 	nodes = tree.findall('.//%s%s' % (xmlns, node))
46 46
 	return parseNodes(nodes, xmlns)
47 47
 	
48
+def getTextFromXml(xml, xpath):
49
+	tree = ET.fromstring(xml)
50
+	xmlns = getNameSpace(tree)
51
+	return tree.findtext(fixupXPath(xmlns, xpath))
52
+
48 53
 def dateS3toPython(date):
49 54
 	date = re.compile("\.\d\d\dZ").sub(".000Z", date)
50 55
 	return time.strptime(date, "%Y-%m-%dT%H:%M:%S.000Z")
... ...
@@ -256,6 +256,97 @@ def cmd_object_del(args):
256 256
 		response = s3.object_delete_uri(uri)
257 257
 		output("Object %s deleted" % uri)
258 258
 
259
+def cmd_sync(agrs):
260
+	src = args.pop(0)
261
+	if S3Uri(src).type != "file":
262
+		raise ParameterError("Source must be a local path instead of: %s" % src)
263
+	dst = args.pop(0)
264
+	dst_uri = S3Uri(dst)
265
+	if dst_uri.type != "s3":
266
+		raise ParameterError("Destination must be a S3 URI instead of: %s" % dst)
267
+	if (len(args)):
268
+		raise ParameterError("Too many parameters! Expected: %s" % commands['sync']['param'])
269
+
270
+	s3 = S3(Config())
271
+
272
+	output("Compiling list of local files...")
273
+	loc_base = os.path.join(src, "")
274
+	loc_base_len = len(loc_base)
275
+	loc_list = {}
276
+	for root, dirs, files in os.walk(src):
277
+		## TODO: implement explicit exclude
278
+		for f in files:
279
+			full_name = os.path.join(root, f)
280
+			if not os.path.isfile(full_name):
281
+				continue
282
+			file = full_name[loc_base_len:]
283
+			sr = os.stat_result(os.lstat(full_name))
284
+			loc_list[file] = {
285
+				'full_name' : full_name,
286
+				'size' : sr.st_size, 
287
+				'mtime' : sr.st_mtime,
288
+				## TODO: Possibly more to save here...
289
+			}
290
+	
291
+	output("Retrieving list of remote files...")
292
+	response = s3.bucket_list(dst_uri.bucket(), prefix = dst_uri.object())
293
+
294
+	rem_base = dst_uri.object()
295
+	rem_base_len = len(rem_base)
296
+	rem_list = {}
297
+	for object in response['list']:
298
+		key = object['Key'][rem_base_len:]
299
+		rem_list[key] = { 
300
+			'size' : int(object['Size']),
301
+			# 'mtime' : dateS3toUnix(object['LastModified']), ## That's upload time, not our lastmod time :-(
302
+			'md5' : object['ETag'][1:-1],
303
+			'object_key' : object['Key'],
304
+		}
305
+	output("Verifying checksums...")
306
+	for file in loc_list.keys():
307
+		debug("Checking %s ..." % file)
308
+		if rem_list.has_key(file):
309
+			debug("%s exists in remote list" % file)
310
+			## Check size first
311
+			if rem_list[file]['size'] == loc_list[file]['size']:
312
+				debug("%s same size: %s" % (file, rem_list[file]['size']))
313
+				## ... same size, check MD5
314
+				loc_md5 = Utils.hash_file_md5(loc_list[file]['full_name'])
315
+				if loc_md5 == rem_list[file]['md5']:
316
+					debug("%s md5 matches: %s" % (file, rem_list[file]['md5']))
317
+					## Checksums are the same.
318
+					## Remove from local-list, all that is left there will be uploaded
319
+					debug("%s removed from local list - upload not needed" % file)
320
+					del(loc_list[file])
321
+				else:
322
+					debug("! %s md5 mismatch: local=%s remote=%s" % (file, loc_md5, rem_list[file]['md5']))
323
+			else:
324
+				debug("! %s size mismatch: local=%s remote=%s" % (file, loc_list[file]['size'], rem_list[file]['size']))
325
+                        
326
+			## Remove from remote-list, all that is left there will be deleted
327
+			debug("%s removed from remote list" % file)
328
+			del(rem_list[file])
329
+
330
+	output("Summary: %d local files to upload, %d remote files to delete" % (len(loc_list), len(rem_list)))
331
+	if cfg.delete_removed:
332
+		for file in rem_list:
333
+			uri = S3Uri("s3://" + dst_uri.bucket()+"/"+rem_list[file]['object_key'])
334
+			response = s3.object_delete_uri(uri)
335
+			output("%s deleted" % uri)
336
+
337
+	total_size = 0
338
+	dst_base = dst_uri.uri()
339
+	if not dst_base[-1] == "/": dst_base += "/"
340
+	file_list = loc_list.keys()
341
+	file_list.sort()
342
+	for file in file_list:
343
+		src = loc_list[file]['full_name']
344
+		uri = S3Uri(dst_base + file)
345
+		response = s3.object_put_uri(src, uri)
346
+		output("%s stored as %s (%d bytes)" % (src, uri, response["size"]))
347
+		total_size += response["size"]
348
+	output("Done. Uploaded %d bytes." % total_size)
349
+
259 350
 def resolve_list(lst, args):
260 351
 	retval = []
261 352
 	for item in lst:
... ...
@@ -425,6 +516,8 @@ commands_list = [
425 425
 	{"cmd":"put", "label":"Put file into bucket", "param":"FILE [FILE...] s3://BUCKET[/PREFIX]", "func":cmd_object_put, "argc":2},
426 426
 	{"cmd":"get", "label":"Get file from bucket", "param":"s3://BUCKET/OBJECT LOCAL_FILE", "func":cmd_object_get, "argc":1},
427 427
 	{"cmd":"del", "label":"Delete file from bucket", "param":"s3://BUCKET/OBJECT", "func":cmd_object_del, "argc":1},
428
+	#{"cmd":"mkdir", "label":"Make a virtual S3 directory", "param":"s3://BUCKET/path/to/dir", "func":cmd_mkdir, "argc":1},
429
+	{"cmd":"sync", "label":"Synchronize a directory tree to S3 and back", "param":"LOCAL_DIR s3://BUCKET[/PREFIX]", "func":cmd_sync, "argc":2},
428 430
 	{"cmd":"du", "label":"Disk usage by buckets", "param":"[s3://BUCKET[/PREFIX]]", "func":cmd_du, "argc":0},
429 431
 	]
430 432
 
... ...
@@ -475,6 +568,7 @@ if __name__ == '__main__':
475 475
 	optparser.add_option("-e", "--encrypt", dest="encrypt", action="store_true", help="Encrypt files before uploading to S3.")
476 476
 	optparser.add_option("-f", "--force", dest="force", action="store_true", help="Force overwrite and other dangerous operations.")
477 477
 	optparser.add_option("-P", "--acl-public", dest="acl_public", action="store_true", help="Store objects with ACL allowing read by anyone.")
478
+	optparser.add_option(      "--delete-removed", dest="delete_removed", action="store_true", help="Delete remote objects with no corresponding local file [sync]")
478 479
 
479 480
 	optparser.add_option("-m", "--mime-type", dest="default_mime_type", type="mimetype", metavar="MIME/TYPE", help="Default MIME-type to be set for objects stored.")
480 481
 	optparser.add_option("-M", "--guess-mime-type", dest="guess_mime_type", action="store_true", help="Guess MIME-type of files by their extension. Falls back to default MIME-Type as specified by --mime-type option")