Browse code

* s3cmd: New [fixbucket] command for fixing invalid object names in a given Bucket. For instance names with  in them (not sure how people manage to upload them but they do). * S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for the above, plus advise user to run 'fixbucket' when XML parsing fails. * NEWS: Updated.

git-svn-id: https://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@395 830e0280-6d2a-0410-9c65-932aecc39d9d

Michal Ludvig authored on 2009/06/02 20:23:15
Showing 6 changed files
... ...
@@ -1,3 +1,13 @@
1
+2009-06-02  Michal Ludvig  <michal@logix.cz>
2
+
3
+	* s3cmd: New [fixbucket] command for fixing invalid object
4
+	  names in a given Bucket. For instance names with &#x08; in
5
+	  them (not sure how people manage to upload them but they do).
6
+	* S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for 
7
+	  the above, plus advise user to run 'fixbucket' when XML parsing 
8
+	  fails.
9
+	* NEWS: Updated.
10
+	
1 11
 2009-05-29  Michal Ludvig  <michal@logix.cz>
2 12
 
3 13
 	* S3/Utils.py: New function replace_nonprintables()
... ...
@@ -10,6 +10,9 @@ s3cmd 1.0.0
10 10
 * Added --exclude/--include and --dry-run for [del], [setacl].
11 11
 * Neutralise characters that are invalid in XML to avoid ExpatErrors.
12 12
   http://boodebr.org/main/python/all-about-python-and-unicode
13
+* New command [fixbucket] for for fixing invalid object names
14
+  in a given Bucket. For instance names with &#x08; in them
15
+  (not sure how people manage to upload them but they do).
13 16
 
14 17
 s3cmd 0.9.9   -   2009-02-17
15 18
 ===========
... ...
@@ -68,7 +68,7 @@ class Config(object):
68 68
 	debug_exclude = {}
69 69
 	debug_include = {}
70 70
 	encoding = "utf-8"
71
-	verbatim = False
71
+	urlencoding_mode = "normal"
72 72
 
73 73
 	## Creating a singleton
74 74
 	def __new__(self, configfile = None):
... ...
@@ -174,26 +174,29 @@ class S3(object):
174 174
 			return getListFromXml(data, "CommonPrefixes")
175 175
 
176 176
 		uri_params = {}
177
-		if prefix:
178
-			uri_params['prefix'] = self.urlencode_string(prefix)
179
-		if not self.config.recursive and not recursive:
180
-			uri_params['delimiter'] = "/"
181
-		request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
182
-		response = self.send_request(request)
183
-		#debug(response)
177
+		response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
184 178
 		list = _get_contents(response["data"])
185 179
 		prefixes = _get_common_prefixes(response["data"])
186 180
 		while _list_truncated(response["data"]):
187 181
 			uri_params['marker'] = self.urlencode_string(list[-1]["Key"])
188 182
 			debug("Listing continues after '%s'" % uri_params['marker'])
189
-			request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
190
-			response = self.send_request(request)
183
+			response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
191 184
 			list += _get_contents(response["data"])
192 185
 			prefixes += _get_common_prefixes(response["data"])
193 186
 		response['list'] = list
194 187
 		response['common_prefixes'] = prefixes
195 188
 		return response
196 189
 
190
+	def bucket_list_noparse(self, bucket, prefix = None, recursive = None, uri_params = {}):
191
+		if prefix:
192
+			uri_params['prefix'] = self.urlencode_string(prefix)
193
+		if not self.config.recursive and not recursive:
194
+			uri_params['delimiter'] = "/"
195
+		request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
196
+		response = self.send_request(request)
197
+		#debug(response)
198
+		return response
199
+
197 200
 	def bucket_create(self, bucket, bucket_location = None):
198 201
 		headers = SortedDict(ignore_case = True)
199 202
 		body = ""
... ...
@@ -320,11 +323,14 @@ class S3(object):
320 320
 		return response
321 321
 
322 322
 	## Low level methods
323
-	def urlencode_string(self, string):
323
+	def urlencode_string(self, string, urlencoding_mode = None):
324 324
 		if type(string) == unicode:
325 325
 			string = string.encode("utf-8")
326 326
 
327
-		if self.config.verbatim:
327
+		if urlencoding_mode is None:
328
+			urlencoding_mode = self.config.urlencoding_mode
329
+
330
+		if urlencoding_mode == "verbatim":
328 331
 			## Don't do any pre-processing
329 332
 			return string
330 333
 
... ...
@@ -345,9 +351,12 @@ class S3(object):
345 345
 					#           [hope that sounds reassuring ;-)]
346 346
 			o = ord(c)
347 347
 			if (o < 0x20 or o == 0x7f):
348
-				error(u"Non-printable character 0x%02x in: %s" % (o, string))
349
-				error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
350
-				encoded += replace_nonprintables(c)
348
+				if urlencoding_mode == "fixbucket":
349
+					encoded += "%%%02X" % o
350
+				else:
351
+					error(u"Non-printable character 0x%02x in: %s" % (o, string))
352
+					error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
353
+					encoded += replace_nonprintables(c)
351 354
 			elif (o == 0x20 or	# Space and below
352 355
 			    o == 0x22 or	# "
353 356
 			    o == 0x23 or	# #
... ...
@@ -21,11 +21,13 @@ import errno
21 21
 from logging import debug, info, warning, error
22 22
 
23 23
 import Config
24
+import Exceptions
24 25
 
25 26
 try:
26 27
 	import xml.etree.ElementTree as ET
27 28
 except ImportError:
28 29
 	import elementtree.ElementTree as ET
30
+from xml.parsers.expat import ExpatError
29 31
 
30 32
 def parseNodes(nodes):
31 33
 	## WARNING: Ignores text nodes from mixed xml/text.
... ...
@@ -57,10 +59,14 @@ def stripNameSpace(xml):
57 57
 
58 58
 def getTreeFromXml(xml):
59 59
 	xml, xmlns = stripNameSpace(xml)
60
-	tree = ET.fromstring(xml)
61
-	if xmlns:
62
-		tree.attrib['xmlns'] = xmlns
63
-	return tree
60
+	try:
61
+		tree = ET.fromstring(xml)
62
+		if xmlns:
63
+			tree.attrib['xmlns'] = xmlns
64
+		return tree
65
+	except ExpatError, e:
66
+		error(e)
67
+		raise Exceptions.ParameterError("Bucket contains invalid filenames. Please run: s3cmd fixbucket s3://your-bucket/")
64 68
 	
65 69
 def getListFromXml(xml, node):
66 70
 	tree = getTreeFromXml(xml)
... ...
@@ -275,7 +281,7 @@ def replace_nonprintables(string):
275 275
 			modified += 1
276 276
 		else:
277 277
 			new_string += c
278
-	if modified:
278
+	if modified and Config.Config().urlencoding_mode != "fixbucket":
279 279
 		warning("%d non-printable characters replaced in: %s" % (modified, new_string))
280 280
 	return new_string
281 281
 
... ...
@@ -21,6 +21,7 @@ import traceback
21 21
 import codecs
22 22
 import locale
23 23
 import subprocess
24
+import htmlentitydefs
24 25
 
25 26
 from copy import copy
26 27
 from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter
... ...
@@ -638,7 +639,7 @@ def _get_filelist_local(local_uri):
638 638
 				## for now skip over
639 639
 				continue
640 640
 			relative_file = unicodise(os.path.join(rel_root, f))
641
-			if not cfg.verbatim:
641
+			if cfg.urlencoding_mode == "normal":
642 642
 				relative_file = replace_nonprintables(relative_file)
643 643
 			if relative_file.startswith('./'):
644 644
 				relative_file = relative_file[2:]
... ...
@@ -1117,6 +1118,71 @@ def cmd_sign(args):
1117 1117
 	signature = Utils.sign_string(string_to_sign)
1118 1118
 	output("Signature: %s" % signature)
1119 1119
 
1120
+def cmd_fixbucket(args):
1121
+	def _unescape(text):
1122
+		##
1123
+		# Removes HTML or XML character references and entities from a text string.
1124
+		#
1125
+		# @param text The HTML (or XML) source text.
1126
+		# @return The plain text, as a Unicode string, if necessary.
1127
+		# 
1128
+		# From: http://effbot.org/zone/re-sub.htm#unescape-html
1129
+		def _unescape_fixup(m):
1130
+			text = m.group(0)
1131
+			if not htmlentitydefs.name2codepoint.has_key('apos'):
1132
+				htmlentitydefs.name2codepoint['apos'] = ord("'")
1133
+			if text[:2] == "&#":
1134
+				# character reference
1135
+				try:
1136
+					if text[:3] == "&#x":
1137
+						return unichr(int(text[3:-1], 16))
1138
+					else:
1139
+						return unichr(int(text[2:-1]))
1140
+				except ValueError:
1141
+					pass
1142
+			else:
1143
+				# named entity
1144
+				try:
1145
+					text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
1146
+				except KeyError:
1147
+					pass
1148
+			return text # leave as is
1149
+		return re.sub("&#?\w+;", _unescape_fixup, text)
1150
+
1151
+	cfg.urlencoding_mode = "fixbucket"
1152
+	s3 = S3(cfg)
1153
+
1154
+	count = 0
1155
+	for arg in args:
1156
+		culprit = S3Uri(arg)
1157
+		if culprit.type != "s3":
1158
+			raise ParameterError("Expecting S3Uri instead of: %s" % arg)
1159
+		response = s3.bucket_list_noparse(culprit.bucket(), culprit.object(), recursive = True)
1160
+		r_xent = re.compile("&#x[\da-fA-F]+;")
1161
+		keys = re.findall("<Key>(.*?)</Key>", response['data'], re.MULTILINE)
1162
+		debug("Keys: %r" % keys)
1163
+		for key in keys:
1164
+			if r_xent.search(key):
1165
+				info("Fixing: %s" % key)
1166
+				debug("Step 1: Transforming %s" % key)
1167
+				key_bin = _unescape(key)
1168
+				debug("Step 2:       ... to %s" % key_bin)
1169
+				key_new = replace_nonprintables(key_bin)
1170
+				debug("Step 3:  ... then to %s" % key_new)
1171
+				src = S3Uri("s3://%s/%s" % (culprit.bucket(), key_bin))
1172
+				dst = S3Uri("s3://%s/%s" % (culprit.bucket(), key_new))
1173
+				resp_move = s3.object_move(src, dst)
1174
+				if resp_move['status'] == 200:
1175
+					output("File %r renamed to %s" % (key_bin, key_new))
1176
+					count += 1
1177
+				else:
1178
+					error("Something went wrong for: %r" % key)
1179
+					error("Please report the problem to s3tools-bugs@lists.sourceforge.net")
1180
+	if count > 0:
1181
+		warning("Fixed %d files' names. Their ACL were reset to Private." % count)
1182
+		warning("Use 's3cmd setacl --acl-public s3://...' to make")
1183
+		warning("them publicly readable if required.")
1184
+
1120 1185
 def resolve_list(lst, args):
1121 1186
 	retval = []
1122 1187
 	for item in lst:
... ...
@@ -1351,6 +1417,7 @@ def get_commands_list():
1351 1351
 	{"cmd":"mv", "label":"Move object", "param":"s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]", "func":cmd_mv, "argc":2},
1352 1352
 	{"cmd":"setacl", "label":"Modify Access control list for Bucket or Files", "param":"s3://BUCKET[/OBJECT]", "func":cmd_setacl, "argc":1},
1353 1353
 	{"cmd":"sign", "label":"Sign arbitrary string using the secret key", "param":"STRING-TO-SIGN", "func":cmd_sign, "argc":1},
1354
+	{"cmd":"fixbucket", "label":"Fix invalid file names in a bucket", "param":"s3://BUCKET[/PREFIX]", "func":cmd_fixbucket, "argc":1},
1354 1355
 
1355 1356
 	## CloudFront commands
1356 1357
 	{"cmd":"cflist", "label":"List CloudFront distribution points", "param":"", "func":CfCmd.info, "argc":0},
... ...
@@ -1445,7 +1512,7 @@ def main():
1445 1445
 	optparser.add_option(      "--add-header", dest="add_header", action="append", metavar="NAME:VALUE", help="Add a given HTTP header to the upload request. Can be used multiple times. For instance set 'Expires' or 'Cache-Control' headers (or both) using this options if you like.")
1446 1446
 
1447 1447
 	optparser.add_option(      "--encoding", dest="encoding", metavar="ENCODING", help="Override autodetected terminal and filesystem encoding (character set). Autodetected: %s" % preferred_encoding)
1448
-	optparser.add_option(      "--verbatim", dest="verbatim", action="store_true", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")
1448
+	optparser.add_option(      "--verbatim", dest="urlencoding_mode", action="store_const", const="verbatim", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")
1449 1449
 
1450 1450
 	optparser.add_option(      "--list-md5", dest="list_md5", action="store_true", help="Include MD5 sums in bucket listings (only for 'ls' command).")
1451 1451
 	optparser.add_option("-H", "--human-readable-sizes", dest="human_readable_sizes", action="store_true", help="Print sizes in human readable form (eg 1kB instead of 1234).")