GitList

Browse code

* s3cmd: New [fixbucket] command for fixing invalid object names in a given Bucket. For instance names with  in them (not sure how people manage to upload them but they do). * S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for the above, plus advise user to run 'fixbucket' when XML parsing fails. * NEWS: Updated.

git-svn-id: https://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@395 830e0280-6d2a-0410-9c65-932aecc39d9d

Michal Ludvig authored on 2009/06/02 20:23:15
Showing 6 changed files

ChangeLog index 1b279fd..9490f17 100644
NEWS index befb1e9..7222feb 100644
S3/Config.py index e1f0145..f0fb0ef 100644
S3/S3.py index 01a63e2..950d5ed 100644
S3/Utils.py index b0e5c7e..ae91841 100644
s3cmd index 0f1f31b..2c9f114 100755

@@ -1,3 +1,13 @@
                     +2009-06-02  Michal Ludvig  <michal@logix.cz>
+                    +
                     +	* s3cmd: New [fixbucket] command for fixing invalid object
                     +	  names in a given Bucket. For instance names with &#x08; in
                     +	  them (not sure how people manage to upload them but they do).
                     +	* S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for
                     +	  the above, plus advise user to run 'fixbucket' when XML parsing
                     +	  fails.
                     +	* NEWS: Updated.
+                    +
 -05-29  Michal Ludvig  <michal@logix.cz>
                      	* S3/Utils.py: New function replace_nonprintables()

NEWS

History View file @ 3c07424

@@ -10,6 +10,9 @@ s3cmd 1.0.0
                      * Added --exclude/--include and --dry-run for [del], [setacl].
                      * Neutralise characters that are invalid in XML to avoid ExpatErrors.
                        http://boodebr.org/main/python/all-about-python-and-unicode
                     +* New command [fixbucket] for for fixing invalid object names
                     +  in a given Bucket. For instance names with &#x08; in them
                     +  (not sure how people manage to upload them but they do).
                      s3cmd 0.9.9   -   2009-02-17
                      ===========

S3/Config.py

History View file @ 3c07424

@@ -68,7 +68,7 @@ class Config(object):
                      	debug_exclude = {}
                      	debug_include = {}
                      	encoding = "utf-8"
                     -	verbatim = False
                     +	urlencoding_mode = "normal"
                      	## Creating a singleton
                      	def __new__(self, configfile = None):

S3/S3.py

History View file @ 3c07424

@@ -174,26 +174,29 @@ class S3(object):
                      			return getListFromXml(data, "CommonPrefixes")
                      		uri_params = {}
                     -		if prefix:
                     -			uri_params['prefix'] = self.urlencode_string(prefix)
                     -		if not self.config.recursive and not recursive:
                     -			uri_params['delimiter'] = "/"
                     -		request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
                     -		response = self.send_request(request)
                     -		#debug(response)
                     +		response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
                      		list = _get_contents(response["data"])
                      		prefixes = _get_common_prefixes(response["data"])
                      		while _list_truncated(response["data"]):
                      			uri_params['marker'] = self.urlencode_string(list[-1]["Key"])
                      			debug("Listing continues after '%s'" % uri_params['marker'])
                     -			request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
                     -			response = self.send_request(request)
                     +			response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params)
                      			list += _get_contents(response["data"])
                      			prefixes += _get_common_prefixes(response["data"])
                      		response['list'] = list
                      		response['common_prefixes'] = prefixes
                      		return response
                     +	def bucket_list_noparse(self, bucket, prefix = None, recursive = None, uri_params = {}):
                     +		if prefix:
                     +			uri_params['prefix'] = self.urlencode_string(prefix)
                     +		if not self.config.recursive and not recursive:
                     +			uri_params['delimiter'] = "/"
                     +		request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params)
                     +		response = self.send_request(request)
                     +		#debug(response)
                     +		return response
+                    +
                      	def bucket_create(self, bucket, bucket_location = None):
                      		headers = SortedDict(ignore_case = True)
                      		body = ""
@@ -320,11 +323,14 @@ class S3(object):
                      		return response
                      	## Low level methods
                     -	def urlencode_string(self, string):
                     +	def urlencode_string(self, string, urlencoding_mode = None):
                      		if type(string) == unicode:
                      			string = string.encode("utf-8")
                     -		if self.config.verbatim:
                     +		if urlencoding_mode is None:
                     +			urlencoding_mode = self.config.urlencoding_mode
+                    +
                     +		if urlencoding_mode == "verbatim":
                      			## Don't do any pre-processing
                      			return string
@@ -345,9 +351,12 @@ class S3(object):
                      					#           [hope that sounds reassuring ;-)]
                      			o = ord(c)
                      			if (o < 0x20 or o == 0x7f):
                     -				error(u"Non-printable character 0x%02x in: %s" % (o, string))
                     -				error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
                     -				encoded += replace_nonprintables(c)
                     +				if urlencoding_mode == "fixbucket":
                     +					encoded += "%%%02X" % o
                     +				else:
                     +					error(u"Non-printable character 0x%02x in: %s" % (o, string))
                     +					error(u"Please report it to s3tools-bugs@lists.sourceforge.net")
                     +					encoded += replace_nonprintables(c)
                      			elif (o == 0x20 or	# Space and below
                      			    o == 0x22 or	# "
                      			    o == 0x23 or	# #

S3/Utils.py

History View file @ 3c07424

@@ -21,11 +21,13 @@ import errno
                      from logging import debug, info, warning, error
                      import Config
                     +import Exceptions
                      try:
                      	import xml.etree.ElementTree as ET
                      except ImportError:
                      	import elementtree.ElementTree as ET
                     +from xml.parsers.expat import ExpatError
                      def parseNodes(nodes):
                      	## WARNING: Ignores text nodes from mixed xml/text.
@@ -57,10 +59,14 @@ def stripNameSpace(xml):
                      def getTreeFromXml(xml):
                      	xml, xmlns = stripNameSpace(xml)
                     -	tree = ET.fromstring(xml)
                     -	if xmlns:
                     -		tree.attrib['xmlns'] = xmlns
                     -	return tree
                     +	try:
                     +		tree = ET.fromstring(xml)
                     +		if xmlns:
                     +			tree.attrib['xmlns'] = xmlns
                     +		return tree
                     +	except ExpatError, e:
                     +		error(e)
                     +		raise Exceptions.ParameterError("Bucket contains invalid filenames. Please run: s3cmd fixbucket s3://your-bucket/")
                      def getListFromXml(xml, node):
                      	tree = getTreeFromXml(xml)
@@ -275,7 +281,7 @@ def replace_nonprintables(string):
                      			modified += 1
                      		else:
                      			new_string += c
                     -	if modified:
                     +	if modified and Config.Config().urlencoding_mode != "fixbucket":
                      		warning("%d non-printable characters replaced in: %s" % (modified, new_string))
                      	return new_string

s3cmd

History View file @ 3c07424

@@ -21,6 +21,7 @@ import traceback
                      import codecs
                      import locale
                      import subprocess
                     +import htmlentitydefs
                      from copy import copy
                      from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter
@@ -638,7 +639,7 @@ def _get_filelist_local(local_uri):
                      				## for now skip over
                      				continue
                      			relative_file = unicodise(os.path.join(rel_root, f))
                     -			if not cfg.verbatim:
                     +			if cfg.urlencoding_mode == "normal":
                      				relative_file = replace_nonprintables(relative_file)
                      			if relative_file.startswith('./'):
                      				relative_file = relative_file[2:]
@@ -1117,6 +1118,71 @@ def cmd_sign(args):
                      	signature = Utils.sign_string(string_to_sign)
                      	output("Signature: %s" % signature)
                     +def cmd_fixbucket(args):
                     +	def _unescape(text):
                     +		##
                     +		# Removes HTML or XML character references and entities from a text string.
                     +		#
                     +		# @param text The HTML (or XML) source text.
                     +		# @return The plain text, as a Unicode string, if necessary.
                     +		#
                     +		# From: http://effbot.org/zone/re-sub.htm#unescape-html
                     +		def _unescape_fixup(m):
                     +			text = m.group(0)
                     +			if not htmlentitydefs.name2codepoint.has_key('apos'):
                     +				htmlentitydefs.name2codepoint['apos'] = ord("'")
                     +			if text[:2] == "&#":
                     +				# character reference
                     +				try:
                     +					if text[:3] == "&#x":
                     +						return unichr(int(text[3:-1], 16))
                     +					else:
                     +						return unichr(int(text[2:-1]))
                     +				except ValueError:
                     +					pass
                     +			else:
                     +				# named entity
                     +				try:
                     +					text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
                     +				except KeyError:
                     +					pass
                     +			return text # leave as is
                     +		return re.sub("&#?\w+;", _unescape_fixup, text)
+                    +
                     +	cfg.urlencoding_mode = "fixbucket"
                     +	s3 = S3(cfg)
+                    +
                     +	count = 0
                     +	for arg in args:
                     +		culprit = S3Uri(arg)
                     +		if culprit.type != "s3":
                     +			raise ParameterError("Expecting S3Uri instead of: %s" % arg)
                     +		response = s3.bucket_list_noparse(culprit.bucket(), culprit.object(), recursive = True)
                     +		r_xent = re.compile("&#x[\da-fA-F]+;")
                     +		keys = re.findall("<Key>(.*?)</Key>", response['data'], re.MULTILINE)
                     +		debug("Keys: %r" % keys)
                     +		for key in keys:
                     +			if r_xent.search(key):
                     +				info("Fixing: %s" % key)
                     +				debug("Step 1: Transforming %s" % key)
                     +				key_bin = _unescape(key)
                     +				debug("Step 2:       ... to %s" % key_bin)
                     +				key_new = replace_nonprintables(key_bin)
                     +				debug("Step 3:  ... then to %s" % key_new)
                     +				src = S3Uri("s3://%s/%s" % (culprit.bucket(), key_bin))
                     +				dst = S3Uri("s3://%s/%s" % (culprit.bucket(), key_new))
                     +				resp_move = s3.object_move(src, dst)
                     +				if resp_move['status'] == 200:
                     +					output("File %r renamed to %s" % (key_bin, key_new))
                     +					count += 1
                     +				else:
                     +					error("Something went wrong for: %r" % key)
                     +					error("Please report the problem to s3tools-bugs@lists.sourceforge.net")
                     +	if count > 0:
                     +		warning("Fixed %d files' names. Their ACL were reset to Private." % count)
                     +		warning("Use 's3cmd setacl --acl-public s3://...' to make")
                     +		warning("them publicly readable if required.")
+                    +
                      def resolve_list(lst, args):
                      	retval = []
                      	for item in lst:
@@ -1351,6 +1417,7 @@ def get_commands_list():
                      	{"cmd":"mv", "label":"Move object", "param":"s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]", "func":cmd_mv, "argc":2},
                      	{"cmd":"setacl", "label":"Modify Access control list for Bucket or Files", "param":"s3://BUCKET[/OBJECT]", "func":cmd_setacl, "argc":1},
                      	{"cmd":"sign", "label":"Sign arbitrary string using the secret key", "param":"STRING-TO-SIGN", "func":cmd_sign, "argc":1},
                     +	{"cmd":"fixbucket", "label":"Fix invalid file names in a bucket", "param":"s3://BUCKET[/PREFIX]", "func":cmd_fixbucket, "argc":1},
                      	## CloudFront commands
                      	{"cmd":"cflist", "label":"List CloudFront distribution points", "param":"", "func":CfCmd.info, "argc":0},
@@ -1445,7 +1512,7 @@ def main():
                      	optparser.add_option(      "--add-header", dest="add_header", action="append", metavar="NAME:VALUE", help="Add a given HTTP header to the upload request. Can be used multiple times. For instance set 'Expires' or 'Cache-Control' headers (or both) using this options if you like.")
                      	optparser.add_option(      "--encoding", dest="encoding", metavar="ENCODING", help="Override autodetected terminal and filesystem encoding (character set). Autodetected: %s" % preferred_encoding)
                     -	optparser.add_option(      "--verbatim", dest="verbatim", action="store_true", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")
                     +	optparser.add_option(      "--verbatim", dest="urlencoding_mode", action="store_const", const="verbatim", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!")
                      	optparser.add_option(      "--list-md5", dest="list_md5", action="store_true", help="Include MD5 sums in bucket listings (only for 'ls' command).")
                      	optparser.add_option("-H", "--human-readable-sizes", dest="human_readable_sizes", action="store_true", help="Print sizes in human readable form (eg 1kB instead of 1234).")