git-svn-id: https://s3tools.svn.sourceforge.net/svnroot/s3tools/s3cmd/trunk@395 830e0280-6d2a-0410-9c65-932aecc39d9d
Michal Ludvig authored on 2009/06/02 20:23:15... | ... |
@@ -1,3 +1,13 @@ |
1 |
+2009-06-02 Michal Ludvig <michal@logix.cz> |
|
2 |
+ |
|
3 |
+ * s3cmd: New [fixbucket] command for fixing invalid object |
|
4 |
+ names in a given Bucket. For instance names with  in |
|
5 |
+ them (not sure how people manage to upload them but they do). |
|
6 |
+ * S3/S3.py, S3/Utils.py, S3/Config.py: Support methods for |
|
7 |
+ the above, plus advise user to run 'fixbucket' when XML parsing |
|
8 |
+ fails. |
|
9 |
+ * NEWS: Updated. |
|
10 |
+ |
|
1 | 11 |
2009-05-29 Michal Ludvig <michal@logix.cz> |
2 | 12 |
|
3 | 13 |
* S3/Utils.py: New function replace_nonprintables() |
... | ... |
@@ -10,6 +10,9 @@ s3cmd 1.0.0 |
10 | 10 |
* Added --exclude/--include and --dry-run for [del], [setacl]. |
11 | 11 |
* Neutralise characters that are invalid in XML to avoid ExpatErrors. |
12 | 12 |
http://boodebr.org/main/python/all-about-python-and-unicode |
13 |
+* New command [fixbucket] for for fixing invalid object names |
|
14 |
+ in a given Bucket. For instance names with  in them |
|
15 |
+ (not sure how people manage to upload them but they do). |
|
13 | 16 |
|
14 | 17 |
s3cmd 0.9.9 - 2009-02-17 |
15 | 18 |
=========== |
... | ... |
@@ -174,26 +174,29 @@ class S3(object): |
174 | 174 |
return getListFromXml(data, "CommonPrefixes") |
175 | 175 |
|
176 | 176 |
uri_params = {} |
177 |
- if prefix: |
|
178 |
- uri_params['prefix'] = self.urlencode_string(prefix) |
|
179 |
- if not self.config.recursive and not recursive: |
|
180 |
- uri_params['delimiter'] = "/" |
|
181 |
- request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params) |
|
182 |
- response = self.send_request(request) |
|
183 |
- #debug(response) |
|
177 |
+ response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params) |
|
184 | 178 |
list = _get_contents(response["data"]) |
185 | 179 |
prefixes = _get_common_prefixes(response["data"]) |
186 | 180 |
while _list_truncated(response["data"]): |
187 | 181 |
uri_params['marker'] = self.urlencode_string(list[-1]["Key"]) |
188 | 182 |
debug("Listing continues after '%s'" % uri_params['marker']) |
189 |
- request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params) |
|
190 |
- response = self.send_request(request) |
|
183 |
+ response = self.bucket_list_noparse(bucket, prefix, recursive, uri_params) |
|
191 | 184 |
list += _get_contents(response["data"]) |
192 | 185 |
prefixes += _get_common_prefixes(response["data"]) |
193 | 186 |
response['list'] = list |
194 | 187 |
response['common_prefixes'] = prefixes |
195 | 188 |
return response |
196 | 189 |
|
190 |
+ def bucket_list_noparse(self, bucket, prefix = None, recursive = None, uri_params = {}): |
|
191 |
+ if prefix: |
|
192 |
+ uri_params['prefix'] = self.urlencode_string(prefix) |
|
193 |
+ if not self.config.recursive and not recursive: |
|
194 |
+ uri_params['delimiter'] = "/" |
|
195 |
+ request = self.create_request("BUCKET_LIST", bucket = bucket, **uri_params) |
|
196 |
+ response = self.send_request(request) |
|
197 |
+ #debug(response) |
|
198 |
+ return response |
|
199 |
+ |
|
197 | 200 |
def bucket_create(self, bucket, bucket_location = None): |
198 | 201 |
headers = SortedDict(ignore_case = True) |
199 | 202 |
body = "" |
... | ... |
@@ -320,11 +323,14 @@ class S3(object): |
320 | 320 |
return response |
321 | 321 |
|
322 | 322 |
## Low level methods |
323 |
- def urlencode_string(self, string): |
|
323 |
+ def urlencode_string(self, string, urlencoding_mode = None): |
|
324 | 324 |
if type(string) == unicode: |
325 | 325 |
string = string.encode("utf-8") |
326 | 326 |
|
327 |
- if self.config.verbatim: |
|
327 |
+ if urlencoding_mode is None: |
|
328 |
+ urlencoding_mode = self.config.urlencoding_mode |
|
329 |
+ |
|
330 |
+ if urlencoding_mode == "verbatim": |
|
328 | 331 |
## Don't do any pre-processing |
329 | 332 |
return string |
330 | 333 |
|
... | ... |
@@ -345,9 +351,12 @@ class S3(object): |
345 | 345 |
# [hope that sounds reassuring ;-)] |
346 | 346 |
o = ord(c) |
347 | 347 |
if (o < 0x20 or o == 0x7f): |
348 |
- error(u"Non-printable character 0x%02x in: %s" % (o, string)) |
|
349 |
- error(u"Please report it to s3tools-bugs@lists.sourceforge.net") |
|
350 |
- encoded += replace_nonprintables(c) |
|
348 |
+ if urlencoding_mode == "fixbucket": |
|
349 |
+ encoded += "%%%02X" % o |
|
350 |
+ else: |
|
351 |
+ error(u"Non-printable character 0x%02x in: %s" % (o, string)) |
|
352 |
+ error(u"Please report it to s3tools-bugs@lists.sourceforge.net") |
|
353 |
+ encoded += replace_nonprintables(c) |
|
351 | 354 |
elif (o == 0x20 or # Space and below |
352 | 355 |
o == 0x22 or # " |
353 | 356 |
o == 0x23 or # # |
... | ... |
@@ -21,11 +21,13 @@ import errno |
21 | 21 |
from logging import debug, info, warning, error |
22 | 22 |
|
23 | 23 |
import Config |
24 |
+import Exceptions |
|
24 | 25 |
|
25 | 26 |
try: |
26 | 27 |
import xml.etree.ElementTree as ET |
27 | 28 |
except ImportError: |
28 | 29 |
import elementtree.ElementTree as ET |
30 |
+from xml.parsers.expat import ExpatError |
|
29 | 31 |
|
30 | 32 |
def parseNodes(nodes): |
31 | 33 |
## WARNING: Ignores text nodes from mixed xml/text. |
... | ... |
@@ -57,10 +59,14 @@ def stripNameSpace(xml): |
57 | 57 |
|
58 | 58 |
def getTreeFromXml(xml): |
59 | 59 |
xml, xmlns = stripNameSpace(xml) |
60 |
- tree = ET.fromstring(xml) |
|
61 |
- if xmlns: |
|
62 |
- tree.attrib['xmlns'] = xmlns |
|
63 |
- return tree |
|
60 |
+ try: |
|
61 |
+ tree = ET.fromstring(xml) |
|
62 |
+ if xmlns: |
|
63 |
+ tree.attrib['xmlns'] = xmlns |
|
64 |
+ return tree |
|
65 |
+ except ExpatError, e: |
|
66 |
+ error(e) |
|
67 |
+ raise Exceptions.ParameterError("Bucket contains invalid filenames. Please run: s3cmd fixbucket s3://your-bucket/") |
|
64 | 68 |
|
65 | 69 |
def getListFromXml(xml, node): |
66 | 70 |
tree = getTreeFromXml(xml) |
... | ... |
@@ -275,7 +281,7 @@ def replace_nonprintables(string): |
275 | 275 |
modified += 1 |
276 | 276 |
else: |
277 | 277 |
new_string += c |
278 |
- if modified: |
|
278 |
+ if modified and Config.Config().urlencoding_mode != "fixbucket": |
|
279 | 279 |
warning("%d non-printable characters replaced in: %s" % (modified, new_string)) |
280 | 280 |
return new_string |
281 | 281 |
|
... | ... |
@@ -21,6 +21,7 @@ import traceback |
21 | 21 |
import codecs |
22 | 22 |
import locale |
23 | 23 |
import subprocess |
24 |
+import htmlentitydefs |
|
24 | 25 |
|
25 | 26 |
from copy import copy |
26 | 27 |
from optparse import OptionParser, Option, OptionValueError, IndentedHelpFormatter |
... | ... |
@@ -638,7 +639,7 @@ def _get_filelist_local(local_uri): |
638 | 638 |
## for now skip over |
639 | 639 |
continue |
640 | 640 |
relative_file = unicodise(os.path.join(rel_root, f)) |
641 |
- if not cfg.verbatim: |
|
641 |
+ if cfg.urlencoding_mode == "normal": |
|
642 | 642 |
relative_file = replace_nonprintables(relative_file) |
643 | 643 |
if relative_file.startswith('./'): |
644 | 644 |
relative_file = relative_file[2:] |
... | ... |
@@ -1117,6 +1118,71 @@ def cmd_sign(args): |
1117 | 1117 |
signature = Utils.sign_string(string_to_sign) |
1118 | 1118 |
output("Signature: %s" % signature) |
1119 | 1119 |
|
1120 |
+def cmd_fixbucket(args): |
|
1121 |
+ def _unescape(text): |
|
1122 |
+ ## |
|
1123 |
+ # Removes HTML or XML character references and entities from a text string. |
|
1124 |
+ # |
|
1125 |
+ # @param text The HTML (or XML) source text. |
|
1126 |
+ # @return The plain text, as a Unicode string, if necessary. |
|
1127 |
+ # |
|
1128 |
+ # From: http://effbot.org/zone/re-sub.htm#unescape-html |
|
1129 |
+ def _unescape_fixup(m): |
|
1130 |
+ text = m.group(0) |
|
1131 |
+ if not htmlentitydefs.name2codepoint.has_key('apos'): |
|
1132 |
+ htmlentitydefs.name2codepoint['apos'] = ord("'") |
|
1133 |
+ if text[:2] == "&#": |
|
1134 |
+ # character reference |
|
1135 |
+ try: |
|
1136 |
+ if text[:3] == "&#x": |
|
1137 |
+ return unichr(int(text[3:-1], 16)) |
|
1138 |
+ else: |
|
1139 |
+ return unichr(int(text[2:-1])) |
|
1140 |
+ except ValueError: |
|
1141 |
+ pass |
|
1142 |
+ else: |
|
1143 |
+ # named entity |
|
1144 |
+ try: |
|
1145 |
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
|
1146 |
+ except KeyError: |
|
1147 |
+ pass |
|
1148 |
+ return text # leave as is |
|
1149 |
+ return re.sub("&#?\w+;", _unescape_fixup, text) |
|
1150 |
+ |
|
1151 |
+ cfg.urlencoding_mode = "fixbucket" |
|
1152 |
+ s3 = S3(cfg) |
|
1153 |
+ |
|
1154 |
+ count = 0 |
|
1155 |
+ for arg in args: |
|
1156 |
+ culprit = S3Uri(arg) |
|
1157 |
+ if culprit.type != "s3": |
|
1158 |
+ raise ParameterError("Expecting S3Uri instead of: %s" % arg) |
|
1159 |
+ response = s3.bucket_list_noparse(culprit.bucket(), culprit.object(), recursive = True) |
|
1160 |
+ r_xent = re.compile("&#x[\da-fA-F]+;") |
|
1161 |
+ keys = re.findall("<Key>(.*?)</Key>", response['data'], re.MULTILINE) |
|
1162 |
+ debug("Keys: %r" % keys) |
|
1163 |
+ for key in keys: |
|
1164 |
+ if r_xent.search(key): |
|
1165 |
+ info("Fixing: %s" % key) |
|
1166 |
+ debug("Step 1: Transforming %s" % key) |
|
1167 |
+ key_bin = _unescape(key) |
|
1168 |
+ debug("Step 2: ... to %s" % key_bin) |
|
1169 |
+ key_new = replace_nonprintables(key_bin) |
|
1170 |
+ debug("Step 3: ... then to %s" % key_new) |
|
1171 |
+ src = S3Uri("s3://%s/%s" % (culprit.bucket(), key_bin)) |
|
1172 |
+ dst = S3Uri("s3://%s/%s" % (culprit.bucket(), key_new)) |
|
1173 |
+ resp_move = s3.object_move(src, dst) |
|
1174 |
+ if resp_move['status'] == 200: |
|
1175 |
+ output("File %r renamed to %s" % (key_bin, key_new)) |
|
1176 |
+ count += 1 |
|
1177 |
+ else: |
|
1178 |
+ error("Something went wrong for: %r" % key) |
|
1179 |
+ error("Please report the problem to s3tools-bugs@lists.sourceforge.net") |
|
1180 |
+ if count > 0: |
|
1181 |
+ warning("Fixed %d files' names. Their ACL were reset to Private." % count) |
|
1182 |
+ warning("Use 's3cmd setacl --acl-public s3://...' to make") |
|
1183 |
+ warning("them publicly readable if required.") |
|
1184 |
+ |
|
1120 | 1185 |
def resolve_list(lst, args): |
1121 | 1186 |
retval = [] |
1122 | 1187 |
for item in lst: |
... | ... |
@@ -1351,6 +1417,7 @@ def get_commands_list(): |
1351 | 1351 |
{"cmd":"mv", "label":"Move object", "param":"s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]", "func":cmd_mv, "argc":2}, |
1352 | 1352 |
{"cmd":"setacl", "label":"Modify Access control list for Bucket or Files", "param":"s3://BUCKET[/OBJECT]", "func":cmd_setacl, "argc":1}, |
1353 | 1353 |
{"cmd":"sign", "label":"Sign arbitrary string using the secret key", "param":"STRING-TO-SIGN", "func":cmd_sign, "argc":1}, |
1354 |
+ {"cmd":"fixbucket", "label":"Fix invalid file names in a bucket", "param":"s3://BUCKET[/PREFIX]", "func":cmd_fixbucket, "argc":1}, |
|
1354 | 1355 |
|
1355 | 1356 |
## CloudFront commands |
1356 | 1357 |
{"cmd":"cflist", "label":"List CloudFront distribution points", "param":"", "func":CfCmd.info, "argc":0}, |
... | ... |
@@ -1445,7 +1512,7 @@ def main(): |
1445 | 1445 |
optparser.add_option( "--add-header", dest="add_header", action="append", metavar="NAME:VALUE", help="Add a given HTTP header to the upload request. Can be used multiple times. For instance set 'Expires' or 'Cache-Control' headers (or both) using this options if you like.") |
1446 | 1446 |
|
1447 | 1447 |
optparser.add_option( "--encoding", dest="encoding", metavar="ENCODING", help="Override autodetected terminal and filesystem encoding (character set). Autodetected: %s" % preferred_encoding) |
1448 |
- optparser.add_option( "--verbatim", dest="verbatim", action="store_true", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!") |
|
1448 |
+ optparser.add_option( "--verbatim", dest="urlencoding_mode", action="store_const", const="verbatim", help="Use the S3 name as given on the command line. No pre-processing, encoding, etc. Use with caution!") |
|
1449 | 1449 |
|
1450 | 1450 |
optparser.add_option( "--list-md5", dest="list_md5", action="store_true", help="Include MD5 sums in bucket listings (only for 'ls' command).") |
1451 | 1451 |
optparser.add_option("-H", "--human-readable-sizes", dest="human_readable_sizes", action="store_true", help="Print sizes in human readable form (eg 1kB instead of 1234).") |