GitList

Browse code

Fixes #346 - Add support for listing and resume of multipart uploads of more than 1000 parts

Add server pagination handling support to:
"get_multipart" (list incomplete multipart files)
"list_multipart" (list parts of an incomplete multipart file)

In order to be able to list and resume multipart uploads that have more
than 1000 parts.

Florent Viard authored on 2020/06/22 10:54:16
Showing 3 changed files

S3/MultiPart.py index 81554e6..c3282ed 100644
S3/S3.py index b40fd72..24a5492 100644
s3cmd index d8c59df..7b1bfa6 100755

S3/MultiPart.py

History View file @ cefa4b6

@@ -48,11 +48,10 @@ class MultiPartUpload(object):
                              self.upload_id = self.initiate_multipart_upload()
                          def get_parts_information(self, uri, upload_id):
                     -        multipart_response = self.s3.list_multipart(uri, upload_id)
                     -        tree = getTreeFromXml(multipart_response['data'])
                     +        part_list = self.s3.list_multipart(uri, upload_id)
                              parts = dict()
                     -        for elem in parseNodes(tree):
                     +        for elem in part_list:
                                  try:
                                      parts[int(elem['PartNumber'])] = {
                                          'checksum': elem['ETag'],
@@ -65,9 +64,8 @@ class MultiPartUpload(object):
                          def get_unique_upload_id(self, uri):
                              upload_id = ""
                     -        multipart_response = self.s3.get_multipart(uri)
                     -        tree = getTreeFromXml(multipart_response['data'])
                     -        for mpupload in parseNodes(tree):
                     +        multipart_list = self.s3.get_multipart(uri)
                     +        for mpupload in multipart_list:
                                  try:
                                      mp_upload_id = mpupload['UploadId']
                                      mp_path = mpupload['Key']

S3/S3.py

History View file @ cefa4b6

@@ -1135,9 +1135,131 @@ class S3(object):
                              response = self.send_request(request)
                              return response
                     -    def get_multipart(self, uri):
                     -        request = self.create_request("BUCKET_LIST", bucket = uri.bucket(),
                     -                                      uri_params = {'uploads': None})
                     +    def get_multipart(self, uri, uri_params=None, limit=-1):
                     +        upload_list = []
                     +        for truncated, uploads in self.get_multipart_streaming(uri,
                     +                                                               uri_params,
                     +                                                               limit):
                     +            upload_list.extend(uploads)
+                    +
                     +        return upload_list
+                    +
                     +    def get_multipart_streaming(self, uri, uri_params=None, limit=-1):
                     +        uri_params = uri_params and uri_params.copy() or {}
                     +        bucket = uri.bucket()
+                    +
                     +        truncated = True
                     +        num_objects = 0
                     +        max_keys = limit
+                    +
                     +        # It is the "uploads: None" in uri_params that will change the
                     +        # behavior of bucket_list to return multiparts instead of keys
                     +        uri_params['uploads'] = None
                     +        while truncated:
                     +            response = self.bucket_list_noparse(bucket, recursive=True,
                     +                                                uri_params=uri_params,
                     +                                                max_keys=max_keys)
+                    +
                     +            xml_data = response["data"]
                     +            # extract list of info of uploads
                     +            upload_list = getListFromXml(xml_data, "Upload")
                     +            num_objects += len(upload_list)
                     +            if limit > num_objects:
                     +                max_keys = limit - num_objects
+                    +
                     +            xml_truncated = getTextFromXml(xml_data, ".//IsTruncated")
                     +            if not xml_truncated or xml_truncated.lower() == "false":
                     +                truncated = False
+                    +
                     +            if truncated:
                     +                if limit == -1 or num_objects < limit:
                     +                    if upload_list:
                     +                        next_key = getTextFromXml(xml_data, "NextKeyMarker")
                     +                        if not next_key:
                     +                            next_key = upload_list[-1]["Key"]
                     +                        uri_params['KeyMarker'] = next_key
+                    +
                     +                        upload_id_marker = getTextFromXml(
                     +                            xml_data, "NextUploadIdMarker")
                     +                        if upload_id_marker:
                     +                            uri_params['UploadIdMarker'] = upload_id_marker
                     +                        elif 'UploadIdMarker' in uri_params:
                     +                            # Clear any pre-existing value
                     +                            del uri_params['UploadIdMarker']
                     +                    else:
                     +                        # Unexpectedly, the server lied, and so the previous
                     +                        # response was not truncated. So, no new key to get.
                     +                        yield False, upload_list
                     +                        break
                     +                    debug("Listing continues after '%s'" %
                     +                          uri_params['KeyMarker'])
                     +                else:
                     +                    yield truncated, upload_list
                     +                    break
                     +            yield truncated, upload_list
+                    +
                     +    def list_multipart(self, uri, upload_id, uri_params=None, limit=-1):
                     +        part_list = []
                     +        for truncated, parts in self.list_multipart_streaming(uri,
                     +                                                              upload_id,
                     +                                                              uri_params,
                     +                                                              limit):
                     +            part_list.extend(parts)
+                    +
                     +        return part_list
+                    +
                     +    def list_multipart_streaming(self, uri, upload_id, uri_params=None,
                     +                                 limit=-1):
                     +        uri_params = uri_params and uri_params.copy() or {}
+                    +
                     +        truncated = True
                     +        num_objects = 0
                     +        max_parts = limit
+                    +
                     +        while truncated:
                     +            response = self.list_multipart_noparse(uri, upload_id,
                     +                                                   uri_params, max_parts)
+                    +
                     +            xml_data = response["data"]
                     +            # extract list of multipart upload parts
                     +            part_list = getListFromXml(xml_data, "Part")
                     +            num_objects += len(part_list)
                     +            if limit > num_objects:
                     +                max_parts = limit - num_objects
+                    +
                     +            xml_truncated = getTextFromXml(xml_data, ".//IsTruncated")
                     +            if not xml_truncated or xml_truncated.lower() == "false":
                     +                truncated = False
+                    +
                     +            if truncated:
                     +                if limit == -1 or num_objects < limit:
                     +                    if part_list:
                     +                        next_part_number = getTextFromXml(
                     +                            xml_data, "NextPartNumberMarker")
                     +                        if not next_part_number:
                     +                            next_part_number = part_list[-1]["PartNumber"]
                     +                        uri_params['part-number-marker'] = next_part_number
                     +                    else:
                     +                        # Unexpectedly, the server lied, and so the previous
                     +                        # response was not truncated. So, no new part to get.
                     +                        yield False, part_list
                     +                        break
                     +                    debug("Listing continues after Part '%s'" %
                     +                          uri_params['part-number-marker'])
                     +                else:
                     +                    yield truncated, part_list
                     +                    break
                     +            yield truncated, part_list
+                    +
                     +    def list_multipart_noparse(self, uri, upload_id, uri_params=None,
                     +                               max_parts=-1):
                     +        if uri_params is None:
                     +            uri_params = {}
                     +        if max_parts != -1:
                     +            uri_params['max-parts'] = str(max_parts)
                     +        uri_params['uploadId'] = upload_id
                     +        request = self.create_request("OBJECT_GET", uri=uri,
                     +                                      uri_params=uri_params)
                              response = self.send_request(request)
                              return response
@@ -1147,12 +1269,6 @@ class S3(object):
                              response = self.send_request(request)
                              return response
                     -    def list_multipart(self, uri, id):
                     -        request = self.create_request("OBJECT_GET", uri = uri,
                     -                                      uri_params = {'uploadId': id})
                     -        response = self.send_request(request)
                     -        return response
+                    -
                          def get_accesslog(self, uri):
                              request = self.create_request("BUCKET_LIST", bucket = uri.bucket(),
                                                            uri_params = {'logging': None})

s3cmd

History View file @ cefa4b6

@@ -2120,15 +2120,16 @@ def cmd_multipart(args):
                          #id = ''
                          #if(len(args) > 1): id = args[1]
                     -    response = s3.get_multipart(uri)
                     -    debug(u"response - %s" % response['status'])
                     +    upload_list = s3.get_multipart(uri)
                          output(u"%s" % uri)
                     -    tree = getTreeFromXml(response['data'])
                     -    debug(parseNodes(tree))
                     +    debug(upload_list)
                          output(u"Initiated\tPath\tId")
                     -    for mpupload in parseNodes(tree):
                     +    for mpupload in upload_list:
                              try:
                     -            output(u"%s\t%s\t%s" % (mpupload['Initiated'], "s3://" + uri.bucket() + "/" + mpupload['Key'], mpupload['UploadId']))
                     +            output(u"%s\t%s\t%s" % (
                     +                mpupload['Initiated'],
                     +                "s3://" + uri.bucket() + "/" + mpupload['Key'],
                     +                mpupload['UploadId']))
                              except KeyError:
                                  pass
                          return EX_OK
@@ -2151,14 +2152,15 @@ def cmd_list_multipart(args):
                          uri = S3Uri(args[0])
                          id = args[1]
                     -    response = s3.list_multipart(uri, id)
                     -    debug(u"response - %s" % response['status'])
                     -    tree = getTreeFromXml(response['data'])
                     +    part_list = s3.list_multipart(uri, id)
                          output(u"LastModified\t\t\tPartNumber\tETag\tSize")
                     -    for mpupload in parseNodes(tree):
                     +    for mpupload in part_list:
                              try:
                     -            output(u"%s\t%s\t%s\t%s" % (mpupload['LastModified'], mpupload['PartNumber'], mpupload['ETag'], mpupload['Size']))
                     -        except Exception:
                     +            output(u"%s\t%s\t%s\t%s" % (mpupload['LastModified'],
                     +                                        mpupload['PartNumber'],
                     +                                        mpupload['ETag'],
                     +                                        mpupload['Size']))
                     +        except KeyError:
                                  pass
                          return EX_OK