GitList

Browse code

Adding MultiPartCopy class and enable for object_copy when remote file is greater than 5gb.

Damian Martinez authored on 2012/07/17 16:45:39
Showing 3 changed files

S3/Config.py index 5435343..35c7a42 100644
S3/MultiPart.py index b336772..83ad86b 100644
S3/S3.py index cd534bd..9f28e6e 100644

@@ -163,6 +163,9 @@ class Config(object):
                          enable_multipart = True
                          multipart_chunk_size_mb = 15    # MB
                          multipart_max_chunks = 10000    # Maximum chunks on AWS S3, could be different on other S3-compatible APIs
                     +    #- minimum size to use multipart remote s3-to-s3 copy with byte range is 5gb
                     +    #multipart_copy_size = (5 * 1024 * 1024 * 1024) - 1
                     +    multipart_copy_size = 5 * 1024 * 1024 * 1024
                          # List of checks to be performed for 'sync'
                          sync_checks = ['size', 'md5']   # 'weak-timestamp'
                          # List of compiled REGEXPs

S3/MultiPart.py

History View file @ 593b4ec

@@ -210,4 +210,100 @@ class MultiPartUpload(object):
                              response = None
                              return response
+                    +
                     +class MultiPartCopy(MultiPartUpload):
+                    +
                     +    def __init__(self, s3, src_uri, dst_uri, src_size, headers_baseline = {}):
                     +        self.s3 = s3
                     +        self.file = self.src_uri = src_uri
                     +        self.uri  = self.dst_uri = dst_uri
                     +        # ...
                     +        self.src_size = src_size
                     +        self.parts = {}
                     +        self.headers_baseline = headers_baseline
                     +        self.upload_id = self.initiate_multipart_copy()
+                    +
                     +    def initiate_multipart_copy(self):
                     +        return self.initiate_multipart_upload()
+                    +
                     +    def complete_multipart_copy(self):
                     +        return self.complete_multipart_upload()
+                    +
                     +    def abort_copy(self):
                     +        return self.abort_upload()
+                    +
+                    +
                     +    def copy_all_parts(self):
                     +        """
                     +        Execute a full multipart upload copy on a remote file
                     +        Returns the seq/etag dict
                     +        """
                     +        if not self.upload_id:
                     +            raise RuntimeError("Attempting to use a multipart copy that has not been initiated.")
+                    +
                     +        size_left = file_size = self.src_size
                     +        self.chunk_size = self.s3.config.multipart_copy_size # - 1
                     +        nr_parts = file_size / self.chunk_size + (file_size % self.chunk_size and 1)
                     +        debug("MultiPart: Copying %s in %d parts" % (self.src_uri, nr_parts))
+                    +
                     +        seq = 1
                     +        while size_left > 0:
                     +            offset = self.chunk_size * (seq - 1)
                     +            current_chunk_size = min(file_size - offset, self.chunk_size)
                     +            size_left -= current_chunk_size
                     +            labels = {
                     +                'source' : unicodise(self.src_uri.uri()),
                     +                'destination' : unicodise(self.uri.uri()),
                     +                'extra' : "[part %d of %d, %s]" % (seq, nr_parts, "%d%sB" % formatSize(current_chunk_size, human_readable = True))
                     +            }
                     +            try:
                     +                self.copy_part(seq, offset, current_chunk_size, labels)
                     +            except:
                     +                # TODO: recover from some "retriable" errors?
                     +                error(u"Upload copy of '%s' part %d failed. Aborting multipart upload copy." % (self.src_uri, seq))
                     +                self.abort_copy()
                     +                raise
                     +            seq += 1
+                    +
                     +        debug("MultiPart: Copy finished: %d parts", seq - 1)
+                    +
                     +    def copy_part(self, seq, offset, chunk_size, labels):
                     +        """
                     +        Copy a remote file chunk
                     +        http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadUploadPart.html
                     +        http://docs.amazonwebservices.com/AmazonS3/latest/API/mpUploadUploadPartCopy.html
                     +        """
                     +        debug("Copying part %i of %r (%s bytes)" % (seq, self.upload_id, chunk_size))
+                    +
                     +        # set up headers with copy-params.
                     +        # Examples:
                     +        #    x-amz-copy-source: /source_bucket/sourceObject
                     +        #    x-amz-copy-source-range:bytes=first-last
                     +        #    x-amz-copy-source-if-match: etag
                     +        #    x-amz-copy-source-if-none-match: etag
                     +        #    x-amz-copy-source-if-unmodified-since: time_stamp
                     +        #    x-amz-copy-source-if-modified-since: time_stamp
                     +        headers = { "x-amz-copy-source": "/%s/%s" % (self.src_uri.bucket(), self.src_uri.object()) }
+                    +
                     +        # include byte range header if already on next sequence or original file is > 5gb
                     +        if (seq > 1) or (chunk_size >= self.s3.config.multipart_copy_size):
                     +            # a 10 byte file has bytes=0-9
                     +            headers["x-amz-copy-source-range"] = "bytes=%d-%d" % (offset, (offset + chunk_size - 1))
+                    +
                     +        query_string = "?partNumber=%i&uploadId=%s" % (seq, self.upload_id)
+                    +
                     +        request = self.s3.create_request("OBJECT_PUT", uri = self.uri, headers = headers, extra = query_string)
                     +        response = self.s3.send_request(request)
+                    +
                     +        # NOTE: Amazon sends whitespace while upload progresses, which
                     +        # accumulates in response body and seems to confuse XML parser.
                     +        # Strip newlines to find ETag in XML response data
                     +        data = response["data"].replace("\n", '')
                     +        self.parts[seq] = getTextFromXml(data, "ETag")
+                    +
                     +        # TODO: how to fail if no ETag found ... raise Exception?
                     +        #debug("Uploaded copy part %i of %r (%s bytes): etag=%s" % (seq, self.upload_id, chunk_size, self.parts[seq]))
+                    +
                     +        return response
+                    +
                      # vim:et:ts=4:sts=4:ai

S3/S3.py

History View file @ 593b4ec

@@ -40,7 +40,7 @@ from .ACL import ACL, GranteeLogDelivery
                      from .BidirMap import BidirMap
                      from .Config import Config
                      from .Exceptions import *
                     -from .MultiPart import MultiPartUpload
                     +from .MultiPart import MultiPartUpload, MultiPartCopy
                      from .S3Uri import S3Uri
                      from .ConnMan import ConnMan
                      from .Crypto import (sign_request_v2, sign_request_v4, checksum_sha256_file,
@@ -826,9 +826,7 @@ class S3(object):
                                          raise exc
                                      acl = None
                              headers = SortedDict(ignore_case = True)
                     -        headers['x-amz-copy-source'] = "/%s/%s" % (src_uri.bucket(),
                     -                                                   urlencode_string(src_uri.object(), unicode_output=True))
                     -        headers['x-amz-metadata-directive'] = "COPY"
+                    +
                              if self.config.acl_public:
                                  headers["x-amz-acl"] = "public-read"
@@ -846,6 +844,28 @@ class S3(object):
                              if extra_headers:
                                  headers.update(extra_headers)
                     +        ## Multipart decision - only do multipart copy for remote s3 files > 5gb
                     +        multipart = False
                     +        # TODO: does it need new config option for: enable_multipart_copy ?
                     +        if self.config.enable_multipart:
                     +            # get size of remote src only if multipart is enabled
                     +            src_info = self.object_info(src_uri)
                     +            size = int(src_info["headers"]["content-length"])
+                    +
                     +            if size > self.config.multipart_copy_size:
                     +                multipart = True
+                    +
                     +        if multipart:
                     +            # Multipart requests are quite different... drop here
                     +            return self.copy_file_multipart(src_uri, dst_uri, size, headers)
+                    +
                     +        ## Not multipart...
                     +        headers['x-amz-copy-source'] = "/%s/%s" % (
                     +            src_uri.bucket(),
                     +            urlencode_string(src_uri.object(), unicode_output=True)
                     +        )
                     +        headers['x-amz-metadata-directive'] = "COPY"
+                    +
                              request = self.create_request("OBJECT_PUT", uri = dst_uri, headers = headers)
                              response = self.send_request(request)
                              if response["data"] and getRootTagName(response["data"]) == "Error":
@@ -1623,6 +1643,18 @@ class S3(object):
                                  raise S3UploadError(getTextFromXml(response["data"], 'Message'))
                              return response
                     +    def copy_file_multipart(self, src_uri, dst_uri, size, headers):
                     +        debug("copying multi-part ..." )
                     +        timestamp_start = time.time()
                     +        multicopy = MultiPartCopy(self, src_uri, dst_uri, size, headers)
                     +        multicopy.copy_all_parts()
                     +        response = multicopy.complete_multipart_copy()
                     +        timestamp_end = time.time()
                     +        response["elapsed"] = timestamp_end - timestamp_start
                     +        response["size"] = size
                     +        response["speed"] = response["elapsed"] and float(response["size"]) / response["elapsed"] or float(-1)
                     +        return response
+                    +
                          def recv_file(self, request, stream, labels, start_position = 0, retries = _max_retries):
                              self.update_region_inner_request(request)