aws s3 does not preserve metadata when doing a copy using
MultipartUploadCopy unlike when doing a simple normal copy.
Sadly, in the case of the multipart copy, that forces us to issue an
additional object_info request as we need to have the original headers
before initiating the MultipartUpload operation.
This patch could still be improved.
A lot earlier in the code, we might have already issued an object_info
request on the source object, but sadly we don't keep the object headers
to re-use them. That could probably be changed to avoid the useless
object_info extra requests.
... | ... |
@@ -809,6 +809,9 @@ class S3(object): |
809 | 809 |
'server', |
810 | 810 |
'x-amz-id-2', |
811 | 811 |
'x-amz-request-id', |
812 |
+ # Other headers that are not copying by a direct copy |
|
813 |
+ 'x-amz-storage-class', |
|
814 |
+ ## We should probably also add server-side encryption headers |
|
812 | 815 |
] |
813 | 816 |
|
814 | 817 |
for h in to_remove + self.config.remove_headers: |
... | ... |
@@ -831,7 +834,33 @@ class S3(object): |
831 | 831 |
if exc.status != 501: |
832 | 832 |
raise exc |
833 | 833 |
acl = None |
834 |
- headers = SortedDict(ignore_case=True) |
|
834 |
+ |
|
835 |
+ multipart = False |
|
836 |
+ if self.config.enable_multipart: |
|
837 |
+ # Get size of remote source only if multipart is enabled and that no |
|
838 |
+ # size info was provided |
|
839 |
+ src_headers = None |
|
840 |
+ if src_size is None: |
|
841 |
+ src_info = self.object_info(src_uri) |
|
842 |
+ src_headers = src_info['headers'] |
|
843 |
+ src_size = int(src_headers["content-length"]) |
|
844 |
+ |
|
845 |
+ if src_size > self.config.multipart_copy_chunk_size_mb * SIZE_1MB: |
|
846 |
+ # Sadly, s3 is badly done as metadata will not be copied in |
|
847 |
+ # multipart copy unlike what is done in the case of direct |
|
848 |
+ # copy. |
|
849 |
+ # TODO: Optimize by re-using the object_info request done |
|
850 |
+ # earlier earlier at fetch remote stage, and preserve headers. |
|
851 |
+ if src_headers is None: |
|
852 |
+ src_info = self.object_info(src_uri) |
|
853 |
+ src_headers = src_info['headers'] |
|
854 |
+ src_size = int(src_headers["content-length"]) |
|
855 |
+ self._sanitize_headers(src_headers) |
|
856 |
+ headers = SortedDict(src_headers, ignore_case=True) |
|
857 |
+ multipart = True |
|
858 |
+ |
|
859 |
+ if not multipart: |
|
860 |
+ headers = SortedDict(ignore_case=True) |
|
835 | 861 |
|
836 | 862 |
if self.config.acl_public: |
837 | 863 |
headers["x-amz-acl"] = "public-read" |
... | ... |
@@ -853,17 +882,10 @@ class S3(object): |
853 | 853 |
|
854 | 854 |
headers['x-amz-metadata-directive'] = "COPY" |
855 | 855 |
|
856 |
- # Get size of remote source only if multipart is enabled and that no |
|
857 |
- # size info was provided |
|
858 |
- if src_size is None and self.config.enable_multipart: |
|
859 |
- src_info = self.object_info(src_uri) |
|
860 |
- src_size = int(src_info["headers"]["content-length"]) |
|
861 |
- |
|
862 |
- # Multipart decision. Only do multipart copy for remote s3 files |
|
863 |
- # bigger than the multipart copy threshlod. |
|
856 |
+ if multipart: |
|
857 |
+ # Multipart decision. Only do multipart copy for remote s3 files |
|
858 |
+ # bigger than the multipart copy threshlod. |
|
864 | 859 |
|
865 |
- if self.config.enable_multipart and \ |
|
866 |
- src_size > self.config.multipart_copy_chunk_size_mb * SIZE_1MB: |
|
867 | 860 |
# Multipart requests are quite different... delegate |
868 | 861 |
response = self.copy_file_multipart(src_uri, dst_uri, src_size, |
869 | 862 |
headers, extra_label) |