Browse code

MultipartCopy - Preserve metadata of the object that is copied

aws s3 does not preserve metadata when doing a copy using
MultipartUploadCopy unlike when doing a simple normal copy.

Sadly, in the case of the multipart copy, that forces us to issue an
additional object_info request as we need to have the original headers
before initiating the MultipartUpload operation.

This patch could still be improved.
A lot earlier in the code, we might have already issued an object_info
request on the source object, but sadly we don't keep the object headers
to re-use them. That could probably be changed to avoid the useless
object_info extra requests.

Florent Viard authored on 2020/04/14 11:18:17
Showing 1 changed files
... ...
@@ -809,6 +809,9 @@ class S3(object):
809 809
             'server',
810 810
             'x-amz-id-2',
811 811
             'x-amz-request-id',
812
+            # Other headers that are not copying by a direct copy
813
+            'x-amz-storage-class',
814
+            ## We should probably also add server-side encryption headers
812 815
         ]
813 816
 
814 817
         for h in to_remove + self.config.remove_headers:
... ...
@@ -831,7 +834,33 @@ class S3(object):
831 831
                 if exc.status != 501:
832 832
                     raise exc
833 833
                 acl = None
834
-        headers = SortedDict(ignore_case=True)
834
+
835
+        multipart = False
836
+        if self.config.enable_multipart:
837
+            # Get size of remote source only if multipart is enabled and that no
838
+            # size info was provided
839
+            src_headers = None
840
+            if src_size is None:
841
+                src_info = self.object_info(src_uri)
842
+                src_headers = src_info['headers']
843
+                src_size = int(src_headers["content-length"])
844
+
845
+            if src_size > self.config.multipart_copy_chunk_size_mb * SIZE_1MB:
846
+                # Sadly, s3 is badly done as metadata will not be copied in
847
+                # multipart copy unlike what is done in the case of direct
848
+                # copy.
849
+                # TODO: Optimize by re-using the object_info request done
850
+                # earlier earlier at fetch remote stage, and preserve headers.
851
+                if src_headers is None:
852
+                    src_info = self.object_info(src_uri)
853
+                    src_headers = src_info['headers']
854
+                    src_size = int(src_headers["content-length"])
855
+                self._sanitize_headers(src_headers)
856
+                headers = SortedDict(src_headers, ignore_case=True)
857
+                multipart = True
858
+
859
+        if not multipart:
860
+            headers = SortedDict(ignore_case=True)
835 861
 
836 862
         if self.config.acl_public:
837 863
             headers["x-amz-acl"] = "public-read"
... ...
@@ -853,17 +882,10 @@ class S3(object):
853 853
 
854 854
         headers['x-amz-metadata-directive'] = "COPY"
855 855
 
856
-        # Get size of remote source only if multipart is enabled and that no
857
-        # size info was provided
858
-        if src_size is None and self.config.enable_multipart:
859
-            src_info = self.object_info(src_uri)
860
-            src_size = int(src_info["headers"]["content-length"])
861
-
862
-        # Multipart decision. Only do multipart copy for remote s3 files
863
-        # bigger than the multipart copy threshlod.
856
+        if multipart:
857
+            # Multipart decision. Only do multipart copy for remote s3 files
858
+            # bigger than the multipart copy threshlod.
864 859
 
865
-        if self.config.enable_multipart and \
866
-           src_size > self.config.multipart_copy_chunk_size_mb * SIZE_1MB:
867 860
             # Multipart requests are quite different... delegate
868 861
             response = self.copy_file_multipart(src_uri, dst_uri, src_size,
869 862
                                                 headers, extra_label)