- Converted to non-threaded upload again
(will add threading for all uploads, not only multipart, later on)
- Using S3.send_file() instead of S3.send_request()
- Don't read data in the main loop, only compute offset and chunk size
and leave it to S3.send_file() to read the data.
- Re-enabled progress indicator.
Still broken:
- "s3cmd sync" doesn't work with multipart uploaded files because
the ETag no longer contains MD5sum of the file. MAJOR!
- Multipart upload abort is not triggered with all failures.
- s3cmd commands "mplist" and "mpabort" to be added.
- s3cmd should resume failed multipart uploads.
... | ... |
@@ -2,53 +2,16 @@ |
2 | 2 |
## Author: Jerome Leclanche <jerome.leclanche@gmail.com> |
3 | 3 |
## License: GPL Version 2 |
4 | 4 |
|
5 |
-from Queue import Queue |
|
6 |
-from threading import Thread |
|
5 |
+import os |
|
6 |
+from stat import ST_SIZE |
|
7 | 7 |
from logging import debug, info, warning, error |
8 |
-from Utils import getTextFromXml |
|
9 |
- |
|
10 |
-class Worker(Thread): |
|
11 |
- """ |
|
12 |
- Thread executing tasks from a given tasks queue |
|
13 |
- """ |
|
14 |
- def __init__(self, tasks): |
|
15 |
- super(Worker, self).__init__() |
|
16 |
- self.tasks = tasks |
|
17 |
- self.daemon = True |
|
18 |
- self.start() |
|
19 |
- |
|
20 |
- def run(self): |
|
21 |
- while True: |
|
22 |
- func, args, kargs = self.tasks.get() |
|
23 |
- func(*args, **kargs) |
|
24 |
- self.tasks.task_done() |
|
25 |
- |
|
26 |
-class ThreadPool(object): |
|
27 |
- """ |
|
28 |
- Pool of threads consuming tasks from a queue |
|
29 |
- """ |
|
30 |
- def __init__(self, num_threads): |
|
31 |
- self.tasks = Queue(num_threads) |
|
32 |
- for _ in range(num_threads): |
|
33 |
- Worker(self.tasks) |
|
34 |
- |
|
35 |
- def add_task(self, func, *args, **kargs): |
|
36 |
- """ |
|
37 |
- Add a task to the queue |
|
38 |
- """ |
|
39 |
- self.tasks.put((func, args, kargs)) |
|
40 |
- |
|
41 |
- def wait_completion(self): |
|
42 |
- """ |
|
43 |
- Wait for completion of all the tasks in the queue |
|
44 |
- """ |
|
45 |
- self.tasks.join() |
|
8 |
+from Utils import getTextFromXml, formatSize, unicodise |
|
9 |
+from Exceptions import S3UploadError |
|
46 | 10 |
|
47 | 11 |
class MultiPartUpload(object): |
48 | 12 |
|
49 | 13 |
MIN_CHUNK_SIZE_MB = 5 # 5MB |
50 | 14 |
MAX_CHUNK_SIZE_MB = 5120 # 5GB |
51 |
- MAX_CHUNKS = 100 |
|
52 | 15 |
MAX_FILE_SIZE = 42949672960 # 5TB |
53 | 16 |
|
54 | 17 |
def __init__(self, s3, file, uri): |
... | ... |
@@ -66,11 +29,10 @@ class MultiPartUpload(object): |
66 | 66 |
request = self.s3.create_request("OBJECT_POST", uri = self.uri, extra = "?uploads") |
67 | 67 |
response = self.s3.send_request(request) |
68 | 68 |
data = response["data"] |
69 |
- s3, key, upload_id = getTextFromXml(data, "Bucket"), getTextFromXml(data, "Key"), getTextFromXml(data, "UploadId") |
|
70 |
- self.upload_id = upload_id |
|
71 |
- return s3, key, upload_id |
|
69 |
+ self.upload_id = getTextFromXml(data, "UploadId") |
|
70 |
+ return self.upload_id |
|
72 | 71 |
|
73 |
- def upload_all_parts(self, num_threads, chunk_size): |
|
72 |
+ def upload_all_parts(self): |
|
74 | 73 |
""" |
75 | 74 |
Execute a full multipart upload on a file |
76 | 75 |
Returns the id/etag dict |
... | ... |
@@ -79,50 +41,52 @@ class MultiPartUpload(object): |
79 | 79 |
if not self.upload_id: |
80 | 80 |
raise RuntimeError("Attempting to use a multipart upload that has not been initiated.") |
81 | 81 |
|
82 |
+ size_left = file_size = os.stat(self.file.name)[ST_SIZE] |
|
83 |
+ self.chunk_size = self.s3.config.multipart_chunk_size_mb * 1024 * 1024 |
|
84 |
+ nr_parts = file_size / self.chunk_size + (file_size % self.chunk_size and 1) |
|
85 |
+ debug("MultiPart: Uploading %s in %d parts" % (self.file.name, nr_parts)) |
|
86 |
+ |
|
82 | 87 |
id = 1 |
83 |
- if num_threads > 1: |
|
84 |
- debug("MultiPart: Uploading in %d threads" % num_threads) |
|
85 |
- pool = ThreadPool(num_threads) |
|
86 |
- else: |
|
87 |
- debug("MultiPart: Uploading in a single thread") |
|
88 |
- |
|
89 |
- while True: |
|
90 |
- if id == self.MAX_CHUNKS: |
|
91 |
- data = self.file.read(-1) |
|
92 |
- else: |
|
93 |
- data = self.file.read(chunk_size) |
|
94 |
- if not data: |
|
95 |
- break |
|
96 |
- if num_threads > 1: |
|
97 |
- pool.add_task(self.upload_part, data, id) |
|
98 |
- else: |
|
99 |
- self.upload_part(data, id) |
|
88 |
+ while size_left > 0: |
|
89 |
+ offset = self.chunk_size * (id - 1) |
|
90 |
+ current_chunk_size = min(file_size - offset, self.chunk_size) |
|
91 |
+ size_left -= current_chunk_size |
|
92 |
+ labels = { |
|
93 |
+ 'source' : unicodise(self.file.name), |
|
94 |
+ 'destination' : unicodise(self.uri.uri()), |
|
95 |
+ 'extra' : "[part %d of %d, %s]" % (id, nr_parts, "%d%sB" % formatSize(current_chunk_size, human_readable = True)) |
|
96 |
+ } |
|
97 |
+ try: |
|
98 |
+ self.upload_part(id, offset, current_chunk_size, labels) |
|
99 |
+ except S3UploadError, e: |
|
100 |
+ error(u"Upload of '%s' part %d failed too many times. Aborting multipart upload." % (self.file.name, id)) |
|
101 |
+ self.abort_upload() |
|
102 |
+ raise e |
|
100 | 103 |
id += 1 |
101 | 104 |
|
102 |
- if num_threads > 1: |
|
103 |
- debug("Thread pool with %i threads and %i tasks awaiting completion." % (num_threads, id)) |
|
104 |
- pool.wait_completion() |
|
105 |
+ debug("MultiPart: Upload finished: %d parts", id - 1) |
|
105 | 106 |
|
106 |
- def upload_part(self, data, id): |
|
107 |
+ def upload_part(self, id, offset, chunk_size, labels): |
|
107 | 108 |
""" |
108 | 109 |
Upload a file chunk |
109 | 110 |
http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadUploadPart.html |
110 | 111 |
""" |
111 | 112 |
# TODO implement Content-MD5 |
112 |
- content_length = str(len(data)) |
|
113 |
- debug("Uploading part %i of %r (%s bytes)" % (id, self.upload_id, content_length)) |
|
114 |
- headers = { "content-length": content_length } |
|
113 |
+ debug("Uploading part %i of %r (%s bytes)" % (id, self.upload_id, chunk_size)) |
|
114 |
+ headers = { "content-length": chunk_size } |
|
115 | 115 |
query_string = "?partNumber=%i&uploadId=%s" % (id, self.upload_id) |
116 | 116 |
request = self.s3.create_request("OBJECT_PUT", uri = self.uri, headers = headers, extra = query_string) |
117 |
- response = self.s3.send_request(request, body = data) |
|
118 |
- |
|
117 |
+ response = self.s3.send_file(request, self.file, labels, offset = offset, chunk_size = chunk_size) |
|
119 | 118 |
self.parts[id] = response["headers"]["etag"] |
119 |
+ return response |
|
120 | 120 |
|
121 | 121 |
def complete_multipart_upload(self): |
122 | 122 |
""" |
123 | 123 |
Finish a multipart upload |
124 | 124 |
http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadComplete.html |
125 | 125 |
""" |
126 |
+ debug("MultiPart: Completing upload: %s" % self.upload_id) |
|
127 |
+ |
|
126 | 128 |
parts_xml = [] |
127 | 129 |
part_xml = "<Part><PartNumber>%i</PartNumber><ETag>%s</ETag></Part>" |
128 | 130 |
for id, etag in self.parts.items(): |
... | ... |
@@ -135,4 +99,14 @@ class MultiPartUpload(object): |
135 | 135 |
|
136 | 136 |
return response |
137 | 137 |
|
138 |
+ def abort_upload(self): |
|
139 |
+ """ |
|
140 |
+ Abort multipart upload |
|
141 |
+ http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadAbort.html |
|
142 |
+ """ |
|
143 |
+ debug("MultiPart: Aborting upload: %s" % self.upload_id) |
|
144 |
+ request = self.s3.create_request("OBJECT_DELETE", uri = self.uri, extra = "?uploadId=%s" % (self.upload_id)) |
|
145 |
+ response = self.s3.send_request(request, body = body) |
|
146 |
+ return response |
|
147 |
+ |
|
138 | 148 |
# vim:et:ts=4:sts=4:ai |
... | ... |
@@ -616,7 +616,7 @@ class S3(object): |
616 | 616 |
|
617 | 617 |
return response |
618 | 618 |
|
619 |
- def send_file(self, request, file, labels, throttle = 0, retries = _max_retries): |
|
619 |
+ def send_file(self, request, file, labels, throttle = 0, retries = _max_retries, offset = 0, chunk_size = -1): |
|
620 | 620 |
method_string, resource, headers = request.get_triplet() |
621 | 621 |
size_left = size_total = headers.get("content-length") |
622 | 622 |
if self.config.progress_meter: |
... | ... |
@@ -639,15 +639,15 @@ class S3(object): |
639 | 639 |
warning("Waiting %d sec..." % self._fail_wait(retries)) |
640 | 640 |
time.sleep(self._fail_wait(retries)) |
641 | 641 |
# Connection error -> same throttle value |
642 |
- return self.send_file(request, file, labels, throttle, retries - 1) |
|
642 |
+ return self.send_file(request, file, labels, throttle, retries - 1, offset, chunk_size) |
|
643 | 643 |
else: |
644 | 644 |
raise S3UploadError("Upload failed for: %s" % resource['uri']) |
645 |
- file.seek(0) |
|
645 |
+ file.seek(offset) |
|
646 | 646 |
md5_hash = md5() |
647 | 647 |
try: |
648 | 648 |
while (size_left > 0): |
649 | 649 |
#debug("SendFile: Reading up to %d bytes from '%s'" % (self.config.send_chunk, file.name)) |
650 |
- data = file.read(self.config.send_chunk) |
|
650 |
+ data = file.read(min(self.config.send_chunk, size_left)) |
|
651 | 651 |
md5_hash.update(data) |
652 | 652 |
conn.send(data) |
653 | 653 |
if self.config.progress_meter: |
... | ... |
@@ -676,7 +676,7 @@ class S3(object): |
676 | 676 |
warning("Waiting %d sec..." % self._fail_wait(retries)) |
677 | 677 |
time.sleep(self._fail_wait(retries)) |
678 | 678 |
# Connection error -> same throttle value |
679 |
- return self.send_file(request, file, labels, throttle, retries - 1) |
|
679 |
+ return self.send_file(request, file, labels, throttle, retries - 1, offset, chunk_size) |
|
680 | 680 |
else: |
681 | 681 |
debug("Giving up on '%s' %s" % (file.name, e)) |
682 | 682 |
raise S3UploadError("Upload failed for: %s" % resource['uri']) |
... | ... |
@@ -698,7 +698,7 @@ class S3(object): |
698 | 698 |
redir_hostname = getTextFromXml(response['data'], ".//Endpoint") |
699 | 699 |
self.set_hostname(redir_bucket, redir_hostname) |
700 | 700 |
warning("Redirected to: %s" % (redir_hostname)) |
701 |
- return self.send_file(request, file, labels) |
|
701 |
+ return self.send_file(request, file, labels, offset = offset, chunk_size = chunk_size) |
|
702 | 702 |
|
703 | 703 |
# S3 from time to time doesn't send ETag back in a response :-( |
704 | 704 |
# Force re-upload here. |
... | ... |
@@ -721,7 +721,7 @@ class S3(object): |
721 | 721 |
warning("Upload failed: %s (%s)" % (resource['uri'], S3Error(response))) |
722 | 722 |
warning("Waiting %d sec..." % self._fail_wait(retries)) |
723 | 723 |
time.sleep(self._fail_wait(retries)) |
724 |
- return self.send_file(request, file, labels, throttle, retries - 1) |
|
724 |
+ return self.send_file(request, file, labels, throttle, retries - 1, offset, chunk_size) |
|
725 | 725 |
else: |
726 | 726 |
warning("Too many failures. Giving up on '%s'" % (file.name)) |
727 | 727 |
raise S3UploadError |
... | ... |
@@ -734,7 +734,7 @@ class S3(object): |
734 | 734 |
warning("MD5 Sums don't match!") |
735 | 735 |
if retries: |
736 | 736 |
warning("Retrying upload of %s" % (file.name)) |
737 |
- return self.send_file(request, file, labels, throttle, retries - 1) |
|
737 |
+ return self.send_file(request, file, labels, throttle, retries - 1, offset, chunk_size) |
|
738 | 738 |
else: |
739 | 739 |
warning("Too many failures. Giving up on '%s'" % (file.name)) |
740 | 740 |
raise S3UploadError |
... | ... |
@@ -743,13 +743,12 @@ class S3(object): |
743 | 743 |
|
744 | 744 |
def send_file_multipart(self, file, headers, uri, size): |
745 | 745 |
upload = MultiPartUpload(self, file, uri) |
746 |
- bucket, key, upload_id = upload.initiate_multipart_upload() |
|
746 |
+ upload_id = upload.initiate_multipart_upload() |
|
747 | 747 |
|
748 | 748 |
num_threads = self.config.multipart_num_threads |
749 | 749 |
chunk_size = self.config.multipart_chunk_size_mb * 1024 * 1024 |
750 | 750 |
|
751 |
- file.seek(0) |
|
752 |
- upload.upload_all_parts(num_threads, chunk_size) |
|
751 |
+ upload.upload_all_parts() |
|
753 | 752 |
response = upload.complete_multipart_upload() |
754 | 753 |
response["speed"] = 0 # XXX |
755 | 754 |
return response |