Browse code

s3cmd du can gobble gigs of RAM on a bucket with millions of objects. Re-worked 'du' to traverse the structure and store only the sum at each level, allowing python to free memory at each folder. Went from 4GB consumption (and being killed) on a test bucket with ~50M objects in thousands of directories, to a max of 80M usage.

Charlie Schluting authored on 2012/07/11 13:06:30
Showing 1 changed files
... ...
@@ -65,18 +65,30 @@ def subcmd_bucket_usage(s3, uri):
65 65
 
66 66
     if object.endswith('*'):
67 67
         object = object[:-1]
68
-    try:
69
-        response = s3.bucket_list(bucket, prefix = object, recursive = True)
70
-    except S3Error, e:
71
-        if S3.codes.has_key(e.info["Code"]):
72
-            error(S3.codes[e.info["Code"]] % bucket)
73
-            return
74
-        else:
75
-            raise
68
+
76 69
     bucket_size = 0
77
-    for object in response["list"]:
78
-        size, size_coeff = formatSize(object["Size"], False)
79
-        bucket_size += size
70
+    # iterate and store directories to traverse, while summing objects:
71
+    dirs = [object]
72
+    while dirs:
73
+        try:
74
+            response = s3.bucket_list(bucket, prefix=dirs.pop())
75
+        except S3Error, e:
76
+            if S3.codes.has_key(e.info["Code"]):
77
+                error(S3.codes[e.info["Code"]] % bucket)
78
+                return
79
+            else:
80
+                raise
81
+
82
+        # objects in the current scope:
83
+        for obj in response["list"]:
84
+            if len(response['list']) < 1:
85
+                break
86
+            bucket_size += int(obj["Size"])
87
+
88
+        # directories found in current scope:
89
+        for obj in response["common_prefixes"]:
90
+            dirs.append(obj["Prefix"])
91
+
80 92
     total_size, size_coeff = formatSize(bucket_size, Config().human_readable_sizes)
81 93
     total_size_str = str(total_size) + size_coeff
82 94
     output(u"%s %s" % (total_size_str.ljust(8), uri))