Browse code

PullSources: Provide atomicity for file downloads

Packages such as linux, linux-esx, linux-secure and linux-aws all
share the same source tarball. So, when building these packages in
parallel, the package-builder will try to download the source tarball
to the same destination (filename) on the disk, simultaneously.

Unfortunately, there is no protection against partial file downloads;
that is, the destination file is visible well before the download is
actually complete. This can result in a race condition where some of
these packages observe that the source tarball already exists, and
compute a sha1 checksum on the (partially downloaded) file and error
out, complaining that the checksum doesn't match the expected value
from that package's spec file. Linux is particularly prone to this
race, as its tarballs are quite large, thus extending this race
window.

Fix this by downloading to a temporary filename and renaming it to the
final destination filename after the download is actually complete.
(Note that it is important to make sure that the temporary file and
the final filename both exist on the same filesystem; otherwise it
doesn't fix the race completely). Also, we utilize the package name
(eg: linux-esx) in making the temporary filename unique.

Change-Id: I5dda3da4d7bd4e75f211774246adeded7a0d095b
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/5136
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Alexey Makhalov <amakhalov@vmware.com>

Srivatsa S. Bhat authored on 2018/05/09 08:45:23
Showing 3 changed files
... ...
@@ -154,7 +154,7 @@ def buildSourcesList(yamlDir, blackListPkgs, logger, singleFile=True):
154 154
             sourceName = listSourceNames[0]
155 155
             sha1 = SPECS.getData().getSHA1(package, sourceName)
156 156
             if sha1 is not None:
157
-                PullSources.get(sourceName, sha1, yamlSourceDir,
157
+                PullSources.get(package, sourceName, sha1, yamlSourceDir,
158 158
                                 constants.pullsourcesConfig, logger)
159 159
 
160 160
         if not singleFile:
... ...
@@ -450,8 +450,8 @@ class PackageUtils(object):
450 450
         # Fetch/verify sources if sha1 not None.
451 451
         sha1 = SPECS.getData().getSHA1(package, source, index)
452 452
         if sha1 is not None:
453
-            PullSources.get(source, sha1, constants.sourcePath, constants.pullsourcesConfig,
454
-                            self.logger)
453
+            PullSources.get(package, source, sha1, constants.sourcePath,
454
+                            constants.pullsourcesConfig, self.logger)
455 455
 
456 456
         sourcePath = cmdUtils.findFile(source, constants.sourcePath)
457 457
         if not sourcePath:
... ...
@@ -22,7 +22,7 @@ def getFileHash(filepath):
22 22
         f.close()
23 23
     return sha1.hexdigest()
24 24
 
25
-def get(source, sha1, sourcesPath, configs, logger):
25
+def get(package, source, sha1, sourcesPath, configs, logger):
26 26
     cmdUtils = CommandUtils()
27 27
     sourcePath = cmdUtils.findFile(source, sourcesPath)
28 28
     if sourcePath is not None and len(sourcePath) > 0:
... ...
@@ -40,7 +40,7 @@ def get(source, sha1, sourcesPath, configs, logger):
40 40
         p = pullSources(config, logger)
41 41
         package_path = os.path.join(sourcesPath, source)
42 42
         try:
43
-            p.downloadFileHelper(source, package_path, sha1)
43
+            p.downloadFileHelper(package, source, package_path, sha1)
44 44
             return
45 45
         except Exception as e:
46 46
             logger.exception(e)
... ...
@@ -63,13 +63,21 @@ class pullSources:
63 63
         with open(conf_file) as jsonFile:
64 64
             self._config = json.load(jsonFile)
65 65
 
66
-    def downloadFile(self, filename, file_path):
66
+    def downloadFile(self, package, filename, file_path):
67 67
         #form url: https://dl.bintray.com/vmware/photon_sources/1.0/<filename>.
68 68
         url = '%s/%s' % (self._config['baseurl'], filename)
69 69
 
70 70
         self.logger.info("Downloading: " + url)
71 71
 
72
-        with open(file_path, 'wb') as handle:
72
+        # We need to provide atomicity for file downloads. That is,
73
+        # the file should be visible in its canonical location only
74
+        # when the download is complete. To achieve that, first
75
+        # download to a temporary location (on the same filesystem)
76
+        # and then rename it to the final destination filename.
77
+
78
+        temp_file_path = file_path + "-" + package
79
+
80
+        with open(temp_file_path, 'wb') as handle:
73 81
             response = requests.get(url, auth=self._auth, stream=True)
74 82
 
75 83
             if not response.ok:
... ...
@@ -82,9 +90,15 @@ class pullSources:
82 82
                 handle.write(block)
83 83
             handle.flush()
84 84
             response.close()
85
+
86
+        if os.path.exists(file_path):
87
+            os.remove(temp_file_path)
88
+        else:
89
+            os.rename(temp_file_path, file_path)
90
+
85 91
         return file_path
86 92
 
87
-    def downloadFileHelper(self, package_name, package_path, package_sha1=None):
88
-        self.downloadFile(package_name, package_path)
89
-        if package_sha1 != getFileHash(package_path):
90
-            raise Exception('Invalid sha1 for package %s' % package_name)
93
+    def downloadFileHelper(self, package, filename, file_path, package_sha1=None):
94
+        self.downloadFile(package, filename, file_path)
95
+        if package_sha1 != getFileHash(file_path):
96
+            raise Exception('Invalid sha1 for package %s file %s' % package, filename)