Browse code

PullSources: Provide atomicity for file downloads

Packages such as linux and linux-esx both share the same source
tarball. So, when building these packages in parallel, the
package-builder will try to download the source tarball to the same
destination (filename) on the disk, simultaneously.

Unfortunately, there is no protection against partial file downloads;
that is, the destination file is visible well before the download is
actually complete. This can result in a race condition where some of
these packages observe that the source tarball already exists, and
compute a sha1 checksum on the (partially downloaded) file and error
out, complaining that the checksum doesn't match the expected value
from that package's spec file. Linux is particularly prone to this
race, as its tarballs are quite large, thus extending this race
window.

Fix this by downloading to a temporary filename and renaming it to the
final destination filename after the download is actually complete.
(Note that it is important to make sure that the temporary file and
the final filename both exist on the same filesystem; otherwise it
doesn't fix the race completely). Also, we utilize the package name
(eg: linux-esx) in making the temporary filename unique.

Change-Id: Ia442ee2caab4e3b521d2b13eac751eb9532a14f4
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/5135
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Alexey Makhalov <amakhalov@vmware.com>

Srivatsa S. Bhat authored on 2018/05/09 08:40:51
Showing 3 changed files
... ...
@@ -111,7 +111,7 @@ class PackageUtils(object):
111 111
             # Fetch/verify sources if sha1 not None.
112 112
             sha1 = constants.specData.getSHA1(package, source)
113 113
             if sha1 is not None:
114
-                PullSources.get(source, sha1, constants.sourcePath, constants.pullsourcesConfig, self.logger)
114
+                PullSources.get(package, source, sha1, constants.sourcePath, constants.pullsourcesConfig, self.logger)
115 115
 
116 116
             sourcePath = cmdUtils.findFile(source,constants.sourcePath)
117 117
             if sourcePath is None or len(sourcePath) == 0:
... ...
@@ -25,7 +25,7 @@ def getFileHash(filepath):
25 25
         f.close()
26 26
     return sha1.hexdigest()
27 27
 
28
-def get(source, sha1, sourcesPath, configs, logger):
28
+def get(package, source, sha1, sourcesPath, configs, logger):
29 29
     cmdUtils = CommandUtils()
30 30
     sourcePath = cmdUtils.findFile(source, sourcesPath)
31 31
     if sourcePath is not None and len(sourcePath) > 0:
... ...
@@ -41,7 +41,7 @@ def get(source, sha1, sourcesPath, configs, logger):
41 41
         p = pullSources(config, logger)
42 42
         package_path = os.path.join(sourcesPath, source)
43 43
         try: 
44
-            p.downloadFileHelper(source, package_path, sha1)
44
+            p.downloadFileHelper(package, source, package_path, sha1)
45 45
             return
46 46
         except Exception as e:
47 47
             logger.error(e)
... ...
@@ -64,13 +64,21 @@ class pullSources:
64 64
         with open(conf_file) as jsonFile:
65 65
             self._config = json.load(jsonFile)
66 66
 
67
-    def downloadFile(self, filename, file_path):
67
+    def downloadFile(self, package, filename, file_path):
68 68
         #form url: https://dl.bintray.com/vmware/photon_sources/1.0/<filename>.
69 69
         url = '%s/%s' % (self._config['baseurl'], filename)
70 70
 
71 71
         self.logger.info("Downloading: "+url)
72 72
 
73
-        with open(file_path, 'wb') as handle:
73
+	# We need to provide atomicity for file downloads. That is,
74
+	# the file should be visible in its canonical location only
75
+	# when the download is complete. To achieve that, first
76
+	# download to a temporary location (on the same filesystem)
77
+	# and then rename it to the final destination filename.
78
+
79
+        temp_file_path = file_path + "-" + package
80
+
81
+        with open(temp_file_path, 'wb') as handle:
74 82
             response = requests.get(url, auth=self._auth, stream=True)
75 83
 
76 84
             if not response.ok:
... ...
@@ -83,10 +91,16 @@ class pullSources:
83 83
 
84 84
                 handle.write(block)
85 85
             response.close()
86
+
87
+        if os.path.exists(file_path):
88
+            os.remove(temp_file_path)
89
+        else:
90
+            os.rename(temp_file_path, file_path)
91
+
86 92
         return file_path
87 93
 
88
-    def downloadFileHelper(self, package_name, package_path, package_sha1 = None):
89
-        self.downloadFile(package_name, package_path)
90
-        if package_sha1 != getFileHash(package_path):
91
-            raise Exception('Invalid sha1 for package %s' % package_name)
94
+    def downloadFileHelper(self, package, filename, file_path, package_sha1 = None):
95
+        self.downloadFile(package, filename, file_path)
96
+        if package_sha1 != getFileHash(file_path):
97
+            raise Exception('Invalid sha1 for package %s file %s' % package, filename)
92 98
 
... ...
@@ -213,7 +213,7 @@ def buildSourcesList(yamlDir, blackListPkgs, logger, singleFile=True):
213 213
             sourceName=listSourceNames[0]
214 214
             sha1 = constants.specData.getSHA1(package, sourceName)
215 215
             if sha1 is not None:
216
-                PullSources.get(sourceName, sha1, yamlSourceDir, constants.pullsourcesConfig, logger)
216
+                PullSources.get(package, sourceName, sha1, yamlSourceDir, constants.pullsourcesConfig, logger)
217 217
 
218 218
         if not singleFile:
219 219
             yamlFile = open(yamlSourceDir+"/"+ossname+"-"+ossversion+".yaml", "w")