Browse code

PullSources: Provide atomicity for file downloads

Packages such as linux, linux-esx, linux-secure and linux-aws all
share the same source tarball. So, when building these packages in
parallel, the package-builder will try to download the source tarball
to the same destination (filename) on the disk, simultaneously.

Unfortunately, there is no protection against partial file downloads;
that is, the destination file is visible well before the download is
actually complete. This can result in a race condition where some of
these packages observe that the source tarball already exists, and
compute a sha1 checksum on the (partially downloaded) file and error
out, complaining that the checksum doesn't match the expected value
from that package's spec file. Linux is particularly prone to this
race, as its tarballs are quite large, thus extending this race
window.

Fix this by downloading to a temporary filename and renaming it to the
final destination filename after the download is actually complete.
(Note that it is important to make sure that the temporary file and
the final filename both exist on the same filesystem; otherwise it
doesn't fix the race completely). Also, we utilize the package name
(eg: linux-esx) in making the temporary filename unique.

Change-Id: I3a332b8c44edc48780f60711a143da1beb3c46ed
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/5134
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Alexey Makhalov <amakhalov@vmware.com>

Srivatsa S. Bhat authored on 2018/05/09 07:43:34
Showing 3 changed files
... ...
@@ -113,7 +113,7 @@ class PackageUtils(object):
113 113
         # Fetch/verify sources if sha1 not None.
114 114
         sha1 = SPECS.getData().getSHA1(package, source, index)
115 115
         if sha1 is not None:
116
-            PullSources.get(source, sha1, constants.sourcePath, constants.pullsourcesConfig, self.logger)
116
+            PullSources.get(package, source, sha1, constants.sourcePath, constants.pullsourcesConfig, self.logger)
117 117
 
118 118
         sourcePath = cmdUtils.findFile(source,constants.sourcePath)
119 119
         if sourcePath is None or len(sourcePath) == 0:
... ...
@@ -25,7 +25,7 @@ def getFileHash(filepath):
25 25
         f.close()
26 26
     return sha1.hexdigest()
27 27
 
28
-def get(source, sha1, sourcesPath, configs, logger):
28
+def get(package, source, sha1, sourcesPath, configs, logger):
29 29
     cmdUtils = CommandUtils()
30 30
     sourcePath = cmdUtils.findFile(source, sourcesPath)
31 31
     if sourcePath is not None and len(sourcePath) > 0:
... ...
@@ -41,7 +41,7 @@ def get(source, sha1, sourcesPath, configs, logger):
41 41
         p = pullSources(config, logger)
42 42
         package_path = os.path.join(sourcesPath, source)
43 43
         try: 
44
-            p.downloadFileHelper(source, package_path, sha1)
44
+            p.downloadFileHelper(package, source, package_path, sha1)
45 45
             return
46 46
         except Exception as e:
47 47
             logger.error(e)
... ...
@@ -64,13 +64,21 @@ class pullSources:
64 64
         with open(conf_file) as jsonFile:
65 65
             self._config = json.load(jsonFile)
66 66
 
67
-    def downloadFile(self, filename, file_path):
67
+    def downloadFile(self, package, filename, file_path):
68 68
         #form url: https://dl.bintray.com/vmware/photon_sources/1.0/<filename>.
69 69
         url = '%s/%s' % (self._config['baseurl'], filename)
70 70
 
71 71
         self.logger.info("Downloading: "+url)
72 72
 
73
-        with open(file_path, 'wb') as handle:
73
+        # We need to provide atomicity for file downloads. That is,
74
+        # the file should be visible in its canonical location only
75
+        # when the download is complete. To achieve that, first
76
+        # download to a temporary location (on the same filesystem)
77
+        # and then rename it to the final destination filename.
78
+
79
+        temp_file_path = file_path + "-" + package
80
+
81
+        with open(temp_file_path, 'wb') as handle:
74 82
             response = requests.get(url, auth=self._auth, stream=True)
75 83
 
76 84
             if not response.ok:
... ...
@@ -83,10 +91,16 @@ class pullSources:
83 83
 
84 84
                 handle.write(block)
85 85
             response.close()
86
+
87
+        if os.path.exists(file_path):
88
+            os.remove(temp_file_path)
89
+        else:
90
+            os.rename(temp_file_path, file_path)
91
+
86 92
         return file_path
87 93
 
88
-    def downloadFileHelper(self, package_name, package_path, package_sha1 = None):
89
-        self.downloadFile(package_name, package_path)
90
-        if package_sha1 != getFileHash(package_path):
91
-            raise Exception('Invalid sha1 for package %s' % package_name)
94
+    def downloadFileHelper(self, package, filename, file_path, package_sha1 = None):
95
+        self.downloadFile(package, filename, file_path)
96
+        if package_sha1 != getFileHash(file_path):
97
+            raise Exception('Invalid sha1 for package %s file %s' % package, filename)
92 98
 
... ...
@@ -223,7 +223,7 @@ def buildSourcesList(yamlDir, blackListPkgs, logger, singleFile=True):
223 223
             sourceName=listSourceNames[0]
224 224
             sha1 = SPECS.getData().getSHA1(package, sourceName)
225 225
             if sha1 is not None:
226
-                PullSources.get(sourceName, sha1, yamlSourceDir, constants.pullsourcesConfig, logger)
226
+                PullSources.get(package, sourceName, sha1, yamlSourceDir, constants.pullsourcesConfig, logger)
227 227
 
228 228
         if not singleFile:
229 229
             yamlFile = open(yamlSourceDir+"/"+ossname+"-"+ossversion+".yaml", "w")