Browse Source

download_http doesn't download again

refs #17
boyska 2 years ago
parent
commit
b35da0a8d0
1 changed files with 44 additions and 4 deletions
  1. 44 4
      larigira/fsutils.py

+ 44 - 4
larigira/fsutils.py

@@ -6,6 +6,10 @@ import posixpath
 import urllib.request
 from tempfile import mkstemp
 from urllib.parse import urlparse
+from pathlib import Path
+import hashlib
+
+import requests
 
 log = logging.getLogger(__name__)
 
@@ -46,6 +50,14 @@ def shortname(path):
     return name
 
 
+def http_expected_length(url):
+    resp = requests.head(url, allow_redirects=True)
+    resp.raise_for_status()
+    header_value = resp.headers.get('content-length')
+    expected_length = int(header_value)
+    return expected_length
+
+
 def download_http(url, destdir=None, copy=False, prefix="httpdl"):
     if url.split(":")[0] not in ("http", "https"):
         log.warning("Not a valid URL: %s", url)
@@ -56,15 +68,43 @@ def download_http(url, destdir=None, copy=False, prefix="httpdl"):
         return None
     if not copy:
         return url
+    if destdir is None:
+        destdir = os.getenv('TMPDIR', '/tmp/')
     fname = posixpath.basename(urlparse(url).path)
     # sanitize
     fname = "".join(
-        c for c in fname if c.isalnum() or c in list("._-")
+        c for c in fname if c.isalnum() or c in list("_-")
     ).rstrip()
+    url_hash = hashlib.sha1(url.encode('utf8')).hexdigest()
+
+    final_path = Path(destdir) / ('%s-%s-%s.%s' % (prefix, fname[:20], url_hash, ext))
+
+    # it might be already fully downloaded, let's check
+    if final_path.exists():
+
+        # this "touch" helps avoiding a race condition in which the
+        # UnusedCleaner could delete  this
+        final_path.touch()
+
+        actual_size = final_path.stat().st_size
+        try:
+            expected_size = http_expected_length(url)
+        except Exception as exc:
+            log.debug("Could not determine expected length for %s: %s", url, exc)
+        else:
+            if expected_size == actual_size:
+                log.debug("File %s already present and complete, download not needed", final_path)
+                return final_path.as_uri()
+            else:
+                log.debug("File %s is already present, but has the wrong length: %d but expected %d", final_path, actual_size, expected_size)
+    else:
+        log.debug("File %s does not exist", final_path)
     tmp = mkstemp(
-        suffix="." + ext, prefix="%s-%s-" % (prefix, fname), dir=destdir
+        suffix="." + ext, prefix="%s-%s-%s-" % (prefix, fname, url_hash), dir=destdir
     )
     os.close(tmp[0])
-    log.info("downloading %s -> %s", url, tmp[1])
+    log.info("downloading %s -> %s -> %s", url, tmp[1], final_path)
     fname, headers = urllib.request.urlretrieve(url, tmp[1])
-    return "file://%s" % os.path.realpath(tmp[1])
+    Path(fname).rename(final_path)
+    return final_path.as_uri()
+# "file://%s" % os.path.realpath(final_path)