download_http doesn't download again

refs #17
This commit is contained in:
boyska 2022-01-02 20:16:55 +01:00
parent 46eb5c400b
commit b35da0a8d0

View file

@ -6,6 +6,10 @@ import posixpath
import urllib.request
from tempfile import mkstemp
from urllib.parse import urlparse
from pathlib import Path
import hashlib
import requests
log = logging.getLogger(__name__)
@ -46,6 +50,14 @@ def shortname(path):
return name
def http_expected_length(url):
resp = requests.head(url, allow_redirects=True)
resp.raise_for_status()
header_value = resp.headers.get('content-length')
expected_length = int(header_value)
return expected_length
def download_http(url, destdir=None, copy=False, prefix="httpdl"):
if url.split(":")[0] not in ("http", "https"):
log.warning("Not a valid URL: %s", url)
@ -56,15 +68,43 @@ def download_http(url, destdir=None, copy=False, prefix="httpdl"):
return None
if not copy:
return url
if destdir is None:
destdir = os.getenv('TMPDIR', '/tmp/')
fname = posixpath.basename(urlparse(url).path)
# sanitize
fname = "".join(
c for c in fname if c.isalnum() or c in list("._-")
c for c in fname if c.isalnum() or c in list("_-")
).rstrip()
url_hash = hashlib.sha1(url.encode('utf8')).hexdigest()
final_path = Path(destdir) / ('%s-%s-%s.%s' % (prefix, fname[:20], url_hash, ext))
# it might be already fully downloaded, let's check
if final_path.exists():
# this "touch" helps avoiding a race condition in which the
# UnusedCleaner could delete this
final_path.touch()
actual_size = final_path.stat().st_size
try:
expected_size = http_expected_length(url)
except Exception as exc:
log.debug("Could not determine expected length for %s: %s", url, exc)
else:
if expected_size == actual_size:
log.debug("File %s already present and complete, download not needed", final_path)
return final_path.as_uri()
else:
log.debug("File %s is already present, but has the wrong length: %d but expected %d", final_path, actual_size, expected_size)
else:
log.debug("File %s does not exist", final_path)
tmp = mkstemp(
suffix="." + ext, prefix="%s-%s-" % (prefix, fname), dir=destdir
suffix="." + ext, prefix="%s-%s-%s-" % (prefix, fname, url_hash), dir=destdir
)
os.close(tmp[0])
log.info("downloading %s -> %s", url, tmp[1])
log.info("downloading %s -> %s -> %s", url, tmp[1], final_path)
fname, headers = urllib.request.urlretrieve(url, tmp[1])
return "file://%s" % os.path.realpath(tmp[1])
Path(fname).rename(final_path)
return final_path.as_uri()
# "file://%s" % os.path.realpath(final_path)