From acc966b488cdf14cf17f4da703f459000771ef5c Mon Sep 17 00:00:00 2001 From: Blallo Date: Wed, 15 Sep 2021 17:53:39 +0200 Subject: [PATCH] Do not singletonize the retriever --- techrec/forge.py | 8 +++-- techrec/http_retriever.py | 69 +++++++++++++-------------------------- 2 files changed, 27 insertions(+), 50 deletions(-) diff --git a/techrec/forge.py b/techrec/forge.py index 138601c..e006fc6 100644 --- a/techrec/forge.py +++ b/techrec/forge.py @@ -8,7 +8,7 @@ from time import sleep from typing import Callable, Optional from .config_manager import get_config -from .http_retriever import RETRIEVER +from .http_retriever import download logger = logging.getLogger("forge") Validator = Callable[[datetime, datetime, str], bool] @@ -23,8 +23,10 @@ async def get_timefile_exact(time) -> str: remote_path = os.path.join( remote_repo, time.strftime(get_config()["AUDIO_INPUT_FORMAT"]) ) - if remote.startswith("http"): - local = await RETRIEVER.get(remote_path) + if remote_path.startswith("http://") or remote_path.startswith("https://"): + logger.info(f"downloading {remote_path}") + print(f"DOWNLOADING -> {remote_path}") + local = await download(remote_path) return local return local_path diff --git a/techrec/http_retriever.py b/techrec/http_retriever.py index 1a3bd67..68d8934 100644 --- a/techrec/http_retriever.py +++ b/techrec/http_retriever.py @@ -1,6 +1,8 @@ # -*- encoding: utf-8 -*- import asyncio import os +from typing import Optional +from tempfile import mkdtemp import aiohttp # type: ignore @@ -9,52 +11,25 @@ from .config_manager import get_config CHUNK_SIZE = 2 ** 12 -class HTTPRetriever(object): +async def download(remote: str, staging: Optional[str] = None) -> str: """ - This class offers the `get` method to retrieve the file from the local staging path - or, if missing, from the given remote. + This will download to AUDIO_STAGING the remote file and return the local + path of the downloaded file """ - _instance = None - - def __new__(cls): - if self._instance is None: - self._instance = super().__new__(cls) - return self._instance - - def __init__(self): - self.repo_path = get_config()["AUDIO_STAGING"] - self.repo = dict( - path=os.path.join(self.repo_path, path) for i in os.listdir(self.repo_path) - ) - - async def get(remote: str) -> str: - """ - This will look in the local staging path (ideally on a tmpfs or something - similar), and return the file from there if present. Otherwise, it will download - it from the remote. - """ - if remote in self.repo: - return self.repo[remote] - file = await self._download_from_remote(remote) - self.repo[] = file - return file - - async def _download(remote: str) -> str: - """ - This will download to AUDIO_STAGING the remote file and return the local path - of the downloaded file - """ - _, filename = os.path.split(remote) - local = os.path.join(get_config()["AUDIO_STAGING"], filename) - async with aiohttp.ClientSession() as session: - async with session.get(remote) as resp: - with open(local) as f: - while True: - chunk = await resp.content.read(CHUNK_SIZE) - if not chunk: - break - f.write(chunk) - return local - - -RETRIEVER = HTTPRetriever() + _, filename = os.path.split(remote) + if staging: + base = staging + else: + # if no staging is specified, and you want to clean the storage + # used by techrec: rm -rf /tmp/techrec* + base = mkdtemp(prefix="techrec-", dir="/tmp") + local = os.path.join(base, filename) + async with aiohttp.ClientSession() as session: + async with session.get(remote) as resp: + with open(local, "wb") as f: + while True: + chunk = await resp.content.read(CHUNK_SIZE) + if not chunk: + break + f.write(chunk) + return local