From 7d7b63a4a68a5d3cbee304681983567b21455d59 Mon Sep 17 00:00:00 2001 From: boyska Date: Mon, 2 Dec 2024 17:50:46 +0100 Subject: [PATCH] archive.org uploader: first round --- archiveorg.toml | 3 + caricari/archiveorg.py | 164 +++++++++++++++++++++++++++++++++++++++++ caricari/private.py | 15 ++++ 3 files changed, 182 insertions(+) create mode 100644 archiveorg.toml create mode 100755 caricari/archiveorg.py diff --git a/archiveorg.toml b/archiveorg.toml new file mode 100644 index 0000000..01e6e6c --- /dev/null +++ b/archiveorg.toml @@ -0,0 +1,3 @@ +[auth] +accesskey = "XXX" +secretkey = "YYY" diff --git a/caricari/archiveorg.py b/caricari/archiveorg.py new file mode 100755 index 0000000..1f90748 --- /dev/null +++ b/caricari/archiveorg.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Upload files to archive as they come +""" + +import datetime +import time +import logging +import re +import random +from argparse import ArgumentParser +from pathlib import Path + +import toml +import requests +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from caricari.httpcommon import get_config +from caricari import database + +CONFIG = get_config() +engine = create_engine(CONFIG["general"]["db"]) +database.metadata.create_all(engine) +session_pool = sessionmaker(bind=engine) + +LOG = logging.getLogger(__name__) + + +class ArchiveBackend: + def __init__(self, accesskey: str, secret: str, bucketprefix: str = "upload"): + self.accesskey = accesskey + self.secret = secret + self.bucketprefix = bucketprefix + self.bucket: str | None = None # final available bucket to be used + self.dl_url: str | None = None # final download URL + + def __str__(self): + return "" + + @property + def auth_headers(self): + return {"authorization": f"LOW {self.accesskey}:{self.secret}"} + + def exists(self, destination_path: bytes): + # XXX: we could check the *existence* of buckets through a GET, then check if it is our by looking at + # the *_meta.xml file, which has an uploader field + return False + + def reserve(self, filename: Path) -> str: + bucketbase = re.sub( + r"""[^a-zA-Z0-9_.-]""", # based on what the archive.org documentation says + "_", + filename.name.rsplit(".", maxsplit=1)[0], + ) + if not bucketbase.startswith(f"{self.bucketprefix}-"): + bucketbase = f"{self.bucketprefix}-" + bucketbase + bucketname = bucketbase + attempts = 5 + for attempt in range(attempts): + LOG.debug("trying %s", bucketname) + resp = requests.put( + f"https://s3.us.archive.org/{bucketname}", + headers=self.auth_headers, + ) + try: + resp.raise_for_status() + except requests.HTTPError as exc: + # XXX: check if this is a 403, in which case we must terminate immediately + if attempt < attempts - 1: + bucketname = f"{bucketbase}-{random.randint(1000,9999)}" + continue + else: + LOG.error("response was %s\n%s\n%s", resp, resp.headers, resp.text) + raise ValueError("could not find a good bucket for ") from exc + else: + break + LOG.info("Found good bucket: %s", bucketname) + self.bucket = bucketname + self.dl_url = f"https://archive.org/download/{self.bucket}/{filename.name}" + return self.dl_url + + def copy(self, filename: Path) -> str: + """ + destination_path is ignored + """ + assert self.dl_url is not None + upload_url = f"https://s3.us.archive.org/{self.bucket}/{filename.name}" + + # XXX: set some more header based on file metadata (date, title, etc.) + headers = { + "x-archive-meta01-collection": "opensource", + "x-archive-meta-language": "ita", + } + with filename.open("rb") as buf: + resp = requests.put( + upload_url, + data=buf, + headers={**headers, **self.auth_headers}, + ) + resp.raise_for_status() + LOG.info("loaded on %s", self.dl_url) + return self.dl_url + + +def get_todo(args) -> list[dict]: + resp = requests.get(args.private_url + '/api/todo/archive.org').json() + return resp["files"] + + +def loop(args): + todos = get_todo(args) + for file in todos: + archived = process(file, args) + if archived is not None: + with session_pool() as conn: + conn.add(archived) + conn.commit() + + +def process(file, args): + backend = ArchiveBackend( + args.config['auth']['accesskey'], + args.config['auth']['secretkey'], + ) + print(file) + + real_file = Path(CONFIG['general']['files']) / file['filepath'] + print(real_file) + if not real_file.exists(): + LOG.warn("File not found: %s", real_file) + return None + + backend.reserve(real_file) + url = backend.copy(real_file) + + archived = database.Archived( + original_id=file['id'], + link=url, + archive='archive.org', + archive_time=int(datetime.datetime.now().strftime('%s')), + # all this is possibly wrong: we should look at the real metadata + sha256=file['sha256'], + size=file['size'], + format=file['mime'], + ) + return archived + + +def main(): + p = ArgumentParser() + p.add_argument('--private-url', default='http://127.0.0.1:8000', + help='URL of the private web server providing the /api/todo endpoint') + p.add_argument('--period', default=30, type=int) + p.add_argument('--config', type=lambda fname: toml.load(open(fname)), required=True, + help='config specific to the archiveorg uploader. Not to be confused with CARICARI_CONFIG') + args = p.parse_args() + + while True: + loop(args) + time.sleep(args.period) + +if __name__ == '__main__': + main() diff --git a/caricari/private.py b/caricari/private.py index 9ebb7c7..4fe55b5 100644 --- a/caricari/private.py +++ b/caricari/private.py @@ -134,3 +134,18 @@ def list(request: Request): return templates.TemplateResponse( name="list.html", request=request, context=data ) + +@app.get("/api/todo/{archive}") +def todo(request: Request, archive: str): + """ + This route will return items which still need to be archived. + """ + files = [] + with session_pool() as conn: + query = conn.query(database.Original).order_by(database.Original.id) + for original in query: + archived = [a for a in original.archived if a.archive == archive] + if not archived: + files.append(original.__dict__) + data = {"files": files} + return data