archive.org uploader: first round

2024-12-02 17:50:46 +01:00 · 2024-12-02 17:50:46 +01:00 · 7d7b63a4a6
commit 7d7b63a4a6
parent 2cd6d94172
3 changed files with 182 additions and 0 deletions
--- a/archiveorg.toml
+++ b/archiveorg.toml
@ -0,0 +1,3 @@
+[auth]
+accesskey = "XXX"
+secretkey = "YYY"
--- a/caricari/archiveorg.py
+++ b/caricari/archiveorg.py
@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Upload files to archive as they come
+"""
+
+import datetime
+import time
+import logging
+import re
+import random
+from argparse import ArgumentParser
+from pathlib import Path
+
+import toml
+import requests
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from caricari.httpcommon import get_config
+from caricari import database
+
+CONFIG = get_config()
+engine = create_engine(CONFIG["general"]["db"])
+database.metadata.create_all(engine)
+session_pool = sessionmaker(bind=engine)
+
+LOG = logging.getLogger(__name__)
+
+
+class ArchiveBackend:
+    def __init__(self, accesskey: str, secret: str, bucketprefix: str = "upload"):
+        self.accesskey = accesskey
+        self.secret = secret
+        self.bucketprefix = bucketprefix
+        self.bucket: str | None = None  # final available bucket to be used
+        self.dl_url: str | None = None  # final download URL
+
+    def __str__(self):
+        return "<archive.org>"
+
+    @property
+    def auth_headers(self):
+        return {"authorization": f"LOW {self.accesskey}:{self.secret}"}
+
+    def exists(self, destination_path: bytes):
+        # XXX: we could check the *existence* of buckets through a GET, then check if it is our by looking at
+        # the *_meta.xml file, which has an uploader field
+        return False
+
+    def reserve(self, filename: Path) -> str:
+        bucketbase = re.sub(
+            r"""[^a-zA-Z0-9_.-]""",  # based on what the archive.org documentation says
+            "_",
+            filename.name.rsplit(".", maxsplit=1)[0],
+        )
+        if not bucketbase.startswith(f"{self.bucketprefix}-"):
+            bucketbase = f"{self.bucketprefix}-" + bucketbase
+        bucketname = bucketbase
+        attempts = 5
+        for attempt in range(attempts):
+            LOG.debug("trying %s", bucketname)
+            resp = requests.put(
+                f"https://s3.us.archive.org/{bucketname}",
+                headers=self.auth_headers,
+            )
+            try:
+                resp.raise_for_status()
+            except requests.HTTPError as exc:
+                # XXX: check if this is a 403, in which case we must terminate immediately
+                if attempt < attempts - 1:
+                    bucketname = f"{bucketbase}-{random.randint(1000,9999)}"
+                    continue
+                else:
+                    LOG.error("response was %s\n%s\n%s", resp, resp.headers, resp.text)
+                    raise ValueError("could not find a good bucket for ") from exc
+            else:
+                break
+        LOG.info("Found good bucket: %s", bucketname)
+        self.bucket = bucketname
+        self.dl_url = f"https://archive.org/download/{self.bucket}/{filename.name}"
+        return self.dl_url
+
+    def copy(self, filename: Path) -> str:
+        """
+        destination_path is ignored
+        """
+        assert self.dl_url is not None
+        upload_url = f"https://s3.us.archive.org/{self.bucket}/{filename.name}"
+
+        # XXX: set some more header based on file metadata (date, title, etc.)
+        headers = {
+            "x-archive-meta01-collection": "opensource",
+            "x-archive-meta-language": "ita",
+        }
+        with filename.open("rb") as buf:
+            resp = requests.put(
+                upload_url,
+                data=buf,
+                headers={**headers, **self.auth_headers},
+            )
+        resp.raise_for_status()
+        LOG.info("loaded on %s", self.dl_url)
+        return self.dl_url
+
+
+def get_todo(args) -> list[dict]:
+    resp = requests.get(args.private_url + '/api/todo/archive.org').json()
+    return resp["files"]
+
+
+def loop(args):
+    todos = get_todo(args)
+    for file in todos:
+        archived = process(file, args)
+        if archived is not None:
+            with session_pool() as conn:
+                conn.add(archived)
+                conn.commit()
+
+
+def process(file, args):
+    backend = ArchiveBackend(
+            args.config['auth']['accesskey'],
+            args.config['auth']['secretkey'],
+            )
+    print(file)
+
+    real_file = Path(CONFIG['general']['files']) / file['filepath']
+    print(real_file)
+    if not real_file.exists():
+        LOG.warn("File not found: %s", real_file)
+        return None
+
+    backend.reserve(real_file)
+    url = backend.copy(real_file)
+
+    archived = database.Archived(
+            original_id=file['id'],
+            link=url,
+            archive='archive.org',
+            archive_time=int(datetime.datetime.now().strftime('%s')),
+            # all this is possibly wrong: we should look at the real metadata
+            sha256=file['sha256'],
+            size=file['size'],
+            format=file['mime'],
+            )
+    return archived
+
+
+def main():
+    p = ArgumentParser()
+    p.add_argument('--private-url', default='http://127.0.0.1:8000',
+                   help='URL of the private web server providing the /api/todo endpoint')
+    p.add_argument('--period', default=30, type=int)
+    p.add_argument('--config', type=lambda fname: toml.load(open(fname)), required=True,
+                   help='config specific to the archiveorg uploader. Not to be confused with CARICARI_CONFIG')
+    args = p.parse_args()
+
+    while True:
+        loop(args)
+        time.sleep(args.period)
+
+if __name__ == '__main__':
+    main()
--- a/caricari/private.py
+++ b/caricari/private.py
@ -134,3 +134,18 @@ def list(request: Request):
    return templates.TemplateResponse(
        name="list.html", request=request, context=data
    )
+
+@app.get("/api/todo/{archive}")
+def todo(request: Request, archive: str):
+    """
+    This route will return items which still need to be archived.
+    """
+    files = []
+    with session_pool() as conn:
+        query = conn.query(database.Original).order_by(database.Original.id)
+        for original in query:
+            archived = [a for a in original.archived if a.archive == archive]
+            if not archived:
+                files.append(original.__dict__)
+    data = {"files": files}
+    return data