archive.org uploader: first round

This commit is contained in:
boyska 2024-12-02 17:50:46 +01:00
parent 2cd6d94172
commit 7d7b63a4a6
3 changed files with 182 additions and 0 deletions

3
archiveorg.toml Normal file
View file

@ -0,0 +1,3 @@
[auth]
accesskey = "XXX"
secretkey = "YYY"

164
caricari/archiveorg.py Executable file
View file

@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
Upload files to archive as they come
"""
import datetime
import time
import logging
import re
import random
from argparse import ArgumentParser
from pathlib import Path
import toml
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from caricari.httpcommon import get_config
from caricari import database
CONFIG = get_config()
engine = create_engine(CONFIG["general"]["db"])
database.metadata.create_all(engine)
session_pool = sessionmaker(bind=engine)
LOG = logging.getLogger(__name__)
class ArchiveBackend:
def __init__(self, accesskey: str, secret: str, bucketprefix: str = "upload"):
self.accesskey = accesskey
self.secret = secret
self.bucketprefix = bucketprefix
self.bucket: str | None = None # final available bucket to be used
self.dl_url: str | None = None # final download URL
def __str__(self):
return "<archive.org>"
@property
def auth_headers(self):
return {"authorization": f"LOW {self.accesskey}:{self.secret}"}
def exists(self, destination_path: bytes):
# XXX: we could check the *existence* of buckets through a GET, then check if it is our by looking at
# the *_meta.xml file, which has an uploader field
return False
def reserve(self, filename: Path) -> str:
bucketbase = re.sub(
r"""[^a-zA-Z0-9_.-]""", # based on what the archive.org documentation says
"_",
filename.name.rsplit(".", maxsplit=1)[0],
)
if not bucketbase.startswith(f"{self.bucketprefix}-"):
bucketbase = f"{self.bucketprefix}-" + bucketbase
bucketname = bucketbase
attempts = 5
for attempt in range(attempts):
LOG.debug("trying %s", bucketname)
resp = requests.put(
f"https://s3.us.archive.org/{bucketname}",
headers=self.auth_headers,
)
try:
resp.raise_for_status()
except requests.HTTPError as exc:
# XXX: check if this is a 403, in which case we must terminate immediately
if attempt < attempts - 1:
bucketname = f"{bucketbase}-{random.randint(1000,9999)}"
continue
else:
LOG.error("response was %s\n%s\n%s", resp, resp.headers, resp.text)
raise ValueError("could not find a good bucket for ") from exc
else:
break
LOG.info("Found good bucket: %s", bucketname)
self.bucket = bucketname
self.dl_url = f"https://archive.org/download/{self.bucket}/{filename.name}"
return self.dl_url
def copy(self, filename: Path) -> str:
"""
destination_path is ignored
"""
assert self.dl_url is not None
upload_url = f"https://s3.us.archive.org/{self.bucket}/{filename.name}"
# XXX: set some more header based on file metadata (date, title, etc.)
headers = {
"x-archive-meta01-collection": "opensource",
"x-archive-meta-language": "ita",
}
with filename.open("rb") as buf:
resp = requests.put(
upload_url,
data=buf,
headers={**headers, **self.auth_headers},
)
resp.raise_for_status()
LOG.info("loaded on %s", self.dl_url)
return self.dl_url
def get_todo(args) -> list[dict]:
resp = requests.get(args.private_url + '/api/todo/archive.org').json()
return resp["files"]
def loop(args):
todos = get_todo(args)
for file in todos:
archived = process(file, args)
if archived is not None:
with session_pool() as conn:
conn.add(archived)
conn.commit()
def process(file, args):
backend = ArchiveBackend(
args.config['auth']['accesskey'],
args.config['auth']['secretkey'],
)
print(file)
real_file = Path(CONFIG['general']['files']) / file['filepath']
print(real_file)
if not real_file.exists():
LOG.warn("File not found: %s", real_file)
return None
backend.reserve(real_file)
url = backend.copy(real_file)
archived = database.Archived(
original_id=file['id'],
link=url,
archive='archive.org',
archive_time=int(datetime.datetime.now().strftime('%s')),
# all this is possibly wrong: we should look at the real metadata
sha256=file['sha256'],
size=file['size'],
format=file['mime'],
)
return archived
def main():
p = ArgumentParser()
p.add_argument('--private-url', default='http://127.0.0.1:8000',
help='URL of the private web server providing the /api/todo endpoint')
p.add_argument('--period', default=30, type=int)
p.add_argument('--config', type=lambda fname: toml.load(open(fname)), required=True,
help='config specific to the archiveorg uploader. Not to be confused with CARICARI_CONFIG')
args = p.parse_args()
while True:
loop(args)
time.sleep(args.period)
if __name__ == '__main__':
main()

View file

@ -134,3 +134,18 @@ def list(request: Request):
return templates.TemplateResponse(
name="list.html", request=request, context=data
)
@app.get("/api/todo/{archive}")
def todo(request: Request, archive: str):
"""
This route will return items which still need to be archived.
"""
files = []
with session_pool() as conn:
query = conn.query(database.Original).order_by(database.Original.id)
for original in query:
archived = [a for a in original.archived if a.archive == archive]
if not archived:
files.append(original.__dict__)
data = {"files": files}
return data