archive.org uploader: first round
This commit is contained in:
parent
2cd6d94172
commit
7d7b63a4a6
3 changed files with 182 additions and 0 deletions
3
archiveorg.toml
Normal file
3
archiveorg.toml
Normal file
|
@ -0,0 +1,3 @@
|
|||
[auth]
|
||||
accesskey = "XXX"
|
||||
secretkey = "YYY"
|
164
caricari/archiveorg.py
Executable file
164
caricari/archiveorg.py
Executable file
|
@ -0,0 +1,164 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Upload files to archive as they come
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
import random
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
|
||||
import toml
|
||||
import requests
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from caricari.httpcommon import get_config
|
||||
from caricari import database
|
||||
|
||||
CONFIG = get_config()
|
||||
engine = create_engine(CONFIG["general"]["db"])
|
||||
database.metadata.create_all(engine)
|
||||
session_pool = sessionmaker(bind=engine)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ArchiveBackend:
|
||||
def __init__(self, accesskey: str, secret: str, bucketprefix: str = "upload"):
|
||||
self.accesskey = accesskey
|
||||
self.secret = secret
|
||||
self.bucketprefix = bucketprefix
|
||||
self.bucket: str | None = None # final available bucket to be used
|
||||
self.dl_url: str | None = None # final download URL
|
||||
|
||||
def __str__(self):
|
||||
return "<archive.org>"
|
||||
|
||||
@property
|
||||
def auth_headers(self):
|
||||
return {"authorization": f"LOW {self.accesskey}:{self.secret}"}
|
||||
|
||||
def exists(self, destination_path: bytes):
|
||||
# XXX: we could check the *existence* of buckets through a GET, then check if it is our by looking at
|
||||
# the *_meta.xml file, which has an uploader field
|
||||
return False
|
||||
|
||||
def reserve(self, filename: Path) -> str:
|
||||
bucketbase = re.sub(
|
||||
r"""[^a-zA-Z0-9_.-]""", # based on what the archive.org documentation says
|
||||
"_",
|
||||
filename.name.rsplit(".", maxsplit=1)[0],
|
||||
)
|
||||
if not bucketbase.startswith(f"{self.bucketprefix}-"):
|
||||
bucketbase = f"{self.bucketprefix}-" + bucketbase
|
||||
bucketname = bucketbase
|
||||
attempts = 5
|
||||
for attempt in range(attempts):
|
||||
LOG.debug("trying %s", bucketname)
|
||||
resp = requests.put(
|
||||
f"https://s3.us.archive.org/{bucketname}",
|
||||
headers=self.auth_headers,
|
||||
)
|
||||
try:
|
||||
resp.raise_for_status()
|
||||
except requests.HTTPError as exc:
|
||||
# XXX: check if this is a 403, in which case we must terminate immediately
|
||||
if attempt < attempts - 1:
|
||||
bucketname = f"{bucketbase}-{random.randint(1000,9999)}"
|
||||
continue
|
||||
else:
|
||||
LOG.error("response was %s\n%s\n%s", resp, resp.headers, resp.text)
|
||||
raise ValueError("could not find a good bucket for ") from exc
|
||||
else:
|
||||
break
|
||||
LOG.info("Found good bucket: %s", bucketname)
|
||||
self.bucket = bucketname
|
||||
self.dl_url = f"https://archive.org/download/{self.bucket}/{filename.name}"
|
||||
return self.dl_url
|
||||
|
||||
def copy(self, filename: Path) -> str:
|
||||
"""
|
||||
destination_path is ignored
|
||||
"""
|
||||
assert self.dl_url is not None
|
||||
upload_url = f"https://s3.us.archive.org/{self.bucket}/{filename.name}"
|
||||
|
||||
# XXX: set some more header based on file metadata (date, title, etc.)
|
||||
headers = {
|
||||
"x-archive-meta01-collection": "opensource",
|
||||
"x-archive-meta-language": "ita",
|
||||
}
|
||||
with filename.open("rb") as buf:
|
||||
resp = requests.put(
|
||||
upload_url,
|
||||
data=buf,
|
||||
headers={**headers, **self.auth_headers},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
LOG.info("loaded on %s", self.dl_url)
|
||||
return self.dl_url
|
||||
|
||||
|
||||
def get_todo(args) -> list[dict]:
|
||||
resp = requests.get(args.private_url + '/api/todo/archive.org').json()
|
||||
return resp["files"]
|
||||
|
||||
|
||||
def loop(args):
|
||||
todos = get_todo(args)
|
||||
for file in todos:
|
||||
archived = process(file, args)
|
||||
if archived is not None:
|
||||
with session_pool() as conn:
|
||||
conn.add(archived)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def process(file, args):
|
||||
backend = ArchiveBackend(
|
||||
args.config['auth']['accesskey'],
|
||||
args.config['auth']['secretkey'],
|
||||
)
|
||||
print(file)
|
||||
|
||||
real_file = Path(CONFIG['general']['files']) / file['filepath']
|
||||
print(real_file)
|
||||
if not real_file.exists():
|
||||
LOG.warn("File not found: %s", real_file)
|
||||
return None
|
||||
|
||||
backend.reserve(real_file)
|
||||
url = backend.copy(real_file)
|
||||
|
||||
archived = database.Archived(
|
||||
original_id=file['id'],
|
||||
link=url,
|
||||
archive='archive.org',
|
||||
archive_time=int(datetime.datetime.now().strftime('%s')),
|
||||
# all this is possibly wrong: we should look at the real metadata
|
||||
sha256=file['sha256'],
|
||||
size=file['size'],
|
||||
format=file['mime'],
|
||||
)
|
||||
return archived
|
||||
|
||||
|
||||
def main():
|
||||
p = ArgumentParser()
|
||||
p.add_argument('--private-url', default='http://127.0.0.1:8000',
|
||||
help='URL of the private web server providing the /api/todo endpoint')
|
||||
p.add_argument('--period', default=30, type=int)
|
||||
p.add_argument('--config', type=lambda fname: toml.load(open(fname)), required=True,
|
||||
help='config specific to the archiveorg uploader. Not to be confused with CARICARI_CONFIG')
|
||||
args = p.parse_args()
|
||||
|
||||
while True:
|
||||
loop(args)
|
||||
time.sleep(args.period)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -134,3 +134,18 @@ def list(request: Request):
|
|||
return templates.TemplateResponse(
|
||||
name="list.html", request=request, context=data
|
||||
)
|
||||
|
||||
@app.get("/api/todo/{archive}")
|
||||
def todo(request: Request, archive: str):
|
||||
"""
|
||||
This route will return items which still need to be archived.
|
||||
"""
|
||||
files = []
|
||||
with session_pool() as conn:
|
||||
query = conn.query(database.Original).order_by(database.Original.id)
|
||||
for original in query:
|
||||
archived = [a for a in original.archived if a.archive == archive]
|
||||
if not archived:
|
||||
files.append(original.__dict__)
|
||||
data = {"files": files}
|
||||
return data
|
||||
|
|
Loading…
Reference in a new issue