oloturia2pdf/oloturia2pdf.py

321 lines
11 KiB
Python
Raw Normal View History

2021-08-17 10:45:53 +02:00
#!/usr/bin/env python3
from mastodon import Mastodon
import json
import datetime
import os.path
2021-08-19 10:33:31 +02:00
from reportlab.lib import pagesizes
2021-08-17 10:45:53 +02:00
import requests
import html2text
import pdfkit
import locale
import PyPDF2
2023-03-27 22:43:36 +02:00
import html
2021-08-17 10:45:53 +02:00
locale.setlocale(locale.LC_TIME, 'it_IT.UTF-8')
2021-08-19 10:33:31 +02:00
def copertina(text):
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
2021-08-19 10:58:31 +02:00
from reportlab.lib.pagesizes import A5
2021-08-19 10:33:31 +02:00
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase.pdfmetrics import stringWidth
FONT = 'Roboto'
2021-08-19 10:58:31 +02:00
SIZE = 36
2021-08-19 10:33:31 +02:00
packet = io.BytesIO()
# create a new PDF with Reportlab
pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
2021-08-19 10:58:31 +02:00
can = canvas.Canvas(packet, pagesize=A5)
2021-08-19 10:33:31 +02:00
can.setFont(FONT, SIZE)
2021-08-19 10:58:31 +02:00
PAGE_WIDTH = A5[0]
2021-08-19 10:33:31 +02:00
text_width = stringWidth(text,FONT, SIZE)
2021-08-19 10:58:31 +02:00
can.drawString((PAGE_WIDTH - text_width) / 2, 100, text)
2021-08-19 10:33:31 +02:00
can.save()
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read your existing PDF
existing_pdf = PdfFileReader(open("copertina.pdf", "rb"))
output = PdfFileWriter()
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(0)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
return(page)
2021-08-19 13:40:39 +02:00
def indice(text):
# PDF GENERATION LIBRARIES
# import the report lab PDF generation tools
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.pagesizes import A5
import io
from PyPDF2 import PdfFileWriter, PdfFileReader
parts = []
pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
style = ParagraphStyle(
name='Normal',
fontName='Roboto',
fontSize=12,
leading = 14,
leftIndent=32,
firstLineIndent=-32,
spaceBefore=5
)
for ro in text.splitlines():
ro = ro.replace(' ',' ')
ro = ro.replace('\t','    ')
parts.append(Paragraph(ro, style = style))
packet = io.BytesIO()
doc = SimpleDocTemplate(packet,
pagesize=A5,
rightMargin=20,
leftMargin=20,
topMargin=40,
bottomMargin=30)
doc.build(parts)
pdfReader = PdfFileReader(packet)
2021-09-06 22:06:51 +02:00
pdfWriter = PdfFileWriter()
for page in pdfReader.pages:
pdfWriter.addPage(page)
pdfWriter.addBlankPage()
return([pdfWriter.getPage(0),pdfWriter.getPage(1)])
2021-08-19 13:40:39 +02:00
2021-08-19 10:33:31 +02:00
2021-08-19 10:58:31 +02:00
def main():
# Scarica tutti i post da Mastodon
2021-08-19 10:33:31 +02:00
2021-08-19 10:58:31 +02:00
print("Scarico i post")
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
def default(o):
if isinstance(o, (datetime.date, datetime.datetime)):
return o.isoformat()
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
if not os.path.isfile('oloturiadump.json'):
mastodon = Mastodon(api_base_url = "https://mastodon.bida.im")
all_vgos = []
last_id = None
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
while True:
statuses = list(filter(lambda s: s['account']['username'] == 'oloturia', mastodon.timeline_hashtag("vgo", local=True, max_id=last_id)))
if not statuses:
break
all_vgos += list(map(
lambda s: {
'id': s['id'],
'uri': s['uri'],
'content': s['content'],
'replies_count': s['replies_count'],
#'replies': mastodon.status_context(s['id']) if s['replies_count'] > 0 else [],
'created': s['created_at'],
'reblogs': s['reblogs_count'],
'favourites': s['favourites_count'],
'media': s['media_attachments']
}
, statuses))
last_id = statuses[-1]['id']
2021-08-19 02:45:01 +02:00
2021-08-19 10:58:31 +02:00
#print(all_vgos)
#print(json.dumps(all_vgos, default=default))
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
with open('oloturiadump.json', 'w') as json_file:
json.dump(all_vgos, json_file, indent=4, default=default)
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
# Scarica tutte le immagini
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
print("Scarico le immagini")
with open('oloturiadump.json') as json_file:
all_vgos = json.load(json_file)
os.makedirs('media', exist_ok=True)
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
vgo_dict={}
2021-08-17 10:45:53 +02:00
2021-08-19 10:58:31 +02:00
for vgo in all_vgos:
vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
2023-03-27 22:43:36 +02:00
# vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
# if len(vgo_name) < 10:
# vgo_name = [s for s in html2text.html2text(vgo['content']).split("\n\n") if s][-1].replace("\n"," ")
vgo_name = vgo['content'].split("<p>")[-1].replace("</p>","")
vgo_name = vgo_name.split("<br />")[-1]
vgo_name = vgo_name.split("</a>")[-1]
vgo_name = html.unescape(vgo_name).strip()
2021-08-19 10:58:31 +02:00
#print(vgo_num +' - '+ vgo_name)
#print(str(vgo['id']) +' '+ vgo['uri'])
vgo_dict[vgo_num] = vgo_name
2021-08-17 10:45:53 +02:00
for media in vgo['media']:
2021-08-19 10:58:31 +02:00
#print(str(media['id']) +' '+ media['url'])
ext = os.path.splitext(media['preview_url'])[1]
2021-08-17 10:45:53 +02:00
img_name = os.path.join('media',str(media['id']) + ext)
2021-08-19 10:58:31 +02:00
if not os.path.isfile(img_name):
print(img_name)
img_data = requests.get(media['preview_url']).content
with open(img_name, 'wb') as handler:
handler.write(img_data)
with open('template.html') as html_file:
html_base = html_file.read()
with open('mediagallery.html') as html_file:
html_mediagallery = html_file.read()
# Genera i PDF
print("Genero i PDF")
os.makedirs('pdf', exist_ok=True)
for vgo in all_vgos:
vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
html_name = 'oloturia.html'
pdf_name = os.path.join('pdf', vgo_num + '.pdf')
if not os.path.isfile(pdf_name):
print(vgo_num +' - '+ vgo_name)
media_num = 0
mediagallery_tot = ''
media_tot = len(vgo['media'])
sizes = "622px" if media_tot == 1 else "311px"
style = [
["inset: auto; width: 100%; height: 100%;"],
["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto auto 2px; width: 50%; height: 100%;"],
["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"],
["inset: auto 2px 2px auto; width: 50%; height: 50%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px 2px auto auto; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"]
]
for media in vgo['media']:
mediagallery = html_mediagallery
ext = os.path.splitext(media['url'])[1]
img_name = os.path.join('media',str(media['id']) + ext)
mediagallery = mediagallery.replace("[media]", img_name)
mediagallery = mediagallery.replace("[style]", style[media_tot-1][media_num])
mediagallery = mediagallery.replace("[sizes]", sizes)
mediagallery_tot = mediagallery_tot + mediagallery
media_num = media_num + 1
content = html_base
content = content.replace("[content]", vgo['content'])
content = content.replace("[date]", datetime.datetime.fromisoformat(vgo['created']).strftime("%-d %B %Y, %H:%M"))
content = content.replace("[reply]", str(vgo['replies_count']))
content = content.replace("[reblogs]", str(vgo['reblogs']))
content = content.replace("[favourites]", str(vgo['favourites']))
content = content.replace("[mediagallery]", mediagallery_tot)
with open(html_name, 'w') as handler:
handler.write(content)
options = {
2022-08-25 11:21:37 +02:00
'enable-local-file-access': None,
2021-08-19 10:58:31 +02:00
'page-size': 'A5',
'margin-top': '0.5cm',
'margin-right': '0.5cm',
'margin-bottom': '0.5cm',
'margin-left': '0.5cm',
'encoding': "UTF-8",
'quiet': ''
}
try:
pdfkit.from_file(html_name, pdf_name, options=options)
except:
pass
os.remove(html_name)
# Genera i libretti
print("Genero i libretti")
os.makedirs('books', exist_ok=True)
2021-09-06 22:06:51 +02:00
for pagstart in range(1, len(vgo_dict), 50):
book_num = int(pagstart / 50) + 1
pagend = min(book_num * 50, len(vgo_dict))
2021-08-19 13:40:39 +02:00
2022-08-25 11:20:34 +02:00
book_name = os.path.join('books', 'vgo_' + str(book_num).zfill(2) + '.pdf')
if not os.path.isfile(book_name):
pdfWriter = PyPDF2.PdfFileWriter()
print(book_num)
2021-08-19 13:45:18 +02:00
2022-08-25 11:20:34 +02:00
# aggiungere copertina
pdfWriter.addPage(copertina(str(pagstart).zfill(3) + " - " + str(pagend).zfill(3)))
2021-08-17 10:45:53 +02:00
2021-09-06 22:06:51 +02:00
pdfWriter.addBlankPage()
2021-08-19 13:45:18 +02:00
2022-08-25 11:20:34 +02:00
indtext = ""
for vgo_num in [str(x).zfill(3) for x in range(pagstart, pagend + 1)]:
pdf_name = os.path.join('pdf', vgo_num + '.pdf')
try:
#print(vgo_num + " - " + vgo_dict[vgo_num])
indtext = indtext + vgo_num + "\t" + vgo_dict[vgo_num] + "\n"
pdfFileObj = open(pdf_name, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
pdfWriter.addPage(pageObj)
except:
pass
for i in range(0, 8 - ((((pagend - 1) % 50) + 1 + 5) % 8)):
pdfWriter.addBlankPage()
# aggiungere indice
for indpag in indice(indtext):
pdfWriter.addPage(indpag)
#Aggiungere pagina finale
#pdfWriter.addBlankPage()
pdfFileObj = open("quarta.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
pdfWriter.addPage(pageObj)
with open(book_name, 'wb') as pdfOutput:
pdfWriter.write(pdfOutput)
# # Genera indice
# with open("index.txt", 'w') as handler:
# for key in sorted(vgo_dict):
# handler.write(key + ' - ' + vgo_dict[key] + '\n')
2021-08-19 10:33:31 +02:00
2021-08-19 10:58:31 +02:00
if __name__ == "__main__":
main()
2021-08-19 10:33:31 +02:00