oloturia2pdf/oloturia2pdf.py

#!/usr/bin/env python3

from mastodon import Mastodon
import json
import datetime
import os.path
from reportlab.lib import pagesizes
import requests
import html2text
import pdfkit
import locale
import PyPDF2
import html

locale.setlocale(locale.LC_TIME, 'it_IT.UTF-8')


def copertina(text):
    from PyPDF2 import PdfFileWriter, PdfFileReader
    import io
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import A5
    from reportlab.pdfbase import pdfmetrics
    from reportlab.pdfbase.ttfonts import TTFont
    from reportlab.pdfbase.pdfmetrics import stringWidth

    FONT = 'Roboto'
    SIZE = 36

    packet = io.BytesIO()
    # create a new PDF with Reportlab

    pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))

    can = canvas.Canvas(packet, pagesize=A5)
    can.setFont(FONT, SIZE)

    PAGE_WIDTH  = A5[0]

    text_width = stringWidth(text,FONT, SIZE)
    can.drawString((PAGE_WIDTH - text_width) / 2, 100, text)
    can.save()

    #move to the beginning of the StringIO buffer
    packet.seek(0)
    new_pdf = PdfFileReader(packet)
    # read your existing PDF
    existing_pdf = PdfFileReader(open("copertina.pdf", "rb"))
    output = PdfFileWriter()
    # add the "watermark" (which is the new pdf) on the existing page
    page = existing_pdf.getPage(0)
    page.mergePage(new_pdf.getPage(0))

    output.addPage(page)

    return(page)

def indice(text):
    # PDF GENERATION LIBRARIES
    # import the report lab PDF generation tools
    from reportlab.lib.pagesizes import letter
    from reportlab.lib.styles import ParagraphStyle
    from reportlab.lib.units import inch
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
    from reportlab.pdfbase import pdfmetrics
    from reportlab.pdfbase.ttfonts import TTFont
    from reportlab.lib.pagesizes import A5
    import io
    from PyPDF2 import PdfFileWriter, PdfFileReader

    parts = []

    pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
    style = ParagraphStyle(
        name='Normal',
        fontName='Roboto',
        fontSize=12,
        leading = 14,
        leftIndent=32,
        firstLineIndent=-32,
        spaceBefore=5
    )

    for ro in text.splitlines():
        ro = ro.replace(' ','&nbsp;')
        ro = ro.replace('\t','&nbsp;&nbsp;&nbsp;&nbsp;')
        parts.append(Paragraph(ro, style = style))


    packet = io.BytesIO()
    doc = SimpleDocTemplate(packet,
        pagesize=A5,
        rightMargin=20,
        leftMargin=20,
        topMargin=40,
        bottomMargin=30)
    doc.build(parts)

    pdfReader = PdfFileReader(packet)
    pdfWriter = PdfFileWriter()

    for page in pdfReader.pages:
        pdfWriter.addPage(page)
    pdfWriter.addBlankPage()

    return([pdfWriter.getPage(0),pdfWriter.getPage(1)])


def main():
    # Scarica tutti i post da Mastodon

    print("Scarico i post")

    def default(o):
        if isinstance(o, (datetime.date, datetime.datetime)):
            return o.isoformat()

    if not os.path.isfile('oloturiadump.json'):
        mastodon = Mastodon(api_base_url = "https://mastodon.bida.im")
        all_vgos = []
        last_id = None

        while True:
            statuses = list(filter(lambda s: s['account']['username'] == 'oloturia', mastodon.timeline_hashtag("vgo", local=True, max_id=last_id)))
            if not statuses:
                break
            all_vgos += list(map(
                lambda s: {
                    'id': s['id'],
                    'uri': s['uri'],
                    'content': s['content'],
                    'replies_count': s['replies_count'],
                    #'replies': mastodon.status_context(s['id']) if s['replies_count'] > 0  else [],
                    'created': s['created_at'],
                    'reblogs': s['reblogs_count'],
                    'favourites': s['favourites_count'],
                    'media': s['media_attachments']
                }
                , statuses))
            last_id = statuses[-1]['id']

        #print(all_vgos)
        #print(json.dumps(all_vgos, default=default))

        with open('oloturiadump.json', 'w') as json_file:
            json.dump(all_vgos, json_file, indent=4, default=default)


    # Scarica tutte le immagini

    print("Scarico le immagini")
    with open('oloturiadump.json') as json_file:
        all_vgos = json.load(json_file)
        os.makedirs('media', exist_ok=True)

        vgo_dict={}

        for vgo in all_vgos:
            vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
            # vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
            # if len(vgo_name) < 10:
            #     vgo_name = [s for s in html2text.html2text(vgo['content']).split("\n\n") if s][-1].replace("\n"," ")

            vgo_name = vgo['content'].split("<p>")[-1].replace("</p>","")
            vgo_name = vgo_name.split("<br />")[-1]
            vgo_name = vgo_name.split("</a>")[-1]
            vgo_name = html.unescape(vgo_name).strip()

            #print(vgo_num +' - '+ vgo_name)
            #print(str(vgo['id']) +' '+ vgo['uri'])
            vgo_dict[vgo_num] = vgo_name

            for media in vgo['media']:
                #print(str(media['id']) +' '+ media['url'])

                ext = os.path.splitext(media['preview_url'])[1]
                img_name = os.path.join('media',str(media['id']) + ext)

                if not os.path.isfile(img_name):
                    print(img_name)
                    img_data = requests.get(media['preview_url']).content
                    with open(img_name, 'wb') as handler:
                        handler.write(img_data)

        with open('template.html') as html_file:
            html_base = html_file.read()
        with open('mediagallery.html') as html_file:
            html_mediagallery = html_file.read()


        # Genera i PDF

        print("Genero i PDF")
        os.makedirs('pdf', exist_ok=True)
        for vgo in all_vgos:
            vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
            vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]

            html_name = 'oloturia.html'
            pdf_name = os.path.join('pdf', vgo_num + '.pdf')

            if not os.path.isfile(pdf_name):
                print(vgo_num +' - '+ vgo_name)


                media_num = 0
                mediagallery_tot = ''
                media_tot = len(vgo['media'])

                sizes = "622px" if media_tot == 1 else "311px"
                style = [
                    ["inset: auto; width: 100%; height: 100%;"],
                    ["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto auto 2px; width: 50%; height: 100%;"],
                    ["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"],
                    ["inset: auto 2px 2px auto; width: 50%; height: 50%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px 2px auto auto; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"]
                ]

                for media in vgo['media']:
                    mediagallery = html_mediagallery
                    ext = os.path.splitext(media['url'])[1]
                    img_name = os.path.join('media',str(media['id']) + ext)
                    mediagallery = mediagallery.replace("[media]", img_name)
                    mediagallery = mediagallery.replace("[style]", style[media_tot-1][media_num])
                    mediagallery = mediagallery.replace("[sizes]", sizes)
                    mediagallery_tot = mediagallery_tot + mediagallery
                    media_num = media_num + 1

                content = html_base
                content = content.replace("[content]", vgo['content'])
                content = content.replace("[date]", datetime.datetime.fromisoformat(vgo['created']).strftime("%-d %B %Y, %H:%M"))
                content = content.replace("[reply]", str(vgo['replies_count']))
                content = content.replace("[reblogs]", str(vgo['reblogs']))
                content = content.replace("[favourites]", str(vgo['favourites']))
                content = content.replace("[mediagallery]", mediagallery_tot)

                with open(html_name, 'w') as handler:
                    handler.write(content)

                options = {
                    'enable-local-file-access': None,
                    'page-size': 'A5',
                    'margin-top': '0.5cm',
                    'margin-right': '0.5cm',
                    'margin-bottom': '0.5cm',
                    'margin-left': '0.5cm',
                    'encoding': "UTF-8",
                    'quiet': ''
                    }

                try:
                    pdfkit.from_file(html_name, pdf_name, options=options)
                except:
                    pass

                os.remove(html_name)


    # Genera i libretti

    print("Genero i libretti")
    os.makedirs('books', exist_ok=True)
    for pagstart in range(1, len(vgo_dict), 50):
        book_num = int(pagstart / 50) + 1
        pagend = min(book_num * 50, len(vgo_dict))

        book_name = os.path.join('books', 'vgo_' + str(book_num).zfill(2) + '.pdf')
        if not os.path.isfile(book_name):
            pdfWriter = PyPDF2.PdfFileWriter()
            print(book_num)

            # aggiungere copertina
            pdfWriter.addPage(copertina(str(pagstart).zfill(3) + " - " + str(pagend).zfill(3)))

            pdfWriter.addBlankPage()

            indtext = ""
            for vgo_num in [str(x).zfill(3) for x in range(pagstart, pagend + 1)]:
                pdf_name = os.path.join('pdf', vgo_num + '.pdf')

                try:
                    #print(vgo_num + " - " + vgo_dict[vgo_num])
                    indtext = indtext + vgo_num + "\t" + vgo_dict[vgo_num] + "\n"
                    pdfFileObj = open(pdf_name, 'rb')
                    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
                    pageObj = pdfReader.getPage(0)
                    pdfWriter.addPage(pageObj)
                except:
                    pass

            for i in range(0, 8 - ((((pagend - 1) % 50) + 1 + 5) % 8)):
                pdfWriter.addBlankPage()

            # aggiungere indice
            for indpag in indice(indtext):
                pdfWriter.addPage(indpag)

            #Aggiungere pagina finale
            #pdfWriter.addBlankPage()
            pdfFileObj = open("quarta.pdf", 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            pageObj = pdfReader.getPage(0)
            pdfWriter.addPage(pageObj)

            with open(book_name, 'wb') as pdfOutput:
                pdfWriter.write(pdfOutput)

    # # Genera indice
    # with open("index.txt", 'w') as handler:
    #     for key in sorted(vgo_dict):
    #         handler.write(key + ' - ' + vgo_dict[key] + '\n')


if __name__ == "__main__":
    main()