oloturia2pdf.py 10 KB


  1. #!/usr/bin/env python3
  2. from mastodon import Mastodon
  3. import json
  4. import datetime
  5. import os.path
  6. from reportlab.lib import pagesizes
  7. import requests
  8. import html2text
  9. import pdfkit
  10. import locale
  11. import PyPDF2
  12. locale.setlocale(locale.LC_TIME, 'it_IT.UTF-8')
  13. def copertina(text):
  14. from PyPDF2 import PdfFileWriter, PdfFileReader
  15. import io
  16. from reportlab.pdfgen import canvas
  17. from reportlab.lib.pagesizes import A5
  18. from reportlab.pdfbase import pdfmetrics
  19. from reportlab.pdfbase.ttfonts import TTFont
  20. from reportlab.pdfbase.pdfmetrics import stringWidth
  21. FONT = 'Roboto'
  22. SIZE = 36
  23. packet = io.BytesIO()
  24. # create a new PDF with Reportlab
  25. pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
  26. can = canvas.Canvas(packet, pagesize=A5)
  27. can.setFont(FONT, SIZE)
  28. PAGE_WIDTH = A5[0]
  29. text_width = stringWidth(text,FONT, SIZE)
  30. can.drawString((PAGE_WIDTH - text_width) / 2, 100, text)
  31. can.save()
  32. #move to the beginning of the StringIO buffer
  33. packet.seek(0)
  34. new_pdf = PdfFileReader(packet)
  35. # read your existing PDF
  36. existing_pdf = PdfFileReader(open("copertina.pdf", "rb"))
  37. output = PdfFileWriter()
  38. # add the "watermark" (which is the new pdf) on the existing page
  39. page = existing_pdf.getPage(0)
  40. page.mergePage(new_pdf.getPage(0))
  41. output.addPage(page)
  42. return(page)
  43. def indice(text):
  44. # PDF GENERATION LIBRARIES
  45. # import the report lab PDF generation tools
  46. from reportlab.lib.pagesizes import letter
  47. from reportlab.lib.styles import ParagraphStyle
  48. from reportlab.lib.units import inch
  49. from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
  50. from reportlab.pdfbase import pdfmetrics
  51. from reportlab.pdfbase.ttfonts import TTFont
  52. from reportlab.lib.pagesizes import A5
  53. import io
  54. from PyPDF2 import PdfFileWriter, PdfFileReader
  55. parts = []
  56. pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
  57. style = ParagraphStyle(
  58. name='Normal',
  59. fontName='Roboto',
  60. fontSize=12,
  61. leading = 14,
  62. leftIndent=32,
  63. firstLineIndent=-32,
  64. spaceBefore=5
  65. )
  66. for ro in text.splitlines():
  67. ro = ro.replace(' ',' ')
  68. ro = ro.replace('\t','    ')
  69. parts.append(Paragraph(ro, style = style))
  70. packet = io.BytesIO()
  71. doc = SimpleDocTemplate(packet,
  72. pagesize=A5,
  73. rightMargin=20,
  74. leftMargin=20,
  75. topMargin=40,
  76. bottomMargin=30)
  77. doc.build(parts)
  78. pdfReader = PdfFileReader(packet)
  79. pdfWriter = PdfFileWriter()
  80. for page in pdfReader.pages:
  81. pdfWriter.addPage(page)
  82. pdfWriter.addBlankPage()
  83. return([pdfWriter.getPage(0),pdfWriter.getPage(1)])
  84. def main():
  85. # Scarica tutti i post da Mastodon
  86. print("Scarico i post")
  87. def default(o):
  88. if isinstance(o, (datetime.date, datetime.datetime)):
  89. return o.isoformat()
  90. if not os.path.isfile('oloturiadump.json'):
  91. mastodon = Mastodon(api_base_url = "https://mastodon.bida.im")
  92. all_vgos = []
  93. last_id = None
  94. while True:
  95. statuses = list(filter(lambda s: s['account']['username'] == 'oloturia', mastodon.timeline_hashtag("vgo", local=True, max_id=last_id)))
  96. if not statuses:
  97. break
  98. all_vgos += list(map(
  99. lambda s: {
  100. 'id': s['id'],
  101. 'uri': s['uri'],
  102. 'content': s['content'],
  103. 'replies_count': s['replies_count'],
  104. #'replies': mastodon.status_context(s['id']) if s['replies_count'] > 0 else [],
  105. 'created': s['created_at'],
  106. 'reblogs': s['reblogs_count'],
  107. 'favourites': s['favourites_count'],
  108. 'media': s['media_attachments']
  109. }
  110. , statuses))
  111. last_id = statuses[-1]['id']
  112. #print(all_vgos)
  113. #print(json.dumps(all_vgos, default=default))
  114. with open('oloturiadump.json', 'w') as json_file:
  115. json.dump(all_vgos, json_file, indent=4, default=default)
  116. # Scarica tutte le immagini
  117. print("Scarico le immagini")
  118. with open('oloturiadump.json') as json_file:
  119. all_vgos = json.load(json_file)
  120. os.makedirs('media', exist_ok=True)
  121. vgo_dict={}
  122. for vgo in all_vgos:
  123. vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
  124. vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
  125. if len(vgo_name) < 10:
  126. vgo_name = [s for s in html2text.html2text(vgo['content']).split("\n\n") if s][-1].replace("\n"," ")
  127. #print(vgo_num +' - '+ vgo_name)
  128. #print(str(vgo['id']) +' '+ vgo['uri'])
  129. vgo_dict[vgo_num] = vgo_name
  130. for media in vgo['media']:
  131. #print(str(media['id']) +' '+ media['url'])
  132. ext = os.path.splitext(media['preview_url'])[1]
  133. img_name = os.path.join('media',str(media['id']) + ext)
  134. if not os.path.isfile(img_name):
  135. print(img_name)
  136. img_data = requests.get(media['preview_url']).content
  137. with open(img_name, 'wb') as handler:
  138. handler.write(img_data)
  139. with open('template.html') as html_file:
  140. html_base = html_file.read()
  141. with open('mediagallery.html') as html_file:
  142. html_mediagallery = html_file.read()
  143. # Genera i PDF
  144. print("Genero i PDF")
  145. os.makedirs('pdf', exist_ok=True)
  146. for vgo in all_vgos:
  147. vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
  148. vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
  149. html_name = 'oloturia.html'
  150. pdf_name = os.path.join('pdf', vgo_num + '.pdf')
  151. if not os.path.isfile(pdf_name):
  152. print(vgo_num +' - '+ vgo_name)
  153. media_num = 0
  154. mediagallery_tot = ''
  155. media_tot = len(vgo['media'])
  156. sizes = "622px" if media_tot == 1 else "311px"
  157. style = [
  158. ["inset: auto; width: 100%; height: 100%;"],
  159. ["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto auto 2px; width: 50%; height: 100%;"],
  160. ["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"],
  161. ["inset: auto 2px 2px auto; width: 50%; height: 50%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px 2px auto auto; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"]
  162. ]
  163. for media in vgo['media']:
  164. mediagallery = html_mediagallery
  165. ext = os.path.splitext(media['url'])[1]
  166. img_name = os.path.join('media',str(media['id']) + ext)
  167. mediagallery = mediagallery.replace("[media]", img_name)
  168. mediagallery = mediagallery.replace("[style]", style[media_tot-1][media_num])
  169. mediagallery = mediagallery.replace("[sizes]", sizes)
  170. mediagallery_tot = mediagallery_tot + mediagallery
  171. media_num = media_num + 1
  172. content = html_base
  173. content = content.replace("[content]", vgo['content'])
  174. content = content.replace("[date]", datetime.datetime.fromisoformat(vgo['created']).strftime("%-d %B %Y, %H:%M"))
  175. content = content.replace("[reply]", str(vgo['replies_count']))
  176. content = content.replace("[reblogs]", str(vgo['reblogs']))
  177. content = content.replace("[favourites]", str(vgo['favourites']))
  178. content = content.replace("[mediagallery]", mediagallery_tot)
  179. with open(html_name, 'w') as handler:
  180. handler.write(content)
  181. options = {
  182. 'page-size': 'A5',
  183. 'margin-top': '0.5cm',
  184. 'margin-right': '0.5cm',
  185. 'margin-bottom': '0.5cm',
  186. 'margin-left': '0.5cm',
  187. 'encoding': "UTF-8",
  188. 'quiet': ''
  189. }
  190. try:
  191. pdfkit.from_file(html_name, pdf_name, options=options)
  192. except:
  193. pass
  194. os.remove(html_name)
  195. # Genera i libretti
  196. print("Genero i libretti")
  197. os.makedirs('books', exist_ok=True)
  198. for pagstart in range(1, len(vgo_dict), 50):
  199. pdfWriter = PyPDF2.PdfFileWriter()
  200. book_num = int(pagstart / 50) + 1
  201. pagend = min(book_num * 50, len(vgo_dict))
  202. print(book_num)
  203. # aggiungere copertina
  204. pdfWriter.addPage(copertina(str(pagstart).zfill(3) + " - " + str(pagend).zfill(3)))
  205. pdfWriter.addBlankPage()
  206. indtext = ""
  207. for vgo_num in [str(x).zfill(3) for x in range(pagstart, pagend + 1)]:
  208. pdf_name = os.path.join('pdf', vgo_num + '.pdf')
  209. try:
  210. #print(vgo_num + " - " + vgo_dict[vgo_num])
  211. indtext = indtext + vgo_num + "\t" + vgo_dict[vgo_num] + "\n"
  212. pdfFileObj = open(pdf_name, 'rb')
  213. pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  214. pageObj = pdfReader.getPage(0)
  215. pdfWriter.addPage(pageObj)
  216. except:
  217. pass
  218. for i in range(0, 8 - ((((pagend - 1) % 50) + 1 + 5) % 8)):
  219. pdfWriter.addBlankPage()
  220. # aggiungere indice
  221. for indpag in indice(indtext):
  222. pdfWriter.addPage(indpag)
  223. #Aggiungere pagina finale
  224. #pdfWriter.addBlankPage()
  225. pdfFileObj = open("quarta.pdf", 'rb')
  226. pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  227. pageObj = pdfReader.getPage(0)
  228. pdfWriter.addPage(pageObj)
  229. book_name = os.path.join('books', 'vgo_' + str(book_num).zfill(2) + '.pdf')
  230. with open(book_name, 'wb') as pdfOutput:
  231. pdfWriter.write(pdfOutput)
  232. if __name__ == "__main__":
  233. main()