oloturia2pdf.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. #!/usr/bin/env python3
  2. from mastodon import Mastodon
  3. import json
  4. import datetime
  5. import os.path
  6. from reportlab.lib import pagesizes
  7. import requests
  8. import html2text
  9. import pdfkit
  10. import locale
  11. import PyPDF2
  12. import html
  13. locale.setlocale(locale.LC_TIME, 'it_IT.UTF-8')
  14. def copertina(text):
  15. from PyPDF2 import PdfFileWriter, PdfFileReader
  16. import io
  17. from reportlab.pdfgen import canvas
  18. from reportlab.lib.pagesizes import A5
  19. from reportlab.pdfbase import pdfmetrics
  20. from reportlab.pdfbase.ttfonts import TTFont
  21. from reportlab.pdfbase.pdfmetrics import stringWidth
  22. FONT = 'Roboto'
  23. SIZE = 36
  24. packet = io.BytesIO()
  25. # create a new PDF with Reportlab
  26. pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
  27. can = canvas.Canvas(packet, pagesize=A5)
  28. can.setFont(FONT, SIZE)
  29. PAGE_WIDTH = A5[0]
  30. text_width = stringWidth(text,FONT, SIZE)
  31. can.drawString((PAGE_WIDTH - text_width) / 2, 100, text)
  32. can.save()
  33. #move to the beginning of the StringIO buffer
  34. packet.seek(0)
  35. new_pdf = PdfFileReader(packet)
  36. # read your existing PDF
  37. existing_pdf = PdfFileReader(open("copertina.pdf", "rb"))
  38. output = PdfFileWriter()
  39. # add the "watermark" (which is the new pdf) on the existing page
  40. page = existing_pdf.getPage(0)
  41. page.mergePage(new_pdf.getPage(0))
  42. output.addPage(page)
  43. return(page)
  44. def indice(text):
  45. # PDF GENERATION LIBRARIES
  46. # import the report lab PDF generation tools
  47. from reportlab.lib.pagesizes import letter
  48. from reportlab.lib.styles import ParagraphStyle
  49. from reportlab.lib.units import inch
  50. from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
  51. from reportlab.pdfbase import pdfmetrics
  52. from reportlab.pdfbase.ttfonts import TTFont
  53. from reportlab.lib.pagesizes import A5
  54. import io
  55. from PyPDF2 import PdfFileWriter, PdfFileReader
  56. parts = []
  57. pdfmetrics.registerFont(TTFont("Roboto", "template/roboto-regular-webfont.ttf"))
  58. style = ParagraphStyle(
  59. name='Normal',
  60. fontName='Roboto',
  61. fontSize=12,
  62. leading = 14,
  63. leftIndent=32,
  64. firstLineIndent=-32,
  65. spaceBefore=5
  66. )
  67. for ro in text.splitlines():
  68. ro = ro.replace(' ',' ')
  69. ro = ro.replace('\t','    ')
  70. parts.append(Paragraph(ro, style = style))
  71. packet = io.BytesIO()
  72. doc = SimpleDocTemplate(packet,
  73. pagesize=A5,
  74. rightMargin=20,
  75. leftMargin=20,
  76. topMargin=40,
  77. bottomMargin=30)
  78. doc.build(parts)
  79. pdfReader = PdfFileReader(packet)
  80. pdfWriter = PdfFileWriter()
  81. for page in pdfReader.pages:
  82. pdfWriter.addPage(page)
  83. pdfWriter.addBlankPage()
  84. return([pdfWriter.getPage(0),pdfWriter.getPage(1)])
  85. def main():
  86. # Scarica tutti i post da Mastodon
  87. print("Scarico i post")
  88. def default(o):
  89. if isinstance(o, (datetime.date, datetime.datetime)):
  90. return o.isoformat()
  91. if not os.path.isfile('oloturiadump.json'):
  92. mastodon = Mastodon(api_base_url = "https://mastodon.bida.im")
  93. all_vgos = []
  94. last_id = None
  95. while True:
  96. statuses = list(filter(lambda s: s['account']['username'] == 'oloturia', mastodon.timeline_hashtag("vgo", local=True, max_id=last_id)))
  97. if not statuses:
  98. break
  99. all_vgos += list(map(
  100. lambda s: {
  101. 'id': s['id'],
  102. 'uri': s['uri'],
  103. 'content': s['content'],
  104. 'replies_count': s['replies_count'],
  105. #'replies': mastodon.status_context(s['id']) if s['replies_count'] > 0 else [],
  106. 'created': s['created_at'],
  107. 'reblogs': s['reblogs_count'],
  108. 'favourites': s['favourites_count'],
  109. 'media': s['media_attachments']
  110. }
  111. , statuses))
  112. last_id = statuses[-1]['id']
  113. #print(all_vgos)
  114. #print(json.dumps(all_vgos, default=default))
  115. with open('oloturiadump.json', 'w') as json_file:
  116. json.dump(all_vgos, json_file, indent=4, default=default)
  117. # Scarica tutte le immagini
  118. print("Scarico le immagini")
  119. with open('oloturiadump.json') as json_file:
  120. all_vgos = json.load(json_file)
  121. os.makedirs('media', exist_ok=True)
  122. vgo_dict={}
  123. for vgo in all_vgos:
  124. vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
  125. # vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
  126. # if len(vgo_name) < 10:
  127. # vgo_name = [s for s in html2text.html2text(vgo['content']).split("\n\n") if s][-1].replace("\n"," ")
  128. vgo_name = vgo['content'].split("<p>")[-1].replace("</p>","")
  129. vgo_name = vgo_name.split("<br />")[-1]
  130. vgo_name = vgo_name.split("</a>")[-1]
  131. vgo_name = html.unescape(vgo_name).strip()
  132. #print(vgo_num +' - '+ vgo_name)
  133. #print(str(vgo['id']) +' '+ vgo['uri'])
  134. vgo_dict[vgo_num] = vgo_name
  135. for media in vgo['media']:
  136. #print(str(media['id']) +' '+ media['url'])
  137. ext = os.path.splitext(media['preview_url'])[1]
  138. img_name = os.path.join('media',str(media['id']) + ext)
  139. if not os.path.isfile(img_name):
  140. print(img_name)
  141. img_data = requests.get(media['preview_url']).content
  142. with open(img_name, 'wb') as handler:
  143. handler.write(img_data)
  144. with open('template.html') as html_file:
  145. html_base = html_file.read()
  146. with open('mediagallery.html') as html_file:
  147. html_mediagallery = html_file.read()
  148. # Genera i PDF
  149. print("Genero i PDF")
  150. os.makedirs('pdf', exist_ok=True)
  151. for vgo in all_vgos:
  152. vgo_num = html2text.html2text(vgo['content']).split(' ')[0]
  153. vgo_name = os.linesep.join([s for s in html2text.html2text(vgo['content']).splitlines() if s]).splitlines()[-1]
  154. html_name = 'oloturia.html'
  155. pdf_name = os.path.join('pdf', vgo_num + '.pdf')
  156. if not os.path.isfile(pdf_name):
  157. print(vgo_num +' - '+ vgo_name)
  158. media_num = 0
  159. mediagallery_tot = ''
  160. media_tot = len(vgo['media'])
  161. sizes = "622px" if media_tot == 1 else "311px"
  162. style = [
  163. ["inset: auto; width: 100%; height: 100%;"],
  164. ["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto auto 2px; width: 50%; height: 100%;"],
  165. ["inset: auto 2px auto auto; width: 50%; height: 100%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"],
  166. ["inset: auto 2px 2px auto; width: 50%; height: 50%;","inset: auto auto 2px 2px; width: 50%; height: 50%;","inset: 2px 2px auto auto; width: 50%; height: 50%;","inset: 2px auto auto 2px; width: 50%; height: 50%;"]
  167. ]
  168. for media in vgo['media']:
  169. mediagallery = html_mediagallery
  170. ext = os.path.splitext(media['url'])[1]
  171. img_name = os.path.join('media',str(media['id']) + ext)
  172. mediagallery = mediagallery.replace("[media]", img_name)
  173. mediagallery = mediagallery.replace("[style]", style[media_tot-1][media_num])
  174. mediagallery = mediagallery.replace("[sizes]", sizes)
  175. mediagallery_tot = mediagallery_tot + mediagallery
  176. media_num = media_num + 1
  177. content = html_base
  178. content = content.replace("[content]", vgo['content'])
  179. content = content.replace("[date]", datetime.datetime.fromisoformat(vgo['created']).strftime("%-d %B %Y, %H:%M"))
  180. content = content.replace("[reply]", str(vgo['replies_count']))
  181. content = content.replace("[reblogs]", str(vgo['reblogs']))
  182. content = content.replace("[favourites]", str(vgo['favourites']))
  183. content = content.replace("[mediagallery]", mediagallery_tot)
  184. with open(html_name, 'w') as handler:
  185. handler.write(content)
  186. options = {
  187. 'enable-local-file-access': None,
  188. 'page-size': 'A5',
  189. 'margin-top': '0.5cm',
  190. 'margin-right': '0.5cm',
  191. 'margin-bottom': '0.5cm',
  192. 'margin-left': '0.5cm',
  193. 'encoding': "UTF-8",
  194. 'quiet': ''
  195. }
  196. try:
  197. pdfkit.from_file(html_name, pdf_name, options=options)
  198. except:
  199. pass
  200. os.remove(html_name)
  201. # Genera i libretti
  202. print("Genero i libretti")
  203. os.makedirs('books', exist_ok=True)
  204. for pagstart in range(1, len(vgo_dict), 50):
  205. book_num = int(pagstart / 50) + 1
  206. pagend = min(book_num * 50, len(vgo_dict))
  207. book_name = os.path.join('books', 'vgo_' + str(book_num).zfill(2) + '.pdf')
  208. if not os.path.isfile(book_name):
  209. pdfWriter = PyPDF2.PdfFileWriter()
  210. print(book_num)
  211. # aggiungere copertina
  212. pdfWriter.addPage(copertina(str(pagstart).zfill(3) + " - " + str(pagend).zfill(3)))
  213. pdfWriter.addBlankPage()
  214. indtext = ""
  215. for vgo_num in [str(x).zfill(3) for x in range(pagstart, pagend + 1)]:
  216. pdf_name = os.path.join('pdf', vgo_num + '.pdf')
  217. try:
  218. #print(vgo_num + " - " + vgo_dict[vgo_num])
  219. indtext = indtext + vgo_num + "\t" + vgo_dict[vgo_num] + "\n"
  220. pdfFileObj = open(pdf_name, 'rb')
  221. pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  222. pageObj = pdfReader.getPage(0)
  223. pdfWriter.addPage(pageObj)
  224. except:
  225. pass
  226. for i in range(0, 8 - ((((pagend - 1) % 50) + 1 + 5) % 8)):
  227. pdfWriter.addBlankPage()
  228. # aggiungere indice
  229. for indpag in indice(indtext):
  230. pdfWriter.addPage(indpag)
  231. #Aggiungere pagina finale
  232. #pdfWriter.addBlankPage()
  233. pdfFileObj = open("quarta.pdf", 'rb')
  234. pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  235. pageObj = pdfReader.getPage(0)
  236. pdfWriter.addPage(pageObj)
  237. with open(book_name, 'wb') as pdfOutput:
  238. pdfWriter.write(pdfOutput)
  239. # # Genera indice
  240. # with open("index.txt", 'w') as handler:
  241. # for key in sorted(vgo_dict):
  242. # handler.write(key + ' - ' + vgo_dict[key] + '\n')
  243. if __name__ == "__main__":
  244. main()