import.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # import.py
  5. #
  6. # Copyright 2021 <avana@disperazione>
  7. #
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # This program is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with this program; if not, write to the Free Software
  20. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  21. # MA 02110-1301, USA.
  22. #
  23. #
  24. import datetime
  25. from itertools import chain
  26. from pathlib import Path
  27. from lxml import etree
  28. import toml
  29. class Article:
  30. BASE = Path('content')
  31. def __init__(self, post_id: str, title: str, body: str, img=None, date=None):
  32. self.post_id = post_id
  33. self.title = title
  34. self.body = body
  35. self.img = img
  36. self.date = date
  37. def get_dir(self):
  38. return self.BASE / 'posts'
  39. def get_fname(self):
  40. return self.get_dir() / (self.post_id + '.md')
  41. def get_frontmatter(self):
  42. fm = {
  43. 'title': self.title,
  44. }
  45. if self.img:
  46. fm['cover'] = self.img
  47. if self.date:
  48. fm['date'] = self.date
  49. return fm
  50. def save(self):
  51. with open(self.get_fname(), 'w') as buf:
  52. buf.write('+++\n')
  53. toml.dump(self.get_frontmatter(), buf)
  54. buf.write('+++\n')
  55. buf.write(self.body)
  56. def __str__(self):
  57. return "<%s '%s'>" % (self.__class__.__name__, self.post_id)
  58. class Event(Article):
  59. def __init__(self, time=None, location=None, **kwargs):
  60. super().__init__(**kwargs)
  61. self.time = time
  62. self.location = location
  63. def get_dir(self):
  64. return self.BASE / 'events'
  65. def get_frontmatter(self):
  66. fm = super().get_frontmatter()
  67. fm['eventDate'] = self.time
  68. fm['location'] = self.location
  69. return fm
  70. def xpath_in(xpath: str, elem):
  71. xp = etree.XPath(xpath)
  72. return xp(elem)
  73. def get_text(node):
  74. if type(node) is str:
  75. return node
  76. s = node.text
  77. if s is None:
  78. s = ''
  79. for child in node:
  80. s += get_text(s) #etree.tostring(child, encoding='unicode')
  81. return s
  82. def stringify_children(node):
  83. parts = ([node.text] +
  84. list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) +
  85. [node.tail])
  86. # filter removes possible Nones in texts and tails
  87. return ''.join(filter(None, parts))
  88. def main(args):
  89. articles = []
  90. parser = etree.HTMLParser()
  91. dom = etree.parse(open('home.html'), parser)
  92. for article in dom.xpath('//article'):
  93. elems = {}
  94. # XXX: questa cosa funziona male
  95. elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n'))
  96. try:
  97. elems['title'] = str(xpath_in('.//h2/text()', article)[0])
  98. except IndexError:
  99. continue
  100. elems['post_id'] = article.attrib['id']
  101. try:
  102. time_el = xpath_in('.//time[@datetime]', article)[0]
  103. elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime'])
  104. elems['date'] = elems['time']
  105. except IndexError:
  106. pass
  107. links = xpath_in('.//a', article)
  108. for l in links:
  109. if l.attrib.get('href', '').startswith('https://forteprenestino.net'):
  110. elems['location'] = 'CSOA Forte Prenestino'
  111. # print('t', [get_text(e) for e in xpath_in('./h2', article)])
  112. try:
  113. elems['img'] = xpath_in('.//img', article)[0].attrib['src']
  114. except IndexError:
  115. img = None
  116. if 'time' in elems and 'location' in elems:
  117. cls = Event
  118. else:
  119. cls = Article
  120. for k in ['time', 'location']:
  121. if k in elems:
  122. del elems[k]
  123. a = cls(**elems)
  124. articles.append(a)
  125. for a in articles:
  126. print(a)
  127. a.save()
  128. return 0
  129. if __name__ == '__main__':
  130. import sys
  131. sys.exit(main(sys.argv))