|
@@ -0,0 +1,155 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+#
|
|
|
+# import.py
|
|
|
+#
|
|
|
+# Copyright 2021 <avana@disperazione>
|
|
|
+#
|
|
|
+# This program is free software; you can redistribute it and/or modify
|
|
|
+# it under the terms of the GNU General Public License as published by
|
|
|
+# the Free Software Foundation; either version 2 of the License, or
|
|
|
+# (at your option) any later version.
|
|
|
+#
|
|
|
+# This program is distributed in the hope that it will be useful,
|
|
|
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
+# GNU General Public License for more details.
|
|
|
+#
|
|
|
+# You should have received a copy of the GNU General Public License
|
|
|
+# along with this program; if not, write to the Free Software
|
|
|
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
|
+# MA 02110-1301, USA.
|
|
|
+#
|
|
|
+#
|
|
|
+
|
|
|
+import datetime
|
|
|
+from itertools import chain
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+from lxml import etree
|
|
|
+import toml
|
|
|
+
|
|
|
+class Article:
|
|
|
+ BASE = Path('content')
|
|
|
+ def __init__(self, post_id: str, title: str, body: str, img=None, date=None):
|
|
|
+ self.post_id = post_id
|
|
|
+ self.title = title
|
|
|
+ self.body = body
|
|
|
+ self.img = img
|
|
|
+ self.date = date
|
|
|
+
|
|
|
+ def get_dir(self):
|
|
|
+ return self.BASE / 'posts'
|
|
|
+
|
|
|
+ def get_fname(self):
|
|
|
+ return self.get_dir() / (self.post_id + '.md')
|
|
|
+
|
|
|
+ def get_frontmatter(self):
|
|
|
+ fm = {
|
|
|
+ 'title': self.title,
|
|
|
+ }
|
|
|
+ if self.img:
|
|
|
+ fm['cover'] = self.img
|
|
|
+ if self.date:
|
|
|
+ fm['date'] = self.date
|
|
|
+ return fm
|
|
|
+
|
|
|
+ def save(self):
|
|
|
+ with open(self.get_fname(), 'w') as buf:
|
|
|
+ buf.write('+++\n')
|
|
|
+ toml.dump(self.get_frontmatter(), buf)
|
|
|
+ buf.write('+++\n')
|
|
|
+ buf.write(self.body)
|
|
|
+
|
|
|
+ def __str__(self):
|
|
|
+ return "<%s '%s'>" % (self.__class__.__name__, self.post_id)
|
|
|
+
|
|
|
+class Event(Article):
|
|
|
+ def __init__(self, time=None, location=None, **kwargs):
|
|
|
+ super().__init__(**kwargs)
|
|
|
+ self.time = time
|
|
|
+ self.location = location
|
|
|
+
|
|
|
+ def get_dir(self):
|
|
|
+ return self.BASE / 'events'
|
|
|
+
|
|
|
+ def get_frontmatter(self):
|
|
|
+ fm = super().get_frontmatter()
|
|
|
+ fm['eventDate'] = self.time
|
|
|
+ fm['location'] = self.location
|
|
|
+ return fm
|
|
|
+
|
|
|
+def xpath_in(xpath: str, elem):
|
|
|
+ xp = etree.XPath(xpath)
|
|
|
+ return xp(elem)
|
|
|
+
|
|
|
+def get_text(node):
|
|
|
+ if type(node) is str:
|
|
|
+ return node
|
|
|
+ s = node.text
|
|
|
+ if s is None:
|
|
|
+ s = ''
|
|
|
+ for child in node:
|
|
|
+ s += get_text(s) #etree.tostring(child, encoding='unicode')
|
|
|
+ return s
|
|
|
+
|
|
|
+def stringify_children(node):
|
|
|
+ parts = ([node.text] +
|
|
|
+ list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) +
|
|
|
+ [node.tail])
|
|
|
+ # filter removes possible Nones in texts and tails
|
|
|
+ return ''.join(filter(None, parts))
|
|
|
+
|
|
|
+def main(args):
|
|
|
+ articles = []
|
|
|
+ parser = etree.HTMLParser()
|
|
|
+ dom = etree.parse(open('home.html'), parser)
|
|
|
+ for article in dom.xpath('//article'):
|
|
|
+ elems = {}
|
|
|
+ # XXX: questa cosa funziona male
|
|
|
+ elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n'))
|
|
|
+ try:
|
|
|
+ elems['title'] = str(xpath_in('.//h2/text()', article)[0])
|
|
|
+ except IndexError:
|
|
|
+ continue
|
|
|
+
|
|
|
+ elems['post_id'] = article.attrib['id']
|
|
|
+
|
|
|
+ try:
|
|
|
+ time_el = xpath_in('.//time[@datetime]', article)[0]
|
|
|
+ elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime'])
|
|
|
+ elems['date'] = elems['time']
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ links = xpath_in('.//a', article)
|
|
|
+ for l in links:
|
|
|
+ if l.attrib.get('href', '').startswith('https://forteprenestino.net'):
|
|
|
+ elems['location'] = 'CSOA Forte Prenestino'
|
|
|
+
|
|
|
+ # print('t', [get_text(e) for e in xpath_in('./h2', article)])
|
|
|
+ try:
|
|
|
+ elems['img'] = xpath_in('.//img', article)[0].attrib['src']
|
|
|
+ except IndexError:
|
|
|
+ img = None
|
|
|
+
|
|
|
+ if 'time' in elems and 'location' in elems:
|
|
|
+ cls = Event
|
|
|
+ else:
|
|
|
+ cls = Article
|
|
|
+ for k in ['time', 'location']:
|
|
|
+ if k in elems:
|
|
|
+ del elems[k]
|
|
|
+
|
|
|
+ a = cls(**elems)
|
|
|
+ articles.append(a)
|
|
|
+
|
|
|
+ for a in articles:
|
|
|
+ print(a)
|
|
|
+ a.save()
|
|
|
+ return 0
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ import sys
|
|
|
+ sys.exit(main(sys.argv))
|
|
|
+
|