1 year ago · 18f2b8bef5
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 
				+/content/
			
--- a/import.py
+++ b/import.py
@@ -0,0 +1,155 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+#
			
 
				+#  import.py
			
 
				+#  
			
 
				+#  Copyright 2021  <avana@disperazione>
			
 
				+#  
			
 
				+#  This program is free software; you can redistribute it and/or modify
			
 
				+#  it under the terms of the GNU General Public License as published by
			
 
				+#  the Free Software Foundation; either version 2 of the License, or
			
 
				+#  (at your option) any later version.
			
 
				+#  
			
 
				+#  This program is distributed in the hope that it will be useful,
			
 
				+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+#  GNU General Public License for more details.
			
 
				+#  
			
 
				+#  You should have received a copy of the GNU General Public License
			
 
				+#  along with this program; if not, write to the Free Software
			
 
				+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
			
 
				+#  MA 02110-1301, USA.
			
 
				+#  
			
 
				+#  
			
 
				+
			
 
				+import datetime
			
 
				+from itertools import chain
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from lxml import etree
			
 
				+import toml
			
 
				+
			
 
				+class Article:
			
 
				+    BASE = Path('content')
			
 
				+    def __init__(self, post_id: str, title: str, body: str, img=None, date=None):
			
 
				+        self.post_id = post_id
			
 
				+        self.title = title
			
 
				+        self.body = body
			
 
				+        self.img = img
			
 
				+        self.date = date
			
 
				+
			
 
				+    def get_dir(self):
			
 
				+        return self.BASE / 'posts'
			
 
				+
			
 
				+    def get_fname(self):
			
 
				+        return self.get_dir() / (self.post_id + '.md')
			
 
				+
			
 
				+    def get_frontmatter(self):
			
 
				+        fm = {
			
 
				+                'title': self.title,
			
 
				+                }
			
 
				+        if self.img:
			
 
				+            fm['cover'] = self.img
			
 
				+        if self.date:
			
 
				+            fm['date'] = self.date
			
 
				+        return fm
			
 
				+
			
 
				+    def save(self):
			
 
				+        with open(self.get_fname(), 'w') as buf:
			
 
				+            buf.write('+++\n')
			
 
				+            toml.dump(self.get_frontmatter(), buf)
			
 
				+            buf.write('+++\n')
			
 
				+            buf.write(self.body)
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        return "<%s '%s'>" % (self.__class__.__name__, self.post_id)
			
 
				+
			
 
				+class Event(Article):
			
 
				+    def __init__(self, time=None, location=None, **kwargs):
			
 
				+        super().__init__(**kwargs)
			
 
				+        self.time = time
			
 
				+        self.location = location
			
 
				+
			
 
				+    def get_dir(self):
			
 
				+        return self.BASE / 'events'
			
 
				+
			
 
				+    def get_frontmatter(self):
			
 
				+        fm = super().get_frontmatter()
			
 
				+        fm['eventDate'] = self.time
			
 
				+        fm['location'] = self.location
			
 
				+        return fm
			
 
				+
			
 
				+def xpath_in(xpath: str, elem):
			
 
				+    xp = etree.XPath(xpath)
			
 
				+    return xp(elem)
			
 
				+
			
 
				+def get_text(node):
			
 
				+    if type(node) is str:
			
 
				+        return node
			
 
				+    s = node.text
			
 
				+    if s is None:
			
 
				+        s = ''
			
 
				+    for child in node:
			
 
				+        s += get_text(s) #etree.tostring(child, encoding='unicode')
			
 
				+    return s
			
 
				+
			
 
				+def stringify_children(node):
			
 
				+    parts = ([node.text] +
			
 
				+            list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) +
			
 
				+            [node.tail])
			
 
				+    # filter removes possible Nones in texts and tails
			
 
				+    return ''.join(filter(None, parts))
			
 
				+
			
 
				+def main(args):
			
 
				+    articles = []
			
 
				+    parser = etree.HTMLParser()
			
 
				+    dom = etree.parse(open('home.html'), parser)
			
 
				+    for article in dom.xpath('//article'):
			
 
				+        elems = {}
			
 
				+        # XXX: questa cosa funziona male
			
 
				+        elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n'))
			
 
				+        try:
			
 
				+            elems['title'] = str(xpath_in('.//h2/text()', article)[0])
			
 
				+        except IndexError:
			
 
				+            continue
			
 
				+
			
 
				+        elems['post_id'] = article.attrib['id']
			
 
				+
			
 
				+        try:
			
 
				+            time_el =  xpath_in('.//time[@datetime]', article)[0]
			
 
				+            elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime'])
			
 
				+            elems['date'] = elems['time']
			
 
				+        except IndexError:
			
 
				+            pass
			
 
				+
			
 
				+        links = xpath_in('.//a', article)
			
 
				+        for l in links:
			
 
				+            if l.attrib.get('href', '').startswith('https://forteprenestino.net'):
			
 
				+                elems['location'] = 'CSOA Forte Prenestino'
			
 
				+        
			
 
				+        # print('t', [get_text(e) for e in xpath_in('./h2', article)])
			
 
				+        try:
			
 
				+            elems['img'] = xpath_in('.//img', article)[0].attrib['src']
			
 
				+        except IndexError:
			
 
				+            img = None
			
 
				+
			
 
				+        if 'time' in elems and 'location' in elems:
			
 
				+            cls = Event
			
 
				+        else:
			
 
				+            cls = Article
			
 
				+            for k in ['time', 'location']:
			
 
				+                if k in elems:
			
 
				+                    del elems[k]
			
 
				+
			
 
				+        a = cls(**elems)
			
 
				+        articles.append(a)
			
 
				+
			
 
				+    for a in articles:
			
 
				+        print(a)
			
 
				+        a.save()
			
 
				+    return 0
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    import sys
			
 
				+    sys.exit(main(sys.argv))
			
 
				+