initial commit

2022-05-12 13:05:03 +02:00 · 2022-05-12 13:05:03 +02:00 · 18f2b8bef5
commit 18f2b8bef5
2 changed files with 156 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /content/
--- a/import.py
+++ b/import.py
@ -0,0 +1,155 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
 #  import.py
 #  
 #  Copyright 2021  <avana@disperazione>
 #  
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #  
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #  
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 #  MA 02110-1301, USA.
 #  
 #  
 import datetime
 from itertools import chain
 from pathlib import Path
 from lxml import etree
 import toml
 class Article:
    BASE = Path('content')
    def __init__(self, post_id: str, title: str, body: str, img=None, date=None):
        self.post_id = post_id
        self.title = title
        self.body = body
        self.img = img
        self.date = date
    def get_dir(self):
        return self.BASE / 'posts'
    def get_fname(self):
        return self.get_dir() / (self.post_id + '.md')
    def get_frontmatter(self):
        fm = {
                'title': self.title,
                }
        if self.img:
            fm['cover'] = self.img
        if self.date:
            fm['date'] = self.date
        return fm
    def save(self):
        with open(self.get_fname(), 'w') as buf:
            buf.write('+++\n')
            toml.dump(self.get_frontmatter(), buf)
            buf.write('+++\n')
            buf.write(self.body)
    def __str__(self):
        return "<%s '%s'>" % (self.__class__.__name__, self.post_id)
 class Event(Article):
    def __init__(self, time=None, location=None, **kwargs):
        super().__init__(**kwargs)
        self.time = time
        self.location = location
    def get_dir(self):
        return self.BASE / 'events'
    def get_frontmatter(self):
        fm = super().get_frontmatter()
        fm['eventDate'] = self.time
        fm['location'] = self.location
        return fm
 def xpath_in(xpath: str, elem):
    xp = etree.XPath(xpath)
    return xp(elem)
 def get_text(node):
    if type(node) is str:
        return node
    s = node.text
    if s is None:
        s = ''
    for child in node:
        s += get_text(s) #etree.tostring(child, encoding='unicode')
    return s
 def stringify_children(node):
    parts = ([node.text] +
            list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) +
            [node.tail])
    # filter removes possible Nones in texts and tails
    return ''.join(filter(None, parts))
 def main(args):
    articles = []
    parser = etree.HTMLParser()
    dom = etree.parse(open('home.html'), parser)
    for article in dom.xpath('//article'):
        elems = {}
        # XXX: questa cosa funziona male
        elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n'))
        try:
            elems['title'] = str(xpath_in('.//h2/text()', article)[0])
        except IndexError:
            continue
        elems['post_id'] = article.attrib['id']
        try:
            time_el =  xpath_in('.//time[@datetime]', article)[0]
            elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime'])
            elems['date'] = elems['time']
        except IndexError:
            pass
        links = xpath_in('.//a', article)
        for l in links:
            if l.attrib.get('href', '').startswith('https://forteprenestino.net'):
                elems['location'] = 'CSOA Forte Prenestino'
        # print('t', [get_text(e) for e in xpath_in('./h2', article)])
        try:
            elems['img'] = xpath_in('.//img', article)[0].attrib['src']
        except IndexError:
            img = None
        if 'time' in elems and 'location' in elems:
            cls = Event
        else:
            cls = Article
            for k in ['time', 'location']:
                if k in elems:
                    del elems[k]
        a = cls(**elems)
        articles.append(a)
    for a in articles:
        print(a)
        a.save()
    return 0
 if __name__ == '__main__':
    import sys
    sys.exit(main(sys.argv))