#!/usr/bin/env python # -*- coding: utf-8 -*- # # import.py # # Copyright 2021 # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # import datetime from itertools import chain from pathlib import Path from lxml import etree import toml class Article: BASE = Path('content') def __init__(self, post_id: str, title: str, body: str, img=None, date=None): self.post_id = post_id self.title = title self.body = body self.img = img self.date = date def get_dir(self): return self.BASE / 'posts' def get_fname(self): return self.get_dir() / (self.post_id + '.md') def get_frontmatter(self): fm = { 'title': self.title, } if self.img: fm['cover'] = self.img if self.date: fm['date'] = self.date return fm def save(self): with open(self.get_fname(), 'w') as buf: buf.write('+++\n') toml.dump(self.get_frontmatter(), buf) buf.write('+++\n') buf.write(self.body) def __str__(self): return "<%s '%s'>" % (self.__class__.__name__, self.post_id) class Event(Article): def __init__(self, time=None, location=None, **kwargs): super().__init__(**kwargs) self.time = time self.location = location def get_dir(self): return self.BASE / 'events' def get_frontmatter(self): fm = super().get_frontmatter() fm['eventDate'] = self.time fm['location'] = self.location return fm def xpath_in(xpath: str, elem): xp = etree.XPath(xpath) return xp(elem) def get_text(node): if type(node) is str: return node s = node.text if s is None: s = '' for child in node: s += get_text(s) #etree.tostring(child, encoding='unicode') return s def stringify_children(node): parts = ([node.text] + list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) + [node.tail]) # filter removes possible Nones in texts and tails return ''.join(filter(None, parts)) def main(args): articles = [] parser = etree.HTMLParser() dom = etree.parse(open('home.html'), parser) for article in dom.xpath('//article'): elems = {} # XXX: questa cosa funziona male elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n')) try: elems['title'] = str(xpath_in('.//h2/text()', article)[0]) except IndexError: continue elems['post_id'] = article.attrib['id'] try: time_el = xpath_in('.//time[@datetime]', article)[0] elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime']) elems['date'] = elems['time'] except IndexError: pass links = xpath_in('.//a', article) for l in links: if l.attrib.get('href', '').startswith('https://forteprenestino.net'): elems['location'] = 'CSOA Forte Prenestino' # print('t', [get_text(e) for e in xpath_in('./h2', article)]) try: elems['img'] = xpath_in('.//img', article)[0].attrib['src'] except IndexError: img = None if 'time' in elems and 'location' in elems: cls = Event else: cls = Article for k in ['time', 'location']: if k in elems: del elems[k] a = cls(**elems) articles.append(a) for a in articles: print(a) a.save() return 0 if __name__ == '__main__': import sys sys.exit(main(sys.argv))