From 18f2b8bef55a588a0fe4a549b6544e8e3e2254d5 Mon Sep 17 00:00:00 2001 From: boyska Date: Thu, 12 May 2022 13:05:03 +0200 Subject: [PATCH] initial commit --- .gitignore | 1 + import.py | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 .gitignore create mode 100644 import.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57a3b59 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/content/ diff --git a/import.py b/import.py new file mode 100644 index 0000000..a675159 --- /dev/null +++ b/import.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# import.py +# +# Copyright 2021 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# +# + +import datetime +from itertools import chain +from pathlib import Path + +from lxml import etree +import toml + +class Article: + BASE = Path('content') + def __init__(self, post_id: str, title: str, body: str, img=None, date=None): + self.post_id = post_id + self.title = title + self.body = body + self.img = img + self.date = date + + def get_dir(self): + return self.BASE / 'posts' + + def get_fname(self): + return self.get_dir() / (self.post_id + '.md') + + def get_frontmatter(self): + fm = { + 'title': self.title, + } + if self.img: + fm['cover'] = self.img + if self.date: + fm['date'] = self.date + return fm + + def save(self): + with open(self.get_fname(), 'w') as buf: + buf.write('+++\n') + toml.dump(self.get_frontmatter(), buf) + buf.write('+++\n') + buf.write(self.body) + + def __str__(self): + return "<%s '%s'>" % (self.__class__.__name__, self.post_id) + +class Event(Article): + def __init__(self, time=None, location=None, **kwargs): + super().__init__(**kwargs) + self.time = time + self.location = location + + def get_dir(self): + return self.BASE / 'events' + + def get_frontmatter(self): + fm = super().get_frontmatter() + fm['eventDate'] = self.time + fm['location'] = self.location + return fm + +def xpath_in(xpath: str, elem): + xp = etree.XPath(xpath) + return xp(elem) + +def get_text(node): + if type(node) is str: + return node + s = node.text + if s is None: + s = '' + for child in node: + s += get_text(s) #etree.tostring(child, encoding='unicode') + return s + +def stringify_children(node): + parts = ([node.text] + + list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) + + [node.tail]) + # filter removes possible Nones in texts and tails + return ''.join(filter(None, parts)) + +def main(args): + articles = [] + parser = etree.HTMLParser() + dom = etree.parse(open('home.html'), parser) + for article in dom.xpath('//article'): + elems = {} + # XXX: questa cosa funziona male + elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n')) + try: + elems['title'] = str(xpath_in('.//h2/text()', article)[0]) + except IndexError: + continue + + elems['post_id'] = article.attrib['id'] + + try: + time_el = xpath_in('.//time[@datetime]', article)[0] + elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime']) + elems['date'] = elems['time'] + except IndexError: + pass + + links = xpath_in('.//a', article) + for l in links: + if l.attrib.get('href', '').startswith('https://forteprenestino.net'): + elems['location'] = 'CSOA Forte Prenestino' + + # print('t', [get_text(e) for e in xpath_in('./h2', article)]) + try: + elems['img'] = xpath_in('.//img', article)[0].attrib['src'] + except IndexError: + img = None + + if 'time' in elems and 'location' in elems: + cls = Event + else: + cls = Article + for k in ['time', 'location']: + if k in elems: + del elems[k] + + a = cls(**elems) + articles.append(a) + + for a in articles: + print(a) + a.save() + return 0 + +if __name__ == '__main__': + import sys + sys.exit(main(sys.argv)) +