import-avana-html/import.py
2022-05-12 13:05:03 +02:00

155 lines
4.4 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# import.py
#
# Copyright 2021 <avana@disperazione>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
import datetime
from itertools import chain
from pathlib import Path
from lxml import etree
import toml
class Article:
BASE = Path('content')
def __init__(self, post_id: str, title: str, body: str, img=None, date=None):
self.post_id = post_id
self.title = title
self.body = body
self.img = img
self.date = date
def get_dir(self):
return self.BASE / 'posts'
def get_fname(self):
return self.get_dir() / (self.post_id + '.md')
def get_frontmatter(self):
fm = {
'title': self.title,
}
if self.img:
fm['cover'] = self.img
if self.date:
fm['date'] = self.date
return fm
def save(self):
with open(self.get_fname(), 'w') as buf:
buf.write('+++\n')
toml.dump(self.get_frontmatter(), buf)
buf.write('+++\n')
buf.write(self.body)
def __str__(self):
return "<%s '%s'>" % (self.__class__.__name__, self.post_id)
class Event(Article):
def __init__(self, time=None, location=None, **kwargs):
super().__init__(**kwargs)
self.time = time
self.location = location
def get_dir(self):
return self.BASE / 'events'
def get_frontmatter(self):
fm = super().get_frontmatter()
fm['eventDate'] = self.time
fm['location'] = self.location
return fm
def xpath_in(xpath: str, elem):
xp = etree.XPath(xpath)
return xp(elem)
def get_text(node):
if type(node) is str:
return node
s = node.text
if s is None:
s = ''
for child in node:
s += get_text(s) #etree.tostring(child, encoding='unicode')
return s
def stringify_children(node):
parts = ([node.text] +
list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
def main(args):
articles = []
parser = etree.HTMLParser()
dom = etree.parse(open('home.html'), parser)
for article in dom.xpath('//article'):
elems = {}
# XXX: questa cosa funziona male
elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n'))
try:
elems['title'] = str(xpath_in('.//h2/text()', article)[0])
except IndexError:
continue
elems['post_id'] = article.attrib['id']
try:
time_el = xpath_in('.//time[@datetime]', article)[0]
elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime'])
elems['date'] = elems['time']
except IndexError:
pass
links = xpath_in('.//a', article)
for l in links:
if l.attrib.get('href', '').startswith('https://forteprenestino.net'):
elems['location'] = 'CSOA Forte Prenestino'
# print('t', [get_text(e) for e in xpath_in('./h2', article)])
try:
elems['img'] = xpath_in('.//img', article)[0].attrib['src']
except IndexError:
img = None
if 'time' in elems and 'location' in elems:
cls = Event
else:
cls = Article
for k in ['time', 'location']:
if k in elems:
del elems[k]
a = cls(**elems)
articles.append(a)
for a in articles:
print(a)
a.save()
return 0
if __name__ == '__main__':
import sys
sys.exit(main(sys.argv))