123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #
- # import.py
- #
- # Copyright 2021 <avana@disperazione>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- # MA 02110-1301, USA.
- #
- #
- import datetime
- from itertools import chain
- from pathlib import Path
- from lxml import etree
- import toml
- class Article:
- BASE = Path('content')
- def __init__(self, post_id: str, title: str, body: str, img=None, date=None):
- self.post_id = post_id
- self.title = title
- self.body = body
- self.img = img
- self.date = date
- def get_dir(self):
- return self.BASE / 'posts'
- def get_fname(self):
- return self.get_dir() / (self.post_id + '.md')
- def get_frontmatter(self):
- fm = {
- 'title': self.title,
- }
- if self.img:
- fm['cover'] = self.img
- if self.date:
- fm['date'] = self.date
- return fm
- def save(self):
- with open(self.get_fname(), 'w') as buf:
- buf.write('+++\n')
- toml.dump(self.get_frontmatter(), buf)
- buf.write('+++\n')
- buf.write(self.body)
- def __str__(self):
- return "<%s '%s'>" % (self.__class__.__name__, self.post_id)
- class Event(Article):
- def __init__(self, time=None, location=None, **kwargs):
- super().__init__(**kwargs)
- self.time = time
- self.location = location
- def get_dir(self):
- return self.BASE / 'events'
- def get_frontmatter(self):
- fm = super().get_frontmatter()
- fm['eventDate'] = self.time
- fm['location'] = self.location
- return fm
- def xpath_in(xpath: str, elem):
- xp = etree.XPath(xpath)
- return xp(elem)
- def get_text(node):
- if type(node) is str:
- return node
- s = node.text
- if s is None:
- s = ''
- for child in node:
- s += get_text(s) #etree.tostring(child, encoding='unicode')
- return s
- def stringify_children(node):
- parts = ([node.text] +
- list(chain(*([c.text, etree.tostring(c).decode('utf8'), c.tail] for c in node.getchildren()))) +
- [node.tail])
- # filter removes possible Nones in texts and tails
- return ''.join(filter(None, parts))
- def main(args):
- articles = []
- parser = etree.HTMLParser()
- dom = etree.parse(open('home.html'), parser)
- for article in dom.xpath('//article'):
- elems = {}
- # XXX: questa cosa funziona male
- elems['body'] = '\n'.join(l.strip() for l in stringify_children(article).split('\n'))
- try:
- elems['title'] = str(xpath_in('.//h2/text()', article)[0])
- except IndexError:
- continue
- elems['post_id'] = article.attrib['id']
- try:
- time_el = xpath_in('.//time[@datetime]', article)[0]
- elems['time'] = datetime.datetime.fromisoformat(time_el.attrib['datetime'])
- elems['date'] = elems['time']
- except IndexError:
- pass
- links = xpath_in('.//a', article)
- for l in links:
- if l.attrib.get('href', '').startswith('https://forteprenestino.net'):
- elems['location'] = 'CSOA Forte Prenestino'
-
- # print('t', [get_text(e) for e in xpath_in('./h2', article)])
- try:
- elems['img'] = xpath_in('.//img', article)[0].attrib['src']
- except IndexError:
- img = None
- if 'time' in elems and 'location' in elems:
- cls = Event
- else:
- cls = Article
- for k in ['time', 'location']:
- if k in elems:
- del elems[k]
- a = cls(**elems)
- articles.append(a)
- for a in articles:
- print(a)
- a.save()
- return 0
- if __name__ == '__main__':
- import sys
- sys.exit(main(sys.argv))
|