initial commit

This commit is contained in:
boyska 2020-08-23 16:28:31 +02:00
commit ac5d8a3cf7
8 changed files with 284 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
__pycache__
news.db

6
README.md Normal file
View file

@ -0,0 +1,6 @@
sf-active fa un file per ogni articolo.
obiettivo di `sfaciolo` è ritrasformare il tutto in un db "vero".
Bisogna partire da una certa dir ed "eseguire" (parsare?) il vecchio php. ovviamente poi butttiamo tutto in un
db "vero".

79
app.py Normal file
View file

@ -0,0 +1,79 @@
import sqlite3
from datetime import datetime
from flask import Flask, render_template, redirect, url_for, request
from flask_sqlalchemy import SQLAlchemy
from flask_paginate import Pagination
# from sqlalchemy import Column, Integer, String
# from sqlalchemy.ext.declarative import declarative_base
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///news.db'
db = SQLAlchemy(app)
class News(db.Model):
__tablename__ = 'news'
nid = db.Column(db.String, primary_key=True)
author = db.Column(db.Unicode)
title = db.Column(db.Unicode)
body = db.Column(db.Unicode)
published = db.Column(db.String)
last_modified = db.Column(db.String)
display = db.Column(db.Unicode)
@property
def canonical_url(self):
return url_for('news', year=self.published_datetime.year, month=self.published_datetime.month, nid=self.nid)
@property
def published_datetime(self):
return datetime.fromtimestamp(self.published)
@property
def published_str(self):
return str(self.published_datetime)
def __repr__(self):
return "<News(%s)>" % (self.nid)
@app.route('/')
def home():
return 'homepage'
@app.route('/search/')
def search_home():
return 'cerca cerca'
@app.route('/search/by-month/<int:year>/<int:month>')
def search_by_month(year, month):
# TODO: proper pagination
page = int(request.args.get('page', '1')) - 1
first = page * 50
last = first + 50
d = datetime(year=year, month=month, day=1)
ts_from = int(d.timestamp())
if month < 12:
d = datetime(year=year, month=month+1, day=1)
else:
d = datetime(year=year+1, month=1, day=1)
ts_to = int(d.timestamp())
news = News.query.filter(News.published >= ts_from, News.published < ts_to).order_by('published')
pagination = Pagination(page=page, total=news.count(), per_page=50)
news = news[first:last]
return render_template('search_results.html', results=news, pagination=pagination)
@app.route('/news/<int:nid>')
def news_by_nid(nid):
n = News.query.get(str(nid))
return redirect(n.canonical_url)
@app.route('/news/<year>/<month>/<int:nid>.php')
def news(year, month, nid):
n = News.query.get(str(nid))
return render_template('news.html', n=n)

138
parse_one.php Normal file
View file

@ -0,0 +1,138 @@
<?php
require 'vendor/autoload.php';
use PhpParser\Error;
use PhpParser\NodeDumper;
use PhpParser\ParserFactory;
// $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
function strptimestamp($date, $fmt) {
$timeArray = strptime($date, $fmt);
if($timeArray === false) { return false; }
return mktime(
$timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
$timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
);
}
function pup_selector(string $html , string $selector): string {
if(getenv("PUP_BIN") === false) {
return [];
}
$process = proc_open(getenv("PUP_BIN") . $selector,
[0=>['pipe', 'r'], 1=>['pipe', 'w']],
$pipes);
fwrite($pipes[0], $html);
fclose($pipes[0]);
$out = stream_get_contents($pipes[1]);
fclose($pipes[1]);
return trim(html_entity_decode($out));
}
function extract_metadata_from_html(string $html): array {
if(getenv("PUP_BIN") === false) {
return [];
}
$meta = [];
$date = strptimestamp(
pup_selector($html, ' table td.titoloFTR .small i text{}'),
"%A, %b. %d, %Y at %H:%M %p"
);
if($date !== false) {
$meta['published'] = strftime('%s', $date);
}
$author = pup_selector($html, ' table td.titoloFTR .small strong text{}');
if($author !== '') {
$meta['author'] = $author;
}
return $meta;
}
function parse_save(object $db, string $filename, $parser) {
try {
$ast = $parser->parse(file_get_contents($filename));
} catch (Error $error) {
echo "Parse error on $filename: {$error->getMessage()}\n";
return;
}
$dumper = new NodeDumper;
// do magic things now
$nid = intval(basename($filename, '.php'));
$fparts = explode('/', $filename);
$year = $fparts[count($fparts)-3];
$month = $fparts[count($fparts)-2];
$dt = strtotime("01-$month-$year");
$metadata = ['nid' => $nid, 'published' => $dt];
$body = '';
foreach($ast as $part)
{
if($part instanceof PhpParser\Node\Stmt\Expression) {
if($part->expr instanceof PhpParser\Node\Expr\Assign
&& $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
&& $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
&& $part->expr->var->var->name === 'GLOBALS'
) {
if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
$val = $part->expr->expr->items;
} elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
$val = iconv('latin1', 'utf8', $part->expr->expr->value);
} else {
$val = $part->expr->expr->value;
}
$metadata[$part->expr->var->dim->value] = $val;
}
}elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
$body .= iconv('latin1', 'utf8', $part->value);
}
}
$metadata = array_merge($metadata, extract_metadata_from_html($body));
save($db, $body, $metadata);
}
function save($db, $body, $metadata) {
// print("Loading ". $metadata['nid'] . "\n");
$stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
if($stm === false) {
print("error during INSERT: ");
print($db->errorInfo()[2]);
return;
}
$stm->bindParam(1, $metadata['nid']);
$stm->bindParam(2, $metadata['page_title']);
$stm->bindParam(3, $metadata['author']);
$stm->bindParam(4, $body);
$stm->bindParam(5, $metadata['page_display']);
$stm->bindParam(6, $metadata['published']);
$stm->execute();
}
$db = new PDO('sqlite:' . $argv[1]);
$db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
$db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
$db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
$parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
$i = 0;
$db->beginTransaction();
while($f = fgets(STDIN)) {
$f = str_replace("\n", '', $f);
parse_save($db, $f, $parser);
$i++;
if($i >= 100) {
$db->commit();
$db->beginTransaction();
$i = 0;
}
}
$db->commit();
$db = null;
exit(0);

10
requirements.txt Normal file
View file

@ -0,0 +1,10 @@
Click==7.0
Flask==1.1.1
flask-paginate==0.5.5
Flask-SQLAlchemy==2.4.1
itsdangerous==1.1.0
Jinja2==2.11.1
MarkupSafe==1.1.1
pkg-resources==0.0.0
SQLAlchemy==1.3.13
Werkzeug==1.0.0

16
templates/base.html Normal file
View file

@ -0,0 +1,16 @@
<html>
<body>
<style>
.pagination ul li {
display: inline;
padding: 0 0.5em;
}
</style>
</body>
<div>
{% block content %}
{% endblock %}
</div>
</html>

10
templates/news.html Normal file
View file

@ -0,0 +1,10 @@
<article>
<header> <h1>{{n.title}}</h1>
<div>
<time> {{n.published_datetime.strftime("%d %b %Y")}}</time>
</div>
</header>
<div class="article-body">
{{n.body|safe}}
</div>
</article>

View file

@ -0,0 +1,23 @@
{% extends "base.html" %}
{% block content %}
<style>
.post-hidden { font-size: 70%; }
.post-hidden a { color: gray; }
.result {
list-style: none;
}
.result .author { font-style: italic; font-size: 85%;}
</style>
<ul>
{% for r in results %}
<li class="result post-display--{{r.display}} {% if r.display == 'f' %}post-hidden{%endif%}"
>
<a href="{{url_for('news_by_nid', nid=r.nid)}}">{{r.nid}}</a>
- <time>{{r.published_datetime.strftime('%d/%m/%Y')}}</time>
- {{r.title}} by <span class="author">{{r.author}}</span>
</li>
{% endfor %}
</ul>
{{pagination.links}}
{% endblock content %}