initial commit
This commit is contained in:
commit
ac5d8a3cf7
8 changed files with 284 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
__pycache__
|
||||
news.db
|
6
README.md
Normal file
6
README.md
Normal file
|
@ -0,0 +1,6 @@
|
|||
sf-active fa un file per ogni articolo.
|
||||
|
||||
obiettivo di `sfaciolo` è ritrasformare il tutto in un db "vero".
|
||||
|
||||
Bisogna partire da una certa dir ed "eseguire" (parsare?) il vecchio php. ovviamente poi butttiamo tutto in un
|
||||
db "vero".
|
79
app.py
Normal file
79
app.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
from flask import Flask, render_template, redirect, url_for, request
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
from flask_paginate import Pagination
|
||||
# from sqlalchemy import Column, Integer, String
|
||||
# from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
app = Flask(__name__)
|
||||
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///news.db'
|
||||
db = SQLAlchemy(app)
|
||||
|
||||
|
||||
class News(db.Model):
|
||||
__tablename__ = 'news'
|
||||
|
||||
nid = db.Column(db.String, primary_key=True)
|
||||
author = db.Column(db.Unicode)
|
||||
title = db.Column(db.Unicode)
|
||||
body = db.Column(db.Unicode)
|
||||
published = db.Column(db.String)
|
||||
last_modified = db.Column(db.String)
|
||||
display = db.Column(db.Unicode)
|
||||
|
||||
@property
|
||||
def canonical_url(self):
|
||||
return url_for('news', year=self.published_datetime.year, month=self.published_datetime.month, nid=self.nid)
|
||||
|
||||
@property
|
||||
def published_datetime(self):
|
||||
return datetime.fromtimestamp(self.published)
|
||||
|
||||
@property
|
||||
def published_str(self):
|
||||
return str(self.published_datetime)
|
||||
|
||||
def __repr__(self):
|
||||
return "<News(%s)>" % (self.nid)
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
return 'homepage'
|
||||
|
||||
@app.route('/search/')
|
||||
def search_home():
|
||||
return 'cerca cerca'
|
||||
|
||||
@app.route('/search/by-month/<int:year>/<int:month>')
|
||||
def search_by_month(year, month):
|
||||
# TODO: proper pagination
|
||||
page = int(request.args.get('page', '1')) - 1
|
||||
first = page * 50
|
||||
last = first + 50
|
||||
|
||||
d = datetime(year=year, month=month, day=1)
|
||||
ts_from = int(d.timestamp())
|
||||
if month < 12:
|
||||
d = datetime(year=year, month=month+1, day=1)
|
||||
else:
|
||||
d = datetime(year=year+1, month=1, day=1)
|
||||
ts_to = int(d.timestamp())
|
||||
news = News.query.filter(News.published >= ts_from, News.published < ts_to).order_by('published')
|
||||
|
||||
pagination = Pagination(page=page, total=news.count(), per_page=50)
|
||||
news = news[first:last]
|
||||
|
||||
return render_template('search_results.html', results=news, pagination=pagination)
|
||||
|
||||
@app.route('/news/<int:nid>')
|
||||
def news_by_nid(nid):
|
||||
n = News.query.get(str(nid))
|
||||
return redirect(n.canonical_url)
|
||||
|
||||
@app.route('/news/<year>/<month>/<int:nid>.php')
|
||||
def news(year, month, nid):
|
||||
n = News.query.get(str(nid))
|
||||
return render_template('news.html', n=n)
|
||||
|
138
parse_one.php
Normal file
138
parse_one.php
Normal file
|
@ -0,0 +1,138 @@
|
|||
<?php
|
||||
|
||||
require 'vendor/autoload.php';
|
||||
|
||||
use PhpParser\Error;
|
||||
use PhpParser\NodeDumper;
|
||||
use PhpParser\ParserFactory;
|
||||
|
||||
// $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
|
||||
|
||||
function strptimestamp($date, $fmt) {
|
||||
$timeArray = strptime($date, $fmt);
|
||||
if($timeArray === false) { return false; }
|
||||
return mktime(
|
||||
$timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
|
||||
$timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
|
||||
);
|
||||
}
|
||||
|
||||
function pup_selector(string $html , string $selector): string {
|
||||
if(getenv("PUP_BIN") === false) {
|
||||
return [];
|
||||
}
|
||||
$process = proc_open(getenv("PUP_BIN") . $selector,
|
||||
[0=>['pipe', 'r'], 1=>['pipe', 'w']],
|
||||
$pipes);
|
||||
fwrite($pipes[0], $html);
|
||||
fclose($pipes[0]);
|
||||
$out = stream_get_contents($pipes[1]);
|
||||
fclose($pipes[1]);
|
||||
return trim(html_entity_decode($out));
|
||||
}
|
||||
|
||||
function extract_metadata_from_html(string $html): array {
|
||||
if(getenv("PUP_BIN") === false) {
|
||||
return [];
|
||||
}
|
||||
$meta = [];
|
||||
$date = strptimestamp(
|
||||
pup_selector($html, ' table td.titoloFTR .small i text{}'),
|
||||
"%A, %b. %d, %Y at %H:%M %p"
|
||||
);
|
||||
|
||||
if($date !== false) {
|
||||
$meta['published'] = strftime('%s', $date);
|
||||
}
|
||||
$author = pup_selector($html, ' table td.titoloFTR .small strong text{}');
|
||||
if($author !== '') {
|
||||
$meta['author'] = $author;
|
||||
}
|
||||
return $meta;
|
||||
}
|
||||
|
||||
function parse_save(object $db, string $filename, $parser) {
|
||||
try {
|
||||
$ast = $parser->parse(file_get_contents($filename));
|
||||
} catch (Error $error) {
|
||||
echo "Parse error on $filename: {$error->getMessage()}\n";
|
||||
return;
|
||||
}
|
||||
|
||||
$dumper = new NodeDumper;
|
||||
|
||||
// do magic things now
|
||||
$nid = intval(basename($filename, '.php'));
|
||||
$fparts = explode('/', $filename);
|
||||
$year = $fparts[count($fparts)-3];
|
||||
$month = $fparts[count($fparts)-2];
|
||||
$dt = strtotime("01-$month-$year");
|
||||
$metadata = ['nid' => $nid, 'published' => $dt];
|
||||
$body = '';
|
||||
foreach($ast as $part)
|
||||
{
|
||||
if($part instanceof PhpParser\Node\Stmt\Expression) {
|
||||
if($part->expr instanceof PhpParser\Node\Expr\Assign
|
||||
&& $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
|
||||
&& $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
|
||||
&& $part->expr->var->var->name === 'GLOBALS'
|
||||
) {
|
||||
if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
|
||||
$val = $part->expr->expr->items;
|
||||
} elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
|
||||
$val = iconv('latin1', 'utf8', $part->expr->expr->value);
|
||||
} else {
|
||||
$val = $part->expr->expr->value;
|
||||
}
|
||||
$metadata[$part->expr->var->dim->value] = $val;
|
||||
}
|
||||
}elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
|
||||
$body .= iconv('latin1', 'utf8', $part->value);
|
||||
}
|
||||
}
|
||||
$metadata = array_merge($metadata, extract_metadata_from_html($body));
|
||||
save($db, $body, $metadata);
|
||||
|
||||
|
||||
}
|
||||
function save($db, $body, $metadata) {
|
||||
// print("Loading ". $metadata['nid'] . "\n");
|
||||
$stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
|
||||
if($stm === false) {
|
||||
print("error during INSERT: ");
|
||||
print($db->errorInfo()[2]);
|
||||
return;
|
||||
}
|
||||
$stm->bindParam(1, $metadata['nid']);
|
||||
$stm->bindParam(2, $metadata['page_title']);
|
||||
$stm->bindParam(3, $metadata['author']);
|
||||
$stm->bindParam(4, $body);
|
||||
$stm->bindParam(5, $metadata['page_display']);
|
||||
$stm->bindParam(6, $metadata['published']);
|
||||
$stm->execute();
|
||||
}
|
||||
|
||||
|
||||
|
||||
$db = new PDO('sqlite:' . $argv[1]);
|
||||
$db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
|
||||
$db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
|
||||
$db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
|
||||
|
||||
|
||||
$parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
|
||||
$i = 0;
|
||||
$db->beginTransaction();
|
||||
while($f = fgets(STDIN)) {
|
||||
$f = str_replace("\n", '', $f);
|
||||
parse_save($db, $f, $parser);
|
||||
$i++;
|
||||
if($i >= 100) {
|
||||
$db->commit();
|
||||
$db->beginTransaction();
|
||||
$i = 0;
|
||||
}
|
||||
}
|
||||
$db->commit();
|
||||
$db = null;
|
||||
exit(0);
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
|
@ -0,0 +1,10 @@
|
|||
Click==7.0
|
||||
Flask==1.1.1
|
||||
flask-paginate==0.5.5
|
||||
Flask-SQLAlchemy==2.4.1
|
||||
itsdangerous==1.1.0
|
||||
Jinja2==2.11.1
|
||||
MarkupSafe==1.1.1
|
||||
pkg-resources==0.0.0
|
||||
SQLAlchemy==1.3.13
|
||||
Werkzeug==1.0.0
|
16
templates/base.html
Normal file
16
templates/base.html
Normal file
|
@ -0,0 +1,16 @@
|
|||
<html>
|
||||
<body>
|
||||
<style>
|
||||
.pagination ul li {
|
||||
display: inline;
|
||||
padding: 0 0.5em;
|
||||
}
|
||||
</style>
|
||||
</body>
|
||||
<div>
|
||||
{% block content %}
|
||||
{% endblock %}
|
||||
</div>
|
||||
</html>
|
||||
|
||||
|
10
templates/news.html
Normal file
10
templates/news.html
Normal file
|
@ -0,0 +1,10 @@
|
|||
<article>
|
||||
<header> <h1>{{n.title}}</h1>
|
||||
<div>
|
||||
<time> {{n.published_datetime.strftime("%d %b %Y")}}</time>
|
||||
</div>
|
||||
</header>
|
||||
<div class="article-body">
|
||||
{{n.body|safe}}
|
||||
</div>
|
||||
</article>
|
23
templates/search_results.html
Normal file
23
templates/search_results.html
Normal file
|
@ -0,0 +1,23 @@
|
|||
{% extends "base.html" %}
|
||||
{% block content %}
|
||||
<style>
|
||||
.post-hidden { font-size: 70%; }
|
||||
.post-hidden a { color: gray; }
|
||||
.result {
|
||||
list-style: none;
|
||||
}
|
||||
.result .author { font-style: italic; font-size: 85%;}
|
||||
</style>
|
||||
<ul>
|
||||
{% for r in results %}
|
||||
<li class="result post-display--{{r.display}} {% if r.display == 'f' %}post-hidden{%endif%}"
|
||||
>
|
||||
<a href="{{url_for('news_by_nid', nid=r.nid)}}">{{r.nid}}</a>
|
||||
- <time>{{r.published_datetime.strftime('%d/%m/%Y')}}</time>
|
||||
- {{r.title}} by <span class="author">{{r.author}}</span>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{{pagination.links}}
|
||||
{% endblock content %}
|
||||
|
Loading…
Reference in a new issue