initial commit

2020-08-23 16:28:31 +02:00 · 2020-08-23 16:28:31 +02:00 · ac5d8a3cf7
commit ac5d8a3cf7
8 changed files with 284 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+__pycache__
+news.db
--- a/README.md
+++ b/README.md
@ -0,0 +1,6 @@
+sf-active fa un file per ogni articolo.
+
+obiettivo di `sfaciolo` è ritrasformare il tutto in un db "vero".
+
+Bisogna partire da una certa dir ed "eseguire" (parsare?) il vecchio php. ovviamente poi butttiamo tutto in un
+db "vero".
--- a/app.py
+++ b/app.py
@ -0,0 +1,79 @@
+import sqlite3
+from datetime import datetime
+
+from  flask import Flask, render_template, redirect, url_for, request
+from  flask_sqlalchemy import SQLAlchemy
+from  flask_paginate import Pagination
+# from sqlalchemy import Column, Integer, String
+# from sqlalchemy.ext.declarative import declarative_base
+
+app = Flask(__name__)
+app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///news.db'
+db = SQLAlchemy(app)
+
+
+class News(db.Model):
+    __tablename__ = 'news'
+
+    nid = db.Column(db.String, primary_key=True)
+    author = db.Column(db.Unicode)
+    title = db.Column(db.Unicode)
+    body = db.Column(db.Unicode)
+    published = db.Column(db.String)
+    last_modified = db.Column(db.String)
+    display = db.Column(db.Unicode)
+
+    @property
+    def canonical_url(self):
+        return url_for('news', year=self.published_datetime.year, month=self.published_datetime.month, nid=self.nid)
+
+    @property
+    def published_datetime(self):
+        return datetime.fromtimestamp(self.published)
+
+    @property
+    def published_str(self):
+        return str(self.published_datetime)
+
+    def __repr__(self):
+       return "<News(%s)>" % (self.nid)
+
+@app.route('/')
+def home():
+    return 'homepage'
+
+@app.route('/search/')
+def search_home():
+    return 'cerca cerca'
+
+@app.route('/search/by-month/<int:year>/<int:month>')
+def search_by_month(year, month):
+    # TODO: proper pagination
+    page = int(request.args.get('page', '1')) - 1
+    first = page * 50
+    last = first + 50
+
+    d = datetime(year=year, month=month, day=1)
+    ts_from = int(d.timestamp())
+    if month < 12:
+        d = datetime(year=year, month=month+1, day=1)
+    else:
+        d = datetime(year=year+1, month=1, day=1)
+    ts_to = int(d.timestamp())
+    news = News.query.filter(News.published >= ts_from, News.published < ts_to).order_by('published')
+
+    pagination = Pagination(page=page, total=news.count(), per_page=50)
+    news = news[first:last]
+
+    return render_template('search_results.html', results=news, pagination=pagination)
+
+@app.route('/news/<int:nid>')
+def news_by_nid(nid):
+    n = News.query.get(str(nid))
+    return redirect(n.canonical_url)
+
+@app.route('/news/<year>/<month>/<int:nid>.php')
+def news(year, month, nid):
+    n = News.query.get(str(nid))
+    return render_template('news.html', n=n)
+
--- a/parse_one.php
+++ b/parse_one.php
@ -0,0 +1,138 @@
+<?php
+
+require 'vendor/autoload.php';
+
+use PhpParser\Error;
+use PhpParser\NodeDumper;
+use PhpParser\ParserFactory;
+
+// $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
+
+function strptimestamp($date, $fmt) {
+    $timeArray = strptime($date, $fmt);
+    if($timeArray === false) { return false; }
+    return mktime(
+        $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'], 
+        $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
+    );
+}
+
+function pup_selector(string $html , string $selector): string {
+    if(getenv("PUP_BIN") === false) {
+        return [];
+    }
+    $process = proc_open(getenv("PUP_BIN") . $selector,
+        [0=>['pipe', 'r'], 1=>['pipe', 'w']],
+        $pipes);
+    fwrite($pipes[0], $html);
+    fclose($pipes[0]);
+    $out = stream_get_contents($pipes[1]);
+    fclose($pipes[1]);
+    return trim(html_entity_decode($out));
+}
+
+function extract_metadata_from_html(string $html): array {
+    if(getenv("PUP_BIN") === false) {
+        return [];
+    }
+    $meta = [];
+    $date = strptimestamp(
+        pup_selector($html, ' table td.titoloFTR .small i text{}'),
+        "%A, %b. %d, %Y at %H:%M %p"
+    );
+
+    if($date !== false) {
+        $meta['published'] = strftime('%s', $date);
+    }
+    $author = pup_selector($html, ' table td.titoloFTR .small strong text{}');
+    if($author !== '') {
+        $meta['author'] = $author;
+    }
+    return $meta;
+}
+
+function parse_save(object $db, string $filename, $parser) {
+    try {
+        $ast = $parser->parse(file_get_contents($filename));
+    } catch (Error $error) {
+        echo "Parse error on $filename: {$error->getMessage()}\n";
+        return;
+    }
+
+    $dumper = new NodeDumper;
+
+    // do magic things now
+    $nid = intval(basename($filename, '.php'));
+    $fparts = explode('/', $filename);
+    $year = $fparts[count($fparts)-3];
+    $month = $fparts[count($fparts)-2];
+    $dt = strtotime("01-$month-$year");
+    $metadata = ['nid' => $nid, 'published' => $dt];
+    $body = '';
+    foreach($ast as $part)
+    {
+        if($part instanceof PhpParser\Node\Stmt\Expression) {
+            if($part->expr instanceof PhpParser\Node\Expr\Assign
+                && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
+                && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
+                && $part->expr->var->var->name === 'GLOBALS'
+            ) {
+                if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
+                    $val = $part->expr->expr->items;
+                } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
+                    $val = iconv('latin1', 'utf8', $part->expr->expr->value);
+                } else {
+                    $val = $part->expr->expr->value;
+                }
+                $metadata[$part->expr->var->dim->value] = $val;
+            }
+        }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
+            $body .= iconv('latin1', 'utf8', $part->value);
+        }
+    }
+    $metadata = array_merge($metadata, extract_metadata_from_html($body));
+    save($db, $body, $metadata);
+
+
+}
+function save($db, $body, $metadata) {
+    // print("Loading ". $metadata['nid'] . "\n");
+    $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
+    if($stm === false) {
+        print("error during INSERT:   ");
+        print($db->errorInfo()[2]);
+        return;
+    }
+    $stm->bindParam(1, $metadata['nid']);
+    $stm->bindParam(2, $metadata['page_title']);
+    $stm->bindParam(3, $metadata['author']);
+    $stm->bindParam(4, $body);
+    $stm->bindParam(5, $metadata['page_display']);
+    $stm->bindParam(6, $metadata['published']);
+    $stm->execute();
+}
+
+
+
+$db = new PDO('sqlite:' . $argv[1]);
+$db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY,  body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
+$db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
+$db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
+
+
+$parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
+$i = 0;
+$db->beginTransaction();
+while($f = fgets(STDIN)) {
+    $f = str_replace("\n", '', $f);
+    parse_save($db, $f, $parser);
+    $i++;
+    if($i >= 100) {
+        $db->commit();
+        $db->beginTransaction();
+        $i = 0;
+    }
+}
+$db->commit();
+$db = null;
+exit(0);
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
+Click==7.0
+Flask==1.1.1
+flask-paginate==0.5.5
+Flask-SQLAlchemy==2.4.1
+itsdangerous==1.1.0
+Jinja2==2.11.1
+MarkupSafe==1.1.1
+pkg-resources==0.0.0
+SQLAlchemy==1.3.13
+Werkzeug==1.0.0
--- a/templates/base.html
+++ b/templates/base.html
@ -0,0 +1,16 @@
+<html>
+    <body>
+<style>
+.pagination ul li {
+    display: inline;
+    padding: 0 0.5em;
+}
+</style>
+    </body>
+    <div>
+        {% block content %}
+        {% endblock %}
+</div>
+</html>
+
+
--- a/templates/news.html
+++ b/templates/news.html
@ -0,0 +1,10 @@
+<article>
+    <header> <h1>{{n.title}}</h1>
+        <div>
+        <time> {{n.published_datetime.strftime("%d %b %Y")}}</time>
+        </div>
+    </header>
+    <div class="article-body">
+        {{n.body|safe}}
+    </div>
+</article>
--- a/templates/search_results.html
+++ b/templates/search_results.html
@ -0,0 +1,23 @@
+{% extends "base.html" %}
+{% block content %}
+<style>
+.post-hidden { font-size: 70%; }
+.post-hidden a { color: gray; }
+.result {
+    list-style: none;
+}
+.result .author { font-style: italic;  font-size: 85%;}
+</style>
+        <ul>
+            {% for r in results %}
+            <li class="result post-display--{{r.display}} {% if r.display == 'f' %}post-hidden{%endif%}"
+                >
+                <a href="{{url_for('news_by_nid', nid=r.nid)}}">{{r.nid}}</a>
+                - <time>{{r.published_datetime.strftime('%d/%m/%Y')}}</time>
+                - {{r.title}} by <span class="author">{{r.author}}</span>
+            </li>
+            {% endfor %}
+        </ul>
+        {{pagination.links}}
+{% endblock content %}
+