|
@@ -0,0 +1,138 @@
|
|
|
+<?php
|
|
|
+
|
|
|
+require 'vendor/autoload.php';
|
|
|
+
|
|
|
+use PhpParser\Error;
|
|
|
+use PhpParser\NodeDumper;
|
|
|
+use PhpParser\ParserFactory;
|
|
|
+
|
|
|
+// $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
|
|
|
+
|
|
|
+function strptimestamp($date, $fmt) {
|
|
|
+ $timeArray = strptime($date, $fmt);
|
|
|
+ if($timeArray === false) { return false; }
|
|
|
+ return mktime(
|
|
|
+ $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
|
|
|
+ $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
|
|
|
+ );
|
|
|
+}
|
|
|
+
|
|
|
+function pup_selector(string $html , string $selector): string {
|
|
|
+ if(getenv("PUP_BIN") === false) {
|
|
|
+ return [];
|
|
|
+ }
|
|
|
+ $process = proc_open(getenv("PUP_BIN") . $selector,
|
|
|
+ [0=>['pipe', 'r'], 1=>['pipe', 'w']],
|
|
|
+ $pipes);
|
|
|
+ fwrite($pipes[0], $html);
|
|
|
+ fclose($pipes[0]);
|
|
|
+ $out = stream_get_contents($pipes[1]);
|
|
|
+ fclose($pipes[1]);
|
|
|
+ return trim(html_entity_decode($out));
|
|
|
+}
|
|
|
+
|
|
|
+function extract_metadata_from_html(string $html): array {
|
|
|
+ if(getenv("PUP_BIN") === false) {
|
|
|
+ return [];
|
|
|
+ }
|
|
|
+ $meta = [];
|
|
|
+ $date = strptimestamp(
|
|
|
+ pup_selector($html, ' table td.titoloFTR .small i text{}'),
|
|
|
+ "%A, %b. %d, %Y at %H:%M %p"
|
|
|
+ );
|
|
|
+
|
|
|
+ if($date !== false) {
|
|
|
+ $meta['published'] = strftime('%s', $date);
|
|
|
+ }
|
|
|
+ $author = pup_selector($html, ' table td.titoloFTR .small strong text{}');
|
|
|
+ if($author !== '') {
|
|
|
+ $meta['author'] = $author;
|
|
|
+ }
|
|
|
+ return $meta;
|
|
|
+}
|
|
|
+
|
|
|
+function parse_save(object $db, string $filename, $parser) {
|
|
|
+ try {
|
|
|
+ $ast = $parser->parse(file_get_contents($filename));
|
|
|
+ } catch (Error $error) {
|
|
|
+ echo "Parse error on $filename: {$error->getMessage()}\n";
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ $dumper = new NodeDumper;
|
|
|
+
|
|
|
+ // do magic things now
|
|
|
+ $nid = intval(basename($filename, '.php'));
|
|
|
+ $fparts = explode('/', $filename);
|
|
|
+ $year = $fparts[count($fparts)-3];
|
|
|
+ $month = $fparts[count($fparts)-2];
|
|
|
+ $dt = strtotime("01-$month-$year");
|
|
|
+ $metadata = ['nid' => $nid, 'published' => $dt];
|
|
|
+ $body = '';
|
|
|
+ foreach($ast as $part)
|
|
|
+ {
|
|
|
+ if($part instanceof PhpParser\Node\Stmt\Expression) {
|
|
|
+ if($part->expr instanceof PhpParser\Node\Expr\Assign
|
|
|
+ && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
|
|
|
+ && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
|
|
|
+ && $part->expr->var->var->name === 'GLOBALS'
|
|
|
+ ) {
|
|
|
+ if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
|
|
|
+ $val = $part->expr->expr->items;
|
|
|
+ } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
|
|
|
+ $val = iconv('latin1', 'utf8', $part->expr->expr->value);
|
|
|
+ } else {
|
|
|
+ $val = $part->expr->expr->value;
|
|
|
+ }
|
|
|
+ $metadata[$part->expr->var->dim->value] = $val;
|
|
|
+ }
|
|
|
+ }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
|
|
|
+ $body .= iconv('latin1', 'utf8', $part->value);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ $metadata = array_merge($metadata, extract_metadata_from_html($body));
|
|
|
+ save($db, $body, $metadata);
|
|
|
+
|
|
|
+
|
|
|
+}
|
|
|
+function save($db, $body, $metadata) {
|
|
|
+ // print("Loading ". $metadata['nid'] . "\n");
|
|
|
+ $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
|
|
|
+ if($stm === false) {
|
|
|
+ print("error during INSERT: ");
|
|
|
+ print($db->errorInfo()[2]);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ $stm->bindParam(1, $metadata['nid']);
|
|
|
+ $stm->bindParam(2, $metadata['page_title']);
|
|
|
+ $stm->bindParam(3, $metadata['author']);
|
|
|
+ $stm->bindParam(4, $body);
|
|
|
+ $stm->bindParam(5, $metadata['page_display']);
|
|
|
+ $stm->bindParam(6, $metadata['published']);
|
|
|
+ $stm->execute();
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+$db = new PDO('sqlite:' . $argv[1]);
|
|
|
+$db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
|
|
|
+$db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
|
|
|
+$db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
|
|
|
+
|
|
|
+
|
|
|
+$parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
|
|
|
+$i = 0;
|
|
|
+$db->beginTransaction();
|
|
|
+while($f = fgets(STDIN)) {
|
|
|
+ $f = str_replace("\n", '', $f);
|
|
|
+ parse_save($db, $f, $parser);
|
|
|
+ $i++;
|
|
|
+ if($i >= 100) {
|
|
|
+ $db->commit();
|
|
|
+ $db->beginTransaction();
|
|
|
+ $i = 0;
|
|
|
+ }
|
|
|
+}
|
|
|
+$db->commit();
|
|
|
+$db = null;
|
|
|
+exit(0);
|