italy-indy-py/parse_comment.php
2020-08-23 19:50:47 +02:00

206 sor
6,9 KiB
PHP

<?php
/*
* Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
* esempio echo 11102_comment.php | php parse_comment.php a.db
* oppure
* find italy/website/ -name '*_comment.php' | php parse_comment.php a.db
*/
require 'vendor/autoload.php';
use PhpParser\Error;
use PhpParser\NodeDumper;
use PhpParser\ParserFactory;
// $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
function strptimestamp($date, $fmt) {
$timeArray = strptime($date, $fmt);
if($timeArray === false) { return false; }
return mktime(
$timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
$timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
);
}
function pup_selector(string $html , string $selector): string {
if(getenv("PUP_BIN") === false) {
return [];
}
$process = proc_open(getenv("PUP_BIN") . " '" . $selector . "'",
[0=>['pipe', 'r'], 1=>['pipe', 'w']],
$pipes);
fwrite($pipes[0], $html);
fclose($pipes[0]);
$out = stream_get_contents($pipes[1]);
fclose($pipes[1]);
return trim(html_entity_decode($out));
}
function extract_metadata_from_html(string $html): array {
if(getenv("PUP_BIN") === false) {
return [];
}
$meta = [];
$date = strptimestamp(
pup_selector($html, 'td.titoloFTR .small i text{}'),
"%A, %b. %d, %Y at %H:%M %p"
);
if($date !== false) {
$meta['published'] = strftime('%s', $date);
}
$author = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}'))[0]->{'text'};
if($author !== '') {
$meta['author'] = $author;
}
return $meta;
}
function extract_comments_from_html(string $html): array {
if(getenv("PUP_BIN") === false) {
return [];
}
if(trim($html) === '') { return []; }
$length = count(json_decode(pup_selector($html, 'table json{}', TRUE)));
if($length === 0) {
return [];
}
$comments = [];
for($i=1; $i <= $length; $i++) {
$comment = pup_selector($html, 'table:nth-of-type(' . $i . ')');
$meta = [];
$date = strptimestamp(
pup_selector($comment, 'table td.titoloFTR .small i text{}'),
"%A, %b. %d, %Y at %H:%M %p"
);
$meta['text'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR text{}');
$meta['html'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR');
if($date !== false) {
$meta['published'] = strftime('%s', $date);
}
$author = json_decode(pup_selector($comment, 'table td.titoloFTR .small strong json{}'))[0]->{'text'};
if($author !== '') {
$meta['author'] = $author;
}
array_push($comments, $meta);
}
return $comments;
}
function parse_save(object $db, string $filename, $parser) {
try {
$ast = $parser->parse(file_get_contents($filename));
} catch (Error $error) {
echo "Parse error on $filename: {$error->getMessage()}\n";
return;
}
$dumper = new NodeDumper;
// do magic things now
$nid = intval(basename($filename, '.php'));
$fparts = explode('/', $filename);
$year = $fparts[count($fparts)-3];
$month = $fparts[count($fparts)-2];
$dt = strtotime("01-$month-$year");
$metadata = ['nid' => $nid, 'published' => $dt];
$body = '';
foreach($ast as $part)
{
if($part instanceof PhpParser\Node\Stmt\Expression) {
if($part->expr instanceof PhpParser\Node\Expr\Assign
&& $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
&& $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
&& $part->expr->var->var->name === 'GLOBALS'
) {
if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
$val = $part->expr->expr->items;
} elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
$val = iconv('latin1', 'utf8', $part->expr->expr->value);
} else {
$val = $part->expr->expr->value;
}
$metadata[$part->expr->var->dim->value] = $val;
}
}elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
$body .= iconv('latin1', 'utf8', $part->value);
}
}
$main = pup_selector($body, 'table:first-of-type');
$metadata = array_merge($metadata, extract_metadata_from_html($main));
$metadata['body'] = pup_selector($main, '.testoFTR');
$metadata['text'] = pup_selector($body, 'text{}');
$others = pup_selector($body, 'table:nth-of-type(n+2)');
$metadata['comments'] = extract_comments_from_html($others);
save($db, $body, $metadata);
}
function save($db, $body, $metadata) {
// print("Loading ". $metadata['nid'] . "\n");
$stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
if($stm === false) {
print("error during INSERT: ");
print($db->errorInfo()[2]);
return;
}
$stm->bindParam(1, $metadata['nid']);
$stm->bindParam(2, $metadata['page_title']);
$stm->bindParam(3, $metadata['author']);
$stm->bindParam(4, $metadata['body']);
$stm->bindParam(5, $metadata['page_display']);
$stm->bindParam(6, $metadata['published']);
$stm->execute();
$i=0;
foreach($metadata['comments'] as $comm)
{
$i++;
$stm = $db->prepare('INSERT OR REPLACE INTO comments(nid, num, published, author, body) VALUES (?,?,?,?,?)');
if($stm === false) {
print("error during INSERT: ");
print($db->errorInfo()[2]);
return;
}
$stm->bindParam(1, $metadata['nid']);
$stm->bindParam(2, $i);
$stm->bindParam(3, $comm['published']);
//$stm->bindParam(3, $comm['title']);
$stm->bindParam(4, $comm['author']);
$stm->bindParam(5, $comm['html']);
$stm->execute();
}
}
$db = new PDO('sqlite:' . $argv[1]);
$db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
$db->exec('CREATE TABLE IF NOT EXISTS comments (nid INTEGER, num INTEGER, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER,
FOREIGN KEY(nid) REFERENCES news(nid),
PRIMARY KEY(nid, num)
);');
$db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
$db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
$parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
$i = 0;
$db->beginTransaction();
while($f = fgets(STDIN)) {
$f = str_replace("\n", '', $f);
parse_save($db, $f, $parser);
$i++;
if($i >= 100) {
$db->commit();
$db->beginTransaction();
$i = 0;
}
}
$db->commit();
$db = null;
exit(0);