123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- <?php
- /*
- * Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
- * esempio echo 11102_comment.php | php parse_comment.php a.db
- * oppure
- * find italy/website/ -name '*_comment.php' | php parse_comment.php a.db
- */
- require 'vendor/autoload.php';
- use PhpParser\Error;
- use PhpParser\NodeDumper;
- use PhpParser\ParserFactory;
- // $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
- function strptimestamp($date, $fmt) {
- $timeArray = strptime($date, $fmt);
- if($timeArray === false) { return false; }
- return mktime(
- $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
- $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
- );
- }
- function pup_selector(string $html , string $selector): string {
- if(getenv("PUP_BIN") === false) {
- return [];
- }
- $process = proc_open(getenv("PUP_BIN") . " '" . $selector . "'",
- [0=>['pipe', 'r'], 1=>['pipe', 'w']],
- $pipes);
- fwrite($pipes[0], $html);
- fclose($pipes[0]);
- $out = stream_get_contents($pipes[1]);
- fclose($pipes[1]);
- return trim(html_entity_decode($out));
- }
- function extract_metadata_from_html(string $html): array {
- if(getenv("PUP_BIN") === false) {
- return [];
- }
- $meta = [];
- $date = strptimestamp(
- pup_selector($html, 'td.titoloFTR .small i text{}'),
- "%A, %b. %d, %Y at %H:%M %p"
- );
- if($date !== false) {
- $meta['published'] = strftime('%s', $date);
- }
- $author = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}'))[0]->{'text'};
- if($author !== '') {
- $meta['author'] = $author;
- }
- return $meta;
- }
- function extract_comments_from_html(string $html): array {
- if(getenv("PUP_BIN") === false) {
- return [];
- }
- if(trim($html) === '') { return []; }
- $length = count(json_decode(pup_selector($html, 'table json{}', TRUE)));
- if($length === 0) {
- return [];
- }
- $comments = [];
- for($i=1; $i <= $length; $i++) {
- $comment = pup_selector($html, 'table:nth-of-type(' . $i . ')');
- $meta = [];
- $date = strptimestamp(
- pup_selector($comment, 'table td.titoloFTR .small i text{}'),
- "%A, %b. %d, %Y at %H:%M %p"
- );
- $meta['text'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR text{}');
- $meta['html'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR');
- if($date !== false) {
- $meta['published'] = strftime('%s', $date);
- }
- $author = json_decode(pup_selector($comment, 'table td.titoloFTR .small strong json{}'))[0]->{'text'};
- if($author !== '') {
- $meta['author'] = $author;
- }
- array_push($comments, $meta);
- }
- return $comments;
- }
- function parse_save(object $db, string $filename, $parser) {
- try {
- $ast = $parser->parse(file_get_contents($filename));
- } catch (Error $error) {
- echo "Parse error on $filename: {$error->getMessage()}\n";
- return;
- }
- $dumper = new NodeDumper;
- // do magic things now
- $nid = intval(basename($filename, '.php'));
- $fparts = explode('/', $filename);
- $year = $fparts[count($fparts)-3];
- $month = $fparts[count($fparts)-2];
- $dt = strtotime("01-$month-$year");
- $metadata = ['nid' => $nid, 'published' => $dt];
- $body = '';
- foreach($ast as $part)
- {
- if($part instanceof PhpParser\Node\Stmt\Expression) {
- if($part->expr instanceof PhpParser\Node\Expr\Assign
- && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
- && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
- && $part->expr->var->var->name === 'GLOBALS'
- ) {
- if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
- $val = $part->expr->expr->items;
- } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
- $val = iconv('latin1', 'utf8', $part->expr->expr->value);
- } else {
- $val = $part->expr->expr->value;
- }
- $metadata[$part->expr->var->dim->value] = $val;
- }
- }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
- $body .= iconv('latin1', 'utf8', $part->value);
- }
- }
- $main = pup_selector($body, 'table:first-of-type');
- $metadata = array_merge($metadata, extract_metadata_from_html($main));
- $metadata['body'] = pup_selector($main, '.testoFTR');
- $metadata['text'] = pup_selector($body, 'text{}');
- $others = pup_selector($body, 'table:nth-of-type(n+2)');
- $metadata['comments'] = extract_comments_from_html($others);
- save($db, $body, $metadata);
- }
- function save($db, $body, $metadata) {
- // print("Loading ". $metadata['nid'] . "\n");
- $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
- if($stm === false) {
- print("error during INSERT: ");
- print($db->errorInfo()[2]);
- return;
- }
- $stm->bindParam(1, $metadata['nid']);
- $stm->bindParam(2, $metadata['page_title']);
- $stm->bindParam(3, $metadata['author']);
- $stm->bindParam(4, $metadata['body']);
- $stm->bindParam(5, $metadata['page_display']);
- $stm->bindParam(6, $metadata['published']);
- $stm->execute();
- $i=0;
- foreach($metadata['comments'] as $comm)
- {
- $i++;
- $stm = $db->prepare('INSERT OR REPLACE INTO comments(nid, num, published, author, body) VALUES (?,?,?,?,?)');
- if($stm === false) {
- print("error during INSERT: ");
- print($db->errorInfo()[2]);
- return;
- }
- $stm->bindParam(1, $metadata['nid']);
- $stm->bindParam(2, $i);
- $stm->bindParam(3, $comm['published']);
- //$stm->bindParam(3, $comm['title']);
- $stm->bindParam(4, $comm['author']);
- $stm->bindParam(5, $comm['html']);
- $stm->execute();
- }
- }
- $db = new PDO('sqlite:' . $argv[1]);
- $db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
- $db->exec('CREATE TABLE IF NOT EXISTS comments (nid INTEGER, num INTEGER, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER,
- FOREIGN KEY(nid) REFERENCES news(nid),
- PRIMARY KEY(nid, num)
- );');
- $db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
- $db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
- $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
- $i = 0;
- $db->beginTransaction();
- while($f = fgets(STDIN)) {
- $f = str_replace("\n", '', $f);
- parse_save($db, $f, $parser);
- $i++;
- if($i >= 100) {
- $db->commit();
- $db->beginTransaction();
- $i = 0;
- }
- }
- $db->commit();
- $db = null;
- exit(0);
|