Browse Source

parse comments, too

boyska 3 years ago
parent
commit
1cad2bd291
2 changed files with 213 additions and 0 deletions
  1. 206 0
      parse_comment.php
  2. 7 0
      parse_one.php

+ 206 - 0
parse_comment.php

@@ -0,0 +1,206 @@
+<?php
+
+/*
+ * Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
+ * esempio echo 11102_comment.php | php parse_comment.php a.db
+ * oppure
+ * find italy/website/ -name '*_comment.php' | php parse_comment.php a.db
+ */
+
+require 'vendor/autoload.php';
+
+use PhpParser\Error;
+use PhpParser\NodeDumper;
+use PhpParser\ParserFactory;
+
+// $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
+
+function strptimestamp($date, $fmt) {
+    $timeArray = strptime($date, $fmt);
+    if($timeArray === false) { return false; }
+    return mktime(
+        $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'], 
+        $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
+    );
+}
+
+function pup_selector(string $html , string $selector): string {
+    if(getenv("PUP_BIN") === false) {
+        return [];
+    }
+    $process = proc_open(getenv("PUP_BIN") . " '" . $selector . "'",
+        [0=>['pipe', 'r'], 1=>['pipe', 'w']],
+        $pipes);
+    fwrite($pipes[0], $html);
+    fclose($pipes[0]);
+    $out = stream_get_contents($pipes[1]);
+    fclose($pipes[1]);
+    return trim(html_entity_decode($out));
+}
+
+function extract_metadata_from_html(string $html): array {
+    if(getenv("PUP_BIN") === false) {
+        return [];
+    }
+    $meta = [];
+    $date = strptimestamp(
+        pup_selector($html, 'td.titoloFTR .small i text{}'),
+        "%A, %b. %d, %Y at %H:%M %p"
+    );
+
+    if($date !== false) {
+        $meta['published'] = strftime('%s', $date);
+    }
+    $author = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}'))[0]->{'text'};
+    if($author !== '') {
+        $meta['author'] = $author;
+    }
+    return $meta;
+}
+
+function extract_comments_from_html(string $html): array {
+    if(getenv("PUP_BIN") === false) {
+        return [];
+    }
+    if(trim($html) === '') { return []; }
+    $length = count(json_decode(pup_selector($html, 'table json{}', TRUE)));
+    if($length === 0) {
+        return [];
+    }
+    $comments = [];
+    for($i=1; $i <= $length; $i++) {
+        $comment = pup_selector($html, 'table:nth-of-type(' . $i . ')');
+        $meta = [];
+        $date = strptimestamp(
+            pup_selector($comment, 'table td.titoloFTR .small i text{}'),
+            "%A, %b. %d, %Y at %H:%M %p"
+        );
+        $meta['text'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR text{}');
+        $meta['html'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR');
+
+        if($date !== false) {
+            $meta['published'] = strftime('%s', $date);
+        }
+        $author = json_decode(pup_selector($comment, 'table td.titoloFTR .small strong json{}'))[0]->{'text'};
+        if($author !== '') {
+            $meta['author'] = $author;
+        }
+        array_push($comments, $meta);
+    }
+    return $comments;
+}
+
+
+function parse_save(object $db, string $filename, $parser) {
+    try {
+        $ast = $parser->parse(file_get_contents($filename));
+    } catch (Error $error) {
+        echo "Parse error on $filename: {$error->getMessage()}\n";
+        return;
+    }
+
+    $dumper = new NodeDumper;
+
+    // do magic things now
+    $nid = intval(basename($filename, '.php'));
+    $fparts = explode('/', $filename);
+    $year = $fparts[count($fparts)-3];
+    $month = $fparts[count($fparts)-2];
+    $dt = strtotime("01-$month-$year");
+    $metadata = ['nid' => $nid, 'published' => $dt];
+    $body = '';
+    foreach($ast as $part)
+    {
+        if($part instanceof PhpParser\Node\Stmt\Expression) {
+            if($part->expr instanceof PhpParser\Node\Expr\Assign
+                && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
+                && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
+                && $part->expr->var->var->name === 'GLOBALS'
+            ) {
+                if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
+                    $val = $part->expr->expr->items;
+                } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
+                    $val = iconv('latin1', 'utf8', $part->expr->expr->value);
+                } else {
+                    $val = $part->expr->expr->value;
+                }
+                $metadata[$part->expr->var->dim->value] = $val;
+            }
+        }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
+            $body .= iconv('latin1', 'utf8', $part->value);
+        }
+    }
+    $main = pup_selector($body, 'table:first-of-type');
+    $metadata = array_merge($metadata, extract_metadata_from_html($main));
+    $metadata['body'] = pup_selector($main, '.testoFTR');
+    $metadata['text'] = pup_selector($body, 'text{}');
+    $others = pup_selector($body, 'table:nth-of-type(n+2)');
+    $metadata['comments'] = extract_comments_from_html($others);
+    save($db, $body, $metadata);
+
+
+}
+function save($db, $body, $metadata) {
+    // print("Loading ". $metadata['nid'] . "\n");
+    $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
+    if($stm === false) {
+        print("error during INSERT:   ");
+        print($db->errorInfo()[2]);
+        return;
+    }
+    $stm->bindParam(1, $metadata['nid']);
+    $stm->bindParam(2, $metadata['page_title']);
+    $stm->bindParam(3, $metadata['author']);
+    $stm->bindParam(4, $metadata['body']);
+    $stm->bindParam(5, $metadata['page_display']);
+    $stm->bindParam(6, $metadata['published']);
+    $stm->execute();
+
+    $i=0;
+    foreach($metadata['comments'] as $comm)
+    {
+        $i++;
+        $stm = $db->prepare('INSERT OR REPLACE INTO comments(nid, num, published, author, body) VALUES (?,?,?,?,?)');
+        if($stm === false) {
+            print("error during INSERT:   ");
+            print($db->errorInfo()[2]);
+            return;
+        }
+        $stm->bindParam(1, $metadata['nid']);
+        $stm->bindParam(2, $i);
+        $stm->bindParam(3, $comm['published']);
+        //$stm->bindParam(3, $comm['title']);
+        $stm->bindParam(4, $comm['author']);
+        $stm->bindParam(5, $comm['html']);
+        $stm->execute();
+    }
+}
+
+
+
+$db = new PDO('sqlite:' . $argv[1]);
+$db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY,  body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
+$db->exec('CREATE TABLE IF NOT EXISTS comments (nid INTEGER, num INTEGER, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER,
+    FOREIGN KEY(nid) REFERENCES news(nid),
+    PRIMARY KEY(nid, num)
+);');
+$db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
+$db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
+
+
+$parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
+$i = 0;
+$db->beginTransaction();
+while($f = fgets(STDIN)) {
+    $f = str_replace("\n", '', $f);
+    parse_save($db, $f, $parser);
+    $i++;
+    if($i >= 100) {
+        $db->commit();
+        $db->beginTransaction();
+        $i = 0;
+    }
+}
+$db->commit();
+$db = null;
+exit(0);

+ 7 - 0
parse_one.php

@@ -1,5 +1,12 @@
 <?php
 
+/*
+ * Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
+ * esempio echo 11102.php | php parse_one.php a.db
+ * oppure
+ * find italy/website/ -name '*.php' | php parse_one.php a.db
+ */
+
 require 'vendor/autoload.php';
 
 use PhpParser\Error;