['pipe', 'r'], 1=>['pipe', 'w']], $pipes); fwrite($pipes[0], $html); fclose($pipes[0]); $out = stream_get_contents($pipes[1]); fclose($pipes[1]); return trim(html_entity_decode($out)); } function extract_metadata_from_html(string $html): array { if(getenv("PUP_BIN") === false) { return []; } $meta = []; $date = strptimestamp( pup_selector($html, 'td.titoloFTR .small i text{}'), "%A, %b. %d, %Y at %H:%M %p" ); if($date !== false) { $meta['published'] = strftime('%s', $date); } $authors = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}')); $meta['author'] = ''; if(count($authors) > 0) { $author = $authors[0]->{'text'}; if($author !== '') { $meta['author'] = $author; } } return $meta; } function extract_comments_from_html(string $html): array { if(getenv("PUP_BIN") === false) { return []; } if(trim($html) === '') { return []; } $length = count(json_decode(pup_selector($html, 'table json{}', TRUE))); if($length === 0) { return []; } $comments = []; for($i=1; $i <= $length; $i++) { $comment = pup_selector($html, 'table:nth-of-type(' . $i . ')'); $meta = []; $date = strptimestamp( pup_selector($comment, 'table td.titoloFTR .small i text{}'), "%A, %b. %d, %Y at %H:%M %p" ); $meta['text'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR text{}'); $meta['html'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR'); $meta['title'] = pup_selector($comment, 'tr:first-child td.titoloFTR:nth-child(2) text{}'); if($date !== false) { $meta['published'] = strftime('%s', $date); } $meta['author'] = pup_selector($comment, 'td.titoloFTR .small strong text{}'); array_push($comments, $meta); } return $comments; } function parse(string $filename, $parser) { try { $ast = $parser->parse(file_get_contents($filename)); } catch (Error $error) { echo "Parse error on $filename: {$error->getMessage()}\n"; return; } $dumper = new NodeDumper; // do magic things now $nid = intval(basename($filename, '.php')); $fparts = explode('/', $filename); $year = $fparts[count($fparts)-3]; $month = $fparts[count($fparts)-2]; $dt = strtotime("01-$month-$year"); $metadata = ['nid' => $nid, 'published' => $dt]; $body = ''; foreach($ast as $part) { if($part instanceof PhpParser\Node\Stmt\Expression) { if($part->expr instanceof PhpParser\Node\Expr\Assign && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable && $part->expr->var->var->name === 'GLOBALS' ) { if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) { $val = $part->expr->expr->items; } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){ $val = iconv('latin1', 'utf8', $part->expr->expr->value); } else { $val = $part->expr->expr->value; } $metadata[$part->expr->var->dim->value] = $val; } }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) { $body .= iconv('latin1', 'utf8', $part->value); } } $main = pup_selector($body, 'table:first-of-type'); $metadata = array_merge($metadata, extract_metadata_from_html($main)); $metadata['body'] = pup_selector($main, '.testoFTR'); $metadata['text'] = pup_selector($body, 'text{}'); $others = pup_selector($body, 'table:nth-of-type(n+2)'); $metadata['comments'] = extract_comments_from_html($others); return $metadata; } function save($db, $metadata) { // print("Loading ". $metadata['nid'] . "\n"); $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)'); if($stm === false) { print("error during INSERT: "); print($db->errorInfo()[2]); return; } $stm->bindParam(1, $metadata['nid']); $stm->bindParam(2, $metadata['page_title']); $stm->bindParam(3, $metadata['author']); $stm->bindParam(4, $metadata['body']); $stm->bindParam(5, $metadata['page_display']); $stm->bindParam(6, $metadata['published']); $stm->execute(); $i=0; foreach($metadata['comments'] as $comm) { $i++; $stm = $db->prepare('INSERT OR REPLACE INTO comments(nid, num, published, author, title, body) VALUES (?,?,?,?,?,?)'); if($stm === false) { print("error during INSERT: "); print($db->errorInfo()[2]); return; } $stm->bindParam(1, $metadata['nid']); $stm->bindParam(2, $i); $stm->bindParam(3, $comm['published']); $stm->bindParam(4, $comm['author']); $stm->bindParam(5, $comm['title']); $stm->bindParam(6, $comm['html']); $stm->execute(); } } $db = new PDO('sqlite:' . $argv[1]); $db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);'); $db->exec('CREATE TABLE IF NOT EXISTS comments (nid INTEGER, num INTEGER, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, FOREIGN KEY(nid) REFERENCES news(nid), PRIMARY KEY(nid, num) );'); $db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);'); $db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);'); $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5); $i = 0; $db->beginTransaction(); while($f = fgets(STDIN)) { fwrite(STDERR, $f); $f = str_replace("\n", '', $f); $parsed = parse($f, $parser); //echo json_encode($parsed); save($db, $parsed); $i++; if($i >= 100) { $db->commit(); $db->beginTransaction(); $i = 0; } } $db->commit(); $db = null; exit(0);