|
@@ -85,21 +85,14 @@ function extract_comments_from_html(string $html): array {
|
|
|
if($date !== false) {
|
|
|
$meta['published'] = strftime('%s', $date);
|
|
|
}
|
|
|
- $authors = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}'));
|
|
|
- $meta['author'] = '';
|
|
|
- if(count($authors) > 0) {
|
|
|
- $author = $authors[0]->{'text'};
|
|
|
- if($author !== '') {
|
|
|
- $meta['author'] = $author;
|
|
|
- }
|
|
|
- }
|
|
|
+ $meta['author'] = pup_selector($comment, 'td.titoloFTR .small strong text{}');
|
|
|
array_push($comments, $meta);
|
|
|
}
|
|
|
return $comments;
|
|
|
}
|
|
|
|
|
|
|
|
|
-function parse_save(object $db, string $filename, $parser) {
|
|
|
+function parse(string $filename, $parser) {
|
|
|
try {
|
|
|
$ast = $parser->parse(file_get_contents($filename));
|
|
|
} catch (Error $error) {
|
|
@@ -144,11 +137,11 @@ function parse_save(object $db, string $filename, $parser) {
|
|
|
$metadata['text'] = pup_selector($body, 'text{}');
|
|
|
$others = pup_selector($body, 'table:nth-of-type(n+2)');
|
|
|
$metadata['comments'] = extract_comments_from_html($others);
|
|
|
- save($db, $body, $metadata);
|
|
|
+ return $metadata;
|
|
|
|
|
|
|
|
|
}
|
|
|
-function save($db, $body, $metadata) {
|
|
|
+function save($db, $metadata) {
|
|
|
// print("Loading ". $metadata['nid'] . "\n");
|
|
|
$stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
|
|
|
if($stm === false) {
|
|
@@ -200,9 +193,11 @@ $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
|
|
|
$i = 0;
|
|
|
$db->beginTransaction();
|
|
|
while($f = fgets(STDIN)) {
|
|
|
- echo $f;
|
|
|
+ fwrite(STDERR, $f);
|
|
|
$f = str_replace("\n", '', $f);
|
|
|
- parse_save($db, $f, $parser);
|
|
|
+ $parsed = parse($f, $parser);
|
|
|
+ //echo json_encode($parsed);
|
|
|
+ save($db, $parsed);
|
|
|
$i++;
|
|
|
if($i >= 100) {
|
|
|
$db->commit();
|