parse_comment.php 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. <?php
  2. /*
  3. * Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
  4. * esempio echo 11102_comment.php | php parse_comment.php a.db
  5. * oppure
  6. * find italy/website/ -name '*_comment.php' | php parse_comment.php a.db
  7. */
  8. require 'vendor/autoload.php';
  9. use PhpParser\Error;
  10. use PhpParser\NodeDumper;
  11. use PhpParser\ParserFactory;
  12. // $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
  13. function strptimestamp($date, $fmt) {
  14. $timeArray = strptime($date, $fmt);
  15. if($timeArray === false) { return false; }
  16. return mktime(
  17. $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
  18. $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
  19. );
  20. }
  21. function pup_selector(string $html , string $selector): string {
  22. if(getenv("PUP_BIN") === false) {
  23. return [];
  24. }
  25. $process = proc_open(getenv("PUP_BIN") . " '" . $selector . "'",
  26. [0=>['pipe', 'r'], 1=>['pipe', 'w']],
  27. $pipes);
  28. fwrite($pipes[0], $html);
  29. fclose($pipes[0]);
  30. $out = stream_get_contents($pipes[1]);
  31. fclose($pipes[1]);
  32. return trim(html_entity_decode($out));
  33. }
  34. function extract_metadata_from_html(string $html): array {
  35. if(getenv("PUP_BIN") === false) {
  36. return [];
  37. }
  38. $meta = [];
  39. $date = strptimestamp(
  40. pup_selector($html, 'td.titoloFTR .small i text{}'),
  41. "%A, %b. %d, %Y at %H:%M %p"
  42. );
  43. if($date !== false) {
  44. $meta['published'] = strftime('%s', $date);
  45. }
  46. $authors = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}'));
  47. $meta['author'] = '';
  48. if(count($authors) > 0) {
  49. $author = $authors[0]->{'text'};
  50. if($author !== '') {
  51. $meta['author'] = $author;
  52. }
  53. }
  54. return $meta;
  55. }
  56. function extract_comments_from_html(string $html): array {
  57. if(getenv("PUP_BIN") === false) {
  58. return [];
  59. }
  60. if(trim($html) === '') { return []; }
  61. $length = count(json_decode(pup_selector($html, 'table json{}', TRUE)));
  62. if($length === 0) {
  63. return [];
  64. }
  65. $comments = [];
  66. for($i=1; $i <= $length; $i++) {
  67. $comment = pup_selector($html, 'table:nth-of-type(' . $i . ')');
  68. $meta = [];
  69. $date = strptimestamp(
  70. pup_selector($comment, 'table td.titoloFTR .small i text{}'),
  71. "%A, %b. %d, %Y at %H:%M %p"
  72. );
  73. $meta['text'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR text{}');
  74. $meta['html'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR');
  75. $meta['title'] = pup_selector($comment, 'tr:first-child td.titoloFTR:nth-child(2) text{}');
  76. if($date !== false) {
  77. $meta['published'] = strftime('%s', $date);
  78. }
  79. $meta['author'] = pup_selector($comment, 'td.titoloFTR .small strong text{}');
  80. array_push($comments, $meta);
  81. }
  82. return $comments;
  83. }
  84. function parse(string $filename, $parser) {
  85. try {
  86. $ast = $parser->parse(file_get_contents($filename));
  87. } catch (Error $error) {
  88. echo "Parse error on $filename: {$error->getMessage()}\n";
  89. return;
  90. }
  91. $dumper = new NodeDumper;
  92. // do magic things now
  93. $nid = intval(basename($filename, '.php'));
  94. $fparts = explode('/', $filename);
  95. $year = $fparts[count($fparts)-3];
  96. $month = $fparts[count($fparts)-2];
  97. $dt = strtotime("01-$month-$year");
  98. $metadata = ['nid' => $nid, 'published' => $dt];
  99. $body = '';
  100. foreach($ast as $part)
  101. {
  102. if($part instanceof PhpParser\Node\Stmt\Expression) {
  103. if($part->expr instanceof PhpParser\Node\Expr\Assign
  104. && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
  105. && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
  106. && $part->expr->var->var->name === 'GLOBALS'
  107. ) {
  108. if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
  109. $val = $part->expr->expr->items;
  110. } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
  111. $val = iconv('latin1', 'utf8', $part->expr->expr->value);
  112. } else {
  113. $val = $part->expr->expr->value;
  114. }
  115. $metadata[$part->expr->var->dim->value] = $val;
  116. }
  117. }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
  118. $body .= iconv('latin1', 'utf8', $part->value);
  119. }
  120. }
  121. $main = pup_selector($body, 'table:first-of-type');
  122. $metadata = array_merge($metadata, extract_metadata_from_html($main));
  123. $metadata['body'] = pup_selector($main, '.testoFTR');
  124. $metadata['text'] = pup_selector($body, 'text{}');
  125. $others = pup_selector($body, 'table:nth-of-type(n+2)');
  126. $metadata['comments'] = extract_comments_from_html($others);
  127. return $metadata;
  128. }
  129. function save($db, $metadata) {
  130. // print("Loading ". $metadata['nid'] . "\n");
  131. $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
  132. if($stm === false) {
  133. print("error during INSERT: ");
  134. print($db->errorInfo()[2]);
  135. return;
  136. }
  137. $stm->bindParam(1, $metadata['nid']);
  138. $stm->bindParam(2, $metadata['page_title']);
  139. $stm->bindParam(3, $metadata['author']);
  140. $stm->bindParam(4, $metadata['body']);
  141. $stm->bindParam(5, $metadata['page_display']);
  142. $stm->bindParam(6, $metadata['published']);
  143. $stm->execute();
  144. $i=0;
  145. foreach($metadata['comments'] as $comm)
  146. {
  147. $i++;
  148. $stm = $db->prepare('INSERT OR REPLACE INTO comments(nid, num, published, author, title, body) VALUES (?,?,?,?,?,?)');
  149. if($stm === false) {
  150. print("error during INSERT: ");
  151. print($db->errorInfo()[2]);
  152. return;
  153. }
  154. $stm->bindParam(1, $metadata['nid']);
  155. $stm->bindParam(2, $i);
  156. $stm->bindParam(3, $comm['published']);
  157. $stm->bindParam(4, $comm['author']);
  158. $stm->bindParam(5, $comm['title']);
  159. $stm->bindParam(6, $comm['html']);
  160. $stm->execute();
  161. }
  162. }
  163. $db = new PDO('sqlite:' . $argv[1]);
  164. $db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
  165. $db->exec('CREATE TABLE IF NOT EXISTS comments (nid INTEGER, num INTEGER, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER,
  166. FOREIGN KEY(nid) REFERENCES news(nid),
  167. PRIMARY KEY(nid, num)
  168. );');
  169. $db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
  170. $db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
  171. $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
  172. $i = 0;
  173. $db->beginTransaction();
  174. while($f = fgets(STDIN)) {
  175. fwrite(STDERR, $f);
  176. $f = str_replace("\n", '', $f);
  177. $parsed = parse($f, $parser);
  178. //echo json_encode($parsed);
  179. save($db, $parsed);
  180. $i++;
  181. if($i >= 100) {
  182. $db->commit();
  183. $db->beginTransaction();
  184. $i = 0;
  185. }
  186. }
  187. $db->commit();
  188. $db = null;
  189. exit(0);