parse_comment.php 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. <?php
  2. /*
  3. * Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
  4. * esempio echo 11102_comment.php | php parse_comment.php a.db
  5. * oppure
  6. * find italy/website/ -name '*_comment.php' | php parse_comment.php a.db
  7. */
  8. require 'vendor/autoload.php';
  9. use PhpParser\Error;
  10. use PhpParser\NodeDumper;
  11. use PhpParser\ParserFactory;
  12. // $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
  13. function strptimestamp($date, $fmt) {
  14. $timeArray = strptime($date, $fmt);
  15. if($timeArray === false) { return false; }
  16. return mktime(
  17. $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
  18. $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
  19. );
  20. }
  21. function pup_selector(string $html , string $selector): string {
  22. if(getenv("PUP_BIN") === false) {
  23. return [];
  24. }
  25. $process = proc_open(getenv("PUP_BIN") . " '" . $selector . "'",
  26. [0=>['pipe', 'r'], 1=>['pipe', 'w']],
  27. $pipes);
  28. fwrite($pipes[0], $html);
  29. fclose($pipes[0]);
  30. $out = stream_get_contents($pipes[1]);
  31. fclose($pipes[1]);
  32. return trim(html_entity_decode($out));
  33. }
  34. function extract_metadata_from_html(string $html): array {
  35. if(getenv("PUP_BIN") === false) {
  36. return [];
  37. }
  38. $meta = [];
  39. $date = strptimestamp(
  40. pup_selector($html, 'td.titoloFTR .small i text{}'),
  41. "%A, %b. %d, %Y at %H:%M %p"
  42. );
  43. if($date !== false) {
  44. $meta['published'] = strftime('%s', $date);
  45. }
  46. $author = json_decode(pup_selector($html, 'td.titoloFTR .small strong json{}'))[0]->{'text'};
  47. if($author !== '') {
  48. $meta['author'] = $author;
  49. }
  50. return $meta;
  51. }
  52. function extract_comments_from_html(string $html): array {
  53. if(getenv("PUP_BIN") === false) {
  54. return [];
  55. }
  56. if(trim($html) === '') { return []; }
  57. $length = count(json_decode(pup_selector($html, 'table json{}', TRUE)));
  58. if($length === 0) {
  59. return [];
  60. }
  61. $comments = [];
  62. for($i=1; $i <= $length; $i++) {
  63. $comment = pup_selector($html, 'table:nth-of-type(' . $i . ')');
  64. $meta = [];
  65. $date = strptimestamp(
  66. pup_selector($comment, 'table td.titoloFTR .small i text{}'),
  67. "%A, %b. %d, %Y at %H:%M %p"
  68. );
  69. $meta['text'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR text{}');
  70. $meta['html'] = pup_selector($comment, 'tr:nth-child(3) td.testoFTR');
  71. if($date !== false) {
  72. $meta['published'] = strftime('%s', $date);
  73. }
  74. $author = json_decode(pup_selector($comment, 'table td.titoloFTR .small strong json{}'))[0]->{'text'};
  75. if($author !== '') {
  76. $meta['author'] = $author;
  77. }
  78. array_push($comments, $meta);
  79. }
  80. return $comments;
  81. }
  82. function parse_save(object $db, string $filename, $parser) {
  83. try {
  84. $ast = $parser->parse(file_get_contents($filename));
  85. } catch (Error $error) {
  86. echo "Parse error on $filename: {$error->getMessage()}\n";
  87. return;
  88. }
  89. $dumper = new NodeDumper;
  90. // do magic things now
  91. $nid = intval(basename($filename, '.php'));
  92. $fparts = explode('/', $filename);
  93. $year = $fparts[count($fparts)-3];
  94. $month = $fparts[count($fparts)-2];
  95. $dt = strtotime("01-$month-$year");
  96. $metadata = ['nid' => $nid, 'published' => $dt];
  97. $body = '';
  98. foreach($ast as $part)
  99. {
  100. if($part instanceof PhpParser\Node\Stmt\Expression) {
  101. if($part->expr instanceof PhpParser\Node\Expr\Assign
  102. && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
  103. && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
  104. && $part->expr->var->var->name === 'GLOBALS'
  105. ) {
  106. if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
  107. $val = $part->expr->expr->items;
  108. } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
  109. $val = iconv('latin1', 'utf8', $part->expr->expr->value);
  110. } else {
  111. $val = $part->expr->expr->value;
  112. }
  113. $metadata[$part->expr->var->dim->value] = $val;
  114. }
  115. }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
  116. $body .= iconv('latin1', 'utf8', $part->value);
  117. }
  118. }
  119. $main = pup_selector($body, 'table:first-of-type');
  120. $metadata = array_merge($metadata, extract_metadata_from_html($main));
  121. $metadata['body'] = pup_selector($main, '.testoFTR');
  122. $metadata['text'] = pup_selector($body, 'text{}');
  123. $others = pup_selector($body, 'table:nth-of-type(n+2)');
  124. $metadata['comments'] = extract_comments_from_html($others);
  125. save($db, $body, $metadata);
  126. }
  127. function save($db, $body, $metadata) {
  128. // print("Loading ". $metadata['nid'] . "\n");
  129. $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
  130. if($stm === false) {
  131. print("error during INSERT: ");
  132. print($db->errorInfo()[2]);
  133. return;
  134. }
  135. $stm->bindParam(1, $metadata['nid']);
  136. $stm->bindParam(2, $metadata['page_title']);
  137. $stm->bindParam(3, $metadata['author']);
  138. $stm->bindParam(4, $metadata['body']);
  139. $stm->bindParam(5, $metadata['page_display']);
  140. $stm->bindParam(6, $metadata['published']);
  141. $stm->execute();
  142. $i=0;
  143. foreach($metadata['comments'] as $comm)
  144. {
  145. $i++;
  146. $stm = $db->prepare('INSERT OR REPLACE INTO comments(nid, num, published, author, body) VALUES (?,?,?,?,?)');
  147. if($stm === false) {
  148. print("error during INSERT: ");
  149. print($db->errorInfo()[2]);
  150. return;
  151. }
  152. $stm->bindParam(1, $metadata['nid']);
  153. $stm->bindParam(2, $i);
  154. $stm->bindParam(3, $comm['published']);
  155. //$stm->bindParam(3, $comm['title']);
  156. $stm->bindParam(4, $comm['author']);
  157. $stm->bindParam(5, $comm['html']);
  158. $stm->execute();
  159. }
  160. }
  161. $db = new PDO('sqlite:' . $argv[1]);
  162. $db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
  163. $db->exec('CREATE TABLE IF NOT EXISTS comments (nid INTEGER, num INTEGER, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER,
  164. FOREIGN KEY(nid) REFERENCES news(nid),
  165. PRIMARY KEY(nid, num)
  166. );');
  167. $db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
  168. $db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
  169. $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
  170. $i = 0;
  171. $db->beginTransaction();
  172. while($f = fgets(STDIN)) {
  173. $f = str_replace("\n", '', $f);
  174. parse_save($db, $f, $parser);
  175. $i++;
  176. if($i >= 100) {
  177. $db->commit();
  178. $db->beginTransaction();
  179. $i = 0;
  180. }
  181. }
  182. $db->commit();
  183. $db = null;
  184. exit(0);