parse_one.php 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. <?php
  2. /*
  3. * Per lanciare questo script, dagli in input tutti i file php che vuoi parsare
  4. * esempio echo 11102.php | php parse_one.php a.db
  5. * oppure
  6. * find italy/website/ -name '*.php' | php parse_one.php a.db
  7. */
  8. require 'vendor/autoload.php';
  9. use PhpParser\Error;
  10. use PhpParser\NodeDumper;
  11. use PhpParser\ParserFactory;
  12. // $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
  13. function strptimestamp($date, $fmt) {
  14. $timeArray = strptime($date, $fmt);
  15. if($timeArray === false) { return false; }
  16. return mktime(
  17. $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
  18. $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
  19. );
  20. }
  21. function pup_selector(string $html , string $selector): string {
  22. if(getenv("PUP_BIN") === false) {
  23. return [];
  24. }
  25. $process = proc_open(getenv("PUP_BIN") . $selector,
  26. [0=>['pipe', 'r'], 1=>['pipe', 'w']],
  27. $pipes);
  28. fwrite($pipes[0], $html);
  29. fclose($pipes[0]);
  30. $out = stream_get_contents($pipes[1]);
  31. fclose($pipes[1]);
  32. return trim(html_entity_decode($out));
  33. }
  34. function extract_metadata_from_html(string $html): array {
  35. if(getenv("PUP_BIN") === false) {
  36. return [];
  37. }
  38. $meta = [];
  39. $date = strptimestamp(
  40. pup_selector($html, ' table td.titoloFTR .small i text{}'),
  41. "%A, %b. %d, %Y at %H:%M %p"
  42. );
  43. if($date !== false) {
  44. $meta['published'] = strftime('%s', $date);
  45. }
  46. $author = pup_selector($html, ' table td.titoloFTR .small strong text{}');
  47. if($author !== '') {
  48. $meta['author'] = $author;
  49. }
  50. return $meta;
  51. }
  52. function parse_save(object $db, string $filename, $parser) {
  53. try {
  54. $ast = $parser->parse(file_get_contents($filename));
  55. } catch (Error $error) {
  56. echo "Parse error on $filename: {$error->getMessage()}\n";
  57. return;
  58. }
  59. $dumper = new NodeDumper;
  60. // do magic things now
  61. $nid = intval(basename($filename, '.php'));
  62. $fparts = explode('/', $filename);
  63. $year = $fparts[count($fparts)-3];
  64. $month = $fparts[count($fparts)-2];
  65. $dt = strtotime("01-$month-$year");
  66. $metadata = ['nid' => $nid, 'published' => $dt];
  67. $body = '';
  68. foreach($ast as $part)
  69. {
  70. if($part instanceof PhpParser\Node\Stmt\Expression) {
  71. if($part->expr instanceof PhpParser\Node\Expr\Assign
  72. && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
  73. && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
  74. && $part->expr->var->var->name === 'GLOBALS'
  75. ) {
  76. if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
  77. $val = $part->expr->expr->items;
  78. } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
  79. $val = iconv('latin1', 'utf8', $part->expr->expr->value);
  80. } else {
  81. $val = $part->expr->expr->value;
  82. }
  83. $metadata[$part->expr->var->dim->value] = $val;
  84. }
  85. }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
  86. $body .= iconv('latin1', 'utf8', $part->value);
  87. }
  88. }
  89. $metadata = array_merge($metadata, extract_metadata_from_html($body));
  90. save($db, $body, $metadata);
  91. }
  92. function save($db, $body, $metadata) {
  93. // print("Loading ". $metadata['nid'] . "\n");
  94. $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
  95. if($stm === false) {
  96. print("error during INSERT: ");
  97. print($db->errorInfo()[2]);
  98. return;
  99. }
  100. $stm->bindParam(1, $metadata['nid']);
  101. $stm->bindParam(2, $metadata['page_title']);
  102. $stm->bindParam(3, $metadata['author']);
  103. $stm->bindParam(4, $body);
  104. $stm->bindParam(5, $metadata['page_display']);
  105. $stm->bindParam(6, $metadata['published']);
  106. $stm->execute();
  107. }
  108. $db = new PDO('sqlite:' . $argv[1]);
  109. $db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
  110. $db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
  111. $db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
  112. $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
  113. $i = 0;
  114. $db->beginTransaction();
  115. while($f = fgets(STDIN)) {
  116. $f = str_replace("\n", '', $f);
  117. parse_save($db, $f, $parser);
  118. $i++;
  119. if($i >= 100) {
  120. $db->commit();
  121. $db->beginTransaction();
  122. $i = 0;
  123. }
  124. }
  125. $db->commit();
  126. $db = null;
  127. exit(0);