parse_one.php 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. <?php
  2. require 'vendor/autoload.php';
  3. use PhpParser\Error;
  4. use PhpParser\NodeDumper;
  5. use PhpParser\ParserFactory;
  6. // $f = '/mnt/i/disco_indy/var/www/sf-active/italy/website/news/2001/07/11102.php';
  7. function strptimestamp($date, $fmt) {
  8. $timeArray = strptime($date, $fmt);
  9. if($timeArray === false) { return false; }
  10. return mktime(
  11. $timeArray['tm_hour'], $timeArray['tm_min'], $timeArray['tm_sec'],
  12. $timeArray['tm_mon']+1, $timeArray['tm_mday'], $timeArray['tm_year']+1900
  13. );
  14. }
  15. function pup_selector(string $html , string $selector): string {
  16. if(getenv("PUP_BIN") === false) {
  17. return [];
  18. }
  19. $process = proc_open(getenv("PUP_BIN") . $selector,
  20. [0=>['pipe', 'r'], 1=>['pipe', 'w']],
  21. $pipes);
  22. fwrite($pipes[0], $html);
  23. fclose($pipes[0]);
  24. $out = stream_get_contents($pipes[1]);
  25. fclose($pipes[1]);
  26. return trim(html_entity_decode($out));
  27. }
  28. function extract_metadata_from_html(string $html): array {
  29. if(getenv("PUP_BIN") === false) {
  30. return [];
  31. }
  32. $meta = [];
  33. $date = strptimestamp(
  34. pup_selector($html, ' table td.titoloFTR .small i text{}'),
  35. "%A, %b. %d, %Y at %H:%M %p"
  36. );
  37. if($date !== false) {
  38. $meta['published'] = strftime('%s', $date);
  39. }
  40. $author = pup_selector($html, ' table td.titoloFTR .small strong text{}');
  41. if($author !== '') {
  42. $meta['author'] = $author;
  43. }
  44. return $meta;
  45. }
  46. function parse_save(object $db, string $filename, $parser) {
  47. try {
  48. $ast = $parser->parse(file_get_contents($filename));
  49. } catch (Error $error) {
  50. echo "Parse error on $filename: {$error->getMessage()}\n";
  51. return;
  52. }
  53. $dumper = new NodeDumper;
  54. // do magic things now
  55. $nid = intval(basename($filename, '.php'));
  56. $fparts = explode('/', $filename);
  57. $year = $fparts[count($fparts)-3];
  58. $month = $fparts[count($fparts)-2];
  59. $dt = strtotime("01-$month-$year");
  60. $metadata = ['nid' => $nid, 'published' => $dt];
  61. $body = '';
  62. foreach($ast as $part)
  63. {
  64. if($part instanceof PhpParser\Node\Stmt\Expression) {
  65. if($part->expr instanceof PhpParser\Node\Expr\Assign
  66. && $part->expr->var instanceof PhpParser\Node\Expr\ArrayDimFetch
  67. && $part->expr->var->var instanceof PhpParser\Node\Expr\Variable
  68. && $part->expr->var->var->name === 'GLOBALS'
  69. ) {
  70. if($part->expr->expr instanceof PhpParser\Node\Expr\Array_) {
  71. $val = $part->expr->expr->items;
  72. } elseif($part->expr->expr instanceof PhpParser\Node\Scalar\String_ ){
  73. $val = iconv('latin1', 'utf8', $part->expr->expr->value);
  74. } else {
  75. $val = $part->expr->expr->value;
  76. }
  77. $metadata[$part->expr->var->dim->value] = $val;
  78. }
  79. }elseif($part instanceof PhpParser\Node\Stmt\InlineHTML ) {
  80. $body .= iconv('latin1', 'utf8', $part->value);
  81. }
  82. }
  83. $metadata = array_merge($metadata, extract_metadata_from_html($body));
  84. save($db, $body, $metadata);
  85. }
  86. function save($db, $body, $metadata) {
  87. // print("Loading ". $metadata['nid'] . "\n");
  88. $stm = $db->prepare('INSERT OR REPLACE INTO news(nid, title, author, body, display, published) VALUES (?,?,?,?,?,?)');
  89. if($stm === false) {
  90. print("error during INSERT: ");
  91. print($db->errorInfo()[2]);
  92. return;
  93. }
  94. $stm->bindParam(1, $metadata['nid']);
  95. $stm->bindParam(2, $metadata['page_title']);
  96. $stm->bindParam(3, $metadata['author']);
  97. $stm->bindParam(4, $body);
  98. $stm->bindParam(5, $metadata['page_display']);
  99. $stm->bindParam(6, $metadata['published']);
  100. $stm->execute();
  101. }
  102. $db = new PDO('sqlite:' . $argv[1]);
  103. $db->exec('CREATE TABLE IF NOT EXISTS news (nid PRIMARY KEY, body TEXT, author VARCHAR, title VARCHAR, display VARCHAR, published INTEGER, last_modified INTEGER);');
  104. $db->exec('CREATE INDEX IF NOT EXISTS news_published ON news (published);');
  105. $db->exec('CREATE INDEX IF NOT EXISTS news_author ON news (author);');
  106. $parser = (new ParserFactory)->create(ParserFactory::PREFER_PHP5);
  107. $i = 0;
  108. $db->beginTransaction();
  109. while($f = fgets(STDIN)) {
  110. $f = str_replace("\n", '', $f);
  111. parse_save($db, $f, $parser);
  112. $i++;
  113. if($i >= 100) {
  114. $db->commit();
  115. $db->beginTransaction();
  116. $i = 0;
  117. }
  118. }
  119. $db->commit();
  120. $db = null;
  121. exit(0);