LWNprevBridge.php 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. <?php
  2. class LWNprevBridge extends BridgeAbstract{
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'LWN Free Weekly Edition';
  5. const URI = 'https://lwn.net/';
  6. const CACHE_TIMEOUT = 604800; // 1 week
  7. const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
  8. function getURI(){
  9. return self::URI . 'free/bigpage';
  10. }
  11. private function jumpToNextTag(&$node){
  12. while($node && $node->nodeType === XML_TEXT_NODE) {
  13. $nextNode = $node->nextSibling;
  14. if(!$nextNode) {
  15. break;
  16. }
  17. $node = $nextNode;
  18. }
  19. }
  20. private function jumpToPreviousTag(&$node){
  21. while($node && $node->nodeType === XML_TEXT_NODE) {
  22. $previousNode = $node->previousSibling;
  23. if(!$previousNode) {
  24. break;
  25. }
  26. $node = $previousNode;
  27. }
  28. }
  29. public function collectData(){
  30. // Because the LWN page is written in loose HTML and not XHTML,
  31. // Simple HTML Dom is not accurate enough for the job
  32. $content = getContents($this->getURI())
  33. or returnServerError('No results for LWNprev');
  34. libxml_use_internal_errors(true);
  35. $html = new DOMDocument();
  36. $html->loadHTML($content);
  37. libxml_clear_errors();
  38. $cat1 = '';
  39. $cat2 = '';
  40. foreach($html->getElementsByTagName('a') as $a) {
  41. if($a->textContent === 'Multi-page format') {
  42. break;
  43. }
  44. }
  45. $realURI = self::URI . $a->getAttribute('href');
  46. $URICounter = 0;
  47. $edition = $html->getElementsByTagName('h1')->item(0)->textContent;
  48. $editionTimeStamp = strtotime(
  49. substr($edition, strpos($edition, 'for ') + strlen('for '))
  50. );
  51. foreach($html->getElementsByTagName('h2') as $h2) {
  52. if($h2->getAttribute('class') !== 'SummaryHL') {
  53. continue;
  54. }
  55. $item = array();
  56. $h2NextSibling = $h2->nextSibling;
  57. $this->jumpToNextTag($h2NextSibling);
  58. switch($h2NextSibling->getAttribute('class')) {
  59. case 'FeatureByline':
  60. $item['author'] = $h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
  61. break;
  62. case 'GAByline':
  63. $text = $h2NextSibling->textContent;
  64. $item['author'] = substr($text, strpos($text, 'by '));
  65. break;
  66. default:
  67. $item['author'] = 'LWN';
  68. break;
  69. };
  70. $h2FirstChild = $h2->firstChild;
  71. $this->jumpToNextTag($h2FirstChild);
  72. if($h2FirstChild->nodeName === 'a') {
  73. $item['uri'] = self::URI . $h2FirstChild->getAttribute('href');
  74. } else{
  75. $item['uri'] = $realURI . '#' . $URICounter;
  76. }
  77. $URICounter++;
  78. $item['timestamp'] = $editionTimeStamp + $URICounter;
  79. $h2PrevSibling = $h2->previousSibling;
  80. $this->jumpToPreviousTag($h2PrevSibling);
  81. switch($h2PrevSibling->getAttribute('class')) {
  82. case 'Cat2HL':
  83. $cat2 = $h2PrevSibling->textContent;
  84. $h2PrevSibling = $h2PrevSibling->previousSibling;
  85. $this->jumpToPreviousTag($h2PrevSibling);
  86. if($h2PrevSibling->getAttribute('class') !== 'Cat1HL') {
  87. break;
  88. }
  89. $cat1 = $h2PrevSibling->textContent;
  90. break;
  91. case 'Cat1HL':
  92. $cat1 = $h2PrevSibling->textContent;
  93. $cat2 = '';
  94. break;
  95. default:
  96. break;
  97. }
  98. $h2PrevSibling = null;
  99. $item['title'] = '';
  100. if(!empty($cat1)) {
  101. $item['title'] .= '[' . $cat1 . ($cat2 ? '/' . $cat2 : '') . '] ';
  102. }
  103. $item['title'] .= $h2->textContent;
  104. $node = $h2;
  105. $content = '';
  106. $contentEnd = false;
  107. while(!$contentEnd) {
  108. $node = $node->nextSibling;
  109. if(!$node || (
  110. $node->nodeType !== XML_TEXT_NODE && (
  111. $node->nodeName === 'h2' || (
  112. !is_null($node->attributes) &&
  113. !is_null($class = $node->attributes->getNamedItem('class')) &&
  114. in_array($class->nodeValue, array('Cat1HL', 'Cat2HL'))
  115. )
  116. )
  117. )
  118. ) {
  119. $contentEnd = true;
  120. } else{
  121. $content .= $node->C14N();
  122. }
  123. }
  124. $item['content'] = $content;
  125. $this->items[] = $item;
  126. }
  127. }
  128. }