LWNprevBridge.php 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. <?php
  2. class LWNprevBridge extends BridgeAbstract{
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'LWN Free Weekly Edition';
  5. const URI = 'https://lwn.net/';
  6. const CACHE_TIMEOUT = 604800; // 1 week
  7. const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
  8. function getURI(){
  9. return self::URI.'free/bigpage';
  10. }
  11. private function jumpToNextTag(&$node){
  12. while($node && $node->nodeType===XML_TEXT_NODE){
  13. $nextNode=$node->nextSibling;
  14. if(!$nextNode){
  15. break;
  16. }
  17. $node=$nextNode;
  18. }
  19. }
  20. private function jumpToPreviousTag(&$node){
  21. while($node && $node->nodeType===XML_TEXT_NODE){
  22. $previousNode=$node->previousSibling;
  23. if(!$previousNode){
  24. break;
  25. }
  26. $node=$previousNode;
  27. }
  28. }
  29. public function collectData(){
  30. // Because the LWN page is written in loose HTML and not XHTML,
  31. // Simple HTML Dom is not accurate enough for the job
  32. $content=getContents($this->getURI())
  33. or returnServerError('No results for LWNprev');
  34. libxml_use_internal_errors(true);
  35. $html=new DOMDocument();
  36. $html->loadHTML($content);
  37. libxml_clear_errors();
  38. $cat1='';
  39. $cat2='';
  40. foreach($html->getElementsByTagName('a') as $a){
  41. if($a->textContent==='Multi-page format'){
  42. break;
  43. }
  44. }
  45. $realURI=self::URI.$a->getAttribute('href');
  46. $URICounter=0;
  47. $edition=$html->getElementsByTagName('h1')->item(0)->textContent;
  48. $editionTimeStamp=strtotime(
  49. substr($edition,strpos($edition,'for ')+strlen('for '))
  50. );
  51. foreach($html->getElementsByTagName('h2') as $h2){
  52. if($h2->getAttribute('class')!=='SummaryHL'){
  53. continue;
  54. }
  55. $item = array();
  56. $h2NextSibling=$h2->nextSibling;
  57. $this->jumpToNextTag($h2NextSibling);
  58. switch($h2NextSibling->getAttribute('class')){
  59. case 'FeatureByline':
  60. $item['author']=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
  61. break;
  62. case 'GAByline':
  63. $text=$h2NextSibling->textContent;
  64. $item['author']=substr($text,strpos($text,'by '));
  65. break;
  66. default:
  67. $item['author']='LWN';
  68. break;
  69. };
  70. $h2FirstChild=$h2->firstChild;
  71. $this->jumpToNextTag($h2FirstChild);
  72. if($h2FirstChild->nodeName==='a'){
  73. $item['uri']=self::URI.$h2FirstChild->getAttribute('href');
  74. }else{
  75. $item['uri']=$realURI.'#'.$URICounter;
  76. }
  77. $URICounter++;
  78. $item['timestamp']=$editionTimeStamp+$URICounter;
  79. $h2PrevSibling=$h2->previousSibling;
  80. $this->jumpToPreviousTag($h2PrevSibling);
  81. switch($h2PrevSibling->getAttribute('class')){
  82. case 'Cat2HL':
  83. $cat2=$h2PrevSibling->textContent;
  84. $h2PrevSibling=$h2PrevSibling->previousSibling;
  85. $this->jumpToPreviousTag($h2PrevSibling);
  86. if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
  87. break;
  88. }
  89. $cat1=$h2PrevSibling->textContent;
  90. break;
  91. case 'Cat1HL':
  92. $cat1=$h2PrevSibling->textContent;
  93. $cat2='';
  94. break;
  95. default:
  96. break;
  97. }
  98. $h2PrevSibling=null;
  99. $item['title']='';
  100. if(!empty($cat1)){
  101. $item['title'].='['.$cat1.($cat2?'/'.$cat2:'').'] ';
  102. }
  103. $item['title'].=$h2->textContent;
  104. $node=$h2;
  105. $content='';
  106. $contentEnd=false;
  107. while(!$contentEnd){
  108. $node=$node->nextSibling;
  109. if(
  110. !$node || (
  111. $node->nodeType!==XML_TEXT_NODE && (
  112. $node->nodeName==='h2' ||
  113. (!is_null($node->attributes) && !is_null($class=$node->attributes->getNamedItem('class')) &&
  114. in_array($class->nodeValue,array('Cat1HL','Cat2HL')))
  115. )
  116. )
  117. ){
  118. $contentEnd=true;
  119. }else{
  120. $content.=$node->C14N();
  121. }
  122. }
  123. $item['content']=$content;
  124. $this->items[]=$item;
  125. }
  126. }
  127. }