LWNprevBridge.php 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. <?php
  2. class LWNprevBridge extends BridgeAbstract{
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'LWN Free Weekly Edition';
  5. const URI = 'https://lwn.net/';
  6. const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
  7. function getURI(){
  8. return self::URI.'free/bigpage';
  9. }
  10. private function jumpToNextTag(&$node){
  11. while($node && $node->nodeType===XML_TEXT_NODE){
  12. $nextNode=$node->nextSibling;
  13. if(!$nextNode){
  14. break;
  15. }
  16. $node=$nextNode;
  17. }
  18. }
  19. private function jumpToPreviousTag(&$node){
  20. while($node && $node->nodeType===XML_TEXT_NODE){
  21. $previousNode=$node->previousSibling;
  22. if(!$previousNode){
  23. break;
  24. }
  25. $node=$previousNode;
  26. }
  27. }
  28. public function collectData(){
  29. // Because the LWN page is written in loose HTML and not XHTML,
  30. // Simple HTML Dom is not accurate enough for the job
  31. $content=getContents($this->getURI())
  32. or returnServerError('No results for LWNprev');
  33. libxml_use_internal_errors(true);
  34. $html=new DOMDocument();
  35. $html->loadHTML($content);
  36. libxml_clear_errors();
  37. $cat1='';
  38. $cat2='';
  39. foreach($html->getElementsByTagName('a') as $a){
  40. if($a->textContent==='Multi-page format'){
  41. break;
  42. }
  43. }
  44. $realURI=self::URI.$a->getAttribute('href');
  45. $URICounter=0;
  46. $edition=$html->getElementsByTagName('h1')->item(0)->textContent;
  47. $editionTimeStamp=strtotime(
  48. substr($edition,strpos($edition,'for ')+strlen('for '))
  49. );
  50. foreach($html->getElementsByTagName('h2') as $h2){
  51. if($h2->getAttribute('class')!=='SummaryHL'){
  52. continue;
  53. }
  54. $item = array();
  55. $h2NextSibling=$h2->nextSibling;
  56. $this->jumpToNextTag($h2NextSibling);
  57. switch($h2NextSibling->getAttribute('class')){
  58. case 'FeatureByline':
  59. $item['author']=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
  60. break;
  61. case 'GAByline':
  62. $text=$h2NextSibling->textContent;
  63. $item['author']=substr($text,strpos($text,'by '));
  64. break;
  65. default:
  66. $item['author']='LWN';
  67. break;
  68. };
  69. $h2FirstChild=$h2->firstChild;
  70. $this->jumpToNextTag($h2FirstChild);
  71. if($h2FirstChild->nodeName==='a'){
  72. $item['uri']=self::URI.$h2FirstChild->getAttribute('href');
  73. }else{
  74. $item['uri']=$realURI.'#'.$URICounter;
  75. }
  76. $URICounter++;
  77. $item['timestamp']=$editionTimeStamp+$URICounter;
  78. $h2PrevSibling=$h2->previousSibling;
  79. $this->jumpToPreviousTag($h2PrevSibling);
  80. switch($h2PrevSibling->getAttribute('class')){
  81. case 'Cat2HL':
  82. $cat2=$h2PrevSibling->textContent;
  83. $h2PrevSibling=$h2PrevSibling->previousSibling;
  84. $this->jumpToPreviousTag($h2PrevSibling);
  85. if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
  86. break;
  87. }
  88. $cat1=$h2PrevSibling->textContent;
  89. break;
  90. case 'Cat1HL':
  91. $cat1=$h2PrevSibling->textContent;
  92. $cat2='';
  93. break;
  94. default:
  95. break;
  96. }
  97. $h2PrevSibling=null;
  98. $item['title']='';
  99. if(!empty($cat1)){
  100. $item['title'].='['.$cat1.($cat2?'/'.$cat2:'').'] ';
  101. }
  102. $item['title'].=$h2->textContent;
  103. $node=$h2;
  104. $content='';
  105. $contentEnd=false;
  106. while(!$contentEnd){
  107. $node=$node->nextSibling;
  108. if(
  109. !$node || (
  110. $node->nodeType!==XML_TEXT_NODE && (
  111. $node->nodeName==='h2' ||
  112. (!is_null($node->attributes) && !is_null($class=$node->attributes->getNamedItem('class')) &&
  113. in_array($class->nodeValue,array('Cat1HL','Cat2HL')))
  114. )
  115. )
  116. ){
  117. $contentEnd=true;
  118. }else{
  119. $content.=$node->C14N();
  120. }
  121. }
  122. $item['content']=$content;
  123. $this->items[]=$item;
  124. }
  125. }
  126. public function getCacheDuration(){
  127. return 604800; // one week
  128. }
  129. }