LWNprevBridge.php 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. <?php
  2. /**
  3. * RssBridgeLWNprev
  4. *
  5. * @name LWNPrev Bridge
  6. * @description Returns the articles from the previous LWN.net edition
  7. */
  8. class LWNprevBridge extends BridgeAbstract{
  9. public function loadMetadatas() {
  10. $this->maintainer = 'Pierre Mazière';
  11. $this->name = 'LWN Free Weekly Edition';
  12. $this->uri = 'https://lwn.net/free/bigpage';
  13. $this->description = 'LWN Free Weekly Edition available one week late';
  14. }
  15. private function jumpToNextTag(&$node){
  16. while($node && $node->nodeType===XML_TEXT_NODE){
  17. $nextNode=$node->nextSibling;
  18. if(!$nextNode){
  19. break;
  20. }
  21. $node=$nextNode;
  22. }
  23. }
  24. private function jumpToPreviousTag(&$node){
  25. while($node && $node->nodeType===XML_TEXT_NODE){
  26. $previousNode=$node->previousSibling;
  27. if(!$previousNode){
  28. break;
  29. }
  30. $node=$previousNode;
  31. }
  32. }
  33. public function collectData(array $param){
  34. // Because the LWN page is written in loose HTML and not XHTML,
  35. // Simple HTML Dom is not accurate enough for the job
  36. $uri='https://lwn.net/free/bigpage';
  37. $context=null;
  38. if(defined('PROXY_URL')) {
  39. $context = array(
  40. 'http' => array(
  41. 'proxy' => PROXY_URL,
  42. 'request_fulluri' => true,
  43. ),
  44. );
  45. $context = stream_context_create($context);
  46. }
  47. $content=file_get_contents($uri, false, $context)
  48. or $this->returnServerError('No results for LWNprev');
  49. libxml_use_internal_errors(true);
  50. $html=new DOMDocument();
  51. $html->loadHTML($content);
  52. libxml_clear_errors();
  53. $cat1='';
  54. $cat2='';
  55. $realURI='https://lwn.net';
  56. foreach($html->getElementsByTagName('a') as $a){
  57. if($a->textContent==='Multi-page format'){
  58. break;
  59. }
  60. }
  61. $realURI.=$a->getAttribute('href');
  62. $URICounter=0;
  63. $edition=$html->getElementsByTagName('h1')->item(0)->textContent;
  64. $editionTimeStamp=strtotime(
  65. substr($edition,strpos($edition,'for ')+strlen('for '))
  66. );
  67. foreach($html->getElementsByTagName('h2') as $h2){
  68. if($h2->getAttribute('class')!=='SummaryHL'){
  69. continue;
  70. }
  71. $item = array();
  72. $h2NextSibling=$h2->nextSibling;
  73. $this->jumpToNextTag($h2NextSibling);
  74. switch($h2NextSibling->getAttribute('class')){
  75. case 'FeatureByline':
  76. $item['author']=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
  77. break;
  78. case 'GAByline':
  79. $text=$h2NextSibling->textContent;
  80. $item['author']=substr($text,strpos($text,'by '));
  81. break;
  82. default:
  83. $item['author']='LWN';
  84. break;
  85. };
  86. $h2FirstChild=$h2->firstChild;
  87. $this->jumpToNextTag($h2FirstChild);
  88. if($h2FirstChild->nodeName==='a'){
  89. $item['uri']='https://lwn.net'.$h2FirstChild->getAttribute('href');
  90. }else{
  91. $item['uri']=$realURI.'#'.$URICounter;
  92. }
  93. $URICounter++;
  94. $item['timestamp']=$editionTimeStamp+$URICounter;
  95. $h2PrevSibling=$h2->previousSibling;
  96. $this->jumpToPreviousTag($h2PrevSibling);
  97. switch($h2PrevSibling->getAttribute('class')){
  98. case 'Cat2HL':
  99. $cat2=$h2PrevSibling->textContent;
  100. $h2PrevSibling=$h2PrevSibling->previousSibling;
  101. $this->jumpToPreviousTag($h2PrevSibling);
  102. if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
  103. break;
  104. }
  105. $cat1=$h2PrevSibling->textContent;
  106. break;
  107. case 'Cat1HL':
  108. $cat1=$h2PrevSibling->textContent;
  109. $cat2='';
  110. break;
  111. default:
  112. break;
  113. }
  114. $h2PrevSibling=null;
  115. $item['title']='';
  116. if(!empty($cat1)){
  117. $item['title'].='['.$cat1.($cat2?'/'.$cat2:'').'] ';
  118. }
  119. $item['title'].=$h2->textContent;
  120. $node=$h2;
  121. $content='';
  122. $contentEnd=false;
  123. while(!$contentEnd){
  124. $node=$node->nextSibling;
  125. if(
  126. !$node || (
  127. $node->nodeType!==XML_TEXT_NODE && (
  128. $node->nodeName==='h2' ||
  129. (!is_null($node->attributes) && !is_null($class=$node->attributes->getNamedItem('class')) &&
  130. in_array($class->nodeValue,array('Cat1HL','Cat2HL')))
  131. )
  132. )
  133. ){
  134. $contentEnd=true;
  135. }else{
  136. $content.=$node->C14N();
  137. }
  138. }
  139. $item['content']=$content;
  140. $this->items[]=$item;
  141. }
  142. }
  143. public function getCacheDuration(){
  144. return 604800; // one week
  145. }
  146. }