LWNprevBridge.php 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. <?php
  2. /**
  3. * RssBridgeLWNprev
  4. *
  5. * @name LWNPrev Bridge
  6. * @description Returns the articles from the previous LWN.net edition
  7. */
  8. class LWNprevBridge extends BridgeAbstract{
  9. public function loadMetadatas() {
  10. $this->maintainer = 'Pierre Mazière';
  11. $this->name = 'LWN Free Weekly Edition';
  12. $this->uri = 'https://lwn.net/free/bigpage';
  13. $this->description = 'LWN Free Weekly Edition available one week late';
  14. $this->update = '2016-19-01';
  15. }
  16. private function jumpToNextTag(&$node){
  17. while($node && $node->nodeType===XML_TEXT_NODE){
  18. $nextNode=$node->nextSibling;
  19. if(!$nextNode){
  20. break;
  21. }
  22. $node=$nextNode;
  23. }
  24. }
  25. private function jumpToPreviousTag(&$node){
  26. while($node && $node->nodeType===XML_TEXT_NODE){
  27. $previousNode=$node->previousSibling;
  28. if(!$previousNode){
  29. break;
  30. }
  31. $node=$previousNode;
  32. }
  33. }
  34. public function collectData(array $param){
  35. // Because the LWN page is written in loose HTML and not XHTML,
  36. // Simple HTML Dom is not accurate enough for the job
  37. $uri='https://lwn.net/free/bigpage';
  38. $context=null;
  39. if(defined('PROXY_URL')) {
  40. $context = array(
  41. 'http' => array(
  42. 'proxy' => PROXY_URL,
  43. 'request_fulluri' => true,
  44. ),
  45. );
  46. $context = stream_context_create($context);
  47. }
  48. $html=file_get_contents($uri, false, $context)
  49. or $this->returnError('No results for LWNprev', 404);
  50. libxml_use_internal_errors(true);
  51. $html=DOMDocument::loadHTML($html);
  52. libxml_clear_errors();
  53. $cat1='';
  54. $cat2='';
  55. $realURI='https://lwn.net';
  56. foreach($html->getElementsByTagName('a') as $a){
  57. if($a->textContent==='Multi-page format'){
  58. break;
  59. }
  60. }
  61. $realURI.=$a->getAttribute('href');
  62. $URICounter=0;
  63. $edition=$html->getElementsByTagName('h1')->item(0)->textContent;
  64. $editionTimeStamp=strtotime(
  65. substr($edition,strpos($edition,'for ')+strlen('for '))
  66. );
  67. foreach($html->getElementsByTagName('h2') as $h2){
  68. if($h2->getAttribute('class')!=='SummaryHL'){
  69. continue;
  70. }
  71. $item = new \Item();
  72. $h2NextSibling=$h2->nextSibling;
  73. $this->jumpToNextTag($h2NextSibling);
  74. switch($h2NextSibling->getAttribute('class')){
  75. case 'FeatureByline':
  76. $item->name=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
  77. break;
  78. case 'GAByline':
  79. $text=$h2NextSibling->textContent;
  80. $item->name=substr($text,strpos($text,'by '));
  81. break;
  82. default:
  83. $item->name='LWN';
  84. break;
  85. };
  86. $h2FirstChild=$h2->firstChild;
  87. $this->jumpToNextTag($h2FirstChild);
  88. if($h2FirstChild->tagName==='a'){
  89. $item->uri='https://lwn.net'.$h2FirstChild->getAttribute('href');
  90. }else{
  91. $item->uri=$realURI.'#'.$URICounter;
  92. }
  93. $URICounter++;
  94. $item->timestamp=$editionTimeStamp+$URICounter;
  95. $h2PrevSibling=$h2->previousSibling;
  96. $this->jumpToPreviousTag($h2PrevSibling);
  97. switch($h2PrevSibling->getAttribute('class')){
  98. case 'Cat2HL':
  99. $cat2=$h2PrevSibling->textContent;
  100. $h2PrevSibling=$h2PrevSibling->previousSibling;
  101. $this->jumpToPreviousTag($h2PrevSibling);
  102. if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
  103. break;
  104. }
  105. $cat1=$h2PrevSibling->textContent;
  106. break;
  107. case 'Cat1HL':
  108. $cat1=$h2PrevSibling->textContent;
  109. $cat2='';
  110. break;
  111. default:
  112. break;
  113. }
  114. $h2PrevSibling=null;
  115. $item->title='';
  116. if(!empty($cat1)){
  117. $item->title.='['.$cat1.($cat2?'/'.$cat2:'').'] ';
  118. }
  119. $item->title.=$h2->textContent;
  120. $node=$h2;
  121. $content='';
  122. $contentEnd=false;
  123. while(!$contentEnd){
  124. $node=$node->nextSibling;
  125. if(
  126. !$node || (
  127. $node->nodeType!==XML_TEXT_NODE && (
  128. $node->tagName==='h2' ||
  129. in_array($node->getAttribute('class'),array('Cat1HL','Cat2HL'))
  130. )
  131. )
  132. ){
  133. $contentEnd=true;
  134. }else{
  135. $content.=$node->C14N();
  136. }
  137. }
  138. $item->content=$content;
  139. $this->items[]=$item;
  140. }
  141. }
  142. public function getName(){
  143. return 'LWN Free Weekly Edition';
  144. }
  145. public function getURI(){
  146. return 'https://lwn.net/free/bigpage';
  147. }
  148. public function getCacheDuration(){
  149. return 604800; // one week
  150. }
  151. }