LWNprevBridge.php 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. <?php
  2. class LWNprevBridge extends BridgeAbstract{
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'LWN Free Weekly Edition';
  5. const URI = 'https://lwn.net/';
  6. const CACHE_TIMEOUT = 604800; // 1 week
  7. const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
  8. private $editionTimeStamp;
  9. function getURI(){
  10. return self::URI.'free/bigpage';
  11. }
  12. private function jumpToNextTag(&$node){
  13. while($node && $node->nodeType === XML_TEXT_NODE) {
  14. $nextNode = $node->nextSibling;
  15. if(!$nextNode) {
  16. break;
  17. }
  18. $node = $nextNode;
  19. }
  20. }
  21. private function jumpToPreviousTag(&$node){
  22. while($node && $node->nodeType === XML_TEXT_NODE) {
  23. $previousNode = $node->previousSibling;
  24. if(!$previousNode) {
  25. break;
  26. }
  27. $node = $previousNode;
  28. }
  29. }
  30. public function collectData(){
  31. // Because the LWN page is written in loose HTML and not XHTML,
  32. // Simple HTML Dom is not accurate enough for the job
  33. $content = getContents($this->getURI())
  34. or returnServerError('No results for LWNprev');
  35. $contents = explode('<b>Page editor</b>', $content);
  36. foreach($contents as $content) {
  37. if(strpos($content, '<html>') === false) {
  38. $content = <<<EOD
  39. <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
  40. <html><head><title>LWN</title></head><body>{$content}</body></html>
  41. EOD;
  42. } else {
  43. $content = $content.'</body></html>';
  44. }
  45. libxml_use_internal_errors(true);
  46. $html = new DOMDocument();
  47. $html->loadHTML($content);
  48. libxml_clear_errors();
  49. $edition = $html->getElementsByTagName('h1');
  50. if($edition->length !== 0) {
  51. $text = $edition->item(0)->textContent;
  52. $this->editionTimeStamp = strtotime(
  53. substr($text, strpos($text, 'for ') + strlen('for '))
  54. );
  55. }
  56. if(strpos($content, 'Cat1HL') === false) {
  57. $items = $this->getFeatureContents($html);
  58. } elseif(strpos($content, 'Cat3HL') === false) {
  59. $items = $this->getBriefItems($html);
  60. } else {
  61. $items = $this->getAnnouncements($html);
  62. }
  63. $this->items = array_merge($this->items, $items);
  64. }
  65. }
  66. private function getArticleContent(&$title){
  67. $link = $title->firstChild;
  68. $this->jumpToNextTag($link);
  69. $item['uri'] = self::URI;
  70. if($link->nodeName === 'a') {
  71. $item['uri'] .= $link->getAttribute('href');
  72. }
  73. $item['timestamp'] = $this->editionTimeStamp;
  74. $node = $title;
  75. $content = '';
  76. $contentEnd = false;
  77. while(!$contentEnd) {
  78. $node = $node->nextSibling;
  79. if(!$node || (
  80. $node->nodeType !== XML_TEXT_NODE &&
  81. $node->nodeName === 'h2' || (
  82. !is_null($node->attributes) &&
  83. !is_null($class = $node->attributes->getNamedItem('class')) &&
  84. in_array($class->nodeValue, array('Cat1HL','Cat2HL'))
  85. )
  86. )
  87. ) {
  88. $contentEnd = true;
  89. } else {
  90. $content .= $node->C14N();
  91. }
  92. }
  93. $item['content'] = $content;
  94. return $item;
  95. }
  96. private function getFeatureContents(&$html){
  97. $items = array();
  98. foreach($html->getElementsByTagName('h2') as $title) {
  99. if($title->getAttribute('class') !== 'SummaryHL') {
  100. continue;
  101. }
  102. $item = array();
  103. $author = $title->nextSibling;
  104. $this->jumpToNextTag($author);
  105. if($author->getAttribute('class') === 'FeatureByline') {
  106. $item['author'] = $author->getElementsByTagName('b')->item(0)->textContent;
  107. } else {
  108. continue;
  109. }
  110. $item['title'] = $title->textContent;
  111. $items[] = array_merge($item, $this->getArticleContent($title));
  112. }
  113. return $items;
  114. }
  115. private function getItemPrefix(&$cat, &$cats){
  116. $cat1 = '';
  117. $cat2 = '';
  118. $cat3 = '';
  119. switch($cat->getAttribute('class')) {
  120. case 'Cat3HL':
  121. $cat3 = $cat->textContent;
  122. $cat = $cat->previousSibling;
  123. $this->jumpToPreviousTag($cat);
  124. $cats[2] = $cat3;
  125. if($cat->getAttribute('class') !== 'Cat2HL') {
  126. break;
  127. }
  128. case 'Cat2HL':
  129. $cat2 = $cat->textContent;
  130. $cat = $cat->previousSibling;
  131. $this->jumpToPreviousTag($cat);
  132. $cats[1] = $cat2;
  133. if(empty($cat3)) {
  134. $cats[2] = '';
  135. }
  136. if($cat->getAttribute('class') !== 'Cat1HL') {
  137. break;
  138. }
  139. case 'Cat1HL':
  140. $cat1 = $cat->textContent;
  141. $cats[0] = $cat1;
  142. if(empty($cat3)) {
  143. $cats[2] = '';
  144. }
  145. if(empty($cat2)) {
  146. $cats[1] = '';
  147. }
  148. break;
  149. default:
  150. break;
  151. }
  152. $prefix = '';
  153. if(!empty($cats[0])) {
  154. $prefix .= '['.$cats[0].($cats[1] ? '/'.$cats[1] : '').'] ';
  155. }
  156. return $prefix;
  157. }
  158. private function getAnnouncements(&$html){
  159. $items = array();
  160. $cats = array('','','');
  161. foreach($html->getElementsByTagName('p') as $newsletters) {
  162. if($newsletters->getAttribute('class') !== 'Cat3HL') {
  163. continue;
  164. }
  165. $item = array();
  166. $item['uri'] = self::URI.'#'.count($items);
  167. $item['timestamp'] = $this->editionTimeStamp;
  168. $item['author'] = 'LWN';
  169. $cat = $newsletters->previousSibling;
  170. $this->jumpToPreviousTag($cat);
  171. $prefix = $this->getItemPrefix($cat, $cats);
  172. $item['title'] = $prefix.' '.$newsletters->textContent;
  173. $node = $newsletters;
  174. $content = '';
  175. $contentEnd = false;
  176. while(!$contentEnd) {
  177. $node = $node->nextSibling;
  178. if(!$node || (
  179. $node->nodeType !== XML_TEXT_NODE && (
  180. !is_null($node->attributes) &&
  181. !is_null($class = $node->attributes->getNamedItem('class')) &&
  182. in_array($class->nodeValue, array('Cat1HL','Cat2HL','Cat3HL'))
  183. )
  184. )
  185. ) {
  186. $contentEnd = true;
  187. } else {
  188. $content .= $node->C14N();
  189. }
  190. }
  191. $item['content'] = $content;
  192. $items[] = $item;
  193. }
  194. foreach($html->getElementsByTagName('h2') as $title) {
  195. if($title->getAttribute('class') !== 'SummaryHL') {
  196. continue;
  197. }
  198. $item = array();
  199. $cat = $title->previousSibling;
  200. $this->jumpToPreviousTag($cat);
  201. $cat = $cat->previousSibling;
  202. $this->jumpToPreviousTag($cat);
  203. $prefix = $this->getItemPrefix($cat, $cats);
  204. $item['title'] = $prefix.' '.$title->textContent;
  205. $items[] = array_merge($item, $this->getArticleContent($title));
  206. }
  207. return $items;
  208. }
  209. private function getBriefItems(&$html){
  210. $items = array();
  211. $cats = array('','','');
  212. foreach($html->getElementsByTagName('h2') as $title) {
  213. if($title->getAttribute('class') !== 'SummaryHL') {
  214. continue;
  215. }
  216. $item = array();
  217. $cat = $title->previousSibling;
  218. $this->jumpToPreviousTag($cat);
  219. $cat = $cat->previousSibling;
  220. $this->jumpToPreviousTag($cat);
  221. $prefix = $this->getItemPrefix($cat, $cats);
  222. $item['title'] = $prefix.' '.$title->textContent;
  223. $items[] = array_merge($item, $this->getArticleContent($title));
  224. }
  225. return $items;
  226. }
  227. }
  228. ?>