1
0
Fork 0
forked from blallo/rss-bridge
rss-bridge/bridges/LWNprevBridge.php
Pierre Mazière f1fb95b257 [core] extract BridgeAbstract methods to make them functions
- returnError, returnServerError, returnClientError ,debugMessage are
  moved to lib/error.php

- getContents, getSimpleHTMLDOM, getSimpleHTMLDOMCached are moved to
  lib/contents.php

Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
2016-09-25 23:22:33 +02:00

147 lines
3.9 KiB
PHP

<?php
class LWNprevBridge extends BridgeAbstract{
const MAINTAINER = 'Pierre Mazière';
const NAME = 'LWN Free Weekly Edition';
const URI = 'https://lwn.net/';
const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
function getURI(){
return self::URI.'free/bigpage';
}
private function jumpToNextTag(&$node){
while($node && $node->nodeType===XML_TEXT_NODE){
$nextNode=$node->nextSibling;
if(!$nextNode){
break;
}
$node=$nextNode;
}
}
private function jumpToPreviousTag(&$node){
while($node && $node->nodeType===XML_TEXT_NODE){
$previousNode=$node->previousSibling;
if(!$previousNode){
break;
}
$node=$previousNode;
}
}
public function collectData(){
// Because the LWN page is written in loose HTML and not XHTML,
// Simple HTML Dom is not accurate enough for the job
$content=getContents($this->getURI())
or returnServerError('No results for LWNprev');
libxml_use_internal_errors(true);
$html=new DOMDocument();
$html->loadHTML($content);
libxml_clear_errors();
$cat1='';
$cat2='';
foreach($html->getElementsByTagName('a') as $a){
if($a->textContent==='Multi-page format'){
break;
}
}
$realURI=self::URI.$a->getAttribute('href');
$URICounter=0;
$edition=$html->getElementsByTagName('h1')->item(0)->textContent;
$editionTimeStamp=strtotime(
substr($edition,strpos($edition,'for ')+strlen('for '))
);
foreach($html->getElementsByTagName('h2') as $h2){
if($h2->getAttribute('class')!=='SummaryHL'){
continue;
}
$item = array();
$h2NextSibling=$h2->nextSibling;
$this->jumpToNextTag($h2NextSibling);
switch($h2NextSibling->getAttribute('class')){
case 'FeatureByline':
$item['author']=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
break;
case 'GAByline':
$text=$h2NextSibling->textContent;
$item['author']=substr($text,strpos($text,'by '));
break;
default:
$item['author']='LWN';
break;
};
$h2FirstChild=$h2->firstChild;
$this->jumpToNextTag($h2FirstChild);
if($h2FirstChild->nodeName==='a'){
$item['uri']=self::URI.$h2FirstChild->getAttribute('href');
}else{
$item['uri']=$realURI.'#'.$URICounter;
}
$URICounter++;
$item['timestamp']=$editionTimeStamp+$URICounter;
$h2PrevSibling=$h2->previousSibling;
$this->jumpToPreviousTag($h2PrevSibling);
switch($h2PrevSibling->getAttribute('class')){
case 'Cat2HL':
$cat2=$h2PrevSibling->textContent;
$h2PrevSibling=$h2PrevSibling->previousSibling;
$this->jumpToPreviousTag($h2PrevSibling);
if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
break;
}
$cat1=$h2PrevSibling->textContent;
break;
case 'Cat1HL':
$cat1=$h2PrevSibling->textContent;
$cat2='';
break;
default:
break;
}
$h2PrevSibling=null;
$item['title']='';
if(!empty($cat1)){
$item['title'].='['.$cat1.($cat2?'/'.$cat2:'').'] ';
}
$item['title'].=$h2->textContent;
$node=$h2;
$content='';
$contentEnd=false;
while(!$contentEnd){
$node=$node->nextSibling;
if(
!$node || (
$node->nodeType!==XML_TEXT_NODE && (
$node->nodeName==='h2' ||
(!is_null($node->attributes) && !is_null($class=$node->attributes->getNamedItem('class')) &&
in_array($class->nodeValue,array('Cat1HL','Cat2HL')))
)
)
){
$contentEnd=true;
}else{
$content.=$node->C14N();
}
}
$item['content']=$content;
$this->items[]=$item;
}
}
public function getCacheDuration(){
return 604800; // one week
}
}