2016-06-19 00:41:02 +02:00
|
|
|
<?php
|
|
|
|
class LWNprevBridge extends BridgeAbstract{
|
2016-08-27 21:03:26 +02:00
|
|
|
public $maintainer = 'Pierre Mazière';
|
|
|
|
public $name = 'LWN Free Weekly Edition';
|
|
|
|
public $uri = 'https://lwn.net/free/bigpage';
|
|
|
|
public $description = 'LWN Free Weekly Edition available one week late';
|
2016-06-19 00:41:02 +02:00
|
|
|
|
|
|
|
private function jumpToNextTag(&$node){
|
|
|
|
while($node && $node->nodeType===XML_TEXT_NODE){
|
|
|
|
$nextNode=$node->nextSibling;
|
|
|
|
if(!$nextNode){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
$node=$nextNode;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function jumpToPreviousTag(&$node){
|
|
|
|
while($node && $node->nodeType===XML_TEXT_NODE){
|
|
|
|
$previousNode=$node->previousSibling;
|
|
|
|
if(!$previousNode){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
$node=$previousNode;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-25 01:24:53 +02:00
|
|
|
public function collectData(){
|
2016-06-19 00:41:02 +02:00
|
|
|
// Because the LWN page is written in loose HTML and not XHTML,
|
|
|
|
// Simple HTML Dom is not accurate enough for the job
|
2016-06-26 11:18:23 +02:00
|
|
|
|
|
|
|
$uri='https://lwn.net/free/bigpage';
|
|
|
|
$context=null;
|
|
|
|
if(defined('PROXY_URL')) {
|
|
|
|
$context = array(
|
|
|
|
'http' => array(
|
|
|
|
'proxy' => PROXY_URL,
|
|
|
|
'request_fulluri' => true,
|
|
|
|
),
|
|
|
|
);
|
|
|
|
$context = stream_context_create($context);
|
|
|
|
}
|
|
|
|
|
2016-07-08 14:06:00 +02:00
|
|
|
$content=file_get_contents($uri, false, $context)
|
2016-08-17 14:45:08 +02:00
|
|
|
or $this->returnServerError('No results for LWNprev');
|
2016-06-19 00:41:02 +02:00
|
|
|
|
|
|
|
libxml_use_internal_errors(true);
|
2016-07-08 14:06:00 +02:00
|
|
|
$html=new DOMDocument();
|
|
|
|
$html->loadHTML($content);
|
2016-06-19 00:41:02 +02:00
|
|
|
libxml_clear_errors();
|
|
|
|
|
|
|
|
$cat1='';
|
|
|
|
$cat2='';
|
|
|
|
|
|
|
|
$realURI='https://lwn.net';
|
|
|
|
foreach($html->getElementsByTagName('a') as $a){
|
|
|
|
if($a->textContent==='Multi-page format'){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$realURI.=$a->getAttribute('href');
|
|
|
|
$URICounter=0;
|
|
|
|
|
|
|
|
$edition=$html->getElementsByTagName('h1')->item(0)->textContent;
|
|
|
|
$editionTimeStamp=strtotime(
|
|
|
|
substr($edition,strpos($edition,'for ')+strlen('for '))
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($html->getElementsByTagName('h2') as $h2){
|
|
|
|
if($h2->getAttribute('class')!=='SummaryHL'){
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-08-22 18:55:59 +02:00
|
|
|
$item = array();
|
2016-06-19 00:41:02 +02:00
|
|
|
|
|
|
|
$h2NextSibling=$h2->nextSibling;
|
|
|
|
$this->jumpToNextTag($h2NextSibling);
|
|
|
|
|
|
|
|
switch($h2NextSibling->getAttribute('class')){
|
2016-06-26 11:17:12 +02:00
|
|
|
case 'FeatureByline':
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['author']=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
|
2016-06-26 11:17:12 +02:00
|
|
|
break;
|
|
|
|
case 'GAByline':
|
|
|
|
$text=$h2NextSibling->textContent;
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['author']=substr($text,strpos($text,'by '));
|
2016-06-26 11:17:12 +02:00
|
|
|
break;
|
|
|
|
default:
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['author']='LWN';
|
2016-06-26 11:17:12 +02:00
|
|
|
break;
|
2016-06-19 00:41:02 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
$h2FirstChild=$h2->firstChild;
|
|
|
|
$this->jumpToNextTag($h2FirstChild);
|
2016-07-08 14:06:00 +02:00
|
|
|
if($h2FirstChild->nodeName==='a'){
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['uri']='https://lwn.net'.$h2FirstChild->getAttribute('href');
|
2016-06-19 00:41:02 +02:00
|
|
|
}else{
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['uri']=$realURI.'#'.$URICounter;
|
2016-06-19 00:41:02 +02:00
|
|
|
}
|
|
|
|
$URICounter++;
|
|
|
|
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['timestamp']=$editionTimeStamp+$URICounter;
|
2016-06-19 00:41:02 +02:00
|
|
|
|
|
|
|
$h2PrevSibling=$h2->previousSibling;
|
|
|
|
$this->jumpToPreviousTag($h2PrevSibling);
|
|
|
|
switch($h2PrevSibling->getAttribute('class')){
|
|
|
|
case 'Cat2HL':
|
|
|
|
$cat2=$h2PrevSibling->textContent;
|
|
|
|
$h2PrevSibling=$h2PrevSibling->previousSibling;
|
|
|
|
$this->jumpToPreviousTag($h2PrevSibling);
|
|
|
|
if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
$cat1=$h2PrevSibling->textContent;
|
|
|
|
break;
|
|
|
|
case 'Cat1HL':
|
|
|
|
$cat1=$h2PrevSibling->textContent;
|
|
|
|
$cat2='';
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
$h2PrevSibling=null;
|
|
|
|
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['title']='';
|
2016-06-25 09:52:17 +02:00
|
|
|
if(!empty($cat1)){
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['title'].='['.$cat1.($cat2?'/'.$cat2:'').'] ';
|
2016-06-25 09:52:17 +02:00
|
|
|
}
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['title'].=$h2->textContent;
|
2016-06-25 09:52:17 +02:00
|
|
|
|
2016-06-19 00:41:02 +02:00
|
|
|
$node=$h2;
|
|
|
|
$content='';
|
|
|
|
$contentEnd=false;
|
|
|
|
while(!$contentEnd){
|
|
|
|
$node=$node->nextSibling;
|
|
|
|
if(
|
|
|
|
!$node || (
|
|
|
|
$node->nodeType!==XML_TEXT_NODE && (
|
2016-07-08 14:06:00 +02:00
|
|
|
$node->nodeName==='h2' ||
|
|
|
|
(!is_null($node->attributes) && !is_null($class=$node->attributes->getNamedItem('class')) &&
|
|
|
|
in_array($class->nodeValue,array('Cat1HL','Cat2HL')))
|
2016-06-19 00:41:02 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
){
|
2016-06-26 11:17:12 +02:00
|
|
|
$contentEnd=true;
|
2016-06-19 00:41:02 +02:00
|
|
|
}else{
|
|
|
|
$content.=$node->C14N();
|
|
|
|
}
|
|
|
|
}
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content']=$content;
|
2016-06-19 00:41:02 +02:00
|
|
|
$this->items[]=$item;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public function getCacheDuration(){
|
|
|
|
return 604800; // one week
|
|
|
|
}
|
|
|
|
}
|