From dc6928316b0740cc359b94ce83040285c76d9dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pierre=20Mazi=C3=A8re?= Date: Sun, 19 Jun 2016 00:41:02 +0200 Subject: [PATCH 1/4] add bridge for LWN Free Weekly Edition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Pierre Mazière --- bridges/LWNprevBridge.php | 156 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 bridges/LWNprevBridge.php diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php new file mode 100644 index 0000000..54fb71d --- /dev/null +++ b/bridges/LWNprevBridge.php @@ -0,0 +1,156 @@ +maintainer = 'Pierre Mazière'; + $this->name = 'LWN Free Weekly Edition'; + $this->uri = 'https://lwn.net/free/bigpage'; + $this->description = 'LWN Free Weekly Edition available one week late'; + $this->update = '2016-19-01'; + + } + + private function jumpToNextTag(&$node){ + while($node && $node->nodeType===XML_TEXT_NODE){ + $nextNode=$node->nextSibling; + if(!$nextNode){ + break; + } + $node=$nextNode; + } + } + + private function jumpToPreviousTag(&$node){ + while($node && $node->nodeType===XML_TEXT_NODE){ + $previousNode=$node->previousSibling; + if(!$previousNode){ + break; + } + $node=$previousNode; + } + } + + public function collectData(array $param){ + // Because the LWN page is written in loose HTML and not XHTML, + // Simple HTML Dom is not accurate enough for the job + $html = advanced_file_get_contents('https://lwn.net/free/bigpage') + or $this->returnError('No results for LWNprev', 404); + + libxml_use_internal_errors(true); + $html=DOMDocument::loadHTML($html); + libxml_clear_errors(); + + $cat1=''; + $cat2=''; + + $realURI='https://lwn.net'; + foreach($html->getElementsByTagName('a') as $a){ + if($a->textContent==='Multi-page format'){ + break; + } + } + $realURI.=$a->getAttribute('href'); + $URICounter=0; + + $edition=$html->getElementsByTagName('h1')->item(0)->textContent; + $editionTimeStamp=strtotime( + substr($edition,strpos($edition,'for ')+strlen('for ')) + ); + + foreach($html->getElementsByTagName('h2') as $h2){ + if($h2->getAttribute('class')!=='SummaryHL'){ + continue; + } + + $item = new \Item(); + + $h2NextSibling=$h2->nextSibling; + $this->jumpToNextTag($h2NextSibling); + + switch($h2NextSibling->getAttribute('class')){ + case 'FeatureByline': + $item->name=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent; + break; + case 'GAByline': + $text=$h2NextSibling->textContent; + $item->name=substr($text,strpos($text,'by ')); + break; + default: + $item->name='LWN'; + break; + }; + + $h2FirstChild=$h2->firstChild; + $this->jumpToNextTag($h2FirstChild); + if($h2FirstChild->tagName==='a'){ + $item->uri='https://lwn.net'.$h2FirstChild->getAttribute('href'); + }else{ + $item->uri=$realURI.'#'.$URICounter; + } + $URICounter++; + + $item->timestamp=$editionTimeStamp+$URICounter; + + $h2PrevSibling=$h2->previousSibling; + $this->jumpToPreviousTag($h2PrevSibling); + switch($h2PrevSibling->getAttribute('class')){ + case 'Cat2HL': + $cat2=$h2PrevSibling->textContent; + $h2PrevSibling=$h2PrevSibling->previousSibling; + $this->jumpToPreviousTag($h2PrevSibling); + if($h2PrevSibling->getAttribute('class')!=='Cat1HL'){ + break; + } + $cat1=$h2PrevSibling->textContent; + break; + case 'Cat1HL': + $cat1=$h2PrevSibling->textContent; + $cat2=''; + break; + default: + break; + } + $h2PrevSibling=null; + + $item->title='['.$cat1.($cat2?'/'.$cat2:'').'] '.$h2->textContent; + $node=$h2; + $content=''; + $contentEnd=false; + while(!$contentEnd){ + $node=$node->nextSibling; + if( + !$node || ( + $node->nodeType!==XML_TEXT_NODE && ( + $node->tagName==='h2' || + in_array($node->getAttribute('class'),array('Cat1HL','Cat2HL')) + ) + ) + ){ + $contentEnd=true; + }else{ + $content.=$node->C14N(); + } + } + $item->content=$content; + $this->items[]=$item; + } + } + + public function getName(){ + return 'LWN Free Weekly Edition'; + } + + public function getURI(){ + return 'https://lwn.net/free/bigpage'; + } + + public function getCacheDuration(){ + return 604800; // one week + } +} From 1f85a2294df31161a705404496e052836c9398ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pierre=20Mazi=C3=A8re?= Date: Sat, 25 Jun 2016 09:52:17 +0200 Subject: [PATCH 2/4] remove empty brackets from title MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Pierre Mazière --- bridges/LWNprevBridge.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php index 54fb71d..44bc555 100644 --- a/bridges/LWNprevBridge.php +++ b/bridges/LWNprevBridge.php @@ -118,7 +118,12 @@ class LWNprevBridge extends BridgeAbstract{ } $h2PrevSibling=null; - $item->title='['.$cat1.($cat2?'/'.$cat2:'').'] '.$h2->textContent; + $item->title=''; + if(!empty($cat1)){ + $item->title.='['.$cat1.($cat2?'/'.$cat2:'').'] '; + } + $item->title.=$h2->textContent; + $node=$h2; $content=''; $contentEnd=false; From 78b4500ba445ad6e67b897684275640fce2d4786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pierre=20Mazi=C3=A8re?= Date: Sun, 26 Jun 2016 11:17:12 +0200 Subject: [PATCH 3/4] fix indentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Pierre Mazière --- bridges/LWNprevBridge.php | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php index 44bc555..224a01b 100644 --- a/bridges/LWNprevBridge.php +++ b/bridges/LWNprevBridge.php @@ -1,9 +1,9 @@ jumpToNextTag($h2NextSibling); switch($h2NextSibling->getAttribute('class')){ - case 'FeatureByline': - $item->name=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent; - break; - case 'GAByline': - $text=$h2NextSibling->textContent; - $item->name=substr($text,strpos($text,'by ')); - break; - default: - $item->name='LWN'; - break; + case 'FeatureByline': + $item->name=$h2NextSibling->getElementsByTagName('b')->item(0)->textContent; + break; + case 'GAByline': + $text=$h2NextSibling->textContent; + $item->name=substr($text,strpos($text,'by ')); + break; + default: + $item->name='LWN'; + break; }; $h2FirstChild=$h2->firstChild; @@ -137,7 +137,7 @@ class LWNprevBridge extends BridgeAbstract{ ) ) ){ - $contentEnd=true; + $contentEnd=true; }else{ $content.=$node->C14N(); } From d73bfbab63de63d00001519045b2725d10c76f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pierre=20Mazi=C3=A8re?= Date: Sun, 26 Jun 2016 11:18:23 +0200 Subject: [PATCH 4/4] do not use advanced_file_get_contents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit as it should not exist in the first place Signed-off-by: Pierre Mazière --- bridges/LWNprevBridge.php | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/bridges/LWNprevBridge.php b/bridges/LWNprevBridge.php index 224a01b..9772799 100644 --- a/bridges/LWNprevBridge.php +++ b/bridges/LWNprevBridge.php @@ -39,7 +39,20 @@ class LWNprevBridge extends BridgeAbstract{ public function collectData(array $param){ // Because the LWN page is written in loose HTML and not XHTML, // Simple HTML Dom is not accurate enough for the job - $html = advanced_file_get_contents('https://lwn.net/free/bigpage') + + $uri='https://lwn.net/free/bigpage'; + $context=null; + if(defined('PROXY_URL')) { + $context = array( + 'http' => array( + 'proxy' => PROXY_URL, + 'request_fulluri' => true, + ), + ); + $context = stream_context_create($context); + } + + $html=file_get_contents($uri, false, $context) or $this->returnError('No results for LWNprev', 404); libxml_use_internal_errors(true);