123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- <?php
- class LWNprevBridge extends BridgeAbstract{
- const MAINTAINER = 'Pierre Mazière';
- const NAME = 'LWN Free Weekly Edition';
- const URI = 'https://lwn.net/';
- const CACHE_TIMEOUT = 604800; // 1 week
- const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
- function getURI(){
- return self::URI . 'free/bigpage';
- }
- private function jumpToNextTag(&$node){
- while($node && $node->nodeType === XML_TEXT_NODE) {
- $nextNode = $node->nextSibling;
- if(!$nextNode) {
- break;
- }
- $node = $nextNode;
- }
- }
- private function jumpToPreviousTag(&$node){
- while($node && $node->nodeType === XML_TEXT_NODE) {
- $previousNode = $node->previousSibling;
- if(!$previousNode) {
- break;
- }
- $node = $previousNode;
- }
- }
- public function collectData(){
- // Because the LWN page is written in loose HTML and not XHTML,
- // Simple HTML Dom is not accurate enough for the job
- $content = getContents($this->getURI())
- or returnServerError('No results for LWNprev');
- libxml_use_internal_errors(true);
- $html = new DOMDocument();
- $html->loadHTML($content);
- libxml_clear_errors();
- $cat1 = '';
- $cat2 = '';
- foreach($html->getElementsByTagName('a') as $a) {
- if($a->textContent === 'Multi-page format') {
- break;
- }
- }
- $realURI = self::URI . $a->getAttribute('href');
- $URICounter = 0;
- $edition = $html->getElementsByTagName('h1')->item(0)->textContent;
- $editionTimeStamp = strtotime(
- substr($edition, strpos($edition, 'for ') + strlen('for '))
- );
- foreach($html->getElementsByTagName('h2') as $h2) {
- if($h2->getAttribute('class') !== 'SummaryHL') {
- continue;
- }
- $item = array();
- $h2NextSibling = $h2->nextSibling;
- $this->jumpToNextTag($h2NextSibling);
- switch($h2NextSibling->getAttribute('class')) {
- case 'FeatureByline':
- $item['author'] = $h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
- break;
- case 'GAByline':
- $text = $h2NextSibling->textContent;
- $item['author'] = substr($text, strpos($text, 'by '));
- break;
- default:
- $item['author'] = 'LWN';
- break;
- };
- $h2FirstChild = $h2->firstChild;
- $this->jumpToNextTag($h2FirstChild);
- if($h2FirstChild->nodeName === 'a') {
- $item['uri'] = self::URI . $h2FirstChild->getAttribute('href');
- } else{
- $item['uri'] = $realURI . '#' . $URICounter;
- }
- $URICounter++;
- $item['timestamp'] = $editionTimeStamp + $URICounter;
- $h2PrevSibling = $h2->previousSibling;
- $this->jumpToPreviousTag($h2PrevSibling);
- switch($h2PrevSibling->getAttribute('class')) {
- case 'Cat2HL':
- $cat2 = $h2PrevSibling->textContent;
- $h2PrevSibling = $h2PrevSibling->previousSibling;
- $this->jumpToPreviousTag($h2PrevSibling);
- if($h2PrevSibling->getAttribute('class') !== 'Cat1HL') {
- break;
- }
- $cat1 = $h2PrevSibling->textContent;
- break;
- case 'Cat1HL':
- $cat1 = $h2PrevSibling->textContent;
- $cat2 = '';
- break;
- default:
- break;
- }
- $h2PrevSibling = null;
- $item['title'] = '';
- if(!empty($cat1)) {
- $item['title'] .= '[' . $cat1 . ($cat2 ? '/' . $cat2 : '') . '] ';
- }
- $item['title'] .= $h2->textContent;
- $node = $h2;
- $content = '';
- $contentEnd = false;
- while(!$contentEnd) {
- $node = $node->nextSibling;
- if(!$node || (
- $node->nodeType !== XML_TEXT_NODE && (
- $node->nodeName === 'h2' || (
- !is_null($node->attributes) &&
- !is_null($class = $node->attributes->getNamedItem('class')) &&
- in_array($class->nodeValue, array('Cat1HL', 'Cat2HL'))
- )
- )
- )
- ) {
- $contentEnd = true;
- } else{
- $content .= $node->C14N();
- }
- }
- $item['content'] = $content;
- $this->items[] = $item;
- }
- }
- }
|