Parcourir la source

[LWNprevBridge] full rewrite

Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
[logmanoriginal@users.noreply.github.com: Fix coding style]
Pierre Mazière il y a 6 ans
Parent
commit
873a91259f
1 fichiers modifiés avec 193 ajouts et 73 suppressions
  1. 193 73
      bridges/LWNprevBridge.php

+ 193 - 73
bridges/LWNprevBridge.php

@@ -6,8 +6,10 @@ class LWNprevBridge extends BridgeAbstract{
 	const CACHE_TIMEOUT = 604800; // 1 week
 	const DESCRIPTION = 'LWN Free Weekly Edition available one week late';
 
+	private $editionTimeStamp;
+
 	function getURI(){
-		return self::URI . 'free/bigpage';
+		return self::URI.'free/bigpage';
 	}
 
 	private function jumpToNextTag(&$node){
@@ -36,110 +38,228 @@ class LWNprevBridge extends BridgeAbstract{
 		$content = getContents($this->getURI())
 			or returnServerError('No results for LWNprev');
 
-		libxml_use_internal_errors(true);
-		$html = new DOMDocument();
-		$html->loadHTML($content);
-		libxml_clear_errors();
+		$contents = explode('<b>Page editor</b>', $content);
 
-		$cat1 = '';
-		$cat2 = '';
+		foreach($contents as $content) {
+			if(strpos($content, '<html>') === false) {
+				$content = <<<EOD
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html><head><title>LWN</title></head><body>{$content}</body></html>
+EOD;
+			} else {
+				$content = $content.'</body></html>';
+			}
 
-		foreach($html->getElementsByTagName('a') as $a) {
-			if($a->textContent === 'Multi-page format') {
-				break;
+			libxml_use_internal_errors(true);
+			$html = new DOMDocument();
+			$html->loadHTML($content);
+			libxml_clear_errors();
+
+			$edition = $html->getElementsByTagName('h1');
+			if($edition->length !== 0) {
+				$text = $edition->item(0)->textContent;
+				$this->editionTimeStamp = strtotime(
+					substr($text, strpos($text, 'for ') + strlen('for '))
+				);
+			}
+
+			if(strpos($content, 'Cat1HL') === false) {
+				$items = $this->getFeatureContents($html);
+			} elseif(strpos($content, 'Cat3HL') === false) {
+				$items = $this->getBriefItems($html);
+			} else {
+				$items = $this->getAnnouncements($html);
 			}
+
+			$this->items = array_merge($this->items, $items);
+		}
+	}
+
+	private function getArticleContent(&$title){
+		$link = $title->firstChild;
+		$this->jumpToNextTag($link);
+		$item['uri'] = self::URI;
+		if($link->nodeName === 'a') {
+			$item['uri'] .= $link->getAttribute('href');
 		}
-		$realURI = self::URI . $a->getAttribute('href');
-		$URICounter = 0;
 
-		$edition = $html->getElementsByTagName('h1')->item(0)->textContent;
-		$editionTimeStamp = strtotime(
-			substr($edition, strpos($edition, 'for ') + strlen('for '))
-		);
+		$item['timestamp'] = $this->editionTimeStamp;
+
+		$node = $title;
+		$content = '';
+		$contentEnd = false;
+		while(!$contentEnd) {
+			$node = $node->nextSibling;
+			if(!$node || (
+					$node->nodeType !== XML_TEXT_NODE &&
+					$node->nodeName === 'h2' || (
+						!is_null($node->attributes) &&
+						!is_null($class = $node->attributes->getNamedItem('class')) &&
+						in_array($class->nodeValue, array('Cat1HL','Cat2HL'))
+					)
+				)
+			) {
+				$contentEnd = true;
+			} else {
+				$content .= $node->C14N();
+			}
+		}
+		$item['content'] = $content;
+		return $item;
+	}
 
-		foreach($html->getElementsByTagName('h2') as $h2) {
-			if($h2->getAttribute('class') !== 'SummaryHL') {
+	private function getFeatureContents(&$html){
+		$items = array();
+		foreach($html->getElementsByTagName('h2') as $title) {
+			if($title->getAttribute('class') !== 'SummaryHL') {
 				continue;
 			}
 
 			$item = array();
 
-			$h2NextSibling = $h2->nextSibling;
-			$this->jumpToNextTag($h2NextSibling);
+			$author = $title->nextSibling;
+			$this->jumpToNextTag($author);
+			if($author->getAttribute('class') === 'FeatureByline') {
+				$item['author'] = $author->getElementsByTagName('b')->item(0)->textContent;
+			} else {
+				continue;
+			}
 
-			switch($h2NextSibling->getAttribute('class')) {
-			case 'FeatureByline':
-				$item['author'] = $h2NextSibling->getElementsByTagName('b')->item(0)->textContent;
-				break;
-			case 'GAByline':
-				$text = $h2NextSibling->textContent;
-				$item['author'] = substr($text, strpos($text, 'by '));
-				break;
-			default:
-				$item['author'] = 'LWN';
-				break;
-			};
-
-			$h2FirstChild = $h2->firstChild;
-			$this->jumpToNextTag($h2FirstChild);
-			if($h2FirstChild->nodeName === 'a') {
-				$item['uri'] = self::URI . $h2FirstChild->getAttribute('href');
-			} else{
-				$item['uri'] = $realURI . '#' . $URICounter;
-			}
-			$URICounter++;
-
-			$item['timestamp'] = $editionTimeStamp + $URICounter;
-
-			$h2PrevSibling = $h2->previousSibling;
-			$this->jumpToPreviousTag($h2PrevSibling);
-			switch($h2PrevSibling->getAttribute('class')) {
-			case 'Cat2HL':
-				$cat2 = $h2PrevSibling->textContent;
-				$h2PrevSibling = $h2PrevSibling->previousSibling;
-				$this->jumpToPreviousTag($h2PrevSibling);
-				if($h2PrevSibling->getAttribute('class') !== 'Cat1HL') {
-					break;
-				}
-				$cat1 = $h2PrevSibling->textContent;
-				break;
-			case 'Cat1HL':
-				$cat1 = $h2PrevSibling->textContent;
-				$cat2 = '';
+			$item['title'] = $title->textContent;
+
+			$items[] = array_merge($item, $this->getArticleContent($title));
+		}
+		return $items;
+	}
+
+	private function getItemPrefix(&$cat, &$cats){
+		$cat1 = '';
+		$cat2 = '';
+		$cat3 = '';
+		switch($cat->getAttribute('class')) {
+		case 'Cat3HL':
+			$cat3 = $cat->textContent;
+			$cat = $cat->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$cats[2] = $cat3;
+			if($cat->getAttribute('class') !== 'Cat2HL') {
 				break;
-			default:
+			}
+		case 'Cat2HL':
+			$cat2 = $cat->textContent;
+			$cat = $cat->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$cats[1] = $cat2;
+			if(empty($cat3)) {
+				$cats[2] = '';
+			}
+			if($cat->getAttribute('class') !== 'Cat1HL') {
 				break;
 			}
-			$h2PrevSibling = null;
+		case 'Cat1HL':
+			$cat1 = $cat->textContent;
+			$cats[0] = $cat1;
+			if(empty($cat3)) {
+				$cats[2] = '';
+			}
+			if(empty($cat2)) {
+				$cats[1] = '';
+			}
+			break;
+		default:
+			break;
+		}
+
+		$prefix = '';
+		if(!empty($cats[0])) {
+			$prefix .= '['.$cats[0].($cats[1] ? '/'.$cats[1] : '').'] ';
+		}
+		return $prefix;
+	}
 
-			$item['title'] = '';
-			if(!empty($cat1)) {
-				$item['title'] .= '[' . $cat1 . ($cat2 ? '/' . $cat2 : '') . '] ';
+	private function getAnnouncements(&$html){
+		$items = array();
+		$cats = array('','','');
+
+		foreach($html->getElementsByTagName('p') as $newsletters) {
+			if($newsletters->getAttribute('class') !== 'Cat3HL') {
+				continue;
 			}
-			$item['title'] .= $h2->textContent;
 
-			$node = $h2;
+			$item = array();
+
+			$item['uri'] = self::URI.'#'.microtime(true);
+
+			$item['timestamp'] = $this->editionTimeStamp;//+$URICounter;
+
+			$item['author'] = 'LWN';
+
+			$cat = $newsletters->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$prefix = $this->getItemPrefix($cat, $cats);
+			$item['title'] = $prefix.' '.$newsletters->textContent;
+
+			$node = $newsletters;
 			$content = '';
 			$contentEnd = false;
 			while(!$contentEnd) {
 				$node = $node->nextSibling;
 				if(!$node || (
 						$node->nodeType !== XML_TEXT_NODE && (
-							$node->nodeName === 'h2' || (
-								!is_null($node->attributes) &&
-								!is_null($class = $node->attributes->getNamedItem('class')) &&
-								in_array($class->nodeValue, array('Cat1HL', 'Cat2HL'))
-							)
+							!is_null($node->attributes) &&
+							!is_null($class = $node->attributes->getNamedItem('class')) &&
+							in_array($class->nodeValue, array('Cat1HL','Cat2HL','Cat3HL'))
 						)
 					)
 				) {
 					$contentEnd = true;
-				} else{
+				} else {
 					$content .= $node->C14N();
 				}
 			}
 			$item['content'] = $content;
-			$this->items[] = $item;
+			$items[] = $item;
 		}
+
+		foreach($html->getElementsByTagName('h2') as $title) {
+			if($title->getAttribute('class') !== 'SummaryHL') {
+				continue;
+			}
+
+			$item = array();
+
+			$cat = $title->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$cat = $cat->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$prefix = $this->getItemPrefix($cat, $cats);
+			$item['title'] = $prefix.' '.$title->textContent;
+			$items[] = array_merge($item, $this->getArticleContent($title));
+		}
+
+		return $items;
+	}
+
+	private function getBriefItems(&$html){
+		$items = array();
+		$cats = array('','','');
+		foreach($html->getElementsByTagName('h2') as $title) {
+			if($title->getAttribute('class') !== 'SummaryHL') {
+				continue;
+			}
+
+			$item = array();
+
+			$cat = $title->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$cat = $cat->previousSibling;
+			$this->jumpToPreviousTag($cat);
+			$prefix = $this->getItemPrefix($cat, $cats);
+			$item['title'] = $prefix.' '.$title->textContent;
+			$items[] = array_merge($item, $this->getArticleContent($title));
+		}
+
+		return $items;
 	}
 }
+?>