Browse Source

Merge pull request #338 from LogMANOriginal/WikipediaBridge

Wikipedia bridge
Mitsu 7 years ago
parent
commit
73dc0efac6

+ 304 - 0
bridges/WikipediaBridge.php

@@ -0,0 +1,304 @@
+<?php
+
+define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
+define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
+
+class WikipediaBridge extends BridgeAbstract{
+	public function loadMetadatas(){
+		$this->maintainer = 'logmanoriginal';
+		$this->name = 'Wikipedia bridge for many languages';
+		$this->uri = 'https://www.wikipedia.org/';
+		$this->description = 'Returns articles for a language of your choice';
+		$this->update = '2016-08-07';
+
+		$this->parameters[] = 
+		'[
+			{
+				"name": "Language",
+				"identifier": "language",
+				"type": "list",
+				"required": "true",
+				"title": "Select your language",
+				"exampleValue": "English",
+				"values": [
+					{
+						"name": "English",
+						"value": "en"
+					},
+					{
+						"name": "German",
+						"value": "de"	
+					},
+					{
+						"name": "French",
+						"value": "fr"
+					},
+					{
+						"name": "Esperanto",
+						"value": "eo"
+					}
+				]
+			},
+			{
+				"name": "Subject",
+				"identifier": "subject",
+				"type": "list",
+				"required": "true",
+				"title": "What subject are you interested in?",
+				"exampleValue": "Today\'s featured article",
+				"values": [
+					{
+						"name": "Today\'s featured article",
+						"value": "tfa"
+					},
+					{
+						"name": "Did you know...",
+						"value": "dyk"
+					}
+				]
+			},
+			{
+				"name": "Load full article",
+				"identifier": "fullarticle",
+				"type": "checkbox",
+				"required": "false",
+				"title": "Activate to always load the full article",
+				"exampleValue": "false"
+			}
+		]';
+	}
+
+	public function collectData(array $params){
+		if(!isset($params['language']))
+			$this->returnError('You must specify a valid language via \'&language=\'!', 400);
+		
+		if(!$this->CheckLanguageCode(strtolower($params['language'])))
+			$this->returnError('The language code you provided (\'' . $params['language'] . '\') is not supported!', 400);
+		
+		if(!isset($params['subject']))
+			$this->returnError('You must specify a valid subject via \'&subject=\'!', 400);
+		
+		$subject = WIKIPEDIA_SUBJECT_TFA;
+		switch($params['subject']){
+			case 'tfa':
+				$subject = WIKIPEDIA_SUBJECT_TFA;
+				break;
+			case 'dyk':
+				$subject = WIKIPEDIA_SUBJECT_DYK;
+				break;
+			default:
+				$subject = WIKIPEDIA_SUBJECT_TFA;
+				break;
+		}
+
+		$fullArticle = false;
+		if(isset($params['fullarticle']))
+			$fullArticle = $params['fullarticle'] === 'on' ? true : false;
+
+		// We store the correct URI as URI of this bridge (so it can be used later!)
+		$this->uri = 'https://' . strtolower($params['language']) . '.wikipedia.org';
+
+		// While we at it let's also update the name for the feed
+		switch($subject){
+			case WIKIPEDIA_SUBJECT_TFA:
+				$this->name = 'Today\'s featured article from ' . strtolower($params['language']) . '.wikipedia.org';
+				break;
+			case WIKIPEDIA_SUBJECT_DYK:
+				$this->name = 'Did you know? - articles from ' . strtolower($params['language']) . '.wikipedia.org';
+				break;
+			default:
+				$this->name = 'Articles from ' . strtolower($params['language']) . '.wikipedia.org';
+				break;
+		}
+
+		// This will automatically send us to the correct main page in any language (try it!)
+		$html = $this->file_get_html($this->uri . '/wiki');
+
+		if(!$html)
+			$this->returnError('Could not load site: ' . $this->uri . '!', 404);
+
+		/* 
+		* Now read content depending on the language (make sure to create one function per language!)
+		* We build the function name automatically, just make sure you create a private function ending
+		* with your desired language code, where the language code is upper case! (en -> GetContentsEN).
+		*/
+		$function = 'GetContents' . strtoupper($params['language']);
+
+		if(!method_exists($this, $function))
+			$this->returnError('A function to get the contents for your langauage is missing (\'' . $function . '\')!', 501);
+		
+		/*
+		* The method takes care of creating all items.
+		*/
+		$this->$function($html, $subject, $fullArticle);
+	}
+
+	/** 
+	* Returns true if the language code is part of the parameters list
+	*/
+	private function CheckLanguageCode($languageCode){
+		$parameter = json_decode($this->parameters[0], true);
+		$languages = $parameter[0]['values'];
+
+		$language_names = array();
+
+		foreach($languages as $language)
+			$language_names[] = $language['value'];
+		
+		return in_array($languageCode, $language_names);
+	}
+
+	/**
+	* Replaces all relative URIs with absolute ones
+	* @param $element A simplehtmldom element
+	* @return The $element->innertext with all URIs replaced
+	*/
+	private function ReplaceURIInHTMLElement($element){
+		return str_replace('href="/', 'href="' . $this->uri . '/', $element->innertext);
+	}
+
+	/*
+	* Adds a new item to $items using a generic operation (should work for most (all?) wikis)
+	*/
+	private function AddTodaysFeaturedArticleGeneric($element, $fullArticle){
+		// Clean the bottom of the featured article
+		$element->find('div', -1)->outertext = '';
+
+		// The title and URI of the article is best defined in an anchor containint the string '...' ('full article ...')
+		$target = $element->find('p/a', 0); // We'll use the first anchor as fallback
+		foreach($element->find('//a') as $anchor){
+			if(strpos($anchor->innertext, '...') !== false){
+				$target = $anchor;
+				break;
+			} 
+		}
+
+		$item = new \Item();
+		$item->uri = $this->uri . $target->href;
+		$item->title = $target->title;
+
+		if(!$fullArticle)
+			$item->content = strip_tags($this->ReplaceURIInHTMLElement($element), '<a><p><br><img>');
+		else 
+			$item->content = $this->LoadFullArticle($item->uri);
+
+		$this->items[] = $item;
+	}
+
+	/*
+	* Adds a new item to $items using a generic operation (should work for most (all?) wikis)
+	*/
+	private function AddDidYouKnowGeneric($element, $fullArticle){
+		foreach($element->find('ul', 0)->find('li') as $entry){
+			$item = new \Item();
+			
+			// We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
+			$item->uri = $this->uri . $entry->find('a', 0)->href;
+			$item->title = strip_tags($entry->innertext);
+
+			if(!$fullArticle)
+				$item->content = $this->ReplaceURIInHTMLElement($entry);
+			else 
+				$item->content = $this->LoadFullArticle($item->uri);
+
+			$this->items[] = $item;
+		}
+	}
+
+	/**
+	* Loads the full article from a given URI
+	*/
+	private function LoadFullArticle($uri){
+		$content_html = $this->file_get_html($uri);
+		
+		if(!$content_html)
+			$this->returnError('Could not load site: ' . $uri . '!', 404);
+		
+		$content = $content_html->find('#mw-content-text', 0);
+
+		if(!$content)
+			$this->returnError('Could not find content in page: ' . $uri . '!', 404);
+		
+		// Let's remove a couple of things from the article
+		$table = $content->find('#toc', 0); // Table of contents
+		if(!$table === false)
+			$table->outertext = '';
+
+		foreach($content->find('ol.references') as $reference) // References
+			$reference->outertext = '';
+
+		return str_replace('href="/', 'href="' . $this->uri . '/', $content->innertext);
+	}
+
+	/**
+	* Implementation for de.wikipedia.org
+	*/
+	private function GetContentsDE($html, $subject, $fullArticle){
+		switch($subject){
+			case WIKIPEDIA_SUBJECT_TFA:		
+				$element = $html->find('div[id=mf-tfa]', 0);
+				$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
+				break;
+			case WIKIPEDIA_SUBJECT_DYK:
+				$element = $html->find('div[id=mf-dyk]', 0);
+				$this->AddDidYouKnowGeneric($element, $fullArticle);
+				break;
+			default:
+				break;
+		}
+	}
+
+	/**
+	* Implementation for fr.wikipedia.org
+	*/
+	private function GetContentsFR($html, $subject, $fullArticle){
+		switch($subject){
+			case WIKIPEDIA_SUBJECT_TFA:		
+				$element = $html->find('div[id=accueil-lumieresur]', 0);
+				$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
+				break;
+			case WIKIPEDIA_SUBJECT_DYK:
+				$element = $html->find('div[id=SaviezVous]', 0);
+				$this->AddDidYouKnowGeneric($element, $fullArticle);
+				break;
+			default:
+				break;
+		}
+	}
+
+	/**
+	* Implementation for en.wikipedia.org
+	*/
+	private function GetContentsEN($html, $subject, $fullArticle){
+		switch($subject){
+			case WIKIPEDIA_SUBJECT_TFA:		
+				$element = $html->find('div[id=mp-tfa]', 0);
+				$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
+				break;
+			case WIKIPEDIA_SUBJECT_DYK:
+				$element = $html->find('div[id=mp-dyk]', 0);
+				$this->AddDidYouKnowGeneric($element, $fullArticle);
+				break;
+			default:
+				break;
+		}
+	}
+
+	/**
+	* Implementation for eo.wikipedia.org
+	*/
+	private function GetContentsEO($html, $subject, $fullArticle){
+		switch($subject){
+			case WIKIPEDIA_SUBJECT_TFA:		
+				$element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
+				$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
+				break;
+			case WIKIPEDIA_SUBJECT_DYK:
+				$element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
+				$this->AddDidYouKnowGeneric($element, $fullArticle);
+				break;
+			default:
+				break;
+		}
+	}
+}

+ 0 - 48
bridges/WikipediaDEBridge.php

@@ -1,48 +0,0 @@
-<?php
-class WikipediaDEBridge extends BridgeAbstract{
-
-	public function loadMetadatas() {
-
-		$this->maintainer = "cnlpete";
-		$this->name = "Wikipedia DE Today's Featured Article...";
-		$this->uri = "https://de.wikipedia.org/";
-		$this->description = "Returns the highlighted en.wikipedia.org article.";
-		$this->update = "2015-11-04";
-
-	}
-
-    public function collectData(array $param){
-        $html = '';
-        $host = 'http://de.wikipedia.org';
-        // If you want HTTPS access instead, uncomment the following line:
-        //$host = 'https://de.wikipedia.org';
-        $link = '/wiki/Wikipedia:Hauptseite';
-
-        $html = $this->file_get_html($host.$link) or $this->returnError('Could not request Wikipedia DE.', 404);
-
-        $element = $html->find('div[id=mf-tfa]', 0);
-        $element->find('div', -1)->outertext = '';
-
-        $item = new \Item();
-        $item->uri = $host.$element->find('p', 0)->find('a', 0)->href;
-        $item->title = $element->find('p',0)->find('a',0)->title;
-
-        $html2 = $this->file_get_html($item->uri) or $this->returnError('Could not request Wikipedia DE '.$item->title.'.', 404);
-        $element2 = $html2->find('div[id=mw-content-text]', 0);
-        $item->content = str_replace('href="/', 'href="'.$host.'/', $element2->innertext);
-
-        $this->items[] = $item;
-    }
-
-    public function getName(){
-        return 'Wikipedia DE "Today\'s Featured Article"';
-    }
-
-    public function getURI(){
-        return 'https://de.wikipedia.org/wiki/Wikipedia:Hauptseite';
-    }
-
-    public function getCacheDuration(){
-        return 3600*8; // 8 hours
-    }
-}

+ 0 - 44
bridges/WikipediaENBridge.php

@@ -1,44 +0,0 @@
-<?php
-class WikipediaENBridge extends BridgeAbstract{
-
-	public function loadMetadatas() {
-
-		$this->maintainer = "gsurrel";
-		$this->name = "Wikipedia EN 'Today's Featured Article...'";
-		$this->uri = "https://en.wikipedia.org/";
-		$this->description = "Returns the highlighted en.wikipedia.org article.";
-		$this->update = "2014-05-25";
-
-	}
-
-    public function collectData(array $param){
-        $html = '';
-        $host = 'http://en.wikipedia.org';
-        // If you want HTTPS access instead, uncomment the following line:
-        //$host = 'https://en.wikipedia.org';
-        $link = '/wiki/Main_Page';
-
-        $html = $this->file_get_html($host.$link) or $this->returnError('Could not request Wikipedia EN.', 404);
-
-		$element = $html->find('div[id=mp-tfa]', 0);
-		// Clean the bottom of the featured article
-		$element->find('div', -1)->outertext = '';
-		$item = new \Item();
-		$item->uri = $host.$element->find('p', 0)->find('a', 0)->href;
-		$item->title = $element->find('p',0)->find('a',0)->title;
-		$item->content = str_replace('href="/', 'href="'.$host.'/', $element->innertext);
-		$this->items[] = $item;
-    }
-
-    public function getName(){
-        return 'Wikipedia EN "Today\'s Featued Article"';
-    }
-
-    public function getURI(){
-        return 'https://en.wikipedia.org/wiki/Main_Page';
-    }
-
-    public function getCacheDuration(){
-        return 3600*4; // 4 hours
-    }
-}

+ 0 - 44
bridges/WikipediaEOBridge.php

@@ -1,44 +0,0 @@
-<?php
-class WikipediaEOBridge extends BridgeAbstract{
-
-	public function loadMetadatas() {
-
-		$this->maintainer = "gsurrel";
-		$this->name = "Wikipedia EO 'Artikolo de la semajno'";
-		$this->uri = "https://eo.wikipedia.org/";
-		$this->description = "Returns the highlighted eo.wikipedia.org article.";
-		$this->update = "2014-05-25";
-
-	}
-
-    public function collectData(array $param){
-        $html = '';
-        $host = 'http://eo.wikipedia.org';
-        // If you want HTTPS access instead, uncomment the following line:
-        //$host = 'https://eo.wikipedia.org';
-        $link = '/wiki/Vikipedio:%C4%88efpa%C4%9Do';
-
-        $html = $this->file_get_html($host.$link) or $this->returnError('Could not request Wikipedia EO.', 404);
-
-		$element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
-		// Link to article
-		$link = $element->find('p', 3)->find('a', 0);
-		$item = new \Item();
-		$item->uri = $host.$link->href;
-		$item->title = $element->find('p',0)->find('i',0)->innertext;
-		$item->content = str_replace('href="/', 'href="'.$host.'/', $element->innertext);
-		$this->items[] = $item;
-    }
-
-    public function getName(){
-        return 'Wikipedia EO "Artikolo de la semajno"';
-    }
-
-    public function getURI(){
-        return 'https://eo.wikipedia.org/wiki/Vikipedio:%C4%88efpa%C4%9Do';
-    }
-
-    public function getCacheDuration(){
-        return 3600*12; // 12 hours
-    }
-}

+ 0 - 46
bridges/WikipediaFRBridge.php

@@ -1,46 +0,0 @@
-<?php
-class WikipediaFRBridge extends BridgeAbstract{
-
-	public function loadMetadatas() {
-
-		$this->maintainer = "gsurrel";
-		$this->name = "Wikipedia FR 'Lumière sur...'";
-		$this->uri = "https://fr.wikipedia.org/";
-		$this->description = "Returns the highlighted fr.wikipedia.org article.";
-		$this->update = "2016-06-04";
-
-	}
-
-    public function collectData(array $param){
-        $html = '';
-        $host = 'http://fr.wikipedia.org';
-        // If you want HTTPS access instead, uncomment the following line:
-        //$host = 'https://fr.wikipedia.org';
-        $link = '/wiki/Wikip%C3%A9dia:Accueil_principal';
-
-        $html = $this->file_get_html($host.$link) or $this->returnError('Could not request Wikipedia FR.', 404);
-
-		$element = $html->find('div[id=mf-lumieresur]', 0);
-		# Use the "Lire la suite" link to dependably get the title of the article
-		# usually it's a child of a li.BA element (Bon article)
-		# occasionally it's a li.AdQ (Article de qualité)
-		$lirelasuite_link = $element->find('.BA > i > a, .AdQ > i > a', 0);
-		$item = new \Item();
-		$item->uri = $host.$lirelasuite_link->href;
-		$item->title = $lirelasuite_link->title;
-		$item->content = str_replace('href="/', 'href="'.$host.'/', $element->innertext);
-		$this->items[] = $item;
-    }
-
-    public function getName(){
-        return 'Wikipedia FR "Lumière sur..."';
-    }
-
-    public function getURI(){
-        return 'https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Accueil_principal';
-    }
-
-    public function getCacheDuration(){
-        return 3600*4; // 4 hours
-    }
-}