2016-08-07 00:06:14 +02:00
|
|
|
<?php
|
2016-08-07 12:51:09 +02:00
|
|
|
|
2016-08-07 13:26:30 +02:00
|
|
|
define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
|
2016-08-07 12:51:09 +02:00
|
|
|
define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
|
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
class WikipediaBridge extends BridgeAbstract{
|
|
|
|
public function loadMetadatas(){
|
|
|
|
$this->maintainer = 'logmanoriginal';
|
2016-08-07 12:51:09 +02:00
|
|
|
$this->name = 'Wikipedia bridge for many languages';
|
2016-08-07 00:06:14 +02:00
|
|
|
$this->uri = 'https://www.wikipedia.org/';
|
2016-08-07 12:51:09 +02:00
|
|
|
$this->description = 'Returns articles for a language of your choice';
|
2016-08-07 00:06:14 +02:00
|
|
|
|
2016-08-22 01:25:56 +02:00
|
|
|
$this->parameters[] = array(
|
|
|
|
'language'=>array(
|
|
|
|
'name'=>'Language',
|
|
|
|
'type'=>'list',
|
|
|
|
'required'=>true,
|
|
|
|
'title'=>'Select your language',
|
|
|
|
'exampleValue'=>'English',
|
|
|
|
'values'=>array(
|
|
|
|
'English'=>'en',
|
2016-08-28 11:22:37 +02:00
|
|
|
'Dutch'=>'nl',
|
2016-08-28 11:48:27 +02:00
|
|
|
'Esperanto'=>'eo',
|
2016-08-22 01:25:56 +02:00
|
|
|
'French'=>'fr',
|
2016-08-28 11:22:37 +02:00
|
|
|
'German'=>'de',
|
2016-08-22 01:25:56 +02:00
|
|
|
)
|
|
|
|
),
|
|
|
|
'subject'=>array(
|
|
|
|
'name'=>'Subject',
|
|
|
|
'type'=>'list',
|
|
|
|
'required'=>true,
|
|
|
|
'title'=>'What subject are you interested in?',
|
|
|
|
'exampleValue'=>'Today\'s featured article',
|
|
|
|
'values'=>array(
|
|
|
|
'Today\'s featured article'=>'tfa',
|
|
|
|
'Did you know…'=>'dyk'
|
|
|
|
)
|
|
|
|
),
|
|
|
|
'fullarticle'=>array(
|
|
|
|
'name'=>'Load full article',
|
|
|
|
'type'=>'checkbox',
|
|
|
|
'title'=>'Activate to always load the full article'
|
|
|
|
)
|
|
|
|
);
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|
|
|
|
|
2016-08-25 01:24:53 +02:00
|
|
|
public function collectData(){
|
|
|
|
$params=$this->parameters[$this->queriedContext];
|
|
|
|
if(!isset($params['language']['value']))
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnClientError('You must specify a valid language via \'&language=\'!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-25 01:24:53 +02:00
|
|
|
if(!$this->CheckLanguageCode(strtolower($params['language']['value'])))
|
|
|
|
$this->returnClientError('The language code you provided (\'' . $params['language']['value'] . '\') is not supported!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-25 01:24:53 +02:00
|
|
|
if(!isset($params['subject']['value']))
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnClientError('You must specify a valid subject via \'&subject=\'!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-07 12:51:09 +02:00
|
|
|
$subject = WIKIPEDIA_SUBJECT_TFA;
|
2016-08-25 01:24:53 +02:00
|
|
|
switch($params['subject']['value']){
|
2016-08-07 12:51:09 +02:00
|
|
|
case 'tfa':
|
|
|
|
$subject = WIKIPEDIA_SUBJECT_TFA;
|
|
|
|
break;
|
|
|
|
case 'dyk':
|
|
|
|
$subject = WIKIPEDIA_SUBJECT_DYK;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
$subject = WIKIPEDIA_SUBJECT_TFA;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
$fullArticle = false;
|
2016-08-25 01:24:53 +02:00
|
|
|
if(isset($params['fullarticle']['value']))
|
2016-08-25 17:18:47 +02:00
|
|
|
$fullArticle = $params['fullarticle']['value'];
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
// We store the correct URI as URI of this bridge (so it can be used later!)
|
2016-08-25 01:24:53 +02:00
|
|
|
$this->uri = 'https://' . strtolower($params['language']['value']) . '.wikipedia.org';
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
// While we at it let's also update the name for the feed
|
2016-08-07 12:51:09 +02:00
|
|
|
switch($subject){
|
|
|
|
case WIKIPEDIA_SUBJECT_TFA:
|
2016-08-25 01:24:53 +02:00
|
|
|
$this->name = 'Today\'s featured article from ' . strtolower($params['language']['value']) . '.wikipedia.org';
|
2016-08-07 12:51:09 +02:00
|
|
|
break;
|
|
|
|
case WIKIPEDIA_SUBJECT_DYK:
|
2016-08-25 01:24:53 +02:00
|
|
|
$this->name = 'Did you know? - articles from ' . strtolower($params['language']['value']) . '.wikipedia.org';
|
2016-08-07 12:51:09 +02:00
|
|
|
break;
|
|
|
|
default:
|
2016-08-25 01:24:53 +02:00
|
|
|
$this->name = 'Articles from ' . strtolower($params['language']['value']) . '.wikipedia.org';
|
2016-08-07 12:51:09 +02:00
|
|
|
break;
|
|
|
|
}
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
// This will automatically send us to the correct main page in any language (try it!)
|
2016-08-09 14:57:42 +02:00
|
|
|
$html = $this->getSimpleHTMLDOM($this->uri . '/wiki');
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
if(!$html)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Could not load site: ' . $this->uri . '!');
|
2016-08-07 00:06:14 +02:00
|
|
|
|
2016-08-22 01:25:56 +02:00
|
|
|
/*
|
2016-08-07 00:06:14 +02:00
|
|
|
* Now read content depending on the language (make sure to create one function per language!)
|
|
|
|
* We build the function name automatically, just make sure you create a private function ending
|
|
|
|
* with your desired language code, where the language code is upper case! (en -> GetContentsEN).
|
|
|
|
*/
|
2016-08-25 01:24:53 +02:00
|
|
|
$function = 'GetContents' . strtoupper($params['language']['value']);
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
if(!method_exists($this, $function))
|
2016-08-28 11:22:37 +02:00
|
|
|
$this->returnServerError('A function to get the contents for your language is missing (\'' . $function . '\')!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
/*
|
|
|
|
* The method takes care of creating all items.
|
|
|
|
*/
|
2016-08-07 12:51:09 +02:00
|
|
|
$this->$function($html, $subject, $fullArticle);
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|
|
|
|
|
2016-08-22 01:25:56 +02:00
|
|
|
/**
|
2016-08-07 00:06:14 +02:00
|
|
|
* Returns true if the language code is part of the parameters list
|
|
|
|
*/
|
|
|
|
private function CheckLanguageCode($languageCode){
|
2016-08-22 01:25:56 +02:00
|
|
|
$languages = $this->parameters[0]['language']['values'];
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
$language_names = array();
|
|
|
|
|
2016-08-22 01:25:56 +02:00
|
|
|
foreach($languages as $name=>$value)
|
|
|
|
$language_names[] = $value;
|
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
return in_array($languageCode, $language_names);
|
|
|
|
}
|
|
|
|
|
2016-08-07 12:51:09 +02:00
|
|
|
/**
|
|
|
|
* Replaces all relative URIs with absolute ones
|
|
|
|
* @param $element A simplehtmldom element
|
|
|
|
* @return The $element->innertext with all URIs replaced
|
|
|
|
*/
|
|
|
|
private function ReplaceURIInHTMLElement($element){
|
|
|
|
return str_replace('href="/', 'href="' . $this->uri . '/', $element->innertext);
|
|
|
|
}
|
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
/*
|
|
|
|
* Adds a new item to $items using a generic operation (should work for most (all?) wikis)
|
2016-08-28 11:22:37 +02:00
|
|
|
* $anchorText can be specified if the wiki in question doesn't use '...' (like Dutch, French and Italian)
|
|
|
|
* $anchorFallbackIndex can be used to specify a different fallback link than the first (e.g., -1 for the last)
|
2016-08-07 00:06:14 +02:00
|
|
|
*/
|
2016-08-28 11:22:37 +02:00
|
|
|
private function AddTodaysFeaturedArticleGeneric($element, $fullArticle, $anchorText = '...', $anchorFallbackIndex = 0){
|
2016-08-07 00:06:14 +02:00
|
|
|
// Clean the bottom of the featured article
|
2016-08-28 11:22:37 +02:00
|
|
|
if ($element->find('div', -1))
|
|
|
|
$element->find('div', -1)->outertext = '';
|
2016-08-07 00:06:14 +02:00
|
|
|
|
2016-08-28 11:22:37 +02:00
|
|
|
// The title and URI of the article can be found in an anchor containing the string '...' in most wikis ('full article ...')
|
|
|
|
$target = $element->find('p/a', $anchorFallbackIndex);
|
2016-08-07 11:26:45 +02:00
|
|
|
foreach($element->find('//a') as $anchor){
|
2016-08-28 11:22:37 +02:00
|
|
|
if(strpos($anchor->innertext, $anchorText) !== false){
|
2016-08-07 11:26:45 +02:00
|
|
|
$target = $anchor;
|
|
|
|
break;
|
2016-08-22 01:25:56 +02:00
|
|
|
}
|
2016-08-07 11:26:45 +02:00
|
|
|
}
|
|
|
|
|
2016-08-22 18:55:59 +02:00
|
|
|
$item = array();
|
|
|
|
$item['uri'] = $this->uri . $target->href;
|
|
|
|
$item['title'] = $target->title;
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
if(!$fullArticle)
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content'] = strip_tags($this->ReplaceURIInHTMLElement($element), '<a><p><br><img>');
|
2016-08-22 01:25:56 +02:00
|
|
|
else
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content'] = $this->LoadFullArticle($item['uri']);
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
$this->items[] = $item;
|
|
|
|
}
|
|
|
|
|
2016-08-07 12:51:09 +02:00
|
|
|
/*
|
|
|
|
* Adds a new item to $items using a generic operation (should work for most (all?) wikis)
|
|
|
|
*/
|
|
|
|
private function AddDidYouKnowGeneric($element, $fullArticle){
|
|
|
|
foreach($element->find('ul', 0)->find('li') as $entry){
|
2016-08-22 18:55:59 +02:00
|
|
|
$item = array();
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-07 13:02:10 +02:00
|
|
|
// We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['uri'] = $this->uri . $entry->find('a', 0)->href;
|
|
|
|
$item['title'] = strip_tags($entry->innertext);
|
2016-08-07 12:51:09 +02:00
|
|
|
|
|
|
|
if(!$fullArticle)
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content'] = $this->ReplaceURIInHTMLElement($entry);
|
2016-08-22 01:25:56 +02:00
|
|
|
else
|
2016-08-22 18:55:59 +02:00
|
|
|
$item['content'] = $this->LoadFullArticle($item['uri']);
|
2016-08-07 12:51:09 +02:00
|
|
|
|
|
|
|
$this->items[] = $item;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
/**
|
|
|
|
* Loads the full article from a given URI
|
|
|
|
*/
|
|
|
|
private function LoadFullArticle($uri){
|
2016-08-09 14:57:42 +02:00
|
|
|
$content_html = $this->getSimpleHTMLDOM($uri);
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
if(!$content_html)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Could not load site: ' . $uri . '!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
$content = $content_html->find('#mw-content-text', 0);
|
|
|
|
|
|
|
|
if(!$content)
|
2016-08-17 14:45:08 +02:00
|
|
|
$this->returnServerError('Could not find content in page: ' . $uri . '!');
|
2016-08-22 01:25:56 +02:00
|
|
|
|
2016-08-07 00:06:14 +02:00
|
|
|
// Let's remove a couple of things from the article
|
2016-08-07 12:51:09 +02:00
|
|
|
$table = $content->find('#toc', 0); // Table of contents
|
|
|
|
if(!$table === false)
|
|
|
|
$table->outertext = '';
|
2016-08-07 00:06:14 +02:00
|
|
|
|
|
|
|
foreach($content->find('ol.references') as $reference) // References
|
|
|
|
$reference->outertext = '';
|
|
|
|
|
|
|
|
return str_replace('href="/', 'href="' . $this->uri . '/', $content->innertext);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation for de.wikipedia.org
|
|
|
|
*/
|
2016-08-07 12:51:09 +02:00
|
|
|
private function GetContentsDE($html, $subject, $fullArticle){
|
|
|
|
switch($subject){
|
2016-08-22 01:25:56 +02:00
|
|
|
case WIKIPEDIA_SUBJECT_TFA:
|
2016-08-07 12:51:09 +02:00
|
|
|
$element = $html->find('div[id=mf-tfa]', 0);
|
|
|
|
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
case WIKIPEDIA_SUBJECT_DYK:
|
|
|
|
$element = $html->find('div[id=mf-dyk]', 0);
|
|
|
|
$this->AddDidYouKnowGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation for fr.wikipedia.org
|
|
|
|
*/
|
2016-08-07 12:51:09 +02:00
|
|
|
private function GetContentsFR($html, $subject, $fullArticle){
|
|
|
|
switch($subject){
|
2016-08-22 01:25:56 +02:00
|
|
|
case WIKIPEDIA_SUBJECT_TFA:
|
2016-08-07 12:51:09 +02:00
|
|
|
$element = $html->find('div[id=accueil-lumieresur]', 0);
|
2016-08-28 11:22:37 +02:00
|
|
|
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lire la suite');
|
2016-08-07 12:51:09 +02:00
|
|
|
break;
|
|
|
|
case WIKIPEDIA_SUBJECT_DYK:
|
|
|
|
$element = $html->find('div[id=SaviezVous]', 0);
|
|
|
|
$this->AddDidYouKnowGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation for en.wikipedia.org
|
|
|
|
*/
|
2016-08-07 12:51:09 +02:00
|
|
|
private function GetContentsEN($html, $subject, $fullArticle){
|
|
|
|
switch($subject){
|
2016-08-22 01:25:56 +02:00
|
|
|
case WIKIPEDIA_SUBJECT_TFA:
|
2016-08-07 12:51:09 +02:00
|
|
|
$element = $html->find('div[id=mp-tfa]', 0);
|
|
|
|
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
case WIKIPEDIA_SUBJECT_DYK:
|
|
|
|
$element = $html->find('div[id=mp-dyk]', 0);
|
|
|
|
$this->AddDidYouKnowGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation for eo.wikipedia.org
|
|
|
|
*/
|
2016-08-07 12:51:09 +02:00
|
|
|
private function GetContentsEO($html, $subject, $fullArticle){
|
|
|
|
switch($subject){
|
2016-08-22 01:25:56 +02:00
|
|
|
case WIKIPEDIA_SUBJECT_TFA:
|
2016-08-07 12:51:09 +02:00
|
|
|
$element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
|
|
|
|
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
case WIKIPEDIA_SUBJECT_DYK:
|
|
|
|
$element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
|
|
|
|
$this->AddDidYouKnowGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|
2016-08-28 11:22:37 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Implementation for nl.wikipedia.org
|
|
|
|
*/
|
|
|
|
private function GetContentsNL($html, $subject, $fullArticle){
|
|
|
|
switch($subject){
|
|
|
|
case WIKIPEDIA_SUBJECT_TFA:
|
|
|
|
$element = $html->find('div[id=mf-uitgelicht]', 0);
|
|
|
|
$this->AddTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lees meer');
|
|
|
|
break;
|
|
|
|
case WIKIPEDIA_SUBJECT_DYK:
|
|
|
|
$element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 2);
|
|
|
|
$this->AddDidYouKnowGeneric($element, $fullArticle);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2016-08-07 00:06:14 +02:00
|
|
|
}
|