WikipediaBridge.php 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. <?php
  2. define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
  3. define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
  4. class WikipediaBridge extends BridgeAbstract{
  5. public function loadMetadatas(){
  6. $this->maintainer = 'logmanoriginal';
  7. $this->name = 'Wikipedia bridge for many languages';
  8. $this->uri = 'https://www.wikipedia.org/';
  9. $this->description = 'Returns articles for a language of your choice';
  10. $this->parameters[] = array(
  11. 'language'=>array(
  12. 'name'=>'Language',
  13. 'type'=>'list',
  14. 'required'=>true,
  15. 'title'=>'Select your language',
  16. 'exampleValue'=>'English',
  17. 'values'=>array(
  18. 'English'=>'en',
  19. 'German'=>'de',
  20. 'French'=>'fr',
  21. 'Esperanto'=>'es'
  22. )
  23. ),
  24. 'subject'=>array(
  25. 'name'=>'Subject',
  26. 'type'=>'list',
  27. 'required'=>true,
  28. 'title'=>'What subject are you interested in?',
  29. 'exampleValue'=>'Today\'s featured article',
  30. 'values'=>array(
  31. 'Today\'s featured article'=>'tfa',
  32. 'Did you know…'=>'dyk'
  33. )
  34. ),
  35. 'fullarticle'=>array(
  36. 'name'=>'Load full article',
  37. 'type'=>'checkbox',
  38. 'title'=>'Activate to always load the full article'
  39. )
  40. );
  41. }
  42. public function collectData(array $params){
  43. if(!isset($params['language']))
  44. $this->returnClientError('You must specify a valid language via \'&language=\'!');
  45. if(!$this->CheckLanguageCode(strtolower($params['language'])))
  46. $this->returnClientError('The language code you provided (\'' . $params['language'] . '\') is not supported!');
  47. if(!isset($params['subject']))
  48. $this->returnClientError('You must specify a valid subject via \'&subject=\'!');
  49. $subject = WIKIPEDIA_SUBJECT_TFA;
  50. switch($params['subject']){
  51. case 'tfa':
  52. $subject = WIKIPEDIA_SUBJECT_TFA;
  53. break;
  54. case 'dyk':
  55. $subject = WIKIPEDIA_SUBJECT_DYK;
  56. break;
  57. default:
  58. $subject = WIKIPEDIA_SUBJECT_TFA;
  59. break;
  60. }
  61. $fullArticle = false;
  62. if(isset($params['fullarticle']))
  63. $fullArticle = $params['fullarticle'] === 'on' ? true : false;
  64. // We store the correct URI as URI of this bridge (so it can be used later!)
  65. $this->uri = 'https://' . strtolower($params['language']) . '.wikipedia.org';
  66. // While we at it let's also update the name for the feed
  67. switch($subject){
  68. case WIKIPEDIA_SUBJECT_TFA:
  69. $this->name = 'Today\'s featured article from ' . strtolower($params['language']) . '.wikipedia.org';
  70. break;
  71. case WIKIPEDIA_SUBJECT_DYK:
  72. $this->name = 'Did you know? - articles from ' . strtolower($params['language']) . '.wikipedia.org';
  73. break;
  74. default:
  75. $this->name = 'Articles from ' . strtolower($params['language']) . '.wikipedia.org';
  76. break;
  77. }
  78. // This will automatically send us to the correct main page in any language (try it!)
  79. $html = $this->getSimpleHTMLDOM($this->uri . '/wiki');
  80. if(!$html)
  81. $this->returnServerError('Could not load site: ' . $this->uri . '!');
  82. /*
  83. * Now read content depending on the language (make sure to create one function per language!)
  84. * We build the function name automatically, just make sure you create a private function ending
  85. * with your desired language code, where the language code is upper case! (en -> GetContentsEN).
  86. */
  87. $function = 'GetContents' . strtoupper($params['language']);
  88. if(!method_exists($this, $function))
  89. $this->returnServerError('A function to get the contents for your langauage is missing (\'' . $function . '\')!');
  90. /*
  91. * The method takes care of creating all items.
  92. */
  93. $this->$function($html, $subject, $fullArticle);
  94. }
  95. /**
  96. * Returns true if the language code is part of the parameters list
  97. */
  98. private function CheckLanguageCode($languageCode){
  99. $languages = $this->parameters[0]['language']['values'];
  100. $language_names = array();
  101. foreach($languages as $name=>$value)
  102. $language_names[] = $value;
  103. return in_array($languageCode, $language_names);
  104. }
  105. /**
  106. * Replaces all relative URIs with absolute ones
  107. * @param $element A simplehtmldom element
  108. * @return The $element->innertext with all URIs replaced
  109. */
  110. private function ReplaceURIInHTMLElement($element){
  111. return str_replace('href="/', 'href="' . $this->uri . '/', $element->innertext);
  112. }
  113. /*
  114. * Adds a new item to $items using a generic operation (should work for most (all?) wikis)
  115. */
  116. private function AddTodaysFeaturedArticleGeneric($element, $fullArticle){
  117. // Clean the bottom of the featured article
  118. $element->find('div', -1)->outertext = '';
  119. // The title and URI of the article is best defined in an anchor containint the string '...' ('full article ...')
  120. $target = $element->find('p/a', 0); // We'll use the first anchor as fallback
  121. foreach($element->find('//a') as $anchor){
  122. if(strpos($anchor->innertext, '...') !== false){
  123. $target = $anchor;
  124. break;
  125. }
  126. }
  127. $item = array();
  128. $item['uri'] = $this->uri . $target->href;
  129. $item['title'] = $target->title;
  130. if(!$fullArticle)
  131. $item['content'] = strip_tags($this->ReplaceURIInHTMLElement($element), '<a><p><br><img>');
  132. else
  133. $item['content'] = $this->LoadFullArticle($item['uri']);
  134. $this->items[] = $item;
  135. }
  136. /*
  137. * Adds a new item to $items using a generic operation (should work for most (all?) wikis)
  138. */
  139. private function AddDidYouKnowGeneric($element, $fullArticle){
  140. foreach($element->find('ul', 0)->find('li') as $entry){
  141. $item = array();
  142. // We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
  143. $item['uri'] = $this->uri . $entry->find('a', 0)->href;
  144. $item['title'] = strip_tags($entry->innertext);
  145. if(!$fullArticle)
  146. $item['content'] = $this->ReplaceURIInHTMLElement($entry);
  147. else
  148. $item['content'] = $this->LoadFullArticle($item['uri']);
  149. $this->items[] = $item;
  150. }
  151. }
  152. /**
  153. * Loads the full article from a given URI
  154. */
  155. private function LoadFullArticle($uri){
  156. $content_html = $this->getSimpleHTMLDOM($uri);
  157. if(!$content_html)
  158. $this->returnServerError('Could not load site: ' . $uri . '!');
  159. $content = $content_html->find('#mw-content-text', 0);
  160. if(!$content)
  161. $this->returnServerError('Could not find content in page: ' . $uri . '!');
  162. // Let's remove a couple of things from the article
  163. $table = $content->find('#toc', 0); // Table of contents
  164. if(!$table === false)
  165. $table->outertext = '';
  166. foreach($content->find('ol.references') as $reference) // References
  167. $reference->outertext = '';
  168. return str_replace('href="/', 'href="' . $this->uri . '/', $content->innertext);
  169. }
  170. /**
  171. * Implementation for de.wikipedia.org
  172. */
  173. private function GetContentsDE($html, $subject, $fullArticle){
  174. switch($subject){
  175. case WIKIPEDIA_SUBJECT_TFA:
  176. $element = $html->find('div[id=mf-tfa]', 0);
  177. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  178. break;
  179. case WIKIPEDIA_SUBJECT_DYK:
  180. $element = $html->find('div[id=mf-dyk]', 0);
  181. $this->AddDidYouKnowGeneric($element, $fullArticle);
  182. break;
  183. default:
  184. break;
  185. }
  186. }
  187. /**
  188. * Implementation for fr.wikipedia.org
  189. */
  190. private function GetContentsFR($html, $subject, $fullArticle){
  191. switch($subject){
  192. case WIKIPEDIA_SUBJECT_TFA:
  193. $element = $html->find('div[id=accueil-lumieresur]', 0);
  194. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  195. break;
  196. case WIKIPEDIA_SUBJECT_DYK:
  197. $element = $html->find('div[id=SaviezVous]', 0);
  198. $this->AddDidYouKnowGeneric($element, $fullArticle);
  199. break;
  200. default:
  201. break;
  202. }
  203. }
  204. /**
  205. * Implementation for en.wikipedia.org
  206. */
  207. private function GetContentsEN($html, $subject, $fullArticle){
  208. switch($subject){
  209. case WIKIPEDIA_SUBJECT_TFA:
  210. $element = $html->find('div[id=mp-tfa]', 0);
  211. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  212. break;
  213. case WIKIPEDIA_SUBJECT_DYK:
  214. $element = $html->find('div[id=mp-dyk]', 0);
  215. $this->AddDidYouKnowGeneric($element, $fullArticle);
  216. break;
  217. default:
  218. break;
  219. }
  220. }
  221. /**
  222. * Implementation for eo.wikipedia.org
  223. */
  224. private function GetContentsEO($html, $subject, $fullArticle){
  225. switch($subject){
  226. case WIKIPEDIA_SUBJECT_TFA:
  227. $element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
  228. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  229. break;
  230. case WIKIPEDIA_SUBJECT_DYK:
  231. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
  232. $this->AddDidYouKnowGeneric($element, $fullArticle);
  233. break;
  234. default:
  235. break;
  236. }
  237. }
  238. }