WikipediaBridge.php 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. <?php
  2. define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
  3. define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
  4. class WikipediaBridge extends BridgeAbstract {
  5. const MAINTAINER = 'logmanoriginal';
  6. const NAME = 'Wikipedia bridge for many languages';
  7. const URI = 'https://www.wikipedia.org/';
  8. const DESCRIPTION = 'Returns articles for a language of your choice';
  9. const PARAMETERS = array( array(
  10. 'language'=>array(
  11. 'name'=>'Language',
  12. 'type'=>'list',
  13. 'required'=>true,
  14. 'title'=>'Select your language',
  15. 'exampleValue'=>'English',
  16. 'values'=>array(
  17. 'English'=>'en',
  18. 'Dutch'=>'nl',
  19. 'Esperanto'=>'eo',
  20. 'French'=>'fr',
  21. 'German'=>'de',
  22. )
  23. ),
  24. 'subject'=>array(
  25. 'name'=>'Subject',
  26. 'type'=>'list',
  27. 'required'=>true,
  28. 'title'=>'What subject are you interested in?',
  29. 'exampleValue'=>'Today\'s featured article',
  30. 'values'=>array(
  31. 'Today\'s featured article'=>'tfa',
  32. 'Did you know…'=>'dyk'
  33. )
  34. ),
  35. 'fullarticle'=>array(
  36. 'name'=>'Load full article',
  37. 'type'=>'checkbox',
  38. 'title'=>'Activate to always load the full article'
  39. )
  40. ));
  41. public function getURI(){
  42. return 'https://' . strtolower($this->getInput('language')) . '.wikipedia.org';
  43. }
  44. public function getName(){
  45. switch($this->getInput('subject')){
  46. case 'tfa':
  47. $subject = WIKIPEDIA_SUBJECT_TFA;
  48. break;
  49. case 'dyk':
  50. $subject = WIKIPEDIA_SUBJECT_DYK;
  51. break;
  52. default:
  53. $subject = WIKIPEDIA_SUBJECT_TFA;
  54. break;
  55. }
  56. switch($subject){
  57. case WIKIPEDIA_SUBJECT_TFA:
  58. $name = 'Today\'s featured article from ' . strtolower($this->getInput('language')) . '.wikipedia.org';
  59. break;
  60. case WIKIPEDIA_SUBJECT_DYK:
  61. $name = 'Did you know? - articles from ' . strtolower($this->getInput('language')) . '.wikipedia.org';
  62. break;
  63. default:
  64. $name = 'Articles from ' . strtolower($this->getInput('language')) . '.wikipedia.org';
  65. break;
  66. }
  67. return $name;
  68. }
  69. public function collectData(){
  70. switch($this->getInput('subject')){
  71. case 'tfa':
  72. $subject = WIKIPEDIA_SUBJECT_TFA;
  73. break;
  74. case 'dyk':
  75. $subject = WIKIPEDIA_SUBJECT_DYK;
  76. break;
  77. default:
  78. $subject = WIKIPEDIA_SUBJECT_TFA;
  79. break;
  80. }
  81. $fullArticle = $this->getInput('fullarticle');
  82. // This will automatically send us to the correct main page in any language (try it!)
  83. $html = getSimpleHTMLDOM($this->getURI() . '/wiki');
  84. if(!$html)
  85. returnServerError('Could not load site: ' . $this->getURI() . '!');
  86. /*
  87. * Now read content depending on the language (make sure to create one function per language!)
  88. * We build the function name automatically, just make sure you create a private function ending
  89. * with your desired language code, where the language code is upper case! (en -> GetContentsEN).
  90. */
  91. $function = 'GetContents' . strtoupper($this->getInput('language'));
  92. if(!method_exists($this, $function))
  93. returnServerError('A function to get the contents for your language is missing (\'' . $function . '\')!');
  94. /*
  95. * The method takes care of creating all items.
  96. */
  97. $this->$function($html, $subject, $fullArticle);
  98. }
  99. /**
  100. * Replaces all relative URIs with absolute ones
  101. * @param $element A simplehtmldom element
  102. * @return The $element->innertext with all URIs replaced
  103. */
  104. private function ReplaceURIInHTMLElement($element){
  105. return str_replace('href="/', 'href="' . $this->getURI() . '/', $element->innertext);
  106. }
  107. /*
  108. * Adds a new item to $items using a generic operation (should work for most (all?) wikis)
  109. * $anchorText can be specified if the wiki in question doesn't use '...' (like Dutch, French and Italian)
  110. * $anchorFallbackIndex can be used to specify a different fallback link than the first (e.g., -1 for the last)
  111. */
  112. private function AddTodaysFeaturedArticleGeneric($element, $fullArticle, $anchorText = '...', $anchorFallbackIndex = 0){
  113. // Clean the bottom of the featured article
  114. if ($element->find('div', -1))
  115. $element->find('div', -1)->outertext = '';
  116. // The title and URI of the article can be found in an anchor containing the string '...' in most wikis ('full article ...')
  117. $target = $element->find('p/a', $anchorFallbackIndex);
  118. foreach($element->find('//a') as $anchor){
  119. if(strpos($anchor->innertext, $anchorText) !== false){
  120. $target = $anchor;
  121. break;
  122. }
  123. }
  124. $item = array();
  125. $item['uri'] = $this->getURI() . $target->href;
  126. $item['title'] = $target->title;
  127. if(!$fullArticle)
  128. $item['content'] = strip_tags($this->ReplaceURIInHTMLElement($element), '<a><p><br><img>');
  129. else
  130. $item['content'] = $this->LoadFullArticle($item['uri']);
  131. $this->items[] = $item;
  132. }
  133. /*
  134. * Adds a new item to $items using a generic operation (should work for most (all?) wikis)
  135. */
  136. private function AddDidYouKnowGeneric($element, $fullArticle){
  137. foreach($element->find('ul', 0)->find('li') as $entry){
  138. $item = array();
  139. // We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
  140. $item['uri'] = $this->getURI() . $entry->find('a', 0)->href;
  141. $item['title'] = strip_tags($entry->innertext);
  142. if(!$fullArticle)
  143. $item['content'] = $this->ReplaceURIInHTMLElement($entry);
  144. else
  145. $item['content'] = $this->LoadFullArticle($item['uri']);
  146. $this->items[] = $item;
  147. }
  148. }
  149. /**
  150. * Loads the full article from a given URI
  151. */
  152. private function LoadFullArticle($uri){
  153. $content_html = getSimpleHTMLDOMCached($uri);
  154. if(!$content_html)
  155. returnServerError('Could not load site: ' . $uri . '!');
  156. $content = $content_html->find('#mw-content-text', 0);
  157. if(!$content)
  158. returnServerError('Could not find content in page: ' . $uri . '!');
  159. // Let's remove a couple of things from the article
  160. $table = $content->find('#toc', 0); // Table of contents
  161. if(!$table === false)
  162. $table->outertext = '';
  163. foreach($content->find('ol.references') as $reference) // References
  164. $reference->outertext = '';
  165. return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext);
  166. }
  167. /**
  168. * Implementation for de.wikipedia.org
  169. */
  170. private function GetContentsDE($html, $subject, $fullArticle){
  171. switch($subject){
  172. case WIKIPEDIA_SUBJECT_TFA:
  173. $element = $html->find('div[id=mf-tfa]', 0);
  174. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  175. break;
  176. case WIKIPEDIA_SUBJECT_DYK:
  177. $element = $html->find('div[id=mf-dyk]', 0);
  178. $this->AddDidYouKnowGeneric($element, $fullArticle);
  179. break;
  180. default:
  181. break;
  182. }
  183. }
  184. /**
  185. * Implementation for fr.wikipedia.org
  186. */
  187. private function GetContentsFR($html, $subject, $fullArticle){
  188. switch($subject){
  189. case WIKIPEDIA_SUBJECT_TFA:
  190. $element = $html->find('div[id=accueil-lumieresur]', 0);
  191. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lire la suite');
  192. break;
  193. case WIKIPEDIA_SUBJECT_DYK:
  194. $element = $html->find('div[id=SaviezVous]', 0);
  195. $this->AddDidYouKnowGeneric($element, $fullArticle);
  196. break;
  197. default:
  198. break;
  199. }
  200. }
  201. /**
  202. * Implementation for en.wikipedia.org
  203. */
  204. private function GetContentsEN($html, $subject, $fullArticle){
  205. switch($subject){
  206. case WIKIPEDIA_SUBJECT_TFA:
  207. $element = $html->find('div[id=mp-tfa]', 0);
  208. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  209. break;
  210. case WIKIPEDIA_SUBJECT_DYK:
  211. $element = $html->find('div[id=mp-dyk]', 0);
  212. $this->AddDidYouKnowGeneric($element, $fullArticle);
  213. break;
  214. default:
  215. break;
  216. }
  217. }
  218. /**
  219. * Implementation for eo.wikipedia.org
  220. */
  221. private function GetContentsEO($html, $subject, $fullArticle){
  222. switch($subject){
  223. case WIKIPEDIA_SUBJECT_TFA:
  224. $element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
  225. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle);
  226. break;
  227. case WIKIPEDIA_SUBJECT_DYK:
  228. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
  229. $this->AddDidYouKnowGeneric($element, $fullArticle);
  230. break;
  231. default:
  232. break;
  233. }
  234. }
  235. /**
  236. * Implementation for nl.wikipedia.org
  237. */
  238. private function GetContentsNL($html, $subject, $fullArticle){
  239. switch($subject){
  240. case WIKIPEDIA_SUBJECT_TFA:
  241. $element = $html->find('div[id=mf-uitgelicht]', 0);
  242. $this->AddTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lees meer');
  243. break;
  244. case WIKIPEDIA_SUBJECT_DYK:
  245. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 2);
  246. $this->AddDidYouKnowGeneric($element, $fullArticle);
  247. break;
  248. default:
  249. break;
  250. }
  251. }
  252. }