WikipediaBridge.php 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. <?php
  2. define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
  3. define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
  4. class WikipediaBridge extends BridgeAbstract {
  5. const MAINTAINER = 'logmanoriginal';
  6. const NAME = 'Wikipedia bridge for many languages';
  7. const URI = 'https://www.wikipedia.org/';
  8. const DESCRIPTION = 'Returns articles for a language of your choice';
  9. const PARAMETERS = array( array(
  10. 'language' => array(
  11. 'name' => 'Language',
  12. 'type' => 'list',
  13. 'required' => true,
  14. 'title' => 'Select your language',
  15. 'exampleValue' => 'English',
  16. 'values' => array(
  17. 'English' => 'en',
  18. 'Dutch' => 'nl',
  19. 'Esperanto' => 'eo',
  20. 'French' => 'fr',
  21. 'German' => 'de',
  22. )
  23. ),
  24. 'subject' => array(
  25. 'name' => 'Subject',
  26. 'type' => 'list',
  27. 'required' => true,
  28. 'title' => 'What subject are you interested in?',
  29. 'exampleValue' => 'Today\'s featured article',
  30. 'values' => array(
  31. 'Today\'s featured article' => 'tfa',
  32. 'Did you know…' => 'dyk'
  33. )
  34. ),
  35. 'fullarticle' => array(
  36. 'name' => 'Load full article',
  37. 'type' => 'checkbox',
  38. 'title' => 'Activate to always load the full article'
  39. )
  40. ));
  41. public function getURI(){
  42. return 'https://'
  43. . strtolower($this->getInput('language'))
  44. . '.wikipedia.org';
  45. }
  46. public function getName(){
  47. switch($this->getInput('subject')){
  48. case 'tfa':
  49. $subject = WIKIPEDIA_SUBJECT_TFA;
  50. break;
  51. case 'dyk':
  52. $subject = WIKIPEDIA_SUBJECT_DYK;
  53. break;
  54. default:
  55. $subject = WIKIPEDIA_SUBJECT_TFA;
  56. break;
  57. }
  58. switch($subject){
  59. case WIKIPEDIA_SUBJECT_TFA:
  60. $name = 'Today\'s featured article from '
  61. . strtolower($this->getInput('language'))
  62. . '.wikipedia.org';
  63. break;
  64. case WIKIPEDIA_SUBJECT_DYK:
  65. $name = 'Did you know? - articles from '
  66. . strtolower($this->getInput('language'))
  67. . '.wikipedia.org';
  68. break;
  69. default:
  70. $name = 'Articles from '
  71. . strtolower($this->getInput('language'))
  72. . '.wikipedia.org';
  73. break;
  74. }
  75. return $name;
  76. }
  77. public function collectData(){
  78. switch($this->getInput('subject')){
  79. case 'tfa':
  80. $subject = WIKIPEDIA_SUBJECT_TFA;
  81. break;
  82. case 'dyk':
  83. $subject = WIKIPEDIA_SUBJECT_DYK;
  84. break;
  85. default:
  86. $subject = WIKIPEDIA_SUBJECT_TFA;
  87. break;
  88. }
  89. $fullArticle = $this->getInput('fullarticle');
  90. // This will automatically send us to the correct main page in any language (try it!)
  91. $html = getSimpleHTMLDOM($this->getURI() . '/wiki');
  92. if(!$html)
  93. returnServerError('Could not load site: ' . $this->getURI() . '!');
  94. /*
  95. * Now read content depending on the language (make sure to create one function per language!)
  96. * We build the function name automatically, just make sure you create a private function ending
  97. * with your desired language code, where the language code is upper case! (en -> getContentsEN).
  98. */
  99. $function = 'getContents' . ucfirst(strtolower($this->getInput('language')));
  100. if(!method_exists($this, $function))
  101. returnServerError('A function to get the contents for your language is missing (\'' . $function . '\')!');
  102. /*
  103. * The method takes care of creating all items.
  104. */
  105. $this->$function($html, $subject, $fullArticle);
  106. }
  107. /**
  108. * Replaces all relative URIs with absolute ones
  109. * @param $element A simplehtmldom element
  110. * @return The $element->innertext with all URIs replaced
  111. */
  112. private function replaceUriInHtmlElement($element){
  113. return str_replace('href="/', 'href="' . $this->getURI() . '/', $element->innertext);
  114. }
  115. /*
  116. * Adds a new item to $items using a generic operation (should work for most
  117. * (all?) wikis) $anchorText can be specified if the wiki in question doesn't
  118. * use '...' (like Dutch, French and Italian) $anchorFallbackIndex can be
  119. * used to specify a different fallback link than the first
  120. * (e.g., -1 for the last)
  121. */
  122. private function addTodaysFeaturedArticleGeneric($element,
  123. $fullArticle,
  124. $anchorText = '...',
  125. $anchorFallbackIndex = 0){
  126. // Clean the bottom of the featured article
  127. if ($element->find('div', -1))
  128. $element->find('div', -1)->outertext = '';
  129. // The title and URI of the article can be found in an anchor containing
  130. // the string '...' in most wikis ('full article ...')
  131. $target = $element->find('p/a', $anchorFallbackIndex);
  132. foreach($element->find('//a') as $anchor){
  133. if(strpos($anchor->innertext, $anchorText) !== false){
  134. $target = $anchor;
  135. break;
  136. }
  137. }
  138. $item = array();
  139. $item['uri'] = $this->getURI() . $target->href;
  140. $item['title'] = $target->title;
  141. if(!$fullArticle)
  142. $item['content'] = strip_tags($this->replaceUriInHtmlElement($element), '<a><p><br><img>');
  143. else
  144. $item['content'] = $this->loadFullArticle($item['uri']);
  145. $this->items[] = $item;
  146. }
  147. /*
  148. * Adds a new item to $items using a generic operation (should work for most (all?) wikis)
  149. */
  150. private function addDidYouKnowGeneric($element, $fullArticle){
  151. foreach($element->find('ul', 0)->find('li') as $entry){
  152. $item = array();
  153. // We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
  154. $item['uri'] = $this->getURI() . $entry->find('a', 0)->href;
  155. $item['title'] = strip_tags($entry->innertext);
  156. if(!$fullArticle)
  157. $item['content'] = $this->replaceUriInHtmlElement($entry);
  158. else
  159. $item['content'] = $this->loadFullArticle($item['uri']);
  160. $this->items[] = $item;
  161. }
  162. }
  163. /**
  164. * Loads the full article from a given URI
  165. */
  166. private function loadFullArticle($uri){
  167. $content_html = getSimpleHTMLDOMCached($uri);
  168. if(!$content_html)
  169. returnServerError('Could not load site: ' . $uri . '!');
  170. $content = $content_html->find('#mw-content-text', 0);
  171. if(!$content)
  172. returnServerError('Could not find content in page: ' . $uri . '!');
  173. // Let's remove a couple of things from the article
  174. $table = $content->find('#toc', 0); // Table of contents
  175. if(!$table === false)
  176. $table->outertext = '';
  177. foreach($content->find('ol.references') as $reference) // References
  178. $reference->outertext = '';
  179. return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext);
  180. }
  181. /**
  182. * Implementation for de.wikipedia.org
  183. */
  184. private function getContentsDe($html, $subject, $fullArticle){
  185. switch($subject){
  186. case WIKIPEDIA_SUBJECT_TFA:
  187. $element = $html->find('div[id=mf-tfa]', 0);
  188. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle);
  189. break;
  190. case WIKIPEDIA_SUBJECT_DYK:
  191. $element = $html->find('div[id=mf-dyk]', 0);
  192. $this->addDidYouKnowGeneric($element, $fullArticle);
  193. break;
  194. default:
  195. break;
  196. }
  197. }
  198. /**
  199. * Implementation for fr.wikipedia.org
  200. */
  201. private function getContentsFr($html, $subject, $fullArticle){
  202. switch($subject){
  203. case WIKIPEDIA_SUBJECT_TFA:
  204. $element = $html->find('div[id=accueil-lumieresur]', 0);
  205. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lire la suite');
  206. break;
  207. case WIKIPEDIA_SUBJECT_DYK:
  208. $element = $html->find('div[id=SaviezVous]', 0);
  209. $this->addDidYouKnowGeneric($element, $fullArticle);
  210. break;
  211. default:
  212. break;
  213. }
  214. }
  215. /**
  216. * Implementation for en.wikipedia.org
  217. */
  218. private function getContentsEn($html, $subject, $fullArticle){
  219. switch($subject){
  220. case WIKIPEDIA_SUBJECT_TFA:
  221. $element = $html->find('div[id=mp-tfa]', 0);
  222. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle);
  223. break;
  224. case WIKIPEDIA_SUBJECT_DYK:
  225. $element = $html->find('div[id=mp-dyk]', 0);
  226. $this->addDidYouKnowGeneric($element, $fullArticle);
  227. break;
  228. default:
  229. break;
  230. }
  231. }
  232. /**
  233. * Implementation for eo.wikipedia.org
  234. */
  235. private function getContentsEo($html, $subject, $fullArticle){
  236. switch($subject){
  237. case WIKIPEDIA_SUBJECT_TFA:
  238. $element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
  239. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle);
  240. break;
  241. case WIKIPEDIA_SUBJECT_DYK:
  242. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
  243. $this->addDidYouKnowGeneric($element, $fullArticle);
  244. break;
  245. default:
  246. break;
  247. }
  248. }
  249. /**
  250. * Implementation for nl.wikipedia.org
  251. */
  252. private function getContentsNl($html, $subject, $fullArticle){
  253. switch($subject){
  254. case WIKIPEDIA_SUBJECT_TFA:
  255. $element = $html->find('div[id=mf-uitgelicht]', 0);
  256. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lees meer');
  257. break;
  258. case WIKIPEDIA_SUBJECT_DYK:
  259. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 2);
  260. $this->addDidYouKnowGeneric($element, $fullArticle);
  261. break;
  262. default:
  263. break;
  264. }
  265. }
  266. }