WikipediaBridge.php 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. <?php
  2. define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
  3. define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
  4. class WikipediaBridge extends BridgeAbstract {
  5. const MAINTAINER = 'logmanoriginal';
  6. const NAME = 'Wikipedia bridge for many languages';
  7. const URI = 'https://www.wikipedia.org/';
  8. const DESCRIPTION = 'Returns articles for a language of your choice';
  9. const PARAMETERS = array( array(
  10. 'language' => array(
  11. 'name' => 'Language',
  12. 'type' => 'list',
  13. 'required' => true,
  14. 'title' => 'Select your language',
  15. 'exampleValue' => 'English',
  16. 'values' => array(
  17. 'English' => 'en',
  18. 'Dutch' => 'nl',
  19. 'Esperanto' => 'eo',
  20. 'French' => 'fr',
  21. 'German' => 'de',
  22. )
  23. ),
  24. 'subject' => array(
  25. 'name' => 'Subject',
  26. 'type' => 'list',
  27. 'required' => true,
  28. 'title' => 'What subject are you interested in?',
  29. 'exampleValue' => 'Today\'s featured article',
  30. 'values' => array(
  31. 'Today\'s featured article' => 'tfa',
  32. 'Did you know…' => 'dyk'
  33. )
  34. ),
  35. 'fullarticle' => array(
  36. 'name' => 'Load full article',
  37. 'type' => 'checkbox',
  38. 'title' => 'Activate to always load the full article'
  39. )
  40. ));
  41. public function getURI(){
  42. if(!is_null($this->getInput('language'))) {
  43. return 'https://'
  44. . strtolower($this->getInput('language'))
  45. . '.wikipedia.org';
  46. }
  47. return parent::getURI();
  48. }
  49. public function getName(){
  50. switch($this->getInput('subject')) {
  51. case 'tfa':
  52. $subject = WIKIPEDIA_SUBJECT_TFA;
  53. break;
  54. case 'dyk':
  55. $subject = WIKIPEDIA_SUBJECT_DYK;
  56. break;
  57. default: return parent::getName();
  58. }
  59. switch($subject) {
  60. case WIKIPEDIA_SUBJECT_TFA:
  61. $name = 'Today\'s featured article from '
  62. . strtolower($this->getInput('language'))
  63. . '.wikipedia.org';
  64. break;
  65. case WIKIPEDIA_SUBJECT_DYK:
  66. $name = 'Did you know? - articles from '
  67. . strtolower($this->getInput('language'))
  68. . '.wikipedia.org';
  69. break;
  70. default:
  71. $name = 'Articles from '
  72. . strtolower($this->getInput('language'))
  73. . '.wikipedia.org';
  74. break;
  75. }
  76. return $name;
  77. }
  78. public function collectData(){
  79. switch($this->getInput('subject')) {
  80. case 'tfa':
  81. $subject = WIKIPEDIA_SUBJECT_TFA;
  82. break;
  83. case 'dyk':
  84. $subject = WIKIPEDIA_SUBJECT_DYK;
  85. break;
  86. default:
  87. $subject = WIKIPEDIA_SUBJECT_TFA;
  88. break;
  89. }
  90. $fullArticle = $this->getInput('fullarticle');
  91. // This will automatically send us to the correct main page in any language (try it!)
  92. $html = getSimpleHTMLDOM($this->getURI() . '/wiki');
  93. if(!$html)
  94. returnServerError('Could not load site: ' . $this->getURI() . '!');
  95. /*
  96. * Now read content depending on the language (make sure to create one function per language!)
  97. * We build the function name automatically, just make sure you create a private function ending
  98. * with your desired language code, where the language code is upper case! (en -> getContentsEN).
  99. */
  100. $function = 'getContents' . ucfirst(strtolower($this->getInput('language')));
  101. if(!method_exists($this, $function))
  102. returnServerError('A function to get the contents for your language is missing (\'' . $function . '\')!');
  103. /*
  104. * The method takes care of creating all items.
  105. */
  106. $this->$function($html, $subject, $fullArticle);
  107. }
  108. /**
  109. * Replaces all relative URIs with absolute ones
  110. * @param $element A simplehtmldom element
  111. * @return The $element->innertext with all URIs replaced
  112. */
  113. private function replaceUriInHtmlElement($element){
  114. return str_replace('href="/', 'href="' . $this->getURI() . '/', $element->innertext);
  115. }
  116. /*
  117. * Adds a new item to $items using a generic operation (should work for most
  118. * (all?) wikis) $anchorText can be specified if the wiki in question doesn't
  119. * use '...' (like Dutch, French and Italian) $anchorFallbackIndex can be
  120. * used to specify a different fallback link than the first
  121. * (e.g., -1 for the last)
  122. */
  123. private function addTodaysFeaturedArticleGeneric($element,
  124. $fullArticle,
  125. $anchorText = '...',
  126. $anchorFallbackIndex = 0){
  127. // Clean the bottom of the featured article
  128. if ($element->find('div', -1))
  129. $element->find('div', -1)->outertext = '';
  130. // The title and URI of the article can be found in an anchor containing
  131. // the string '...' in most wikis ('full article ...')
  132. $target = $element->find('p/a', $anchorFallbackIndex);
  133. foreach($element->find('//a') as $anchor) {
  134. if(strpos($anchor->innertext, $anchorText) !== false) {
  135. $target = $anchor;
  136. break;
  137. }
  138. }
  139. $item = array();
  140. $item['uri'] = $this->getURI() . $target->href;
  141. $item['title'] = $target->title;
  142. if(!$fullArticle)
  143. $item['content'] = strip_tags($this->replaceUriInHtmlElement($element), '<a><p><br><img>');
  144. else
  145. $item['content'] = $this->loadFullArticle($item['uri']);
  146. $this->items[] = $item;
  147. }
  148. /*
  149. * Adds a new item to $items using a generic operation (should work for most (all?) wikis)
  150. */
  151. private function addDidYouKnowGeneric($element, $fullArticle){
  152. foreach($element->find('ul', 0)->find('li') as $entry) {
  153. $item = array();
  154. // We can only use the first anchor, there is no way of finding the 'correct' one if there are multiple
  155. $item['uri'] = $this->getURI() . $entry->find('a', 0)->href;
  156. $item['title'] = strip_tags($entry->innertext);
  157. if(!$fullArticle)
  158. $item['content'] = $this->replaceUriInHtmlElement($entry);
  159. else
  160. $item['content'] = $this->loadFullArticle($item['uri']);
  161. $this->items[] = $item;
  162. }
  163. }
  164. /**
  165. * Loads the full article from a given URI
  166. */
  167. private function loadFullArticle($uri){
  168. $content_html = getSimpleHTMLDOMCached($uri);
  169. if(!$content_html)
  170. returnServerError('Could not load site: ' . $uri . '!');
  171. $content = $content_html->find('#mw-content-text', 0);
  172. if(!$content)
  173. returnServerError('Could not find content in page: ' . $uri . '!');
  174. // Let's remove a couple of things from the article
  175. $table = $content->find('#toc', 0); // Table of contents
  176. if(!$table === false)
  177. $table->outertext = '';
  178. foreach($content->find('ol.references') as $reference) // References
  179. $reference->outertext = '';
  180. return str_replace('href="/', 'href="' . $this->getURI() . '/', $content->innertext);
  181. }
  182. /**
  183. * Implementation for de.wikipedia.org
  184. */
  185. private function getContentsDe($html, $subject, $fullArticle){
  186. switch($subject) {
  187. case WIKIPEDIA_SUBJECT_TFA:
  188. $element = $html->find('div[id=mf-tfa]', 0);
  189. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle);
  190. break;
  191. case WIKIPEDIA_SUBJECT_DYK:
  192. $element = $html->find('div[id=mf-dyk]', 0);
  193. $this->addDidYouKnowGeneric($element, $fullArticle);
  194. break;
  195. default:
  196. break;
  197. }
  198. }
  199. /**
  200. * Implementation for fr.wikipedia.org
  201. */
  202. private function getContentsFr($html, $subject, $fullArticle){
  203. switch($subject) {
  204. case WIKIPEDIA_SUBJECT_TFA:
  205. $element = $html->find('div[class=accueil_2017_cadre]', 0);
  206. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lire la suite');
  207. break;
  208. case WIKIPEDIA_SUBJECT_DYK:
  209. $element = $html->find('div[class=accueil_2017_cadre]', 2);
  210. $this->addDidYouKnowGeneric($element, $fullArticle);
  211. break;
  212. default:
  213. break;
  214. }
  215. }
  216. /**
  217. * Implementation for en.wikipedia.org
  218. */
  219. private function getContentsEn($html, $subject, $fullArticle){
  220. switch($subject) {
  221. case WIKIPEDIA_SUBJECT_TFA:
  222. $element = $html->find('div[id=mp-tfa]', 0);
  223. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle);
  224. break;
  225. case WIKIPEDIA_SUBJECT_DYK:
  226. $element = $html->find('div[id=mp-dyk]', 0);
  227. $this->addDidYouKnowGeneric($element, $fullArticle);
  228. break;
  229. default:
  230. break;
  231. }
  232. }
  233. /**
  234. * Implementation for eo.wikipedia.org
  235. */
  236. private function getContentsEo($html, $subject, $fullArticle){
  237. switch($subject) {
  238. case WIKIPEDIA_SUBJECT_TFA:
  239. $element = $html->find('div[id=mf-artikolo-de-la-semajno]', 0);
  240. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle);
  241. break;
  242. case WIKIPEDIA_SUBJECT_DYK:
  243. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 4);
  244. $this->addDidYouKnowGeneric($element, $fullArticle);
  245. break;
  246. default:
  247. break;
  248. }
  249. }
  250. /**
  251. * Implementation for nl.wikipedia.org
  252. */
  253. private function getContentsNl($html, $subject, $fullArticle){
  254. switch($subject) {
  255. case WIKIPEDIA_SUBJECT_TFA:
  256. $element = $html->find('div[id=mf-uitgelicht]', 0);
  257. $this->addTodaysFeaturedArticleGeneric($element, $fullArticle, 'Lees meer');
  258. break;
  259. case WIKIPEDIA_SUBJECT_DYK:
  260. $element = $html->find('div[id=mw-content-text]', 0)->find('table', 4)->find('td', 2);
  261. $this->addDidYouKnowGeneric($element, $fullArticle);
  262. break;
  263. default:
  264. break;
  265. }
  266. }
  267. }