MoinMoinBridge.php 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. <?php
  2. class MoinMoinBridge extends BridgeAbstract {
  3. const MAINTAINER = 'logmanoriginal';
  4. const NAME = 'MoinMoin Bridge';
  5. const URI = 'https://moinmo.in';
  6. const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
  7. const PARAMETERS = array(
  8. array(
  9. 'source' => array(
  10. 'name' => 'Source',
  11. 'type' => 'text',
  12. 'required' => true,
  13. 'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
  14. 'exampleValue' => 'https://moinmo.in/MoinMoin'
  15. ),
  16. 'separator' => array(
  17. 'name' => 'Separator',
  18. 'type' => 'list',
  19. 'requied' => true,
  20. 'title' => 'Defines the separtor for splitting content into feeds',
  21. 'defaultValue' => 'h2',
  22. 'values' => array(
  23. 'Header (h1)' => 'h1',
  24. 'Header (h2)' => 'h2',
  25. 'Header (h3)' => 'h3',
  26. 'List element (li)' => 'li',
  27. 'Anchor (a)' => 'a'
  28. )
  29. ),
  30. 'limit' => array(
  31. 'name' => 'Limit',
  32. 'type' => 'number',
  33. 'required' => false,
  34. 'title' => 'Number of items to return (from top)',
  35. 'defaultValue' => -1
  36. ),
  37. 'content' => array(
  38. 'name' => 'Content',
  39. 'type' => 'list',
  40. 'required' => false,
  41. 'title' => 'Defines how feed contents are build',
  42. 'defaultValue' => 'separator',
  43. 'values' => array(
  44. 'By separator' => 'separator',
  45. 'Follow link (only for anchor)' => 'follow',
  46. 'None' => 'none'
  47. )
  48. )
  49. )
  50. );
  51. private $title = '';
  52. public function collectData(){
  53. /* MoinMoin uses a rather unpleasent representation of HTML. Instead of
  54. * using tags like <article/>, <navigation/>, <header/>, etc... it uses
  55. * <div/>, <span/> and <p/>. Also each line is literaly identified via
  56. * IDs. The only way to distinguish content is via headers, though not
  57. * in all cases.
  58. *
  59. * Example (indented for the sake of readability):
  60. * ...
  61. * <span class="anchor" id="line-1"></span>
  62. * <span class="anchor" id="line-2"></span>
  63. * <span class="anchor" id="line-3"></span>
  64. * <span class="anchor" id="line-4"></span>
  65. * <span class="anchor" id="line-5"></span>
  66. * <span class="anchor" id="line-6"></span>
  67. * <span class="anchor" id="line-7"></span>
  68. * <span class="anchor" id="line-8"></span>
  69. * <span class="anchor" id="line-9"></span>
  70. * <p class="line867">MoinMoin is a Wiki software implemented in
  71. * <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
  72. * and distributed as Free Software under
  73. * <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
  74. * ...
  75. */
  76. $html = getSimpleHTMLDOM($this->getInput('source'))
  77. or returnServerError('Could not load ' . $this->getInput('source'));
  78. // Some anchors link to local sites or local IDs (both don't work well
  79. // in feeds)
  80. $html = $this->fixAnchors($html);
  81. $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
  82. // Here we focus on simple author and timestamp information from the given
  83. // page. Later we update this information in case the anchor is followed.
  84. $author = $this->findAuthor($html);
  85. $timestamp = $this->findTimestamp($html);
  86. $sections = $this->splitSections($html);
  87. foreach($sections as $section) {
  88. $item = array();
  89. $item['uri'] = $this->findSectionAnchor($section[0]);
  90. switch($this->getInput('content')) {
  91. case 'none': // Do not return any content
  92. break;
  93. case 'follow': // Follow the anchor
  94. // We can only follow anchors (use default otherwise)
  95. if($this->getInput('separator') === 'a') {
  96. $content = $this->followAnchor($item['uri']);
  97. // Return only actual content
  98. $item['content'] = $content->find('div#page', 0)->innertext;
  99. // Each page could have its own author and timestamp
  100. $author = $this->findAuthor($content);
  101. $timestamp = $this->findTimestamp($content);
  102. break;
  103. }
  104. case 'separator':
  105. default: // Use contents from the current page
  106. $item['content'] = $this->cleanArticle($section[2]);
  107. }
  108. if(!is_null($author)) $item['author'] = $author;
  109. if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
  110. $item['title'] = strip_tags($section[1]);
  111. // Skip items with empty title
  112. if(empty(trim($item['title']))) {
  113. continue;
  114. }
  115. $this->items[] = $item;
  116. if($this->getInput('limit') > 0
  117. && count($this->items) >= $this->getInput('limit')) {
  118. break;
  119. }
  120. }
  121. }
  122. public function getName(){
  123. return $this->title ?: parent::getName();
  124. }
  125. public function getURI(){
  126. return $this->getInput('source') ?: parent::getURI();
  127. }
  128. /**
  129. * Splits the html into sections.
  130. *
  131. * Returns an array with one element per section. Each element consists of:
  132. * [0] The entire section
  133. * [1] The section title
  134. * [2] The section content
  135. */
  136. private function splitSections($html){
  137. $content = $html->find('div#page', 0)->innertext
  138. or returnServerError('Unable to find <div id="page"/>!');
  139. $sections = array();
  140. $regex = implode(
  141. '',
  142. array(
  143. "\<{$this->getInput('separator')}.+?(?=\>)\>",
  144. "(.+?)(?=\<\/{$this->getInput('separator')}\>)",
  145. "\<\/{$this->getInput('separator')}\>",
  146. "(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
  147. )
  148. );
  149. preg_match_all(
  150. '/' . $regex . '/m',
  151. $content,
  152. $sections,
  153. PREG_SET_ORDER
  154. );
  155. // Some pages don't use headers, return page as one feed
  156. if(count($sections) === 0) {
  157. return array(
  158. array(
  159. $content,
  160. $html->find('title', 0)->innertext,
  161. $content
  162. )
  163. );
  164. }
  165. return $sections;
  166. }
  167. /**
  168. * Returns the anchor for a given section
  169. */
  170. private function findSectionAnchor($section){
  171. $html = str_get_html($section);
  172. // For IDs
  173. $anchor = $html->find($this->getInput('separator') . '[id=]', 0);
  174. if(!is_null($anchor)) {
  175. return $this->getInput('source') . '#' . $anchor->id;
  176. }
  177. // For actual anchors
  178. $anchor = $html->find($this->getInput('separator') . '[href=]', 0);
  179. if(!is_null($anchor)) {
  180. return $anchor->href;
  181. }
  182. // Nothing found
  183. return $this->getInput('source');
  184. }
  185. /**
  186. * Returns the author
  187. *
  188. * Notice: Some pages don't provide author information
  189. */
  190. private function findAuthor($html){
  191. /* Example:
  192. * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
  193. * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
  194. * [178.162.199.143]">hosted-by</span>)</p>
  195. */
  196. $pageinfo = $html->find('[id="pageinfo"]', 0);
  197. if(is_null($pageinfo)) {
  198. return null;
  199. } else {
  200. $author = $pageinfo->find('[title=]', 0);
  201. if(is_null($author)) {
  202. return null;
  203. } else {
  204. return trim(explode('@', $author->title)[0]);
  205. }
  206. }
  207. }
  208. /**
  209. * Returns the time of last edit
  210. *
  211. * Notice: Some pages don't provide this information
  212. */
  213. private function findTimestamp($html){
  214. // See example of findAuthor()
  215. $pageinfo = $html->find('[id="pageinfo"]', 0);
  216. if(is_null($pageinfo)) {
  217. return null;
  218. } else {
  219. $timestamp = $pageinfo->innertext;
  220. $matches = array();
  221. preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
  222. return strtotime($matches[1]);
  223. }
  224. }
  225. /**
  226. * Returns the original HTML with all anchors fixed (makes relative anchors
  227. * absolute)
  228. */
  229. private function fixAnchors($html, $source = null){
  230. $source = $source ?: $this->getURI();
  231. foreach($html->find('a') as $anchor) {
  232. switch(substr($anchor->href, 0, 1)) {
  233. case 'h': // http or https, no actions required
  234. break;
  235. case '/': // some relative path
  236. $anchor->href = $this->findDomain($source) . $anchor->href;
  237. break;
  238. case '#': // it's an ID
  239. default: // probably something like ? or &, skip empty ones
  240. if(!isset($anchor->href))
  241. break;
  242. $anchor->href = $source . $anchor->href;
  243. }
  244. }
  245. return $html;
  246. }
  247. /**
  248. * Loads the full article of a given anchor (if the anchor is from the same
  249. * wiki domain)
  250. */
  251. private function followAnchor($anchor){
  252. if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)) {
  253. return null;
  254. }
  255. $html = getSimpleHTMLDOMCached($anchor);
  256. if(!$html) { // Cannot load article
  257. return null;
  258. }
  259. return $this->fixAnchors($html, $anchor);
  260. }
  261. /**
  262. * Finds the domain for a given URI
  263. */
  264. private function findDomain($uri){
  265. $matches = array();
  266. preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
  267. return $matches[1];
  268. }
  269. /* This function is a copy from CNETBridge */
  270. private function stripWithDelimiters($string, $start, $end){
  271. while(strpos($string, $start) !== false) {
  272. $section_to_remove = substr($string, strpos($string, $start));
  273. $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
  274. $string = str_replace($section_to_remove, '', $string);
  275. }
  276. return $string;
  277. }
  278. /* This function is based on CNETBridge */
  279. private function cleanArticle($article_html){
  280. $article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
  281. return $article_html;
  282. }
  283. }