|
@@ -0,0 +1,327 @@
|
|
|
+<?php
|
|
|
+class MoinMoinBridge extends BridgeAbstract {
|
|
|
+
|
|
|
+ const MAINTAINER = 'logmanoriginal';
|
|
|
+ const NAME = 'MoinMoin Bridge';
|
|
|
+ const URI = 'https://moinmo.in';
|
|
|
+ const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
|
|
|
+ const PARAMETERS = array(
|
|
|
+ array(
|
|
|
+ 'source' => array(
|
|
|
+ 'name' => 'Source',
|
|
|
+ 'type' => 'text',
|
|
|
+ 'required' => true,
|
|
|
+ 'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
|
|
|
+ 'exampleValue' => 'https://moinmo.in/MoinMoin'
|
|
|
+ ),
|
|
|
+ 'separator' => array(
|
|
|
+ 'name' => 'Separator',
|
|
|
+ 'type' => 'list',
|
|
|
+ 'requied' => true,
|
|
|
+ 'title' => 'Defines the separtor for splitting content into feeds',
|
|
|
+ 'defaultValue' => 'h2',
|
|
|
+ 'values' => array(
|
|
|
+ 'Header (h1)' => 'h1',
|
|
|
+ 'Header (h2)' => 'h2',
|
|
|
+ 'Header (h3)' => 'h3',
|
|
|
+ 'List element (li)' => 'li',
|
|
|
+ 'Anchor (a)' => 'a'
|
|
|
+ )
|
|
|
+ ),
|
|
|
+ 'limit' => array(
|
|
|
+ 'name' => 'Limit',
|
|
|
+ 'type' => 'number',
|
|
|
+ 'required' => false,
|
|
|
+ 'title' => 'Number of items to return (from top)',
|
|
|
+ 'defaultValue' => -1
|
|
|
+ ),
|
|
|
+ 'content' => array(
|
|
|
+ 'name' => 'Content',
|
|
|
+ 'type' => 'list',
|
|
|
+ 'required' => false,
|
|
|
+ 'title' => 'Defines how feed contents are build',
|
|
|
+ 'defaultValue' => 'separator',
|
|
|
+ 'values' => array(
|
|
|
+ 'By separator' => 'separator',
|
|
|
+ 'Follow link (only for anchor)' => 'follow',
|
|
|
+ 'None' => 'none'
|
|
|
+ )
|
|
|
+ )
|
|
|
+ )
|
|
|
+ );
|
|
|
+
|
|
|
+ private $title = '';
|
|
|
+
|
|
|
+ public function collectData(){
|
|
|
+ /* MoinMoin uses a rather unpleasent representation of HTML. Instead of
|
|
|
+ * using tags like <article/>, <navigation/>, <header/>, etc... it uses
|
|
|
+ * <div/>, <span/> and <p/>. Also each line is literaly identified via
|
|
|
+ * IDs. The only way to distinguish content is via headers, though not
|
|
|
+ * in all cases.
|
|
|
+ *
|
|
|
+ * Example (indented for the sake of readability):
|
|
|
+ * ...
|
|
|
+ * <span class="anchor" id="line-1"></span>
|
|
|
+ * <span class="anchor" id="line-2"></span>
|
|
|
+ * <span class="anchor" id="line-3"></span>
|
|
|
+ * <span class="anchor" id="line-4"></span>
|
|
|
+ * <span class="anchor" id="line-5"></span>
|
|
|
+ * <span class="anchor" id="line-6"></span>
|
|
|
+ * <span class="anchor" id="line-7"></span>
|
|
|
+ * <span class="anchor" id="line-8"></span>
|
|
|
+ * <span class="anchor" id="line-9"></span>
|
|
|
+ * <p class="line867">MoinMoin is a Wiki software implemented in
|
|
|
+ * <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
|
|
|
+ * and distributed as Free Software under
|
|
|
+ * <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
|
|
|
+ * ...
|
|
|
+ */
|
|
|
+ $html = getSimpleHTMLDOM($this->getInput('source'))
|
|
|
+ or returnServerError('Could not load ' . $this->getInput('source'));
|
|
|
+
|
|
|
+ // Some anchors link to local sites or local IDs (both don't work well
|
|
|
+ // in feeds)
|
|
|
+ $html = $this->fixAnchors($html);
|
|
|
+
|
|
|
+ $this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
|
|
|
+
|
|
|
+ // Here we focus on simple author and timestamp information from the given
|
|
|
+ // page. Later we update this information in case the anchor is followed.
|
|
|
+ $author = $this->findAuthor($html);
|
|
|
+ $timestamp = $this->findTimestamp($html);
|
|
|
+
|
|
|
+ $sections = $this->splitSections($html);
|
|
|
+
|
|
|
+ foreach($sections as $section){
|
|
|
+ $item = array();
|
|
|
+
|
|
|
+ $item['uri'] = $this->findSectionAnchor($section[0]);
|
|
|
+
|
|
|
+ switch($this->getInput('content')){
|
|
|
+ case 'none': // Do not return any content
|
|
|
+ break;
|
|
|
+ case 'follow': // Follow the anchor
|
|
|
+ // We can only follow anchors (use default otherwise)
|
|
|
+ if($this->getInput('separator') === 'a'){
|
|
|
+ $content = $this->followAnchor($item['uri']);
|
|
|
+
|
|
|
+ // Return only actual content
|
|
|
+ $item['content'] = $content->find('div#page', 0)->innertext;
|
|
|
+
|
|
|
+ // Each page could have its own author and timestamp
|
|
|
+ $author = $this->findAuthor($content);
|
|
|
+ $timestamp = $this->findTimestamp($content);
|
|
|
+
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case 'separator':
|
|
|
+ default: // Use contents from the current page
|
|
|
+ $item['content'] = $this->cleanArticle($section[2]);
|
|
|
+ }
|
|
|
+
|
|
|
+ if(!is_null($author)) $item['author'] = $author;
|
|
|
+ if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
|
|
|
+ $item['title'] = strip_tags($section[1]);
|
|
|
+
|
|
|
+ // Skip items with empty title
|
|
|
+ if(empty(trim($item['title']))){
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ $this->items[] = $item;
|
|
|
+
|
|
|
+ if($this->getInput('limit') > 0
|
|
|
+ && count($this->items) >= $this->getInput('limit')){
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public function getName(){
|
|
|
+ return $this->title ?: parent::getName();
|
|
|
+ }
|
|
|
+
|
|
|
+ public function getURI(){
|
|
|
+ return $this->getInput('source') ?: parent::getURI();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Splits the html into sections.
|
|
|
+ *
|
|
|
+ * Returns an array with one element per section. Each element consists of:
|
|
|
+ * [0] The entire section
|
|
|
+ * [1] The section title
|
|
|
+ * [2] The section content
|
|
|
+ */
|
|
|
+ private function splitSections($html){
|
|
|
+ $content = $html->find('div#page', 0)->innertext
|
|
|
+ or returnServerError('Unable to find <div id="page"/>!');
|
|
|
+
|
|
|
+ $sections = array();
|
|
|
+
|
|
|
+ $regex = implode(
|
|
|
+ '',
|
|
|
+ array(
|
|
|
+ "\<{$this->getInput('separator')}.+?(?=\>)\>",
|
|
|
+ "(.+?)(?=\<\/{$this->getInput('separator')}\>)",
|
|
|
+ "\<\/{$this->getInput('separator')}\>",
|
|
|
+ "(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
|
|
|
+ )
|
|
|
+ );
|
|
|
+
|
|
|
+ preg_match_all(
|
|
|
+ '/' . $regex . '/m',
|
|
|
+ $content,
|
|
|
+ $sections,
|
|
|
+ PREG_SET_ORDER
|
|
|
+ );
|
|
|
+
|
|
|
+ // Some pages don't use headers, return page as one feed
|
|
|
+ if(count($sections) === 0){
|
|
|
+ return array(
|
|
|
+ array(
|
|
|
+ $content,
|
|
|
+ $html->find('title', 0)->innertext,
|
|
|
+ $content
|
|
|
+ )
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ return $sections;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Returns the anchor for a given section
|
|
|
+ */
|
|
|
+ private function findSectionAnchor($section){
|
|
|
+ $html = str_get_html($section);
|
|
|
+
|
|
|
+ // For IDs
|
|
|
+ $anchor = $html->find($this->getInput('separator') . '[id=]', 0);
|
|
|
+ if(!is_null($anchor)){
|
|
|
+ return $this->getInput('source') . '#' . $anchor->id;
|
|
|
+ }
|
|
|
+
|
|
|
+ // For actual anchors
|
|
|
+ $anchor = $html->find($this->getInput('separator') . '[href=]', 0);
|
|
|
+ if(!is_null($anchor)){
|
|
|
+ return $anchor->href;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Nothing found
|
|
|
+ return $this->getInput('source');
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Returns the author
|
|
|
+ *
|
|
|
+ * Notice: Some pages don't provide author information
|
|
|
+ */
|
|
|
+ private function findAuthor($html){
|
|
|
+ /* Example:
|
|
|
+ * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
|
|
|
+ * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
|
|
|
+ * [178.162.199.143]">hosted-by</span>)</p>
|
|
|
+ */
|
|
|
+ $pageinfo = $html->find('[id="pageinfo"]', 0);
|
|
|
+
|
|
|
+ if(is_null($pageinfo)){
|
|
|
+ return null;
|
|
|
+ } else {
|
|
|
+ $author = $pageinfo->find('[title=]', 0);
|
|
|
+ if(is_null($author)){
|
|
|
+ return null;
|
|
|
+ } else {
|
|
|
+ return trim(explode('@', $author->title)[0]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Returns the time of last edit
|
|
|
+ *
|
|
|
+ * Notice: Some pages don't provide this information
|
|
|
+ */
|
|
|
+ private function findTimestamp($html){
|
|
|
+ // See example of findAuthor()
|
|
|
+ $pageinfo = $html->find('[id="pageinfo"]', 0);
|
|
|
+
|
|
|
+ if(is_null($pageinfo)){
|
|
|
+ return null;
|
|
|
+ } else {
|
|
|
+ $timestamp = $pageinfo->innertext;
|
|
|
+ $matches = array();
|
|
|
+ preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
|
|
|
+ return strtotime($matches[1]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Returns the original HTML with all anchors fixed (makes relative anchors
|
|
|
+ * absolute)
|
|
|
+ */
|
|
|
+ private function fixAnchors($html, $source = null){
|
|
|
+
|
|
|
+ $source = $source ?: $this->getURI();
|
|
|
+
|
|
|
+ foreach($html->find('a') as $anchor){
|
|
|
+ switch(substr($anchor->href, 0, 1)){
|
|
|
+ case 'h': // http or https, no actions required
|
|
|
+ break;
|
|
|
+ case '/': // some relative path
|
|
|
+ $anchor->href = $this->findDomain($source) . $anchor->href;
|
|
|
+ break;
|
|
|
+ case '#': // it's an ID
|
|
|
+ default: // probably something like ? or &, skip empty ones
|
|
|
+ if(!isset($anchor->href))
|
|
|
+ break;
|
|
|
+ $anchor->href = $source . $anchor->href;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return $html;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Loads the full article of a given anchor (if the anchor is from the same
|
|
|
+ * wiki domain)
|
|
|
+ */
|
|
|
+ private function followAnchor($anchor){
|
|
|
+ if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ $html = getSimpleHTMLDOMCached($anchor);
|
|
|
+ if(!$html){ // Cannot load article
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ return $this->fixAnchors($html, $anchor);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Finds the domain for a given URI
|
|
|
+ */
|
|
|
+ private function findDomain($uri){
|
|
|
+ $matches = array();
|
|
|
+ preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
|
|
|
+ return $matches[1];
|
|
|
+ }
|
|
|
+
|
|
|
+ /* This function is a copy from CNETBridge */
|
|
|
+ private function stripWithDelimiters($string, $start, $end){
|
|
|
+ while(strpos($string, $start) !== false){
|
|
|
+ $section_to_remove = substr($string, strpos($string, $start));
|
|
|
+ $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
|
|
|
+ $string = str_replace($section_to_remove, '', $string);
|
|
|
+ }
|
|
|
+
|
|
|
+ return $string;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* This function is based on CNETBridge */
|
|
|
+ private function cleanArticle($article_html){
|
|
|
+ $article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
|
|
|
+ return $article_html;
|
|
|
+ }
|
|
|
+}
|