Merge branch 'MoinMoinBridge' of https://github.com/logmanoriginal/rss-bridge

2017-02-18 10:16:05 +01:00 · 2017-02-18 10:16:05 +01:00 · 790bd17d41
commit 790bd17d41
parent 801ea837c9 1dcef02f27
1 changed files with 327 additions and 0 deletions
--- a/bridges/MoinMoinBridge.php
+++ b/bridges/MoinMoinBridge.php
@ -0,0 +1,327 @@
+<?php
+class MoinMoinBridge extends BridgeAbstract {
+
+	const MAINTAINER = 'logmanoriginal';
+	const NAME = 'MoinMoin Bridge';
+	const URI = 'https://moinmo.in';
+	const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
+	const PARAMETERS = array(
+		array(
+			'source' => array(
+				'name' => 'Source',
+				'type' => 'text',
+				'required' => true,
+				'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
+				'exampleValue' => 'https://moinmo.in/MoinMoin'
+			),
+			'separator' => array(
+				'name' => 'Separator',
+				'type' => 'list',
+				'requied' => true,
+				'title' => 'Defines the separtor for splitting content into feeds',
+				'defaultValue' => 'h2',
+				'values' => array(
+					'Header (h1)' => 'h1',
+					'Header (h2)' => 'h2',
+					'Header (h3)' => 'h3',
+					'List element (li)' => 'li',
+					'Anchor (a)' => 'a'
+				)
+			),
+			'limit' => array(
+				'name' => 'Limit',
+				'type' => 'number',
+				'required' => false,
+				'title' => 'Number of items to return (from top)',
+				'defaultValue' => -1
+			),
+			'content' => array(
+				'name' => 'Content',
+				'type' => 'list',
+				'required' => false,
+				'title' => 'Defines how feed contents are build',
+				'defaultValue' => 'separator',
+				'values' => array(
+					'By separator' => 'separator',
+					'Follow link (only for anchor)' => 'follow',
+					'None' => 'none'
+				)
+			)
+		)
+	);
+
+	private $title = '';
+
+	public function collectData(){
+		/* MoinMoin uses a rather unpleasent representation of HTML. Instead of
+		 * using tags like <article/>, <navigation/>, <header/>, etc... it uses
+		 * <div/>, <span/> and <p/>. Also each line is literaly identified via
+		 * IDs. The only way to distinguish content is via headers, though not
+		 * in all cases.
+		 *
+		 * Example (indented for the sake of readability):
+		 * ...
+		 * <span class="anchor" id="line-1"></span>
+		 * <span class="anchor" id="line-2"></span>
+		 * <span class="anchor" id="line-3"></span>
+		 * <span class="anchor" id="line-4"></span>
+		 * <span class="anchor" id="line-5"></span>
+		 * <span class="anchor" id="line-6"></span>
+		 * <span class="anchor" id="line-7"></span>
+		 * <span class="anchor" id="line-8"></span>
+		 * <span class="anchor" id="line-9"></span>
+		 *   <p class="line867">MoinMoin is a Wiki software implemented in
+		 *     <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
+		 *   and distributed as Free Software under
+		 *     <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
+		 * ...
+		 */
+		$html = getSimpleHTMLDOM($this->getInput('source'))
+			or returnServerError('Could not load ' . $this->getInput('source'));
+
+		// Some anchors link to local sites or local IDs (both don't work well
+		// in feeds)
+		$html = $this->fixAnchors($html);
+
+		$this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
+
+		// Here we focus on simple author and timestamp information from the given
+		// page. Later we update this information in case the anchor is followed.
+		$author = $this->findAuthor($html);
+		$timestamp = $this->findTimestamp($html);
+
+		$sections = $this->splitSections($html);
+
+		foreach($sections as $section){
+			$item = array();
+
+			$item['uri'] = $this->findSectionAnchor($section[0]);
+
+			switch($this->getInput('content')){
+				case 'none': // Do not return any content
+					break;
+				case 'follow': // Follow the anchor
+					// We can only follow anchors (use default otherwise)
+					if($this->getInput('separator') === 'a'){
+						$content = $this->followAnchor($item['uri']);
+
+						// Return only actual content
+						$item['content'] = $content->find('div#page', 0)->innertext;
+
+						// Each page could have its own author and timestamp
+						$author = $this->findAuthor($content);
+						$timestamp = $this->findTimestamp($content);
+
+						break;
+					}
+				case 'separator':
+				default: // Use contents from the current page
+					$item['content'] = $this->cleanArticle($section[2]);
+			}
+
+			if(!is_null($author)) $item['author'] = $author;
+			if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
+			$item['title'] = strip_tags($section[1]);
+
+			// Skip items with empty title
+			if(empty(trim($item['title']))){
+				continue;
+			}
+
+			$this->items[] = $item;
+
+			if($this->getInput('limit') > 0
+			&& count($this->items) >= $this->getInput('limit')){
+				break;
+			}
+		}
+	}
+
+	public function getName(){
+		return $this->title ?: parent::getName();
+	}
+
+	public function getURI(){
+		return $this->getInput('source') ?: parent::getURI();
+	}
+
+	/**
+	 * Splits the html into sections.
+	 *
+	 * Returns an array with one element per section. Each element consists of:
+	 * [0] The entire section
+	 * [1] The section title
+	 * [2] The section content
+	 */
+	private function splitSections($html){
+		$content = $html->find('div#page', 0)->innertext
+			or returnServerError('Unable to find <div id="page"/>!');
+
+		$sections = array();
+
+		$regex = implode(
+			'',
+			array(
+				"\<{$this->getInput('separator')}.+?(?=\>)\>",
+				"(.+?)(?=\<\/{$this->getInput('separator')}\>)",
+				"\<\/{$this->getInput('separator')}\>",
+				"(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
+			)
+		);
+
+		preg_match_all(
+			'/' . $regex . '/m',
+			$content,
+			$sections,
+			PREG_SET_ORDER
+		);
+
+		// Some pages don't use headers, return page as one feed
+		if(count($sections) === 0){
+			return array(
+				array(
+					$content,
+					$html->find('title', 0)->innertext,
+					$content
+				)
+			);
+		}
+
+		return $sections;
+	}
+
+	/**
+	 * Returns the anchor for a given section
+	 */
+	private function findSectionAnchor($section){
+		$html = str_get_html($section);
+
+		// For IDs
+		$anchor = $html->find($this->getInput('separator') . '[id=]', 0);
+		if(!is_null($anchor)){
+			return $this->getInput('source') . '#' . $anchor->id;
+		}
+
+		// For actual anchors
+		$anchor = $html->find($this->getInput('separator') . '[href=]', 0);
+		if(!is_null($anchor)){
+			return $anchor->href;
+		}
+
+		// Nothing found
+		return $this->getInput('source');
+	}
+
+	/**
+	 * Returns the author
+	 *
+	 * Notice: Some pages don't provide author information
+	 */
+	private function findAuthor($html){
+		/* Example:
+		 * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
+		 * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
+		 * [178.162.199.143]">hosted-by</span>)</p>
+		*/
+		$pageinfo = $html->find('[id="pageinfo"]', 0);
+
+		if(is_null($pageinfo)){
+			return null;
+		} else {
+			$author = $pageinfo->find('[title=]', 0);
+			if(is_null($author)){
+				return null;
+			} else {
+				return trim(explode('@', $author->title)[0]);
+			}
+		}
+	}
+
+	/**
+	 * Returns the time of last edit
+	 *
+	 * Notice: Some pages don't provide this information
+	 */
+	private function findTimestamp($html){
+		// See example of findAuthor()
+		$pageinfo = $html->find('[id="pageinfo"]', 0);
+
+		if(is_null($pageinfo)){
+			return null;
+		} else {
+			$timestamp = $pageinfo->innertext;
+			$matches = array();
+			preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
+			return strtotime($matches[1]);
+		}
+	}
+
+	/**
+	 * Returns the original HTML with all anchors fixed (makes relative anchors
+	 * absolute)
+	 */
+	private function fixAnchors($html, $source = null){
+
+		$source = $source ?: $this->getURI();
+
+		foreach($html->find('a') as $anchor){
+			switch(substr($anchor->href, 0, 1)){
+				case 'h': // http or https, no actions required
+					break;
+				case '/': // some relative path
+					$anchor->href = $this->findDomain($source) . $anchor->href;
+					break;
+				case '#': // it's an ID
+				default: // probably something like ? or &, skip empty ones
+					if(!isset($anchor->href))
+						break;
+					$anchor->href = $source . $anchor->href;
+			}
+		}
+
+		return $html;
+	}
+
+	/**
+	 * Loads the full article of a given anchor (if the anchor is from the same
+	 * wiki domain)
+	 */
+	private function followAnchor($anchor){
+		if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){
+			return null;
+		}
+
+		$html = getSimpleHTMLDOMCached($anchor);
+		if(!$html){ // Cannot load article
+			return null;
+		}
+
+		return $this->fixAnchors($html, $anchor);
+	}
+
+	/**
+	 * Finds the domain for a given URI
+	 */
+	private function findDomain($uri){
+		$matches = array();
+		preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
+		return $matches[1];
+	}
+
+	/* This function is a copy from CNETBridge */
+	private function stripWithDelimiters($string, $start, $end){
+		while(strpos($string, $start) !== false){
+			$section_to_remove = substr($string, strpos($string, $start));
+			$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
+			$string = str_replace($section_to_remove, '', $string);
+		}
+
+		return $string;
+	}
+
+	/* This function is based on CNETBridge */
+	private function cleanArticle($article_html){
+		$article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
+		return $article_html;
+	}
+}