rss-bridge/bridges/MoinMoinBridge.php

<?php
class MoinMoinBridge extends BridgeAbstract {

	const MAINTAINER = 'logmanoriginal';
	const NAME = 'MoinMoin Bridge';
	const URI = 'https://moinmo.in';
	const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
	const PARAMETERS = array(
		array(
			'source' => array(
				'name' => 'Source',
				'type' => 'text',
				'required' => true,
				'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
				'exampleValue' => 'https://moinmo.in/MoinMoin'
			),
			'separator' => array(
				'name' => 'Separator',
				'type' => 'list',
				'requied' => true,
				'title' => 'Defines the separtor for splitting content into feeds',
				'defaultValue' => 'h2',
				'values' => array(
					'Header (h1)' => 'h1',
					'Header (h2)' => 'h2',
					'Header (h3)' => 'h3',
					'List element (li)' => 'li',
					'Anchor (a)' => 'a'
				)
			),
			'limit' => array(
				'name' => 'Limit',
				'type' => 'number',
				'required' => false,
				'title' => 'Number of items to return (from top)',
				'defaultValue' => -1
			),
			'content' => array(
				'name' => 'Content',
				'type' => 'list',
				'required' => false,
				'title' => 'Defines how feed contents are build',
				'defaultValue' => 'separator',
				'values' => array(
					'By separator' => 'separator',
					'Follow link (only for anchor)' => 'follow',
					'None' => 'none'
				)
			)
		)
	);

	private $title = '';

	public function collectData(){
		/* MoinMoin uses a rather unpleasent representation of HTML. Instead of
		 * using tags like <article/>, <navigation/>, <header/>, etc... it uses
		 * <div/>, <span/> and <p/>. Also each line is literaly identified via
		 * IDs. The only way to distinguish content is via headers, though not
		 * in all cases.
		 *
		 * Example (indented for the sake of readability):
		 * ...
		 * <span class="anchor" id="line-1"></span>
		 * <span class="anchor" id="line-2"></span>
		 * <span class="anchor" id="line-3"></span>
		 * <span class="anchor" id="line-4"></span>
		 * <span class="anchor" id="line-5"></span>
		 * <span class="anchor" id="line-6"></span>
		 * <span class="anchor" id="line-7"></span>
		 * <span class="anchor" id="line-8"></span>
		 * <span class="anchor" id="line-9"></span>
		 *   <p class="line867">MoinMoin is a Wiki software implemented in
		 *     <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
		 *   and distributed as Free Software under
		 *     <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
		 * ...
		 */
		$html = getSimpleHTMLDOM($this->getInput('source'))
			or returnServerError('Could not load ' . $this->getInput('source'));

		// Some anchors link to local sites or local IDs (both don't work well
		// in feeds)
		$html = $this->fixAnchors($html);

		$this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;

		// Here we focus on simple author and timestamp information from the given
		// page. Later we update this information in case the anchor is followed.
		$author = $this->findAuthor($html);
		$timestamp = $this->findTimestamp($html);

		$sections = $this->splitSections($html);

		foreach($sections as $section){
			$item = array();

			$item['uri'] = $this->findSectionAnchor($section[0]);

			switch($this->getInput('content')){
				case 'none': // Do not return any content
					break;
				case 'follow': // Follow the anchor
					// We can only follow anchors (use default otherwise)
					if($this->getInput('separator') === 'a'){
						$content = $this->followAnchor($item['uri']);

						// Return only actual content
						$item['content'] = $content->find('div#page', 0)->innertext;

						// Each page could have its own author and timestamp
						$author = $this->findAuthor($content);
						$timestamp = $this->findTimestamp($content);

						break;
					}
				case 'separator':
				default: // Use contents from the current page
					$item['content'] = $this->cleanArticle($section[2]);
			}

			if(!is_null($author)) $item['author'] = $author;
			if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
			$item['title'] = strip_tags($section[1]);

			// Skip items with empty title
			if(empty(trim($item['title']))){
				continue;
			}

			$this->items[] = $item;

			if($this->getInput('limit') > 0
			&& count($this->items) >= $this->getInput('limit')){
				break;
			}
		}
	}

	public function getName(){
		return $this->title ?: parent::getName();
	}

	public function getURI(){
		return $this->getInput('source') ?: parent::getURI();
	}

	/**
	 * Splits the html into sections.
	 *
	 * Returns an array with one element per section. Each element consists of:
	 * [0] The entire section
	 * [1] The section title
	 * [2] The section content
	 */
	private function splitSections($html){
		$content = $html->find('div#page', 0)->innertext
			or returnServerError('Unable to find <div id="page"/>!');

		$sections = array();

		$regex = implode(
			'',
			array(
				"\<{$this->getInput('separator')}.+?(?=\>)\>",
				"(.+?)(?=\<\/{$this->getInput('separator')}\>)",
				"\<\/{$this->getInput('separator')}\>",
				"(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
			)
		);

		preg_match_all(
			'/' . $regex . '/m',
			$content,
			$sections,
			PREG_SET_ORDER
		);

		// Some pages don't use headers, return page as one feed
		if(count($sections) === 0){
			return array(
				array(
					$content,
					$html->find('title', 0)->innertext,
					$content
				)
			);
		}

		return $sections;
	}

	/**
	 * Returns the anchor for a given section
	 */
	private function findSectionAnchor($section){
		$html = str_get_html($section);

		// For IDs
		$anchor = $html->find($this->getInput('separator') . '[id=]', 0);
		if(!is_null($anchor)){
			return $this->getInput('source') . '#' . $anchor->id;
		}

		// For actual anchors
		$anchor = $html->find($this->getInput('separator') . '[href=]', 0);
		if(!is_null($anchor)){
			return $anchor->href;
		}

		// Nothing found
		return $this->getInput('source');
	}

	/**
	 * Returns the author
	 *
	 * Notice: Some pages don't provide author information
	 */
	private function findAuthor($html){
		/* Example:
		 * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
		 * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
		 * [178.162.199.143]">hosted-by</span>)</p>
		*/
		$pageinfo = $html->find('[id="pageinfo"]', 0);

		if(is_null($pageinfo)){
			return null;
		} else {
			$author = $pageinfo->find('[title=]', 0);
			if(is_null($author)){
				return null;
			} else {
				return trim(explode('@', $author->title)[0]);
			}
		}
	}

	/**
	 * Returns the time of last edit
	 *
	 * Notice: Some pages don't provide this information
	 */
	private function findTimestamp($html){
		// See example of findAuthor()
		$pageinfo = $html->find('[id="pageinfo"]', 0);

		if(is_null($pageinfo)){
			return null;
		} else {
			$timestamp = $pageinfo->innertext;
			$matches = array();
			preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
			return strtotime($matches[1]);
		}
	}

	/**
	 * Returns the original HTML with all anchors fixed (makes relative anchors
	 * absolute)
	 */
	private function fixAnchors($html, $source = null){

		$source = $source ?: $this->getURI();

		foreach($html->find('a') as $anchor){
			switch(substr($anchor->href, 0, 1)){
				case 'h': // http or https, no actions required
					break;
				case '/': // some relative path
					$anchor->href = $this->findDomain($source) . $anchor->href;
					break;
				case '#': // it's an ID
				default: // probably something like ? or &, skip empty ones
					if(!isset($anchor->href))
						break;
					$anchor->href = $source . $anchor->href;
			}
		}

		return $html;
	}

	/**
	 * Loads the full article of a given anchor (if the anchor is from the same
	 * wiki domain)
	 */
	private function followAnchor($anchor){
		if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){
			return null;
		}

		$html = getSimpleHTMLDOMCached($anchor);
		if(!$html){ // Cannot load article
			return null;
		}

		return $this->fixAnchors($html, $anchor);
	}

	/**
	 * Finds the domain for a given URI
	 */
	private function findDomain($uri){
		$matches = array();
		preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
		return $matches[1];
	}

	/* This function is a copy from CNETBridge */
	private function stripWithDelimiters($string, $start, $end){
		while(strpos($string, $start) !== false){
			$section_to_remove = substr($string, strpos($string, $start));
			$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
			$string = str_replace($section_to_remove, '', $string);
		}

		return $string;
	}

	/* This function is based on CNETBridge */
	private function cleanArticle($article_html){
		$article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
		return $article_html;
	}
}