rss-bridge/bridges/MoinMoinBridge.php

<?php
class MoinMoinBridge extends BridgeAbstract {

	const MAINTAINER = 'logmanoriginal';
	const NAME = 'MoinMoin Bridge';
	const URI = 'https://moinmo.in';
	const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
	const PARAMETERS = array(
		array(
			'source' => array(
				'name' => 'Source',
				'type' => 'text',
				'required' => true,
				'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
				'exampleValue' => 'https://moinmo.in/MoinMoin'
			),
			'separator' => array(
				'name' => 'Separator',
				'type' => 'list',
				'requied' => true,
				'title' => 'Defines the separtor for splitting content into feeds',
				'defaultValue' => 'h2',
				'values' => array(
					'Header (h1)' => 'h1',
					'Header (h2)' => 'h2',
					'Header (h3)' => 'h3',
					'List element (li)' => 'li',
					'Anchor (a)' => 'a'
				)
			),
			'limit' => array(
				'name' => 'Limit',
				'type' => 'number',
				'required' => false,
				'title' => 'Number of items to return (from top)',
				'defaultValue' => -1
			),
			'content' => array(
				'name' => 'Content',
				'type' => 'list',
				'required' => false,
				'title' => 'Defines how feed contents are build',
				'defaultValue' => 'separator',
				'values' => array(
					'By separator' => 'separator',
					'Follow link (only for anchor)' => 'follow',
					'None' => 'none'
				)
			)
		)
	);

	private $title = '';

	public function collectData(){
		/* MoinMoin uses a rather unpleasent representation of HTML. Instead of
		 * using tags like <article/>, <navigation/>, <header/>, etc... it uses
		 * <div/>, <span/> and <p/>. Also each line is literaly identified via
		 * IDs. The only way to distinguish content is via headers, though not
		 * in all cases.
		 *
		 * Example (indented for the sake of readability):
		 * ...
		 * <span class="anchor" id="line-1"></span>
		 * <span class="anchor" id="line-2"></span>
		 * <span class="anchor" id="line-3"></span>
		 * <span class="anchor" id="line-4"></span>
		 * <span class="anchor" id="line-5"></span>
		 * <span class="anchor" id="line-6"></span>
		 * <span class="anchor" id="line-7"></span>
		 * <span class="anchor" id="line-8"></span>
		 * <span class="anchor" id="line-9"></span>
		 *   <p class="line867">MoinMoin is a Wiki software implemented in
		 *     <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
		 *   and distributed as Free Software under
		 *     <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
		 * ...
		 */
		$html = getSimpleHTMLDOM($this->getInput('source'))
			or returnServerError('Could not load ' . $this->getInput('source'));

		// Some anchors link to local sites or local IDs (both don't work well
		// in feeds)
		$html = $this->fixAnchors($html);

		$this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;

		// Here we focus on simple author and timestamp information from the given
		// page. Later we update this information in case the anchor is followed.
		$author = $this->findAuthor($html);
		$timestamp = $this->findTimestamp($html);

		$sections = $this->splitSections($html);

		foreach($sections as $section){
			$item = array();

			$item['uri'] = $this->findSectionAnchor($section[0]);

			switch($this->getInput('content')){
				case 'none': // Do not return any content
					break;
				case 'follow': // Follow the anchor
					// We can only follow anchors (use default otherwise)
					if($this->getInput('separator') === 'a'){
						$content = $this->followAnchor($item['uri']);

						// Return only actual content
						$item['content'] = $content->find('div#page', 0)->innertext;

						// Each page could have its own author and timestamp
						$author = $this->findAuthor($content);
						$timestamp = $this->findTimestamp($content);

						break;
					}
				case 'separator':
				default: // Use contents from the current page
					$item['content'] = $this->cleanArticle($section[2]);
			}

			if(!is_null($author)) $item['author'] = $author;
			if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
			$item['title'] = strip_tags($section[1]);

			// Skip items with empty title
			if(empty(trim($item['title']))){
				continue;
			}

			$this->items[] = $item;

			if($this->getInput('limit') > 0
			&& count($this->items) >= $this->getInput('limit')){
				break;
			}
		}
	}

	public function getName(){
		return $this->title ?: parent::getName();
	}

	public function getURI(){
		return $this->getInput('source') ?: parent::getURI();
	}

	/**
	 * Splits the html into sections.
	 *
	 * Returns an array with one element per section. Each element consists of:
	 * [0] The entire section
	 * [1] The section title
	 * [2] The section content
	 */
	private function splitSections($html){
		$content = $html->find('div#page', 0)->innertext
			or returnServerError('Unable to find <div id="page"/>!');

		$sections = array();

		$regex = implode(
			'',
			array(
				"\<{$this->getInput('separator')}.+?(?=\>)\>",
				"(.+?)(?=\<\/{$this->getInput('separator')}\>)",
				"\<\/{$this->getInput('separator')}\>",
				"(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
			)
		);

		preg_match_all(
			'/' . $regex . '/m',
			$content,
			$sections,
			PREG_SET_ORDER
		);

		// Some pages don't use headers, return page as one feed
		if(count($sections) === 0){
			return array(
				array(
					$content,
					$html->find('title', 0)->innertext,
					$content
				)
			);
		}

		return $sections;
	}

	/**
	 * Returns the anchor for a given section
	 */
	private function findSectionAnchor($section){
		$html = str_get_html($section);

		// For IDs
		$anchor = $html->find($this->getInput('separator') . '[id=]', 0);
		if(!is_null($anchor)){
			return $this->getInput('source') . '#' . $anchor->id;
		}

		// For actual anchors
		$anchor = $html->find($this->getInput('separator') . '[href=]', 0);
		if(!is_null($anchor)){
			return $anchor->href;
		}

		// Nothing found
		return $this->getInput('source');
	}

	/**
	 * Returns the author
	 *
	 * Notice: Some pages don't provide author information
	 */
	private function findAuthor($html){
		/* Example:
		 * <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
		 * (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
		 * [178.162.199.143]">hosted-by</span>)</p>
		*/
		$pageinfo = $html->find('[id="pageinfo"]', 0);

		if(is_null($pageinfo)){
			return null;
		} else {
			$author = $pageinfo->find('[title=]', 0);
			if(is_null($author)){
				return null;
			} else {
				return trim(explode('@', $author->title)[0]);
			}
		}
	}

	/**
	 * Returns the time of last edit
	 *
	 * Notice: Some pages don't provide this information
	 */
	private function findTimestamp($html){
		// See example of findAuthor()
		$pageinfo = $html->find('[id="pageinfo"]', 0);

		if(is_null($pageinfo)){
			return null;
		} else {
			$timestamp = $pageinfo->innertext;
			$matches = array();
			preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
			return strtotime($matches[1]);
		}
	}

	/**
	 * Returns the original HTML with all anchors fixed (makes relative anchors
	 * absolute)
	 */
	private function fixAnchors($html, $source = null){

		$source = $source ?: $this->getURI();

		foreach($html->find('a') as $anchor){
			switch(substr($anchor->href, 0, 1)){
				case 'h': // http or https, no actions required
					break;
				case '/': // some relative path
					$anchor->href = $this->findDomain($source) . $anchor->href;
					break;
				case '#': // it's an ID
				default: // probably something like ? or &, skip empty ones
					if(!isset($anchor->href))
						break;
					$anchor->href = $source . $anchor->href;
			}
		}

		return $html;
	}

	/**
	 * Loads the full article of a given anchor (if the anchor is from the same
	 * wiki domain)
	 */
	private function followAnchor($anchor){
		if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){
			return null;
		}

		$html = getSimpleHTMLDOMCached($anchor);
		if(!$html){ // Cannot load article
			return null;
		}

		return $this->fixAnchors($html, $anchor);
	}

	/**
	 * Finds the domain for a given URI
	 */
	private function findDomain($uri){
		$matches = array();
		preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
		return $matches[1];
	}

	/* This function is a copy from CNETBridge */
	private function stripWithDelimiters($string, $start, $end){
		while(strpos($string, $start) !== false){
			$section_to_remove = substr($string, strpos($string, $start));
			$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
			$string = str_replace($section_to_remove, '', $string);
		}

		return $string;
	}

	/* This function is based on CNETBridge */
	private function cleanArticle($article_html){
		$article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
		return $article_html;
	}
}
[MoinMoinBridge] Add new bridge This bridge returns feeds for each section (via given separator) from a given MoinMoin compatible wiki. The separator can be any tag of the following: - h1 - h2 - h3 - li - a The number of items returned can be specified. For anchor tags (a) the bridge can optionally follow the anchor to the linked page and return it as content. 2017-02-18 00:22:35 +01:00			`<?php`
			`class MoinMoinBridge extends BridgeAbstract {`

			`const MAINTAINER = 'logmanoriginal';`
			`const NAME = 'MoinMoin Bridge';`
			`const URI = 'https://moinmo.in';`
			`const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';`
			`const PARAMETERS = array(`
			`array(`
			`'source' => array(`
			`'name' => 'Source',`
			`'type' => 'text',`
			`'required' => true,`
			`'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',`
			`'exampleValue' => 'https://moinmo.in/MoinMoin'`
			`),`
			`'separator' => array(`
			`'name' => 'Separator',`
			`'type' => 'list',`
			`'requied' => true,`
			`'title' => 'Defines the separtor for splitting content into feeds',`
			`'defaultValue' => 'h2',`
			`'values' => array(`
			`'Header (h1)' => 'h1',`
			`'Header (h2)' => 'h2',`
			`'Header (h3)' => 'h3',`
			`'List element (li)' => 'li',`
			`'Anchor (a)' => 'a'`
			`)`
			`),`
			`'limit' => array(`
			`'name' => 'Limit',`
			`'type' => 'number',`
			`'required' => false,`
			`'title' => 'Number of items to return (from top)',`
			`'defaultValue' => -1`
			`),`
			`'content' => array(`
			`'name' => 'Content',`
			`'type' => 'list',`
			`'required' => false,`
			`'title' => 'Defines how feed contents are build',`
			`'defaultValue' => 'separator',`
			`'values' => array(`
			`'By separator' => 'separator',`
			`'Follow link (only for anchor)' => 'follow',`
			`'None' => 'none'`
			`)`
			`)`
			`)`
			`);`

			`private $title = '';`

			`public function collectData(){`
			`/* MoinMoin uses a rather unpleasent representation of HTML. Instead of`
			`* using tags like <article/>, <navigation/>, <header/>, etc... it uses`
			`* <div/>, <span/> and <p/>. Also each line is literaly identified via`
			`* IDs. The only way to distinguish content is via headers, though not`
			`* in all cases.`
			`*`
			`* Example (indented for the sake of readability):`
			`* ...`
			`* <span class="anchor" id="line-1"></span>`
			`* <span class="anchor" id="line-2"></span>`
			`* <span class="anchor" id="line-3"></span>`
			`* <span class="anchor" id="line-4"></span>`
			`* <span class="anchor" id="line-5"></span>`
			`* <span class="anchor" id="line-6"></span>`
			`* <span class="anchor" id="line-7"></span>`
			`* <span class="anchor" id="line-8"></span>`
			`* <span class="anchor" id="line-9"></span>`
			`* <p class="line867">MoinMoin is a Wiki software implemented in`
			`* <a class="interwiki" href="/Python" title="MoinMoin">Python</a>`
			`* and distributed as Free Software under`
			`* <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.`
			`* ...`
			`*/`
			`$html = getSimpleHTMLDOM($this->getInput('source'))`
			`or returnServerError('Could not load ' . $this->getInput('source'));`

			`// Some anchors link to local sites or local IDs (both don't work well`
			`// in feeds)`
			`$html = $this->fixAnchors($html);`

			`$this->title = $html->find('title', 0)->innertext . ' \| ' . self::NAME;`

			`// Here we focus on simple author and timestamp information from the given`
			`// page. Later we update this information in case the anchor is followed.`
			`$author = $this->findAuthor($html);`
			`$timestamp = $this->findTimestamp($html);`

			`$sections = $this->splitSections($html);`

			`foreach($sections as $section){`
			`$item = array();`

			`$item['uri'] = $this->findSectionAnchor($section[0]);`

			`switch($this->getInput('content')){`
			`case 'none': // Do not return any content`
			`break;`
			`case 'follow': // Follow the anchor`
			`// We can only follow anchors (use default otherwise)`
			`if($this->getInput('separator') === 'a'){`
			`$content = $this->followAnchor($item['uri']);`

			`// Return only actual content`
			`$item['content'] = $content->find('div#page', 0)->innertext;`

			`// Each page could have its own author and timestamp`
			`$author = $this->findAuthor($content);`
			`$timestamp = $this->findTimestamp($content);`

			`break;`
			`}`
			`case 'separator':`
			`default: // Use contents from the current page`
			`$item['content'] = $this->cleanArticle($section[2]);`
			`}`

			`if(!is_null($author)) $item['author'] = $author;`
			`if(!is_null($timestamp)) $item['timestamp'] = $timestamp;`
			`$item['title'] = strip_tags($section[1]);`

			`// Skip items with empty title`
			`if(empty(trim($item['title']))){`
			`continue;`
			`}`

			`$this->items[] = $item;`

			`if($this->getInput('limit') > 0`
			`&& count($this->items) >= $this->getInput('limit')){`
			`break;`
			`}`
			`}`
			`}`

			`public function getName(){`
			`return $this->title ?: parent::getName();`
			`}`

			`public function getURI(){`
			`return $this->getInput('source') ?: parent::getURI();`
			`}`

			`/**`
			`* Splits the html into sections.`
			`*`
			`* Returns an array with one element per section. Each element consists of:`
			`* [0] The entire section`
			`* [1] The section title`
			`* [2] The section content`
			`*/`
			`private function splitSections($html){`
			`$content = $html->find('div#page', 0)->innertext`
			`or returnServerError('Unable to find <div id="page"/>!');`

			`$sections = array();`

			`$regex = implode(`
			`'',`
			`array(`
			`"\<{$this->getInput('separator')}.+?(?=\>)\>",`
			`"(.+?)(?=\<\/{$this->getInput('separator')}\>)",`
			`"\<\/{$this->getInput('separator')}\>",`
			`"(.+?)((?=\<{$this->getInput('separator')})\|(?=\<div\sid=\"pagebottom\")){1}"`
			`)`
			`);`

			`preg_match_all(`
			`'/' . $regex . '/m',`
			`$content,`
			`$sections,`
			`PREG_SET_ORDER`
			`);`

			`// Some pages don't use headers, return page as one feed`
			`if(count($sections) === 0){`
			`return array(`
			`array(`
			`$content,`
			`$html->find('title', 0)->innertext,`
			`$content`
			`)`
			`);`
			`}`

			`return $sections;`
			`}`

			`/**`
			`* Returns the anchor for a given section`
			`*/`
			`private function findSectionAnchor($section){`
			`$html = str_get_html($section);`

			`// For IDs`
			`$anchor = $html->find($this->getInput('separator') . '[id=]', 0);`
			`if(!is_null($anchor)){`
			`return $this->getInput('source') . '#' . $anchor->id;`
			`}`

			`// For actual anchors`
			`$anchor = $html->find($this->getInput('separator') . '[href=]', 0);`
			`if(!is_null($anchor)){`
			`return $anchor->href;`
			`}`

			`// Nothing found`
			`return $this->getInput('source');`
			`}`

			`/**`
			`* Returns the author`
			`*`
			`* Notice: Some pages don't provide author information`
			`*/`
			`private function findAuthor($html){`
			`/* Example:`
			`* <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords`
			`* (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com`
			`* [178.162.199.143]">hosted-by</span>)</p>`
			`*/`
			`$pageinfo = $html->find('[id="pageinfo"]', 0);`

			`if(is_null($pageinfo)){`
			`return null;`
			`} else {`
			`$author = $pageinfo->find('[title=]', 0);`
			`if(is_null($author)){`
			`return null;`
			`} else {`
			`return trim(explode('@', $author->title)[0]);`
			`}`
			`}`
			`}`

			`/**`
			`* Returns the time of last edit`
			`*`
			`* Notice: Some pages don't provide this information`
			`*/`
			`private function findTimestamp($html){`
			`// See example of findAuthor()`
			`$pageinfo = $html->find('[id="pageinfo"]', 0);`

			`if(is_null($pageinfo)){`
			`return null;`
			`} else {`
			`$timestamp = $pageinfo->innertext;`
			`$matches = array();`
			`preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);`
			`return strtotime($matches[1]);`
			`}`
			`}`

			`/**`
			`* Returns the original HTML with all anchors fixed (makes relative anchors`
			`* absolute)`
			`*/`
			`private function fixAnchors($html, $source = null){`

			`$source = $source ?: $this->getURI();`

			`foreach($html->find('a') as $anchor){`
			`switch(substr($anchor->href, 0, 1)){`
			`case 'h': // http or https, no actions required`
			`break;`
			`case '/': // some relative path`
			`$anchor->href = $this->findDomain($source) . $anchor->href;`
			`break;`
			`case '#': // it's an ID`
			`default: // probably something like ? or &, skip empty ones`
			`if(!isset($anchor->href))`
			`break;`
			`$anchor->href = $source . $anchor->href;`
			`}`
			`}`

			`return $html;`
			`}`

			`/**`
			`* Loads the full article of a given anchor (if the anchor is from the same`
			`* wiki domain)`
			`*/`
			`private function followAnchor($anchor){`
			`if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){`
			`return null;`
			`}`

			`$html = getSimpleHTMLDOMCached($anchor);`
			`if(!$html){ // Cannot load article`
			`return null;`
			`}`

			`return $this->fixAnchors($html, $anchor);`
			`}`

			`/**`
			`* Finds the domain for a given URI`
			`*/`
			`private function findDomain($uri){`
			`$matches = array();`
			`preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);`
			`return $matches[1];`
			`}`

			`/* This function is a copy from CNETBridge */`
			`private function stripWithDelimiters($string, $start, $end){`
			`while(strpos($string, $start) !== false){`
			`$section_to_remove = substr($string, strpos($string, $start));`
			`$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));`
			`$string = str_replace($section_to_remove, '', $string);`
			`}`

			`return $string;`
			`}`

			`/* This function is based on CNETBridge */`
			`private function cleanArticle($article_html){`
			`$article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');`
			`return $article_html;`
			`}`
			`}`