This commit is contained in:
logmanoriginal 2017-02-18 10:16:05 +01:00
commit 790bd17d41

327
bridges/MoinMoinBridge.php Normal file
View file

@ -0,0 +1,327 @@
<?php
class MoinMoinBridge extends BridgeAbstract {
const MAINTAINER = 'logmanoriginal';
const NAME = 'MoinMoin Bridge';
const URI = 'https://moinmo.in';
const DESCRIPTION = 'Generates feeds for pages of a MoinMoin (compatible) wiki';
const PARAMETERS = array(
array(
'source' => array(
'name' => 'Source',
'type' => 'text',
'required' => true,
'title' => 'Insert wiki page URI (e.g.: https://moinmo.in/MoinMoin)',
'exampleValue' => 'https://moinmo.in/MoinMoin'
),
'separator' => array(
'name' => 'Separator',
'type' => 'list',
'requied' => true,
'title' => 'Defines the separtor for splitting content into feeds',
'defaultValue' => 'h2',
'values' => array(
'Header (h1)' => 'h1',
'Header (h2)' => 'h2',
'Header (h3)' => 'h3',
'List element (li)' => 'li',
'Anchor (a)' => 'a'
)
),
'limit' => array(
'name' => 'Limit',
'type' => 'number',
'required' => false,
'title' => 'Number of items to return (from top)',
'defaultValue' => -1
),
'content' => array(
'name' => 'Content',
'type' => 'list',
'required' => false,
'title' => 'Defines how feed contents are build',
'defaultValue' => 'separator',
'values' => array(
'By separator' => 'separator',
'Follow link (only for anchor)' => 'follow',
'None' => 'none'
)
)
)
);
private $title = '';
public function collectData(){
/* MoinMoin uses a rather unpleasent representation of HTML. Instead of
* using tags like <article/>, <navigation/>, <header/>, etc... it uses
* <div/>, <span/> and <p/>. Also each line is literaly identified via
* IDs. The only way to distinguish content is via headers, though not
* in all cases.
*
* Example (indented for the sake of readability):
* ...
* <span class="anchor" id="line-1"></span>
* <span class="anchor" id="line-2"></span>
* <span class="anchor" id="line-3"></span>
* <span class="anchor" id="line-4"></span>
* <span class="anchor" id="line-5"></span>
* <span class="anchor" id="line-6"></span>
* <span class="anchor" id="line-7"></span>
* <span class="anchor" id="line-8"></span>
* <span class="anchor" id="line-9"></span>
* <p class="line867">MoinMoin is a Wiki software implemented in
* <a class="interwiki" href="/Python" title="MoinMoin">Python</a>
* and distributed as Free Software under
* <a class="interwiki" href="/GPL" title="MoinMoin">GNU GPL license</a>.
* ...
*/
$html = getSimpleHTMLDOM($this->getInput('source'))
or returnServerError('Could not load ' . $this->getInput('source'));
// Some anchors link to local sites or local IDs (both don't work well
// in feeds)
$html = $this->fixAnchors($html);
$this->title = $html->find('title', 0)->innertext . ' | ' . self::NAME;
// Here we focus on simple author and timestamp information from the given
// page. Later we update this information in case the anchor is followed.
$author = $this->findAuthor($html);
$timestamp = $this->findTimestamp($html);
$sections = $this->splitSections($html);
foreach($sections as $section){
$item = array();
$item['uri'] = $this->findSectionAnchor($section[0]);
switch($this->getInput('content')){
case 'none': // Do not return any content
break;
case 'follow': // Follow the anchor
// We can only follow anchors (use default otherwise)
if($this->getInput('separator') === 'a'){
$content = $this->followAnchor($item['uri']);
// Return only actual content
$item['content'] = $content->find('div#page', 0)->innertext;
// Each page could have its own author and timestamp
$author = $this->findAuthor($content);
$timestamp = $this->findTimestamp($content);
break;
}
case 'separator':
default: // Use contents from the current page
$item['content'] = $this->cleanArticle($section[2]);
}
if(!is_null($author)) $item['author'] = $author;
if(!is_null($timestamp)) $item['timestamp'] = $timestamp;
$item['title'] = strip_tags($section[1]);
// Skip items with empty title
if(empty(trim($item['title']))){
continue;
}
$this->items[] = $item;
if($this->getInput('limit') > 0
&& count($this->items) >= $this->getInput('limit')){
break;
}
}
}
public function getName(){
return $this->title ?: parent::getName();
}
public function getURI(){
return $this->getInput('source') ?: parent::getURI();
}
/**
* Splits the html into sections.
*
* Returns an array with one element per section. Each element consists of:
* [0] The entire section
* [1] The section title
* [2] The section content
*/
private function splitSections($html){
$content = $html->find('div#page', 0)->innertext
or returnServerError('Unable to find <div id="page"/>!');
$sections = array();
$regex = implode(
'',
array(
"\<{$this->getInput('separator')}.+?(?=\>)\>",
"(.+?)(?=\<\/{$this->getInput('separator')}\>)",
"\<\/{$this->getInput('separator')}\>",
"(.+?)((?=\<{$this->getInput('separator')})|(?=\<div\sid=\"pagebottom\")){1}"
)
);
preg_match_all(
'/' . $regex . '/m',
$content,
$sections,
PREG_SET_ORDER
);
// Some pages don't use headers, return page as one feed
if(count($sections) === 0){
return array(
array(
$content,
$html->find('title', 0)->innertext,
$content
)
);
}
return $sections;
}
/**
* Returns the anchor for a given section
*/
private function findSectionAnchor($section){
$html = str_get_html($section);
// For IDs
$anchor = $html->find($this->getInput('separator') . '[id=]', 0);
if(!is_null($anchor)){
return $this->getInput('source') . '#' . $anchor->id;
}
// For actual anchors
$anchor = $html->find($this->getInput('separator') . '[href=]', 0);
if(!is_null($anchor)){
return $anchor->href;
}
// Nothing found
return $this->getInput('source');
}
/**
* Returns the author
*
* Notice: Some pages don't provide author information
*/
private function findAuthor($html){
/* Example:
* <p id="pageinfo" class="info" dir="ltr" lang="en">MoinMoin: LocalSpellingWords
* (last edited 2017-02-16 15:36:31 by <span title="??? @ hosted-by.leaseweb.com
* [178.162.199.143]">hosted-by</span>)</p>
*/
$pageinfo = $html->find('[id="pageinfo"]', 0);
if(is_null($pageinfo)){
return null;
} else {
$author = $pageinfo->find('[title=]', 0);
if(is_null($author)){
return null;
} else {
return trim(explode('@', $author->title)[0]);
}
}
}
/**
* Returns the time of last edit
*
* Notice: Some pages don't provide this information
*/
private function findTimestamp($html){
// See example of findAuthor()
$pageinfo = $html->find('[id="pageinfo"]', 0);
if(is_null($pageinfo)){
return null;
} else {
$timestamp = $pageinfo->innertext;
$matches = array();
preg_match('/.+?(?=\().+?(?=\d)([0-9\-\s\:]+)/m', $pageinfo, $matches);
return strtotime($matches[1]);
}
}
/**
* Returns the original HTML with all anchors fixed (makes relative anchors
* absolute)
*/
private function fixAnchors($html, $source = null){
$source = $source ?: $this->getURI();
foreach($html->find('a') as $anchor){
switch(substr($anchor->href, 0, 1)){
case 'h': // http or https, no actions required
break;
case '/': // some relative path
$anchor->href = $this->findDomain($source) . $anchor->href;
break;
case '#': // it's an ID
default: // probably something like ? or &, skip empty ones
if(!isset($anchor->href))
break;
$anchor->href = $source . $anchor->href;
}
}
return $html;
}
/**
* Loads the full article of a given anchor (if the anchor is from the same
* wiki domain)
*/
private function followAnchor($anchor){
if(strrpos($anchor, $this->findDomain($this->getInput('source')) === false)){
return null;
}
$html = getSimpleHTMLDOMCached($anchor);
if(!$html){ // Cannot load article
return null;
}
return $this->fixAnchors($html, $anchor);
}
/**
* Finds the domain for a given URI
*/
private function findDomain($uri){
$matches = array();
preg_match('/(http[s]{0,1}:\/\/.+?(?=\/))/', $uri, $matches);
return $matches[1];
}
/* This function is a copy from CNETBridge */
private function stripWithDelimiters($string, $start, $end){
while(strpos($string, $start) !== false){
$section_to_remove = substr($string, strpos($string, $start));
$section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
$string = str_replace($section_to_remove, '', $string);
}
return $string;
}
/* This function is based on CNETBridge */
private function cleanArticle($article_html){
$article_html = $this->stripWithDelimiters($article_html, '<script', '</script>');
return $article_html;
}
}