forked from blallo/rss-bridge
Merge pull request #332 from LogMANOriginal/WordPressBridge
Improve WordPress bridge and remove LeMotDuJour + Raymond (test before merge!)
This commit is contained in:
commit
da1e32267b
3 changed files with 94 additions and 143 deletions
|
@ -1,55 +0,0 @@
|
|||
<?php
|
||||
class LeMotDuJourBridge extends BridgeAbstract{
|
||||
|
||||
public function loadMetadatas() {
|
||||
|
||||
$this->maintainer = "qwertygc";
|
||||
$this->name = "LeMotDuJour Bridge";
|
||||
$this->uri = "http://www.lemotdujour.com/";
|
||||
$this->description = "Returns the newest articles.";
|
||||
$this->update = "2014-05-25";
|
||||
|
||||
}
|
||||
|
||||
public function collectData(array $param){
|
||||
|
||||
function StripCDATA($string) {
|
||||
$string = str_replace('<![CDATA[', '', $string);
|
||||
$string = str_replace(']]>', '', $string);
|
||||
return $string;
|
||||
}
|
||||
function ExtractContent($url) {
|
||||
$html2 = $this->file_get_html($url);
|
||||
$text = $html2->find('div.single-contenu', 0)->innertext;
|
||||
return $text;
|
||||
}
|
||||
$html = $this->file_get_html('http://feeds2.feedburner.com/lemotdujour/lemotdujour') or $this->returnError('Could not request LeMotDuJour.', 404);
|
||||
$limit = 0;
|
||||
|
||||
foreach($html->find('item') as $element) {
|
||||
if($limit < 10) {
|
||||
$item = new \Item();
|
||||
$item->title = StripCDATA($element->find('title', 0)->innertext);
|
||||
$item->uri = StripCDATA($element->find('guid', 0)->plaintext);
|
||||
$item->timestamp = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item->content = ExtractContent($item->uri);
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public function getName(){
|
||||
return 'LeMotDuJour Bridge';
|
||||
}
|
||||
|
||||
public function getURI(){
|
||||
return 'http://lemotdujour.com/';
|
||||
}
|
||||
|
||||
public function getCacheDuration(){
|
||||
return 3600*2; // 2 hours
|
||||
// return 0; // 2 hours
|
||||
}
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
<?php
|
||||
class RaymondBridge extends BridgeAbstract{
|
||||
|
||||
public function loadMetadatas() {
|
||||
|
||||
$this->maintainer = "pit-fgfjiudghdf";
|
||||
$this->name = "Raymond";
|
||||
$this->uri = "http://www.raymond.cc";
|
||||
$this->description = "Returns the 3 newest posts from Raymond.cc (full text)";
|
||||
$this->update = "2014-05-26";
|
||||
|
||||
}
|
||||
|
||||
public function collectData(array $param){
|
||||
function raymondStripCDATA($string) {
|
||||
$string = str_replace('<![CDATA[', '', $string);
|
||||
$string = str_replace(']]>', '', $string);
|
||||
return $string;
|
||||
}
|
||||
function raymondExtractContent($url) {
|
||||
$html2 = $this->file_get_html($url);
|
||||
$text = $html2->find('div.entry-content', 0)->innertext;
|
||||
$text = preg_replace('/class="ad".*/', '', $text);
|
||||
$text = strip_tags($text, '<p><a><i><strong><em><img>');
|
||||
$text = str_replace('(adsbygoogle = window.adsbygoogle || []).push({});', '', $text);
|
||||
return $text;
|
||||
}
|
||||
$html = $this->file_get_html('http://www.raymond.cc/blog/feed') or $this->returnError('Could not request raymond.', 404);
|
||||
$limit = 0;
|
||||
foreach($html->find('item') as $element) {
|
||||
if($limit < 3) {
|
||||
$item = new \Item();
|
||||
$item->title = raymondStripCDATA($element->find('title', 0)->innertext);
|
||||
$item->uri = raymondStripCDATA($element->find('guid', 0)->plaintext);
|
||||
$item->timestamp = strtotime($element->find('pubDate', 0)->plaintext);
|
||||
$item->content = raymondExtractContent($item->uri);
|
||||
$this->items[] = $item;
|
||||
$limit++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
public function getName(){
|
||||
return 'raymond';
|
||||
}
|
||||
public function getURI(){
|
||||
return 'http://www.raymond.cc/blog';
|
||||
}
|
||||
public function getCacheDuration(){
|
||||
return 3600*12; // 12 hour
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,10 @@
|
|||
<?php
|
||||
define('WORDPRESS_TYPE_ATOM', 1); // Content is of type ATOM
|
||||
define('WORDPRESS_TYPE_RSS', 2); // Content is of type RSS
|
||||
class WordPressBridge extends BridgeAbstract {
|
||||
|
||||
private $url;
|
||||
public $sitename; // Name of the site
|
||||
|
||||
public function loadMetadatas() {
|
||||
|
||||
|
@ -9,7 +12,7 @@ class WordPressBridge extends BridgeAbstract {
|
|||
$this->name = "Wordpress Bridge";
|
||||
$this->uri = "https://wordpress.org/";
|
||||
$this->description = "Returns the 3 newest full posts of a Wordpress blog";
|
||||
$this->update = "2016-08-02";
|
||||
$this->update = "2016-08-04";
|
||||
|
||||
$this->parameters[] =
|
||||
'[
|
||||
|
@ -19,10 +22,26 @@ class WordPressBridge extends BridgeAbstract {
|
|||
"identifier" : "url"
|
||||
}
|
||||
]';
|
||||
|
||||
}
|
||||
|
||||
public function collectData(array $param) {
|
||||
// Returns the content type for a given html dom
|
||||
function DetectContentType($html){
|
||||
if($html->find('entry'))
|
||||
return WORDPRESS_TYPE_ATOM;
|
||||
if($html->find('item'))
|
||||
return WORDPRESS_TYPE_RSS;
|
||||
return WORDPRESS_TYPE_ATOM; // Make ATOM default
|
||||
}
|
||||
|
||||
// Replaces all 'link' tags with 'url' for simplehtmldom to actually find 'links' ('url')
|
||||
function ReplaceLinkTagsWithUrlTags($element){
|
||||
// We need to fix the 'link' tag as simplehtmldom cannot parse it (just rename it and load back as dom)
|
||||
$element_text = $element->outertext;
|
||||
$element_text = str_replace('<link>', '<url>', $element_text);
|
||||
$element_text = str_replace('</link>', '</url>', $element_text);
|
||||
$element_text = str_replace('<link ', '<url ', $element_text);
|
||||
return str_get_html($element_text);
|
||||
}
|
||||
|
||||
function StripCDATA($string) {
|
||||
$string = str_replace('<![CDATA[', '', $string);
|
||||
|
@ -30,12 +49,14 @@ class WordPressBridge extends BridgeAbstract {
|
|||
return $string;
|
||||
}
|
||||
|
||||
function clearContent($content) {
|
||||
$content = preg_replace('/<script.*\/script>/', '', $content);
|
||||
function ClearContent($content) {
|
||||
$content = preg_replace('/<script[^>]*>[^<]*<\/script>/', '', $content);
|
||||
$content = preg_replace('/<div class="wpa".*/', '', $content);
|
||||
$content = preg_replace('/<form.*\/form>/', '', $content);
|
||||
return $content;
|
||||
}
|
||||
|
||||
public function collectData(array $param) {
|
||||
$this->processParams($param);
|
||||
|
||||
if (!$this->hasUrl()) {
|
||||
|
@ -44,35 +65,75 @@ class WordPressBridge extends BridgeAbstract {
|
|||
|
||||
$this->url = $this->url.'/feed/atom';
|
||||
$html = $this->file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404);
|
||||
|
||||
// Notice: We requested an ATOM feed, however some sites return RSS feeds instead!
|
||||
$type = $this->DetectContentType($html);
|
||||
|
||||
if($type === WORDPRESS_TYPE_RSS)
|
||||
$posts = $html->find('item');
|
||||
else
|
||||
$posts = $html->find('entry');
|
||||
|
||||
if(!empty($posts) ) {
|
||||
$this->name = $html->find('title', 0)->plaintext;
|
||||
$this->sitename = $html->find('title', 0)->plaintext;
|
||||
$i=0;
|
||||
foreach ($html->find('entry') as $article) {
|
||||
|
||||
foreach ($posts as $article) {
|
||||
if($i < 3) {
|
||||
$this->items[$i]->uri = $article->find('link', 0)->getAttribute('href');
|
||||
$this->items[$i]->title = StripCDATA($article->find('title', 0)->plaintext);
|
||||
$this->items[$i]->author = trim($article->find('author', 0)->innertext);
|
||||
$this->items[$i]->timestamp = strtotime($article->find('updated', 0)->innertext);
|
||||
|
||||
$article_html = $this->file_get_html($this->items[$i]->uri);
|
||||
$this->items[$i]->content = clearContent($article_html->find('article', 0)->innertext);
|
||||
if(empty($this->items[$i]->content))
|
||||
$this->items[$i]->content = clearContent($article_html->find('.single-content', 0)->innertext); // another common content div
|
||||
if(empty($this->items[$i]->content))
|
||||
$this->items[$i]->content = clearContent($article_html->find('.post', 0)->innertext); // for old WordPress themes without HTML5
|
||||
$item = new \Item();
|
||||
|
||||
$article = $this->ReplaceLinkTagsWithUrlTags($article);
|
||||
|
||||
if($type === WORDPRESS_TYPE_RSS){
|
||||
$item->uri = $article->find('url', 0)->innertext; // 'link' => 'url'!
|
||||
$item->title = $article->find('title', 0)->plaintext;
|
||||
$item->author = trim($this->StripCDATA($article->find('dc:creator', 0)->innertext));
|
||||
$item->timestamp = strtotime($article->find('pubDate', 0)->innertext);
|
||||
} else {
|
||||
$item->uri = $article->find('url', 0)->getAttribute('href'); // 'link' => 'url'!
|
||||
$item->title = $this->StripCDATA($article->find('title', 0)->plaintext);
|
||||
$item->author = trim($article->find('author', 0)->innertext);
|
||||
$item->timestamp = strtotime($article->find('updated', 0)->innertext);
|
||||
}
|
||||
|
||||
$article_html = $this->file_get_html($item->uri);
|
||||
|
||||
// Attempt to find most common content div
|
||||
if(empty($item->content)){
|
||||
$article = $article_html->find('article', 0);
|
||||
if(!empty($article)){
|
||||
$item->content = $this->ClearContent($article->innertext);
|
||||
}
|
||||
}
|
||||
|
||||
// another common content div
|
||||
if(empty($item->content)){
|
||||
$article = $article_html->find('.single-content', 0);
|
||||
if(!empty($article)){
|
||||
$item->content = $this->ClearContent($article->innertext);
|
||||
}
|
||||
}
|
||||
|
||||
// for old WordPress themes without HTML5
|
||||
if(empty($item->content)){
|
||||
$article = $article_html->find('.post', 0);
|
||||
if(!empty($article)){
|
||||
$item->content = $this->ClearContent($article->innertext);
|
||||
}
|
||||
}
|
||||
|
||||
$this->items[] = $item;
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
$this->returnError("Sorry, {$this->url} doesn't seem to be a Wordpress blog.", 404);
|
||||
}
|
||||
}
|
||||
|
||||
public function getName() {
|
||||
return "{$this->name} - Wordpress Bridge";
|
||||
return "{$this->sitename} - Wordpress Bridge";
|
||||
}
|
||||
|
||||
public function getURI() {
|
||||
|
@ -93,6 +154,4 @@ class WordPressBridge extends BridgeAbstract {
|
|||
private function processParams($param) {
|
||||
$this->url = $param['url'];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue