1
0
Fork 0
forked from blallo/rss-bridge

[bridges] make them WordPressBridge derivatives

The specific content filtering used in these bridges will need to
be reintegrated later as part of the bridge or as part of the
WordPressBridge if they are considered generic enough filters,
such as the already existing WordPressBridge <script> removal filter.

Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
This commit is contained in:
Pierre Mazière 2016-09-15 12:40:26 +02:00
parent 43ac961284
commit 3f64d2d65a
7 changed files with 32 additions and 209 deletions

View file

@ -1,30 +1,11 @@
<?php <?php
class ArstechnicaBridge extends FeedExpander { require_once('WordPressBridge.php');
class ArstechnicaBridge extends WordPressBridge {
const MAINTAINER = "prysme"; const MAINTAINER = "prysme";
const NAME = "ArstechnicaBridge"; const NAME = "ArstechnicaBridge";
const URI = "http://arstechnica.com"; const URI = "http://arstechnica.com";
const DESCRIPTION = "The PC enthusiast's resource. Power users and the tools they love, without computing religion"; const DESCRIPTION = "The PC enthusiast's resource. Power users and the tools they love, without computing religion";
const PARAMETERS = array();
protected function parseItem($item){
$item = parent::parseItem($item);
$html = $this->getSimpleHTMLDOMCached($item['uri']);
if(!$html){
$item['content'] .= '<p>Requesting full article failed.</p>';
}else{
$item['content'] = $html->find('.article-guts', 0);
}
return $item;
}
public function collectData(){
$this->collectExpandableDatas('http://feeds.arstechnica.com/arstechnica/index/');
}
public function getCacheDuration() {
return 7200; // 2h
}
} }

View file

@ -1,22 +1,11 @@
<?php <?php
class FreenewsBridge extends FeedExpander { require_once('WordPressBridge.php');
class FreenewsBridge extends WordPressBridge {
const MAINTAINER = "mitsukarenai"; const MAINTAINER = "mitsukarenai";
const NAME = "Freenews"; const NAME = "Freenews";
const URI = "http://freenews.fr"; const URI = "http://freenews.fr";
const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales."; const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox)";
const PARAMETERS = array();
public function collectData(){
parent::collectExpandableDatas('http://feeds.feedburner.com/Freenews-Freebox?format=xml');
}
protected function parseItem($newsItem) {
$item = parent::parseItem($newsItem);
$articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('.post-container', 0);
$item['content'] = $content->innertext;
return $item;
}
} }

View file

@ -1,42 +1,13 @@
<?php <?php
class LeJournalDuGeekBridge extends FeedExpander { require_once('WordPressBridge.php');
class LeJournalDuGeekBridge extends WordPressBridge{
const MAINTAINER = "polopollo"; const MAINTAINER = "polopollo";
const NAME = "journaldugeek.com (FR)"; const NAME = "journaldugeek.com (FR)";
const URI = "http://www.journaldugeek.com/"; const URI = "http://www.journaldugeek.com/";
const DESCRIPTION = "Returns the 5 newest posts from LeJournalDuGeek (full text)."; const DESCRIPTION = "Returns the newest posts from LeJournalDuGeek (full text).";
const PARAMETERS = array();
public function collectData(){
$this->collectExpandableDatas(self::URI . 'rss', 5);
}
protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
$item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']);
return $item;
}
private function LeJournalDuGeekExtractContent($url) {
$articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
$text = $articleHTMLContent->find('div.post-content', 0)->innertext;
foreach($articleHTMLContent->find('a.more') as $element) {
if ($element->innertext == "Source") {
$text = $text . '<p><a href="' . $element->href . '">Source : ' . $element->href . '</a></p>';
break;
}
}
foreach($articleHTMLContent->find('iframe') as $element) {
if (preg_match("/youtube/i", $element->src)) {
$text = $text . '// An IFRAME to Youtube was included in the article: <a href="' . $element->src . '">' . $element->src . '</a><br>';
}
}
$text = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $text);
$text = strip_tags($text, '<p><b><a><blockquote><img><em><br/><br><ul><li>');
return $text;
}
public function getCacheDuration(){ public function getCacheDuration(){
return 1800; // 30min return 1800; // 30min

View file

@ -1,61 +1,11 @@
<?php <?php
class NakedSecurityBridge extends FeedExpander { require_once('WordPressBridge.php');
class NakedSecurityBridge extends WordPressBridge {
const MAINTAINER = 'ORelio'; const MAINTAINER = 'ORelio';
const NAME = 'Naked Security'; const NAME = 'Naked Security';
const URI = 'https://nakedsecurity.sophos.com/'; const URI = 'https://nakedsecurity.sophos.com/';
const DESCRIPTION = 'Returns the newest articles.'; const DESCRIPTION = 'Returns the newest articles.';
const PARAMETERS = array();
private function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
$open_tag = '<'.$tag_name;
$close_tag = '</'.$tag_name.'>';
$close_tag_length = strlen($close_tag);
if (strpos($tag_start, $open_tag) === 0) {
while (strpos($string, $tag_start) !== false) {
$max_recursion = 100;
$section_to_remove = null;
$section_start = strpos($string, $tag_start);
$search_offset = $section_start;
do {
$max_recursion--;
$section_end = strpos($string, $close_tag, $search_offset);
$search_offset = $section_end + $close_tag_length;
$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
$open_tag_count = substr_count($section_to_remove, $open_tag);
$close_tag_count = substr_count($section_to_remove, $close_tag);
} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
$string = str_replace($section_to_remove, '', $string);
}
}
return $string;
}
protected function parseItem($item){
$item = parent::parseItem($item);
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
if(!$article_html){
$item['content'] = 'Could not request '.$this->getName().': '.$item['uri'];
return $item;
}
$article_image = $article_html->find('img.wp-post-image', 0)->src;
$article_content = $article_html->find('div.entry-content', 0)->innertext;
$article_content = $this->StripRecursiveHTMLSection($article_content , 'div', '<div class="entry-prefix"');
$article_content = $this->StripRecursiveHTMLSection($article_content , 'script', '<script');
$article_content = $this->StripRecursiveHTMLSection($article_content , 'aside', '<aside');
$article_content = '<p><img src="'.$article_image.'" /></p><p><b>'.$item['content'].'</b></p>'.$article_content;
$item['content'] = $article_content;
return $item;
}
public function collectData(){
$feedUrl = 'https://feeds.feedburner.com/nakedsecurity?format=xml';
$this->collectExpandableDatas($feedUrl);
}
} }

View file

@ -1,29 +1,15 @@
<?php <?php
class NumeramaBridge extends FeedExpander { require_once('WordPressBridge.php');
class NumeramaBridge extends WordPressBridge {
const MAINTAINER = 'mitsukarenai'; const MAINTAINER = 'mitsukarenai';
const NAME = 'Numerama'; const NAME = 'Numerama';
const URI = 'http://www.numerama.com/'; const URI = 'http://www.numerama.com/';
const DESCRIPTION = 'Returns the 5 newest posts from Numerama (full text)'; const DESCRIPTION = 'Returns the newest posts from Numerama (full text)';
const PARAMETERS = array();
public function collectData(){
$this->collectExpandableDatas(self::URI . 'feed/', 5);
}
protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
$item['content'] = $this->ExtractContent($item['uri']);
return $item;
}
private function ExtractContent($url){
$article_html = $this->getSimpleHTMLDOMCached('Could not request Numerama: '.$url);
$contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block
$contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture
return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post
}
public function getCacheDuration() { public function getCacheDuration() {
return 1800; // 30min return 1800; // 30min
} }
} }

View file

@ -1,39 +1,13 @@
<?php <?php
class SiliconBridge extends FeedExpander { require_once('WordPressBridge.php');
class SiliconBridge extends WordPressBridge {
const MAINTAINER = "ORelio"; const MAINTAINER = "ORelio";
const NAME = 'Silicon Bridge'; const NAME = 'Silicon Bridge';
const URI = 'http://www.silicon.fr/'; const URI = 'http://www.silicon.fr/';
const DESCRIPTION = "Returns the newest articles."; const DESCRIPTION = "Returns the newest articles.";
const PARAMETERS = array();
protected function parseItem($item){
$item = parent::parseItem($item);
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
if(!$article_html){
$item['content'] .= '<p>Could not request Silicon: '.$item['uri'].'</p>';
return $item;
}
$article_content = '<p><b>'.$article_html->find('div.entry-excerpt', 0)->plaintext.'</b></p>'
.$article_html->find('div.entry-content', 0)->innertext;
//Remove useless scripts left in the page
while (strpos($article_content, '<script') !== false) {
$script_section = substr($article_content, strpos($article_content, '<script'));
$script_section = substr($script_section, 0, strpos($script_section, '</script>') + 9);
$article_content = str_replace($script_section, '', $article_content);
}
$item['content'] = $article_content;
return $item;
}
public function collectData(){
$feedUrl = self::URI.'feed';
$this->collectExpandableDatas($feedUrl);
}
public function getCacheDuration() { public function getCacheDuration() {
return 1800; // 30 minutes return 1800; // 30 minutes

View file

@ -1,41 +1,13 @@
<?php <?php
class ZatazBridge extends BridgeAbstract { require_once('WordPressBridge.php');
class ZatazBridge extends WordPressBridge{
const MAINTAINER = "aledeg"; const MAINTAINER = "aledeg";
const NAME = 'Zataz Magazine'; const NAME = 'Zataz Magazine';
const URI = 'http://www.zataz.com'; const URI = 'http://www.zataz.com';
const DESCRIPTION = "ZATAZ Magazine - S'informer, c'est déjà se sécuriser"; const DESCRIPTION = "ZATAZ Magazine - S'informer, c'est déjà se sécuriser";
const PARAMETERS = array();
public function collectData(){
$html = $this->getSimpleHTMLDOM(self::URI) or $this->returnServerError('Could not request ' . self::URI);
$recent_posts = $html->find('#recent-posts-3', 0)->find('ul', 0)->find('li');
foreach ($recent_posts as $article) {
if (count($this->items) < 5) {
$uri = $article->find('a', 0)->href;
$this->items[] = $this->getDetails($uri);
}
}
}
private function getDetails($uri) {
$html = $this->getSimpleHTMLDOM($uri) or exit;
$item = array();
$article = $html->find('.gdl-blog-full', 0);
$item['uri'] = $uri;
$item['title'] = $article->find('.blog-title', 0)->find('a', 0)->innertext;
$item['content'] = $article->find('.blog-content', 0)->innertext;
$item['timestamp'] = $this->getTimestampFromDate($article->find('.blog-date', 0)->find('a', 0)->href);
return $item;
}
private function getTimestampFromDate($uri) {
preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches);
$date = new \DateTime($matches[0]);
return $date->format('U');
}
public function getCacheDuration() { public function getCacheDuration() {
return 7200; // 2h return 7200; // 2h