forked from blallo/rss-bridge
[bridges] make them WordPressBridge derivatives
The specific content filtering used in these bridges will need to be reintegrated later as part of the bridge or as part of the WordPressBridge if they are considered generic enough filters, such as the already existing WordPressBridge <script> removal filter. Signed-off-by: Pierre Mazière <pierre.maziere@gmx.com>
This commit is contained in:
parent
43ac961284
commit
3f64d2d65a
7 changed files with 32 additions and 209 deletions
|
@ -1,30 +1,11 @@
|
|||
<?php
|
||||
class ArstechnicaBridge extends FeedExpander {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class ArstechnicaBridge extends WordPressBridge {
|
||||
|
||||
const MAINTAINER = "prysme";
|
||||
const NAME = "ArstechnicaBridge";
|
||||
const URI = "http://arstechnica.com";
|
||||
const DESCRIPTION = "The PC enthusiast's resource. Power users and the tools they love, without computing religion";
|
||||
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
$html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$html){
|
||||
$item['content'] .= '<p>Requesting full article failed.</p>';
|
||||
}else{
|
||||
$item['content'] = $html->find('.article-guts', 0);
|
||||
}
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
$this->collectExpandableDatas('http://feeds.arstechnica.com/arstechnica/index/');
|
||||
}
|
||||
|
||||
public function getCacheDuration() {
|
||||
return 7200; // 2h
|
||||
}
|
||||
|
||||
const PARAMETERS = array();
|
||||
}
|
||||
|
|
|
@ -1,22 +1,11 @@
|
|||
<?php
|
||||
class FreenewsBridge extends FeedExpander {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class FreenewsBridge extends WordPressBridge {
|
||||
|
||||
const MAINTAINER = "mitsukarenai";
|
||||
const NAME = "Freenews";
|
||||
const URI = "http://freenews.fr";
|
||||
const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales.";
|
||||
|
||||
public function collectData(){
|
||||
parent::collectExpandableDatas('http://feeds.feedburner.com/Freenews-Freebox?format=xml');
|
||||
}
|
||||
|
||||
protected function parseItem($newsItem) {
|
||||
$item = parent::parseItem($newsItem);
|
||||
|
||||
$articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
$content = $articlePage->find('.post-container', 0);
|
||||
$item['content'] = $content->innertext;
|
||||
|
||||
return $item;
|
||||
}
|
||||
const DESCRIPTION = "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox)";
|
||||
const PARAMETERS = array();
|
||||
}
|
||||
|
|
|
@ -1,42 +1,13 @@
|
|||
<?php
|
||||
class LeJournalDuGeekBridge extends FeedExpander {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class LeJournalDuGeekBridge extends WordPressBridge{
|
||||
|
||||
const MAINTAINER = "polopollo";
|
||||
const NAME = "journaldugeek.com (FR)";
|
||||
const URI = "http://www.journaldugeek.com/";
|
||||
const DESCRIPTION = "Returns the 5 newest posts from LeJournalDuGeek (full text).";
|
||||
|
||||
public function collectData(){
|
||||
$this->collectExpandableDatas(self::URI . 'rss', 5);
|
||||
}
|
||||
|
||||
protected function parseItem($newsItem){
|
||||
$item = parent::parseItem($newsItem);
|
||||
$item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']);
|
||||
return $item;
|
||||
}
|
||||
|
||||
private function LeJournalDuGeekExtractContent($url) {
|
||||
$articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
|
||||
$text = $articleHTMLContent->find('div.post-content', 0)->innertext;
|
||||
|
||||
foreach($articleHTMLContent->find('a.more') as $element) {
|
||||
if ($element->innertext == "Source") {
|
||||
$text = $text . '<p><a href="' . $element->href . '">Source : ' . $element->href . '</a></p>';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
foreach($articleHTMLContent->find('iframe') as $element) {
|
||||
if (preg_match("/youtube/i", $element->src)) {
|
||||
$text = $text . '// An IFRAME to Youtube was included in the article: <a href="' . $element->src . '">' . $element->src . '</a><br>';
|
||||
}
|
||||
}
|
||||
|
||||
$text = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $text);
|
||||
$text = strip_tags($text, '<p><b><a><blockquote><img><em><br/><br><ul><li>');
|
||||
return $text;
|
||||
}
|
||||
const DESCRIPTION = "Returns the newest posts from LeJournalDuGeek (full text).";
|
||||
const PARAMETERS = array();
|
||||
|
||||
public function getCacheDuration(){
|
||||
return 1800; // 30min
|
||||
|
|
|
@ -1,61 +1,11 @@
|
|||
<?php
|
||||
class NakedSecurityBridge extends FeedExpander {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class NakedSecurityBridge extends WordPressBridge {
|
||||
|
||||
const MAINTAINER = 'ORelio';
|
||||
const NAME = 'Naked Security';
|
||||
const URI = 'https://nakedsecurity.sophos.com/';
|
||||
const DESCRIPTION = 'Returns the newest articles.';
|
||||
|
||||
private function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
|
||||
$open_tag = '<'.$tag_name;
|
||||
$close_tag = '</'.$tag_name.'>';
|
||||
$close_tag_length = strlen($close_tag);
|
||||
if (strpos($tag_start, $open_tag) === 0) {
|
||||
while (strpos($string, $tag_start) !== false) {
|
||||
$max_recursion = 100;
|
||||
$section_to_remove = null;
|
||||
$section_start = strpos($string, $tag_start);
|
||||
$search_offset = $section_start;
|
||||
do {
|
||||
$max_recursion--;
|
||||
$section_end = strpos($string, $close_tag, $search_offset);
|
||||
$search_offset = $section_end + $close_tag_length;
|
||||
$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
|
||||
$open_tag_count = substr_count($section_to_remove, $open_tag);
|
||||
$close_tag_count = substr_count($section_to_remove, $close_tag);
|
||||
} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
|
||||
$string = str_replace($section_to_remove, '', $string);
|
||||
}
|
||||
}
|
||||
return $string;
|
||||
}
|
||||
|
||||
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$article_html){
|
||||
$item['content'] = 'Could not request '.$this->getName().': '.$item['uri'];
|
||||
return $item;
|
||||
}
|
||||
|
||||
$article_image = $article_html->find('img.wp-post-image', 0)->src;
|
||||
$article_content = $article_html->find('div.entry-content', 0)->innertext;
|
||||
$article_content = $this->StripRecursiveHTMLSection($article_content , 'div', '<div class="entry-prefix"');
|
||||
$article_content = $this->StripRecursiveHTMLSection($article_content , 'script', '<script');
|
||||
$article_content = $this->StripRecursiveHTMLSection($article_content , 'aside', '<aside');
|
||||
$article_content = '<p><img src="'.$article_image.'" /></p><p><b>'.$item['content'].'</b></p>'.$article_content;
|
||||
|
||||
$item['content'] = $article_content;
|
||||
|
||||
return $item;
|
||||
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
|
||||
$feedUrl = 'https://feeds.feedburner.com/nakedsecurity?format=xml';
|
||||
$this->collectExpandableDatas($feedUrl);
|
||||
}
|
||||
const PARAMETERS = array();
|
||||
}
|
||||
|
|
|
@ -1,29 +1,15 @@
|
|||
<?php
|
||||
class NumeramaBridge extends FeedExpander {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class NumeramaBridge extends WordPressBridge {
|
||||
|
||||
const MAINTAINER = 'mitsukarenai';
|
||||
const NAME = 'Numerama';
|
||||
const URI = 'http://www.numerama.com/';
|
||||
const DESCRIPTION = 'Returns the 5 newest posts from Numerama (full text)';
|
||||
|
||||
public function collectData(){
|
||||
$this->collectExpandableDatas(self::URI . 'feed/', 5);
|
||||
}
|
||||
|
||||
protected function parseItem($newsItem){
|
||||
$item = parent::parseItem($newsItem);
|
||||
$item['content'] = $this->ExtractContent($item['uri']);
|
||||
return $item;
|
||||
}
|
||||
|
||||
private function ExtractContent($url){
|
||||
$article_html = $this->getSimpleHTMLDOMCached('Could not request Numerama: '.$url);
|
||||
$contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block
|
||||
$contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture
|
||||
return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post
|
||||
}
|
||||
|
||||
const DESCRIPTION = 'Returns the newest posts from Numerama (full text)';
|
||||
const PARAMETERS = array();
|
||||
public function getCacheDuration() {
|
||||
|
||||
return 1800; // 30min
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,39 +1,13 @@
|
|||
<?php
|
||||
class SiliconBridge extends FeedExpander {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class SiliconBridge extends WordPressBridge {
|
||||
|
||||
const MAINTAINER = "ORelio";
|
||||
const NAME = 'Silicon Bridge';
|
||||
const URI = 'http://www.silicon.fr/';
|
||||
const DESCRIPTION = "Returns the newest articles.";
|
||||
|
||||
protected function parseItem($item){
|
||||
$item = parent::parseItem($item);
|
||||
|
||||
$article_html = $this->getSimpleHTMLDOMCached($item['uri']);
|
||||
if(!$article_html){
|
||||
$item['content'] .= '<p>Could not request Silicon: '.$item['uri'].'</p>';
|
||||
return $item;
|
||||
}
|
||||
|
||||
$article_content = '<p><b>'.$article_html->find('div.entry-excerpt', 0)->plaintext.'</b></p>'
|
||||
.$article_html->find('div.entry-content', 0)->innertext;
|
||||
|
||||
//Remove useless scripts left in the page
|
||||
while (strpos($article_content, '<script') !== false) {
|
||||
$script_section = substr($article_content, strpos($article_content, '<script'));
|
||||
$script_section = substr($script_section, 0, strpos($script_section, '</script>') + 9);
|
||||
$article_content = str_replace($script_section, '', $article_content);
|
||||
}
|
||||
|
||||
$item['content'] = $article_content;
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
public function collectData(){
|
||||
$feedUrl = self::URI.'feed';
|
||||
$this->collectExpandableDatas($feedUrl);
|
||||
}
|
||||
const PARAMETERS = array();
|
||||
|
||||
public function getCacheDuration() {
|
||||
return 1800; // 30 minutes
|
||||
|
|
|
@ -1,41 +1,13 @@
|
|||
<?php
|
||||
class ZatazBridge extends BridgeAbstract {
|
||||
require_once('WordPressBridge.php');
|
||||
|
||||
class ZatazBridge extends WordPressBridge{
|
||||
|
||||
const MAINTAINER = "aledeg";
|
||||
const NAME = 'Zataz Magazine';
|
||||
const URI = 'http://www.zataz.com';
|
||||
const DESCRIPTION = "ZATAZ Magazine - S'informer, c'est déjà se sécuriser";
|
||||
|
||||
public function collectData(){
|
||||
$html = $this->getSimpleHTMLDOM(self::URI) or $this->returnServerError('Could not request ' . self::URI);
|
||||
|
||||
$recent_posts = $html->find('#recent-posts-3', 0)->find('ul', 0)->find('li');
|
||||
foreach ($recent_posts as $article) {
|
||||
if (count($this->items) < 5) {
|
||||
$uri = $article->find('a', 0)->href;
|
||||
$this->items[] = $this->getDetails($uri);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function getDetails($uri) {
|
||||
$html = $this->getSimpleHTMLDOM($uri) or exit;
|
||||
|
||||
$item = array();
|
||||
|
||||
$article = $html->find('.gdl-blog-full', 0);
|
||||
$item['uri'] = $uri;
|
||||
$item['title'] = $article->find('.blog-title', 0)->find('a', 0)->innertext;
|
||||
$item['content'] = $article->find('.blog-content', 0)->innertext;
|
||||
$item['timestamp'] = $this->getTimestampFromDate($article->find('.blog-date', 0)->find('a', 0)->href);
|
||||
return $item;
|
||||
}
|
||||
|
||||
private function getTimestampFromDate($uri) {
|
||||
preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches);
|
||||
$date = new \DateTime($matches[0]);
|
||||
return $date->format('U');
|
||||
}
|
||||
const PARAMETERS = array();
|
||||
|
||||
public function getCacheDuration() {
|
||||
return 7200; // 2h
|
||||
|
|
Loading…
Reference in a new issue