1
0
Fork 0
forked from blallo/rss-bridge

Merge branch 'HttpCachingBridgeAbstract' of https://github.com/logmanoriginal/rss-bridge

This commit is contained in:
logmanoriginal 2016-09-10 19:16:04 +02:00
commit 5ad2e38927
24 changed files with 83 additions and 73 deletions

View file

@ -16,7 +16,7 @@ class CADBridge extends FeedExpander {
} }
private function CADExtractContent($url) { private function CADExtractContent($url) {
$html3 = $this->get_cached($url); $html3 = $this->getSimpleHTMLDOMCached($url);
// The request might fail due to missing https support or wrong URL // The request might fail due to missing https support or wrong URL
if($html3 == false) if($html3 == false)

View file

@ -17,7 +17,7 @@ class CommonDreamsBridge extends FeedExpander {
} }
private function CommonDreamsExtractContent($url) { private function CommonDreamsExtractContent($url) {
$html3 = $this->get_cached($url); $html3 = $this->getSimpleHTMLDOMCached($url);
$text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext;
$html3->clear(); $html3->clear();
unset ($html3); unset ($html3);

View file

@ -1,5 +1,5 @@
<?php <?php
class CpasbienBridge extends HttpCachingBridgeAbstract{ class CpasbienBridge extends BridgeAbstract {
const MAINTAINER = "lagaisse"; const MAINTAINER = "lagaisse";
const NAME = "Cpasbien Bridge"; const NAME = "Cpasbien Bridge";
@ -23,7 +23,7 @@ class CpasbienBridge extends HttpCachingBridgeAbstract{
if ($episode->getAttribute('class')=='ligne0' || if ($episode->getAttribute('class')=='ligne0' ||
$episode->getAttribute('class')=='ligne1') $episode->getAttribute('class')=='ligne1')
{ {
$htmlepisode=$this->get_cached($episode->find('a', 0)->getAttribute('href')); $htmlepisode=$this->getSimpleHTMLDOMCached($episode->find('a', 0)->getAttribute('href'));
$item = array(); $item = array();
$item['author'] = $episode->find('a', 0)->text(); $item['author'] = $episode->find('a', 0)->text();

View file

@ -47,7 +47,7 @@ class DauphineLibereBridge extends FeedExpander {
} }
private function ExtractContent($url) { private function ExtractContent($url) {
$html2 = $this->get_cached($url); $html2 = $this->getSimpleHTMLDOMCached($url);
$text = $html2->find('div.column', 0)->innertext; $text = $html2->find('div.column', 0)->innertext;
$text = preg_replace('@<script[^>]*?>.*?</script>@si', '', $text); $text = preg_replace('@<script[^>]*?>.*?</script>@si', '', $text);
return $text; return $text;

View file

@ -42,7 +42,7 @@ class DeveloppezDotComBridge extends FeedExpander {
} }
private function DeveloppezDotComExtractContent($url) { private function DeveloppezDotComExtractContent($url) {
$articleHTMLContent = $this->get_cached($url); $articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
$text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext);
$text = utf8_encode($text); $text = utf8_encode($text);
return trim($text); return trim($text);

View file

@ -13,7 +13,7 @@ class FreenewsBridge extends FeedExpander {
protected function parseItem($newsItem) { protected function parseItem($newsItem) {
$item = $this->parseRSS_2_0_Item($newsItem); $item = $this->parseRSS_2_0_Item($newsItem);
$articlePage = $this->get_cached($item['uri']); $articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('.post-container', 0); $content = $articlePage->find('.post-container', 0);
$item['content'] = $content->innertext; $item['content'] = $content->innertext;

View file

@ -86,7 +86,7 @@ class FuturaSciencesBridge extends FeedExpander {
protected function parseItem($newsItem){ protected function parseItem($newsItem){
$item = $this->parseRSS_2_0_Item($newsItem); $item = $this->parseRSS_2_0_Item($newsItem);
$item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']);
$article = $this->get_cached($item['uri']) $article = $this->getSimpleHTMLDOMCached($item['uri'])
or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']);
$item['content'] = $this->ExtractArticleContent($article); $item['content'] = $this->ExtractArticleContent($article);
$item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article);

View file

@ -1,5 +1,5 @@
<?php <?php
class JapanExpoBridge extends HttpCachingBridgeAbstract { class JapanExpoBridge extends BridgeAbstract {
const MAINTAINER = 'Ginko'; const MAINTAINER = 'Ginko';
const NAME = 'Japan Expo Actualités'; const NAME = 'Japan Expo Actualités';
@ -60,7 +60,7 @@ class JapanExpoBridge extends HttpCachingBridgeAbstract {
break; break;
} }
$article_html = $this->get_cached($url) or $this->returnServerError('Could not request JapanExpo: '.$url); $article_html = $this->getSimpleHTMLDOMCached('Could not request JapanExpo: '.$url);
$header = $article_html->find('header.pageHeadBox', 0); $header = $article_html->find('header.pageHeadBox', 0);
$timestamp = strtotime($header->find('time', 0)->datetime); $timestamp = strtotime($header->find('time', 0)->datetime);
$title_html = $header->find('div.section', 0)->next_sibling(); $title_html = $header->find('div.section', 0)->next_sibling();

View file

@ -1,5 +1,5 @@
<?php <?php
class KununuBridge extends HttpCachingBridgeAbstract { class KununuBridge extends BridgeAbstract {
const MAINTAINER = "logmanoriginal"; const MAINTAINER = "logmanoriginal";
const NAME = "Kununu Bridge"; const NAME = "Kununu Bridge";
const URI = "https://www.kununu.com/"; const URI = "https://www.kununu.com/";
@ -224,7 +224,7 @@ class KununuBridge extends HttpCachingBridgeAbstract {
*/ */
private function extract_full_description($uri){ private function extract_full_description($uri){
// Load full article // Load full article
$html = $this->get_cached($uri); $html = $this->getSimpleHTMLDOMCached($uri);
if($html === false) if($html === false)
$this->returnServerError('Could not load full description!'); $this->returnServerError('Could not load full description!');

View file

@ -17,7 +17,7 @@ class LeJournalDuGeekBridge extends FeedExpander {
} }
private function LeJournalDuGeekExtractContent($url) { private function LeJournalDuGeekExtractContent($url) {
$articleHTMLContent = $this->get_cached($url); $articleHTMLContent = $this->getSimpleHTMLDOMCached($url);
$text = $articleHTMLContent->find('div.post-content', 0)->innertext; $text = $articleHTMLContent->find('div.post-content', 0)->innertext;
foreach($articleHTMLContent->find('a.more') as $element) { foreach($articleHTMLContent->find('a.more') as $element) {

View file

@ -12,7 +12,7 @@ class LeMondeInformatiqueBridge extends FeedExpander {
protected function parseItem($newsItem){ protected function parseItem($newsItem){
$item = $this->parseRSS_1_0_Item($newsItem); $item = $this->parseRSS_1_0_Item($newsItem);
$article_html = $this->get_cached($item['uri']) $article_html = $this->getSimpleHTMLDOMCached($item['uri'])
or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']); or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']);
$item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext); $item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext);
$item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext; $item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext;

View file

@ -17,7 +17,7 @@ class LichessBridge extends FeedExpander {
} }
private function retrieve_lichess_post($blog_post_uri){ private function retrieve_lichess_post($blog_post_uri){
$blog_post_html = $this->get_cached($blog_post_uri); $blog_post_html = $this->getSimpleHTMLDOMCached($blog_post_uri);
$blog_post_div = $blog_post_html->find('#lichess_blog', 0); $blog_post_div = $blog_post_html->find('#lichess_blog', 0);
$post_chapo = $blog_post_div->find('.shortlede', 0)->innertext; $post_chapo = $blog_post_div->find('.shortlede', 0)->innertext;

View file

@ -17,7 +17,7 @@ class NextInpactBridge extends FeedExpander {
} }
private function ExtractContent($url) { private function ExtractContent($url) {
$html2 = $this->get_cached($url); $html2 = $this->getSimpleHTMLDOMCached($url);
$text = '<p><em>'.$html2->find('span.sub_title', 0)->innertext.'</em></p>' $text = '<p><em>'.$html2->find('span.sub_title', 0)->innertext.'</em></p>'
.'<p><img src="'.$html2->find('div.container_main_image_article', 0)->find('img.dedicated',0)->src.'" alt="-" /></p>' .'<p><img src="'.$html2->find('div.container_main_image_article', 0)->find('img.dedicated',0)->src.'" alt="-" /></p>'
.'<div>'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'</div>'; .'<div>'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'</div>';

View file

@ -56,7 +56,7 @@ class NextgovBridge extends FeedExpander {
} }
private function ExtractContent($url){ private function ExtractContent($url){
$article = $this->get_cached($url) $article = $this->getSimpleHTMLDOMCached($url)
or $this->returnServerError('Could not request Nextgov: ' . $url); or $this->returnServerError('Could not request Nextgov: ' . $url);
$contents = $article->find('div.wysiwyg', 0)->innertext; $contents = $article->find('div.wysiwyg', 0)->innertext;

View file

@ -17,7 +17,7 @@ class NiceMatinBridge extends FeedExpander {
} }
private function NiceMatinExtractContent($url) { private function NiceMatinExtractContent($url) {
$html = $this->get_cached($url); $html = $this->getSimpleHTMLDOMCached($url);
if(!$html) if(!$html)
return 'Could not acquire content from url: ' . $url . '!'; return 'Could not acquire content from url: ' . $url . '!';

View file

@ -17,7 +17,7 @@ class NumeramaBridge extends FeedExpander {
} }
private function ExtractContent($url){ private function ExtractContent($url){
$article_html = $this->get_cached($url) or $this->returnServerError('Could not request Numerama: '.$url); $article_html = $this->getSimpleHTMLDOMCached('Could not request Numerama: '.$url);
$contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block
$contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture $contents = '<img alt="" style="max-width:300px;" src="'.$article_html->find('meta[property=og:image]', 0)->getAttribute('content').'">'; // add post picture
return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post

View file

@ -13,7 +13,7 @@ class TheOatmealBridge extends FeedExpander{
protected function parseItem($newsItem) { protected function parseItem($newsItem) {
$item = $this->parseRSS_1_0_Item($newsItem); $item = $this->parseRSS_1_0_Item($newsItem);
$articlePage = $this->get_cached($item['uri']); $articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('#comic', 0); $content = $articlePage->find('#comic', 0);
if(is_null($content)) // load alternative if(is_null($content)) // load alternative
$content = $articlePage->find('#blog', 0); $content = $articlePage->find('#blog', 0);

View file

@ -3,7 +3,7 @@
define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article
define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know... define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know...
class WikipediaBridge extends HttpCachingBridgeAbstract { class WikipediaBridge extends BridgeAbstract {
const MAINTAINER = 'logmanoriginal'; const MAINTAINER = 'logmanoriginal';
const NAME = 'Wikipedia bridge for many languages'; const NAME = 'Wikipedia bridge for many languages';
const URI = 'https://www.wikipedia.org/'; const URI = 'https://www.wikipedia.org/';
@ -175,7 +175,7 @@ class WikipediaBridge extends HttpCachingBridgeAbstract {
* Loads the full article from a given URI * Loads the full article from a given URI
*/ */
private function LoadFullArticle($uri){ private function LoadFullArticle($uri){
$content_html = $this->get_cached($uri); $content_html = $this->getSimpleHTMLDOMCached($uri);
if(!$content_html) if(!$content_html)
$this->returnServerError('Could not load site: ' . $uri . '!'); $this->returnServerError('Could not load site: ' . $uri . '!');

View file

@ -1,7 +1,7 @@
<?php <?php
define('WORDPRESS_TYPE_ATOM', 1); // Content is of type ATOM define('WORDPRESS_TYPE_ATOM', 1); // Content is of type ATOM
define('WORDPRESS_TYPE_RSS', 2); // Content is of type RSS define('WORDPRESS_TYPE_RSS', 2); // Content is of type RSS
class WordPressBridge extends HttpCachingBridgeAbstract { class WordPressBridge extends BridgeAbstract {
public $sitename; // Name of the site public $sitename; // Name of the site
@ -82,7 +82,7 @@ class WordPressBridge extends HttpCachingBridgeAbstract {
$item['timestamp'] = strtotime($article->find('updated', 0)->innertext); $item['timestamp'] = strtotime($article->find('updated', 0)->innertext);
} }
$article_html = $this->get_cached($item['uri']); $article_html = $this->getSimpleHTMLDOMCached($item['uri']);
// Attempt to find most common content div // Attempt to find most common content div
if(!isset($item['content'])){ if(!isset($item['content'])){

View file

@ -1,5 +1,5 @@
<?php <?php
class WorldOfTanksBridge extends HttpCachingBridgeAbstract{ class WorldOfTanksBridge extends BridgeAbstract {
const MAINTAINER = "mitsukarenai"; const MAINTAINER = "mitsukarenai";
const NAME = "World of Tanks"; const NAME = "World of Tanks";
@ -57,7 +57,7 @@ class WorldOfTanksBridge extends HttpCachingBridgeAbstract{
$item['uri'] = self::URI.$infoLink->href; $item['uri'] = self::URI.$infoLink->href;
// now load that uri from cache // now load that uri from cache
$this->debugMessage("loading page ".$item['uri']); $this->debugMessage("loading page ".$item['uri']);
$articlePage = $this->get_cached($item['uri']); $articlePage = $this->getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('.l-content', 0); $content = $articlePage->find('.l-content', 0);
HTMLSanitizer::defaultImageSrcTo($content, self::URI); HTMLSanitizer::defaultImageSrcTo($content, self::URI);
$item['title'] = $content->find('h1', 0)->innertext; $item['title'] = $content->find('h1', 0)->innertext;

View file

@ -386,4 +386,60 @@ abstract class BridgeAbstract implements BridgeInterface {
, $defaultBRText , $defaultBRText
, $defaultSpanText); , $defaultSpanText);
} }
/**
* Maintain locally cached versions of pages to avoid multiple downloads.
* @param url url to cache
* @param duration duration of the cache file in seconds (default: 24h/86400s)
* @return content of the file as string
*/
public function getSimpleHTMLDOMCached($url
, $duration = 86400
, $use_include_path = false
, $context = null
, $offset = 0
, $maxLen = null
, $lowercase = true
, $forceTagsClosed = true
, $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true
, $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT){
$this->debugMessage('Caching url ' . $url . ', duration ' . $duration);
$filepath = __DIR__ . '/../cache/pages/' . sha1($url) . '.cache';
$this->debugMessage('Cache file ' . $filepath);
if(file_exists($filepath) && filectime($filepath) < time() - $duration){
unlink ($filepath);
$this->debugMessage('Cached file deleted: ' . $filepath);
}
if(file_exists($filepath)){
$this->debugMessage('Loading cached file ' . $filepath);
touch($filepath);
$content = file_get_contents($filepath);
} else {
$this->debugMessage('Caching ' . $url . ' to ' . $filepath);
$dir = substr($filepath, 0, strrpos($filepath, '/'));
if(!is_dir($dir)){
$this->debugMessage('Creating directory ' . $dir);
mkdir($dir, 0777, true);
}
$content = $this->getContents($url, $use_include_path, $context, $offset, $maxLen);
if($content !== false){
file_put_contents($filepath, $content);
}
}
return str_get_html($content
, $lowercase
, $forceTagsClosed
, $target_charset
, $stripRN
, $defaultBRText
, $defaultSpanText);
}
} }

View file

@ -1,6 +1,6 @@
<?php <?php
require_once(__DIR__ . '/BridgeInterface.php'); require_once(__DIR__ . '/BridgeInterface.php');
abstract class FeedExpander extends HttpCachingBridgeAbstract { abstract class FeedExpander extends BridgeAbstract {
private $name; private $name;
private $uri; private $uri;

View file

@ -1,45 +0,0 @@
<?php
require_once(__DIR__ . '/BridgeInterface.php');
/**
* Extension of BridgeAbstract allowing caching of files downloaded over http.
*/
abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
/**
* Maintain locally cached versions of pages to download, to avoid multiple downloads.
* @param url url to cache
* @param duration duration of the cache file in seconds (default: 24h/86400s)
* @return content of the file as string
*/
public function get_cached($url, $duration = 86400){
$this->debugMessage('Caching url ' . $url . ', duration ' . $duration);
$filepath = __DIR__ . '/../cache/pages/' . sha1($url) . '.cache';
$this->debugMessage('Cache file ' . $filepath);
if(file_exists($filepath) && filectime($filepath) < time() - $duration){
unlink ($filepath);
$this->debugMessage('Cached file deleted: ' . $filepath);
}
if(file_exists($filepath)){
$this->debugMessage('Loading cached file ' . $filepath);
touch($filepath);
$content = file_get_contents($filepath);
} else {
$this->debugMessage('Caching ' . $url . ' to ' . $filepath);
$dir = substr($filepath, 0, strrpos($filepath, '/'));
if(!is_dir($dir)){
$this->debugMessage('Creating directory ' . $dir);
mkdir($dir, 0777, true);
}
$content = $this->getContents($url);
if($content !== false){
file_put_contents($filepath, $content);
}
}
return str_get_html($content);
}
}

View file

@ -12,7 +12,6 @@ require __DIR__ . '/Format.php';
require __DIR__ . '/FormatAbstract.php'; require __DIR__ . '/FormatAbstract.php';
require __DIR__ . '/Bridge.php'; require __DIR__ . '/Bridge.php';
require __DIR__ . '/BridgeAbstract.php'; require __DIR__ . '/BridgeAbstract.php';
require __DIR__ . '/HttpCachingBridgeAbstract.php';
require __DIR__ . '/FeedExpander.php'; require __DIR__ . '/FeedExpander.php';
require __DIR__ . '/Cache.php'; require __DIR__ . '/Cache.php';
require __DIR__ . '/CacheAbstract.php'; require __DIR__ . '/CacheAbstract.php';