From 9f2dd4868469a4362cd0ec8ef1aeae0ff9bf2b04 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 10 Sep 2016 19:04:01 +0200 Subject: [PATCH 1/3] [BridgeAbstract] Add getSimpleHTMLDOMCached This function is a copy of the get_cached function from HttpCachingBridgeAbstract, adding all parameters of getSimpleHTMLDOM in order to replace the need of HttpCachingBridgeAbstract entirely --- lib/BridgeAbstract.php | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/lib/BridgeAbstract.php b/lib/BridgeAbstract.php index f58e056..abcda79 100644 --- a/lib/BridgeAbstract.php +++ b/lib/BridgeAbstract.php @@ -386,4 +386,60 @@ abstract class BridgeAbstract implements BridgeInterface { , $defaultBRText , $defaultSpanText); } + + /** + * Maintain locally cached versions of pages to avoid multiple downloads. + * @param url url to cache + * @param duration duration of the cache file in seconds (default: 24h/86400s) + * @return content of the file as string + */ + public function getSimpleHTMLDOMCached($url + , $duration = 86400 + , $use_include_path = false + , $context = null + , $offset = 0 + , $maxLen = null + , $lowercase = true + , $forceTagsClosed = true + , $target_charset = DEFAULT_TARGET_CHARSET + , $stripRN = true + , $defaultBRText = DEFAULT_BR_TEXT + , $defaultSpanText = DEFAULT_SPAN_TEXT){ + $this->debugMessage('Caching url ' . $url . ', duration ' . $duration); + + $filepath = __DIR__ . '/../cache/pages/' . sha1($url) . '.cache'; + $this->debugMessage('Cache file ' . $filepath); + + if(file_exists($filepath) && filectime($filepath) < time() - $duration){ + unlink ($filepath); + $this->debugMessage('Cached file deleted: ' . $filepath); + } + + if(file_exists($filepath)){ + $this->debugMessage('Loading cached file ' . $filepath); + touch($filepath); + $content = file_get_contents($filepath); + } else { + $this->debugMessage('Caching ' . $url . ' to ' . $filepath); + $dir = substr($filepath, 0, strrpos($filepath, '/')); + + if(!is_dir($dir)){ + $this->debugMessage('Creating directory ' . $dir); + mkdir($dir, 0777, true); + } + + $content = $this->getContents($url, $use_include_path, $context, $offset, $maxLen); + if($content !== false){ + file_put_contents($filepath, $content); + } + } + + return str_get_html($content + , $lowercase + , $forceTagsClosed + , $target_charset + , $stripRN + , $defaultBRText + , $defaultSpanText); + } } From 2eec89ab2718065bdb78560171fb55e280fb9806 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 10 Sep 2016 19:11:09 +0200 Subject: [PATCH 2/3] [bridges] Change all bridges to use BridgeAbstract with getSimpleHTMLDOMCached --- bridges/CADBridge.php | 2 +- bridges/CommonDreamsBridge.php | 2 +- bridges/CpasbienBridge.php | 4 ++-- bridges/DauphineLibereBridge.php | 2 +- bridges/DeveloppezDotComBridge.php | 2 +- bridges/FreenewsBridge.php | 2 +- bridges/FuturaSciencesBridge.php | 2 +- bridges/JapanExpoBridge.php | 4 ++-- bridges/KununuBridge.php | 4 ++-- bridges/LeJournalDuGeekBridge.php | 2 +- bridges/LeMondeInformatiqueBridge.php | 2 +- bridges/LichessBridge.php | 2 +- bridges/NextInpactBridge.php | 2 +- bridges/NextgovBridge.php | 2 +- bridges/NiceMatinBridge.php | 2 +- bridges/NumeramaBridge.php | 2 +- bridges/TheOatMealBridge.php | 2 +- bridges/WikipediaBridge.php | 4 ++-- bridges/WordPressBridge.php | 4 ++-- bridges/WorldOfTanksBridge.php | 4 ++-- lib/FeedExpander.php | 2 +- 21 files changed, 27 insertions(+), 27 deletions(-) diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php index 86dfdb0..595160e 100644 --- a/bridges/CADBridge.php +++ b/bridges/CADBridge.php @@ -16,7 +16,7 @@ class CADBridge extends FeedExpander { } private function CADExtractContent($url) { - $html3 = $this->get_cached($url); + $html3 = $this->getSimpleHTMLDOMCached($url); // The request might fail due to missing https support or wrong URL if($html3 == false) diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index e8a4af3..224b309 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -17,7 +17,7 @@ class CommonDreamsBridge extends FeedExpander { } private function CommonDreamsExtractContent($url) { - $html3 = $this->get_cached($url); + $html3 = $this->getSimpleHTMLDOMCached($url); $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $html3->clear(); unset ($html3); diff --git a/bridges/CpasbienBridge.php b/bridges/CpasbienBridge.php index 829c596..10af594 100644 --- a/bridges/CpasbienBridge.php +++ b/bridges/CpasbienBridge.php @@ -1,5 +1,5 @@ getAttribute('class')=='ligne0' || $episode->getAttribute('class')=='ligne1') { - $htmlepisode=$this->get_cached($episode->find('a', 0)->getAttribute('href')); + $htmlepisode=$this->getSimpleHTMLDOMCached($episode->find('a', 0)->getAttribute('href')); $item = array(); $item['author'] = $episode->find('a', 0)->text(); diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index fe4775c..9e9aacd 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -47,7 +47,7 @@ class DauphineLibereBridge extends FeedExpander { } private function ExtractContent($url) { - $html2 = $this->get_cached($url); + $html2 = $this->getSimpleHTMLDOMCached($url); $text = $html2->find('div.column', 0)->innertext; $text = preg_replace('@]*?>.*?@si', '', $text); return $text; diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index fe08d28..5cbd576 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -42,7 +42,7 @@ class DeveloppezDotComBridge extends FeedExpander { } private function DeveloppezDotComExtractContent($url) { - $articleHTMLContent = $this->get_cached($url); + $articleHTMLContent = $this->getSimpleHTMLDOMCached($url); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = utf8_encode($text); return trim($text); diff --git a/bridges/FreenewsBridge.php b/bridges/FreenewsBridge.php index dbc46b9..1934e0b 100644 --- a/bridges/FreenewsBridge.php +++ b/bridges/FreenewsBridge.php @@ -13,7 +13,7 @@ class FreenewsBridge extends FeedExpander { protected function parseItem($newsItem) { $item = $this->parseRSS_2_0_Item($newsItem); - $articlePage = $this->get_cached($item['uri']); + $articlePage = $this->getSimpleHTMLDOMCached($item['uri']); $content = $articlePage->find('.post-container', 0); $item['content'] = $content->innertext; diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index 2cf846c..73f1b53 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -86,7 +86,7 @@ class FuturaSciencesBridge extends FeedExpander { protected function parseItem($newsItem){ $item = $this->parseRSS_2_0_Item($newsItem); $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); - $article = $this->get_cached($item['uri']) + $article = $this->getSimpleHTMLDOMCached($item['uri']) or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); $item['content'] = $this->ExtractArticleContent($article); $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); diff --git a/bridges/JapanExpoBridge.php b/bridges/JapanExpoBridge.php index 4019ae8..dcd951a 100644 --- a/bridges/JapanExpoBridge.php +++ b/bridges/JapanExpoBridge.php @@ -1,5 +1,5 @@ get_cached($url) or $this->returnServerError('Could not request JapanExpo: '.$url); + $article_html = $this->getSimpleHTMLDOMCached('Could not request JapanExpo: '.$url); $header = $article_html->find('header.pageHeadBox', 0); $timestamp = strtotime($header->find('time', 0)->datetime); $title_html = $header->find('div.section', 0)->next_sibling(); diff --git a/bridges/KununuBridge.php b/bridges/KununuBridge.php index da159ce..a958b77 100644 --- a/bridges/KununuBridge.php +++ b/bridges/KununuBridge.php @@ -1,5 +1,5 @@ get_cached($uri); + $html = $this->getSimpleHTMLDOMCached($uri); if($html === false) $this->returnServerError('Could not load full description!'); diff --git a/bridges/LeJournalDuGeekBridge.php b/bridges/LeJournalDuGeekBridge.php index c723a2f..95bd960 100644 --- a/bridges/LeJournalDuGeekBridge.php +++ b/bridges/LeJournalDuGeekBridge.php @@ -17,7 +17,7 @@ class LeJournalDuGeekBridge extends FeedExpander { } private function LeJournalDuGeekExtractContent($url) { - $articleHTMLContent = $this->get_cached($url); + $articleHTMLContent = $this->getSimpleHTMLDOMCached($url); $text = $articleHTMLContent->find('div.post-content', 0)->innertext; foreach($articleHTMLContent->find('a.more') as $element) { diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index 010228a..f609517 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -12,7 +12,7 @@ class LeMondeInformatiqueBridge extends FeedExpander { protected function parseItem($newsItem){ $item = $this->parseRSS_1_0_Item($newsItem); - $article_html = $this->get_cached($item['uri']) + $article_html = $this->getSimpleHTMLDOMCached($item['uri']) or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']); $item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext); $item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext; diff --git a/bridges/LichessBridge.php b/bridges/LichessBridge.php index 638811d..6f64539 100644 --- a/bridges/LichessBridge.php +++ b/bridges/LichessBridge.php @@ -17,7 +17,7 @@ class LichessBridge extends FeedExpander { } private function retrieve_lichess_post($blog_post_uri){ - $blog_post_html = $this->get_cached($blog_post_uri); + $blog_post_html = $this->getSimpleHTMLDOMCached($blog_post_uri); $blog_post_div = $blog_post_html->find('#lichess_blog', 0); $post_chapo = $blog_post_div->find('.shortlede', 0)->innertext; diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index 815a236..3152b09 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -17,7 +17,7 @@ class NextInpactBridge extends FeedExpander { } private function ExtractContent($url) { - $html2 = $this->get_cached($url); + $html2 = $this->getSimpleHTMLDOMCached($url); $text = '

'.$html2->find('span.sub_title', 0)->innertext.'

' .'

-

' .'
'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'
'; diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index 5d26ec5..d706119 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -56,7 +56,7 @@ class NextgovBridge extends FeedExpander { } private function ExtractContent($url){ - $article = $this->get_cached($url) + $article = $this->getSimpleHTMLDOMCached($url) or $this->returnServerError('Could not request Nextgov: ' . $url); $contents = $article->find('div.wysiwyg', 0)->innertext; diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 6d148ad..4e83cff 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -17,7 +17,7 @@ class NiceMatinBridge extends FeedExpander { } private function NiceMatinExtractContent($url) { - $html = $this->get_cached($url); + $html = $this->getSimpleHTMLDOMCached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index ead340a..d018fbd 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -17,7 +17,7 @@ class NumeramaBridge extends FeedExpander { } private function ExtractContent($url){ - $article_html = $this->get_cached($url) or $this->returnServerError('Could not request Numerama: '.$url); + $article_html = $this->getSimpleHTMLDOMCached('Could not request Numerama: '.$url); $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block $contents = ''; // add post picture return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post diff --git a/bridges/TheOatMealBridge.php b/bridges/TheOatMealBridge.php index eee9283..3c3d216 100644 --- a/bridges/TheOatMealBridge.php +++ b/bridges/TheOatMealBridge.php @@ -13,7 +13,7 @@ class TheOatmealBridge extends FeedExpander{ protected function parseItem($newsItem) { $item = $this->parseRSS_1_0_Item($newsItem); - $articlePage = $this->get_cached($item['uri']); + $articlePage = $this->getSimpleHTMLDOMCached($item['uri']); $content = $articlePage->find('#comic', 0); if(is_null($content)) // load alternative $content = $articlePage->find('#blog', 0); diff --git a/bridges/WikipediaBridge.php b/bridges/WikipediaBridge.php index d7a90dc..5feb429 100644 --- a/bridges/WikipediaBridge.php +++ b/bridges/WikipediaBridge.php @@ -3,7 +3,7 @@ define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know... -class WikipediaBridge extends HttpCachingBridgeAbstract { +class WikipediaBridge extends BridgeAbstract { const MAINTAINER = 'logmanoriginal'; const NAME = 'Wikipedia bridge for many languages'; const URI = 'https://www.wikipedia.org/'; @@ -175,7 +175,7 @@ class WikipediaBridge extends HttpCachingBridgeAbstract { * Loads the full article from a given URI */ private function LoadFullArticle($uri){ - $content_html = $this->get_cached($uri); + $content_html = $this->getSimpleHTMLDOMCached($uri); if(!$content_html) $this->returnServerError('Could not load site: ' . $uri . '!'); diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 30e7e2a..6c7d5f6 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -1,7 +1,7 @@ find('updated', 0)->innertext); } - $article_html = $this->get_cached($item['uri']); + $article_html = $this->getSimpleHTMLDOMCached($item['uri']); // Attempt to find most common content div if(!isset($item['content'])){ diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index b723526..a3179be 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -1,5 +1,5 @@ href; // now load that uri from cache $this->debugMessage("loading page ".$item['uri']); - $articlePage = $this->get_cached($item['uri']); + $articlePage = $this->getSimpleHTMLDOMCached($item['uri']); $content = $articlePage->find('.l-content', 0); HTMLSanitizer::defaultImageSrcTo($content, self::URI); $item['title'] = $content->find('h1', 0)->innertext; diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index abaf121..5566f7c 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -1,6 +1,6 @@ Date: Sat, 10 Sep 2016 19:13:01 +0200 Subject: [PATCH 3/3] [core] Remove HttpCachingBridgeAbstract BridgeAbstract implements all functions to cover the implementation --- lib/HttpCachingBridgeAbstract.php | 45 ------------------------------- lib/RssBridge.php | 1 - 2 files changed, 46 deletions(-) delete mode 100644 lib/HttpCachingBridgeAbstract.php diff --git a/lib/HttpCachingBridgeAbstract.php b/lib/HttpCachingBridgeAbstract.php deleted file mode 100644 index 364606e..0000000 --- a/lib/HttpCachingBridgeAbstract.php +++ /dev/null @@ -1,45 +0,0 @@ -debugMessage('Caching url ' . $url . ', duration ' . $duration); - - $filepath = __DIR__ . '/../cache/pages/' . sha1($url) . '.cache'; - $this->debugMessage('Cache file ' . $filepath); - - if(file_exists($filepath) && filectime($filepath) < time() - $duration){ - unlink ($filepath); - $this->debugMessage('Cached file deleted: ' . $filepath); - } - - if(file_exists($filepath)){ - $this->debugMessage('Loading cached file ' . $filepath); - touch($filepath); - $content = file_get_contents($filepath); - } else { - $this->debugMessage('Caching ' . $url . ' to ' . $filepath); - $dir = substr($filepath, 0, strrpos($filepath, '/')); - - if(!is_dir($dir)){ - $this->debugMessage('Creating directory ' . $dir); - mkdir($dir, 0777, true); - } - - $content = $this->getContents($url); - if($content !== false){ - file_put_contents($filepath, $content); - } - } - - return str_get_html($content); - } -} diff --git a/lib/RssBridge.php b/lib/RssBridge.php index 6dd2663..0728683 100644 --- a/lib/RssBridge.php +++ b/lib/RssBridge.php @@ -12,7 +12,6 @@ require __DIR__ . '/Format.php'; require __DIR__ . '/FormatAbstract.php'; require __DIR__ . '/Bridge.php'; require __DIR__ . '/BridgeAbstract.php'; -require __DIR__ . '/HttpCachingBridgeAbstract.php'; require __DIR__ . '/FeedExpander.php'; require __DIR__ . '/Cache.php'; require __DIR__ . '/CacheAbstract.php';