From 5f3d28f3a6af421bd020457f1b7a948ea72f9b58 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 28 Aug 2016 19:37:32 +0200 Subject: [PATCH 1/5] [Bridge] Return HTML DOM with get_cached --- lib/Bridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Bridge.php b/lib/Bridge.php index 55c85c0..a1e48e9 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -419,7 +419,7 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { } } - return $content; + return str_get_html($content); } public function get_cached_time($url){ From a7b3519c3536c6f02df9cfabeab724b061e736b9 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 28 Aug 2016 19:38:34 +0200 Subject: [PATCH 2/5] [bridges] Fix all calls to get_cached --- bridges/CpasbienBridge.php | 2 +- bridges/FreenewsBridge.php | 2 +- bridges/GawkerBridge.php | 2 +- bridges/Les400CulsBridge.php | 2 +- bridges/TheOatMealBridge.php | 2 +- bridges/WorldOfTanksBridge.php | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bridges/CpasbienBridge.php b/bridges/CpasbienBridge.php index d41d442..862e58b 100644 --- a/bridges/CpasbienBridge.php +++ b/bridges/CpasbienBridge.php @@ -33,7 +33,7 @@ class CpasbienBridge extends HttpCachingBridgeAbstract{ if ($episode->getAttribute('class')=='ligne0' || $episode->getAttribute('class')=='ligne1') { - $htmlepisode=str_get_html($this->get_cached($episode->find('a', 0)->getAttribute('href'))); + $htmlepisode=$this->get_cached($episode->find('a', 0)->getAttribute('href')); $item = array(); $item['author'] = $episode->find('a', 0)->text(); diff --git a/bridges/FreenewsBridge.php b/bridges/FreenewsBridge.php index 727f9f5..0df3c6c 100644 --- a/bridges/FreenewsBridge.php +++ b/bridges/FreenewsBridge.php @@ -25,7 +25,7 @@ class FreenewsBridge extends RssExpander { } // now load that uri from cache $this->debugMessage("now loading page ".$item['uri']); - $articlePage = str_get_html($this->get_cached($item['uri'])); + $articlePage = $this->get_cached($item['uri']); $content = $articlePage->find('.post-container', 0); $item['content'] = $content->innertext; diff --git a/bridges/GawkerBridge.php b/bridges/GawkerBridge.php index 6012220..ac52ae0 100644 --- a/bridges/GawkerBridge.php +++ b/bridges/GawkerBridge.php @@ -45,7 +45,7 @@ class GawkerBridge extends RssExpander{ try { // now load that uri from cache $this->debugMessage("loading page ".$item['uri']); - $articlePage = str_get_html($this->get_cached($item['uri'])); + $articlePage = $this->get_cached($item['uri']); if(is_object($articlePage)) { $content = $articlePage->find('.post-content', 0); HTMLSanitizer::defaultImageSrcTo($content, $this->getURI()); diff --git a/bridges/Les400CulsBridge.php b/bridges/Les400CulsBridge.php index 2dd9883..2925a29 100644 --- a/bridges/Les400CulsBridge.php +++ b/bridges/Les400CulsBridge.php @@ -29,7 +29,7 @@ class Les400CulsBridge extends RssExpander{ } // now load that uri from cache $this->debugMessage("now loading page ".$item['uri']); -// $articlePage = str_get_html($this->get_cached($item['uri'])); +// $articlePage = $this->get_cached($item['uri']); // $content = $articlePage->find('.post-container', 0); $item['content'] = (string) $newsItem->description; diff --git a/bridges/TheOatMealBridge.php b/bridges/TheOatMealBridge.php index c6ab0cd..a152157 100644 --- a/bridges/TheOatMealBridge.php +++ b/bridges/TheOatMealBridge.php @@ -43,7 +43,7 @@ class TheOatmealBridge extends RssExpander{ $item['uri']=(string) $newsItem->attributes($namespaces['rdf'])->about; // now load that uri from cache $this->debugMessage("now loading page ".$item['uri']); - $articlePage = str_get_html($this->get_cached($item['uri'])); + $articlePage = $this->get_cached($item['uri']); $content = $articlePage->find('#comic', 0); if($content==null) { diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index 98f27bf..e327685 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -58,7 +58,7 @@ class WorldOfTanksBridge extends HttpCachingBridgeAbstract{ $item['uri'] = $this->uri.$infoLink->href; // now load that uri from cache $this->debugMessage("loading page ".$item['uri']); - $articlePage = str_get_html($this->get_cached($item['uri'])); + $articlePage = $this->get_cached($item['uri']); $content = $articlePage->find('.l-content', 0); HTMLSanitizer::defaultImageSrcTo($content, $this->uri); $item['title'] = $content->find('h1', 0)->innertext; From 7363acfa6b39d14ca2b2a5d446de0165187a1ed8 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 28 Aug 2016 19:39:23 +0200 Subject: [PATCH 3/5] [Wikipedia] Use cache for full articles --- bridges/WikipediaBridge.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bridges/WikipediaBridge.php b/bridges/WikipediaBridge.php index 7a28403..ea75441 100644 --- a/bridges/WikipediaBridge.php +++ b/bridges/WikipediaBridge.php @@ -3,7 +3,7 @@ define('WIKIPEDIA_SUBJECT_TFA', 0); // Today's featured article define('WIKIPEDIA_SUBJECT_DYK', 1); // Did you know... -class WikipediaBridge extends BridgeAbstract{ +class WikipediaBridge extends HttpCachingBridgeAbstract { public function loadMetadatas(){ $this->maintainer = 'logmanoriginal'; $this->name = 'Wikipedia bridge for many languages'; @@ -188,7 +188,10 @@ class WikipediaBridge extends BridgeAbstract{ * Loads the full article from a given URI */ private function LoadFullArticle($uri){ - $content_html = $this->getSimpleHTMLDOM($uri); + if($this->get_cached_time($uri) <= strtotime('-24 hours')) + $this->remove_from_cache($uri); + + $content_html = $this->get_cached($uri); if(!$content_html) $this->returnServerError('Could not load site: ' . $uri . '!'); From e4b314f78a1878a1ff33f4c3499e4a2c25d9b9f4 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 28 Aug 2016 19:47:40 +0200 Subject: [PATCH 4/5] [Bridge] Enable cache file deletion --- lib/Bridge.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Bridge.php b/lib/Bridge.php index a1e48e9..7f4adee 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -465,8 +465,8 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { // TODO build this from the variable given to Cache $cacheDir = __DIR__ . '/../cache/pages/'; $filepath = $this->buildCacheFilePath($url, $cacheDir); - $this->debugMessage('removing from cache \'' . $filepath . '\' WELL, NOT REALLY'); - // unlink($filepath); + $this->debugMessage('removing from cache \'' . $filepath . '\''); + unlink($filepath); } } From 78f67576227a6b24a2498d2b04896346245206c8 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 28 Aug 2016 20:07:56 +0200 Subject: [PATCH 5/5] [bridges] Use caching with applicable bridges --- bridges/JapanExpoBridge.php | 7 +++++-- bridges/KununuBridge.php | 7 +++++-- bridges/LichessBridge.php | 7 +++++-- bridges/NumeramaBridge.php | 7 +++++-- bridges/WordPressBridge.php | 7 +++++-- 5 files changed, 25 insertions(+), 10 deletions(-) diff --git a/bridges/JapanExpoBridge.php b/bridges/JapanExpoBridge.php index 7ac1f8a..ea20592 100644 --- a/bridges/JapanExpoBridge.php +++ b/bridges/JapanExpoBridge.php @@ -1,5 +1,5 @@ maintainer = 'Ginko'; @@ -64,7 +64,10 @@ class JapanExpoBridge extends BridgeAbstract{ if ($fullcontent) { if ($count < 5) { - $article_html = $this->getSimpleHTMLDOM($url) or $this->returnServerError('Could not request JapanExpo: '.$url); + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); + + $article_html = $this->get_cached($url) or $this->returnServerError('Could not request JapanExpo: '.$url); $header = $article_html->find('header.pageHeadBox', 0); $timestamp = strtotime($header->find('time', 0)->datetime); $title_html = $header->find('div.section', 0)->next_sibling(); diff --git a/bridges/KununuBridge.php b/bridges/KununuBridge.php index e81917d..c4c2fa0 100644 --- a/bridges/KununuBridge.php +++ b/bridges/KununuBridge.php @@ -1,5 +1,5 @@ maintainer = "logmanoriginal"; $this->name = "Kununu Bridge"; /* This will be replaced later! */ @@ -248,7 +248,10 @@ class KununuBridge extends BridgeAbstract{ */ private function extract_full_description($uri){ // Load full article - $html = $this->getSimpleHTMLDOM($uri); + if($this->get_cached_time($uri) <= strtotime('-24 hours')) + $this->remove_from_cache($uri); + + $html = $this->get_cached($uri); if($html === false) $this->returnServerError('Could not load full description!'); diff --git a/bridges/LichessBridge.php b/bridges/LichessBridge.php index 839b183..1a34005 100644 --- a/bridges/LichessBridge.php +++ b/bridges/LichessBridge.php @@ -1,6 +1,6 @@ getSimpleHTMLDOM($blog_post_uri); + if($this->get_cached_time($blog_post_uri) <= strtotime('-24 hours')) + $this->remove_from_cache($blog_post_uriuri); + + $blog_post_html = $this->get_cached($blog_post_uri); $blog_post_div = $blog_post_html->find('#lichess_blog', 0); $post_chapo = $blog_post_div->find('.shortlede', 0)->innertext; diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index d9ae083..132c1e6 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -1,5 +1,5 @@ find('pubDate', 0)->plaintext); $article_url = NumeramaStripCDATA($element->find('guid', 0)->plaintext); - $article_html = $this->getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Numerama: '.$article_url); + if($this->get_cached_time($article_url) <= strtotime('-24 hours')) + $this->remove_from_cache($article_url); + + $article_html = $this->get_cached($article_url) or $this->returnServerError('Could not request Numerama: '.$article_url); $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block $contents = ''; // add post picture $contents = $contents.$article_html->find('article[class=post-content]', 0)->innertext; // extract the post diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 370b772..271d2cb 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -1,7 +1,7 @@ find('updated', 0)->innertext); } - $article_html = $this->getSimpleHTMLDOM($item['uri']); + if($this->get_cached_time($item['uri']) <= strtotime('-24 hours')) + $this->remove_from_cache($item['uri']); + + $article_html = $this->get_cached($item['uri']); // Attempt to find most common content div if(!isset($item['content'])){