From 2861a855e4641f1d072813fe60acef1edfe8c46f Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Mon, 5 Sep 2016 20:26:45 +0200 Subject: [PATCH] [bridges] Define max items and clear caches --- bridges/CADBridge.php | 4 +++- bridges/CommonDreamsBridge.php | 4 +++- bridges/DauphineLibereBridge.php | 6 ++++-- bridges/DeveloppezDotComBridge.php | 4 +++- bridges/FuturaSciencesBridge.php | 16 +++++++++------- bridges/LeJournalDuGeekBridge.php | 4 +++- bridges/LeMondeInformatiqueBridge.php | 16 +++++++++------- bridges/LichessBridge.php | 2 +- bridges/NextInpactBridge.php | 4 +++- bridges/NextgovBridge.php | 2 +- bridges/NiceMatinBridge.php | 4 +++- bridges/NumeramaBridge.php | 2 +- 12 files changed, 43 insertions(+), 25 deletions(-) diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php index eb05fd1..1fdcfb5 100644 --- a/bridges/CADBridge.php +++ b/bridges/CADBridge.php @@ -6,7 +6,7 @@ class CADBridge extends FeedExpander { const DESCRIPTION = "Returns the newest articles."; public function collectData(){ - $this->collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml'); + $this->collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml', 10); } protected function parseItem($newsItem){ @@ -16,6 +16,8 @@ class CADBridge extends FeedExpander { } private function CADExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html3 = $this->get_cached($url); // The request might fail due to missing https support or wrong URL diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index e621db4..937590c 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -7,7 +7,7 @@ class CommonDreamsBridge extends FeedExpander { const DESCRIPTION = "Returns the newest articles."; public function collectData(){ - $this->collectExpandableDatas('http://www.commondreams.org/rss.xml'); + $this->collectExpandableDatas('http://www.commondreams.org/rss.xml', 10); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class CommonDreamsBridge extends FeedExpander { } private function CommonDreamsExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html3 = $this->get_cached($url); $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $html3->clear(); diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index d8e10dd..2f645c9 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -37,7 +37,7 @@ class DauphineLibereBridge extends FeedExpander { $url = self::URI . $this->getInput('u') . '/rss'; } - $this->collectExpandableDatas($url); + $this->collectExpandableDatas($url, 10); } protected function parseItem($newsItem){ @@ -47,7 +47,9 @@ class DauphineLibereBridge extends FeedExpander { } private function ExtractContent($url) { - $html2 = $this->getSimpleHTMLDOM($url); + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); + $html2 = $this->get_cached($url); $text = $html2->find('div.column', 0)->innertext; $text = preg_replace('@]*?>.*?@si', '', $text); return $text; diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 52e52db..cb277ec 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -7,7 +7,7 @@ class DeveloppezDotComBridge extends FeedExpander { const DESCRIPTION = "Returns the 15 newest posts from DeveloppezDotCom (full text)."; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'index/rss'); + $this->collectExpandableDatas(self::URI . 'index/rss', 15); } protected function parseItem($newsItem){ @@ -42,6 +42,8 @@ class DeveloppezDotComBridge extends FeedExpander { } private function DeveloppezDotComExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $articleHTMLContent = $this->get_cached($url); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = utf8_encode($text); diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index beff9c8..aef5813 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -78,20 +78,22 @@ class FuturaSciencesBridge extends FeedExpander { ) )); - public function collectData(){ + public function collectData(){ $url = self::URI . 'rss/' . $this->getInput('feed') . '.xml'; - $this->collectExpandableDatas($url); - } + $this->collectExpandableDatas($url, 10); + } - protected function parseItem($newsItem){ - $item = $this->parseRSS_2_0_Item($newsItem); + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); + if($this->get_cached_time($item['uri']) <= strtotime('-24 hours')) + $this->remove_from_cache($item['uri']); $article = $this->get_cached($item['uri']) or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); $item['content'] = $this->ExtractArticleContent($article); $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); - return $item; - } + return $item; + } function StripWithDelimiters($string, $start, $end) { while (strpos($string, $start) !== false) { diff --git a/bridges/LeJournalDuGeekBridge.php b/bridges/LeJournalDuGeekBridge.php index dd0c444..c537a15 100644 --- a/bridges/LeJournalDuGeekBridge.php +++ b/bridges/LeJournalDuGeekBridge.php @@ -7,7 +7,7 @@ class LeJournalDuGeekBridge extends FeedExpander { const DESCRIPTION = "Returns the 5 newest posts from LeJournalDuGeek (full text)."; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss'); + $this->collectExpandableDatas(self::URI . 'rss', 5); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class LeJournalDuGeekBridge extends FeedExpander { } private function LeJournalDuGeekExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $articleHTMLContent = $this->get_cached($url); $text = $articleHTMLContent->find('div.post-content', 0)->innertext; diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index e361ea8..3b3e5b4 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -6,18 +6,20 @@ class LeMondeInformatiqueBridge extends FeedExpander { const URI = "http://www.lemondeinformatique.fr/"; const DESCRIPTION = "Returns the newest articles."; - public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss/rss.xml'); - } + public function collectData(){ + $this->collectExpandableDatas(self::URI . 'rss/rss.xml', 10); + } - protected function parseItem($newsItem){ - $item = $this->parseRSS_1_0_Item($newsItem); + protected function parseItem($newsItem){ + $item = $this->parseRSS_1_0_Item($newsItem); + if($this->get_cached_time($item['uri']) <= strtotime('-24 hours')) + $this->remove_from_cache($item['uri']); $article_html = $this->get_cached($item['uri']) or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']); $item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext); $item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext; - return $item; - } + return $item; + } function StripCDATA($string) { $string = str_replace('collectExpandableDatas(self::URI . '.atom'); + $this->collectExpandableDatas(self::URI . '.atom', 5); } protected function parseItem($newsItem){ diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index a24a02e..a047f63 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -7,7 +7,7 @@ class NextInpactBridge extends FeedExpander { const DESCRIPTION = "Returns the newest articles."; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss/news.xml'); + $this->collectExpandableDatas(self::URI . 'rss/news.xml', 10); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class NextInpactBridge extends FeedExpander { } private function ExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html2 = $this->get_cached($url); $text = '

'.$html2->find('span.sub_title', 0)->innertext.'

' .'

-

' diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index dee8c37..5d26ec5 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -26,7 +26,7 @@ class NextgovBridge extends FeedExpander { )); public function collectData(){ - $this->collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/'); + $this->collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/', 10); } protected function parseItem($newsItem){ diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 0f9d011..9f0e552 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -7,7 +7,7 @@ class NiceMatinBridge extends FeedExpander { const DESCRIPTION = "Returns the 10 newest posts from NiceMatin (full text)"; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'derniere-minute/rss'); + $this->collectExpandableDatas(self::URI . 'derniere-minute/rss', 10); } protected function parseItem($newsItem){ @@ -17,6 +17,8 @@ class NiceMatinBridge extends FeedExpander { } private function NiceMatinExtractContent($url) { + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); $html = $this->get_cached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index 48260a0..202d552 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -7,7 +7,7 @@ class NumeramaBridge extends FeedExpander { const DESCRIPTION = 'Returns the 5 newest posts from Numerama (full text)'; public function collectData(){ - $this->collectExpandableDatas(self::URI . 'feed/'); + $this->collectExpandableDatas(self::URI . 'feed/', 5); } protected function parseItem($newsItem){