diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php index 47ff165..eb05fd1 100644 --- a/bridges/CADBridge.php +++ b/bridges/CADBridge.php @@ -1,12 +1,22 @@ collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->CADExtractContent($item['uri']); + return $item; + } + private function CADExtractContent($url) { - $html3 = $this->getSimpleHTMLDOM($url); + $html3 = $this->get_cached($url); // The request might fail due to missing https support or wrong URL if($html3 == false) @@ -32,33 +42,6 @@ class CADBridge extends BridgeAbstract{ return ''; } - public function collectData(){ - function CADUrl($string) { - $html2 = explode("\"", $string); - $string = $html2[1]; - if (substr($string,0,4) != 'http') - return 'notanurl'; - return $string; - } - - $html = $this->getSimpleHTMLDOM('http://cdn2.cad-comic.com/rss.xml') or $this->returnServerError('Could not request CAD.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = CADUrl($element->find('description', 0)->innertext); - if ($item['uri'] != 'notanurl') { - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->CADExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } - } - public function getCacheDuration(){ return 3600*2; // 2 hours } diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index 446a6df..e621db4 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -1,39 +1,26 @@ collectExpandableDatas('http://www.commondreams.org/rss.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->CommonDreamsExtractContent($item['uri']); + return $item; + } + private function CommonDreamsExtractContent($url) { - $html3 = $this->getSimpleHTMLDOM($url); + $html3 = $this->get_cached($url); $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $html3->clear(); unset ($html3); return $text; } - - public function collectData(){ - - function CommonDreamsUrl($string) { - $html2 = explode(" ", $string); - $string = $html2[2] . "/node/" . $html2[0]; - return $string; - } - - $html = $this->getSimpleHTMLDOM('http://www.commondreams.org/rss.xml') or $this->returnServerError('Could not request CommonDreams.'); - $limit = 0; - foreach($html->find('item') as $element) { - if($limit < 4) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = CommonDreamsUrl($element->find('guid', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->CommonDreamsExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 143a6c0..d8e10dd 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -1,10 +1,10 @@ array( @@ -30,41 +30,31 @@ class DauphineLibereBridge extends BridgeAbstract { ) )); - private function ExtractContent($url, $context) { - $html2 = $this->getSimpleHTMLDOM($url); - $text = $html2->find('div.column', 0)->innertext; - $text = preg_replace('@@si', '', $text); - return $text; - } + public function collectData(){ + $url = self::URI . 'rss'; - public function collectData(){ + if (empty($this->getInput('u'))) { + $url = self::URI . $this->getInput('u') . '/rss'; + } - $context = stream_context_create($opts); + $this->collectExpandableDatas($url); + } - if (empty($this->getInput('u'))) { - $html = $this->getSimpleHTMLDOM(self::URI.$this->getInput('u').'/rss') - or $this->returnServerError('Could not request DauphineLibere.'); - } else { - $html = $this->getSimpleHTMLDOM(self::URI.'rss') - or $this->returnServerError('Could not request DauphineLibere.'); - } - $limit = 0; + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; + } - foreach($html->find('item') as $element) { - if($limit < 10) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = $element->find('guid', 0)->plaintext; - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri'], $context); - $this->items[] = $item; - $limit++; - } - } - } + private function ExtractContent($url) { + $html2 = $this->getSimpleHTMLDOM($url); + $text = $html2->find('div.column', 0)->innertext; + $text = preg_replace('@@si', '', $text); + return $text; + } - public function getCacheDuration(){ - return 3600*2; // 2 hours - } + public function getCacheDuration(){ + return 3600*2; // 2 hours + } } ?> diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 48e2974..52e52db 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -1,11 +1,21 @@ collectExpandableDatas(self::URI . 'index/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->DeveloppezDotComExtractContent($item['uri']); + return $item; + } + private function DeveloppezDotComStripCDATA($string) { $string = str_replace('', '', $string); @@ -32,31 +42,12 @@ class DeveloppezDotComBridge extends BridgeAbstract{ } private function DeveloppezDotComExtractContent($url) { - $articleHTMLContent = $this->getSimpleHTMLDOM($url); + $articleHTMLContent = $this->get_cached($url); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = utf8_encode($text); return trim($text); } - public function collectData(){ - $rssFeed = $this->getSimpleHTMLDOM(self::URI.'index/rss') - or $this->returnServerError('Could not request '.self::URI.'index/rss'); - $limit = 0; - - foreach($rssFeed->find('item') as $element) { - if($limit < 10) { - $item = array(); - $item['title'] = $this->DeveloppezDotComStripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->DeveloppezDotComStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $content = $this->DeveloppezDotComExtractContent($item['uri']); - $item['content'] = strlen($content) ? $content : $element->description; //In case of it is a tutorial, we just keep the original description - $this->items[] = $item; - $limit++; - } - } - } - public function getCacheDuration(){ return 1800; // 30min } diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index e4c8471..beff9c8 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -1,5 +1,5 @@ getInput('feed') . '.xml'; + $this->collectExpandableDatas($url); + } - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); + $article = $this->get_cached($item['uri']) + or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); + $item['content'] = $this->ExtractArticleContent($article); + $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); + return $item; + } - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { + $open_tag = '<'.$tag_name; + $close_tag = ''.$tag_name.'>'; + $close_tag_length = strlen($close_tag); + if (strpos($tag_start, $open_tag) === 0) { + while (strpos($string, $tag_start) !== false) { + $max_recursion = 100; + $section_to_remove = null; + $section_start = strpos($string, $tag_start); + $search_offset = $section_start; + do { + $max_recursion--; + $section_end = strpos($string, $close_tag, $search_offset); + $search_offset = $section_end + $close_tag_length; + $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); + $open_tag_count = substr_count($section_to_remove, $open_tag); + $close_tag_count = substr_count($section_to_remove, $close_tag); + } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); - } return $string; - } - - function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { - $open_tag = '<'.$tag_name; - $close_tag = ''.$tag_name.'>'; - $close_tag_length = strlen($close_tag); - if (strpos($tag_start, $open_tag) === 0) { - while (strpos($string, $tag_start) !== false) { - $max_recursion = 100; - $section_to_remove = null; - $section_start = strpos($string, $tag_start); - $search_offset = $section_start; - do { - $max_recursion--; - $section_end = strpos($string, $close_tag, $search_offset); - $search_offset = $section_end + $close_tag_length; - $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); - $open_tag_count = substr_count($section_to_remove, $open_tag); - $close_tag_count = substr_count($section_to_remove, $close_tag); - } while ($open_tag_count > $close_tag_count && $max_recursion > 0); - $string = str_replace($section_to_remove, '', $string); - } - } - return $string; - } - - // Extracts the author from an article or element - function ExtractAuthor($article, $element){ - $article_author = $article->find('span.author', 0); - if($article_author){ - $authorname = trim(str_replace(', Futura-Sciences', '', $article_author->plaintext)); - if(empty($authorname)){ - $element_author = $element->find('author', 0); - if($element_author) - $authorname = StripCDATA($element_author->plaintext); - else - return ''; - } - return $authorname; - } - return ''; - } - - $url = $this->getURI().'rss/'.$this->getInput('feed').'.xml'; - - $html = $this->getSimpleHTMLDOM($url) - or $this->returnServerError('Could not request Futura-Sciences: '.$url); - $limit = 0; - - foreach($html->find('item') as $element) { - if ($limit < 10) { - $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext)); - $article = $this->getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Futura-Sciences: '.$article_url); - $contents = $article->find('div.content', 0)->innertext; - - foreach (array( - '
'); - $contents = StripWithDelimiters($contents, ''); - $contents = StripWithDelimiters($contents, 'fs:definition="', '"'); - $contents = StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); - $contents = StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); - - $item = array(); - $item['author'] = ExtractAuthor($article, $element); - $item['uri'] = $article_url; - $item['title'] = StripCDATA($element->find('title', 0)->innertext); - $item['timestamp'] = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); - $item['content'] = trim($contents); - $this->items[] = $item; - $limit++; } } + return $string; + } + function ExtractArticleContent($article){ + $contents = $article->find('div.content', 0)->innertext; + + foreach (array( + '
'); + $contents = $this->StripWithDelimiters($contents, 'fs:definition="', '"'); + $contents = $this->StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); + $contents = $this->StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); + + return $contents; + } + + // Extracts the author from an article or element + function ExtractAuthor($article){ + $article_author = $article->find('span.author', 0); + if($article_author){ + return trim(str_replace(', Futura-Sciences', '', $article_author->plaintext)); + } + return ''; } } diff --git a/bridges/LeJournalDuGeekBridge.php b/bridges/LeJournalDuGeekBridge.php index e08f419..dd0c444 100644 --- a/bridges/LeJournalDuGeekBridge.php +++ b/bridges/LeJournalDuGeekBridge.php @@ -1,19 +1,23 @@ ', '', $string); - return $string; + public function collectData(){ + $this->collectExpandableDatas(self::URI . 'rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']); + return $item; } private function LeJournalDuGeekExtractContent($url) { - $articleHTMLContent = $this->getSimpleHTMLDOM($url); + $articleHTMLContent = $this->get_cached($url); $text = $articleHTMLContent->find('div.post-content', 0)->innertext; foreach($articleHTMLContent->find('a.more') as $element) { @@ -34,24 +38,6 @@ class LeJournalDuGeekBridge extends BridgeAbstract{ return $text; } - public function collectData(){ - $rssFeed = $this->getSimpleHTMLDOM(self::URI.'rss') - or $this->returnServerError('Could not request '.self::URI.'/rss'); - $limit = 0; - - foreach($rssFeed->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = $this->LeJournalDuGeekStripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->LeJournalDuGeekStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } - public function getCacheDuration(){ return 1800; // 30min } diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index 8fd1daa..e361ea8 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -1,60 +1,42 @@ collectExpandableDatas(self::URI . 'rss/rss.xml'); + } - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_1_0_Item($newsItem); + $article_html = $this->get_cached($item['uri']) + or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']); + $item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext); + $item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext; + return $item; + } - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } return $string; - } + function StripCDATA($string) { + $string = str_replace('', '', $string); + return $string; + } - function CleanArticle($article_html) { - $article_html = StripWithDelimiters($article_html, ''); + $article_html = $this->StripWithDelimiters($article_html, '
'.$html2->find('span.sub_title', 0)->innertext.'
' .'' .''.$premium_article->innertext.'
'; return $text; } - - public function collectData(){ - $html = $this->getSimpleHTMLDOM(self::URI.'rss/news.xml') or $this->returnServerError('Could not request NextInpact.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 3) { - $item = array(); - $item['title'] = $this->StripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->StripCDATA($element->find('guid', 0)->plaintext); - $item['author'] = $this->StripCDATA($element->find('creator', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index ee4f299..dee8c37 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/'); + } - function ExtractFromDelimiters($string, $start, $end) { - if (strpos($string, $start) !== false) { - $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); - $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); - return $section_retrieved; - } return false; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } return $string; - } + $item['content'] = ''; - $category = $this->getInput('category'); - $url = $this->getURI().'rss/'.$category.'/'; - $html = $this->getSimpleHTMLDOM($url) or $this->returnServerError('Could not request Nextgov: '.$url); - $limit = 0; - - foreach ($html->find('item') as $element) { - if ($limit >= 10) { - break; + $namespaces = $newsItem->getNamespaces(true); + if(isset($namespaces['media'])){ + $media = $newsItem->children($namespaces['media']); + if(isset($media->content)){ + $attributes = $media->content->attributes(); + $item['content'] = ''; } - - $article_url = ExtractFromDelimiters($element->innertext, '', ''); - $article_author = ExtractFromDelimiters($element->innertext, 'dc/elements/1.1/">', ''); - $article_title = $element->find('title', 0)->plaintext; - $article_subtitle = $element->find('description', 0)->plaintext; - $article_timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $article_thumbnail = ExtractFromDelimiters($element->innertext, ''.$article_subtitle.'
' + .trim($contents); } } diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 3c18909..0f9d011 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -1,13 +1,23 @@ collectExpandableDatas(self::URI . 'derniere-minute/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->NiceMatinExtractContent($item['uri']); + return $item; + } + private function NiceMatinExtractContent($url) { - $html = $this->getSimpleHTMLDOM($url); + $html = $this->get_cached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; @@ -19,29 +29,4 @@ class NiceMatinBridge extends BridgeAbstract{ $text = strip_tags($text, '