From c07eacfd6af5646c795e5739e8d39964002fa87f Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Mon, 26 May 2014 19:45:10 +0200 Subject: [PATCH] Add extra bridges, second batch (TPB inside) --- bridges/CollegeDeFranceBridge.php | 41 ++++++++++++++ bridges/FlickrTagBridge.php | 53 +++++++++++++++++ bridges/KoreusBridge.php | 56 ++++++++++++++++++ bridges/MsnMondeBridge.php | 55 ++++++++++++++++++ bridges/NiceMatinBridge.php | 62 ++++++++++++++++++++ bridges/PlanetLibreBridge.php | 49 ++++++++++++++++ bridges/RaymondBridge.php | 52 +++++++++++++++++ bridges/Sexactu.php | 91 ++++++++++++++++++++++++++++++ bridges/ThePirateBayBridge.php | 49 ++++++++++++++++ bridges/WordPressBridge.php | 94 +++++++++++++++++++++++++++++++ 10 files changed, 602 insertions(+) create mode 100644 bridges/CollegeDeFranceBridge.php create mode 100644 bridges/FlickrTagBridge.php create mode 100644 bridges/KoreusBridge.php create mode 100644 bridges/MsnMondeBridge.php create mode 100644 bridges/NiceMatinBridge.php create mode 100644 bridges/PlanetLibreBridge.php create mode 100644 bridges/RaymondBridge.php create mode 100644 bridges/Sexactu.php create mode 100644 bridges/ThePirateBayBridge.php create mode 100644 bridges/WordPressBridge.php diff --git a/bridges/CollegeDeFranceBridge.php b/bridges/CollegeDeFranceBridge.php new file mode 100644 index 0000000..6526eae --- /dev/null +++ b/bridges/CollegeDeFranceBridge.php @@ -0,0 +1,41 @@ +returnError('Could not request CollegeDeFrance.', 404); + $limit = 0; + foreach($html->find('li.audio') as $element) { + if($limit < 10) { + $item = new \Item(); + $item->title = $element->find('span.title', 0)->plaintext; + $item->timestamp = strtotime(str_replace($find, $replace, $element->find('span.date', 0)->plaintext)); + $item->content = $element->find('span.lecturer', 0)->innertext . ' - ' . $element->find('span.title', 0)->innertext; + $item->uri = $element->find('a', 0)->href; + $this->items[] = $item; + $limit++; + } + } + + } + public function getName(){ + return 'CollegeDeFrance'; + } + public function getURI(){ + return 'http://www.college-de-france.fr/'; + } + public function getCacheDuration(){ + return 3600*3; // 3 hour + } +} + diff --git a/bridges/FlickrTagBridge.php b/bridges/FlickrTagBridge.php new file mode 100644 index 0000000..fa274b1 --- /dev/null +++ b/bridges/FlickrTagBridge.php @@ -0,0 +1,53 @@ +returnError('Could not request Flickr.', 404); + if (isset($param['q'])) { /* keyword search mode */ + $this->request = $param['q']; + $html = file_get_html('http://www.flickr.com/search/?q='.urlencode($this->request).'&s=rec') or $this->returnError('No results for this query.', 404); + } + elseif (isset($param['u'])) { /* user timeline mode */ + $this->request = $param['u']; + $html = file_get_html('http://www.flickr.com/photos/'.urlencode($this->request).'/') or $this->returnError('Requested username can\'t be found.', 404); + } + + else { + $this->returnError('You must specify a keyword or a Flickr username.', 400); + } + + foreach($html->find('span.photo_container') as $element) { + $item = new \Item(); + $item->uri = 'http://flickr.com'.$element->find('a',0)->href; + $item->thumbnailUri = $element->find('img',0)->getAttribute('data-defer-src'); + $item->content = ''; // FIXME: Filter javascript ? + $item->title = $element->find('a',0)->title; + $this->items[] = $item; + } + } + + public function getName(){ + return 'Flickr Tag'; + } + + public function getURI(){ + return 'http://www.flickr.com/search/'; + } + + public function getCacheDuration(){ + return 21600; // 6 hours + } +} + diff --git a/bridges/KoreusBridge.php b/bridges/KoreusBridge.php new file mode 100644 index 0000000..8ae90e8 --- /dev/null +++ b/bridges/KoreusBridge.php @@ -0,0 +1,56 @@ +', '', $string); + return $string; + } + function KoreusExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('p[class=itemText]', 0)->innertext; + $text = utf8_encode(preg_replace('/(Sur le m.+?)+$/i','',$text)); + return $text; + } + $html = file_get_html('http://feeds.feedburner.com/Koreus-articles') or $this->returnError('Could not request Koreus.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 5) { + $item = new \Item(); + $item->title = KoreusStripCDATA($element->find('title', 0)->innertext); + $item->uri = KoreusStripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = KoreusExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'Koreus'; + } + + public function getURI(){ + return 'http://www.koreus.com/'; + } + + public function getCacheDuration(){ + return 3600; // 1 hour + } +} + diff --git a/bridges/MsnMondeBridge.php b/bridges/MsnMondeBridge.php new file mode 100644 index 0000000..bd7f5b1 --- /dev/null +++ b/bridges/MsnMondeBridge.php @@ -0,0 +1,55 @@ +find('div[id=m6_diaponews_placeholder]', 0)->outertext=''; //Supression de la partie "et aussi" + $text = $html2->find('div[class=svsubtorabs]', 0)->innertext; // ajout du resume + $text .= $html2->find('div[id=page1]', 0)->innertext; // article + $text = preg_replace('/

Lire aussi.*/i','',$text); //Supression de la partie "Lire aussi" + + return $text; + } + + $html = file_get_html('http://news.fr.msn.com/m6-actualite/RSS/News_RSS_Monde.aspx') or $this->returnError('Could not request MsnMonde.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 10) { + $item = new \Item(); + $item->title = $element->find('title', 0)->innertext; + $item->uri = $element->find('guid', 0)->plaintext; + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = MsnMondeExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'MSN Actu Monde'; + } + + public function getURI(){ + return 'http://news.fr.msn.com/m6-actualite/monde/'; + } + + public function getCacheDuration(){ + return 3600; // 1 hour + } +} + diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php new file mode 100644 index 0000000..7c0441a --- /dev/null +++ b/bridges/NiceMatinBridge.php @@ -0,0 +1,62 @@ +', '', $string); + //$string = str_replace('.+', '', $string); + $string = preg_replace('/html.*http.*/i','html',$string); + $string = preg_replace('/.*http/i','http',$string); + return $string; + } + + function NiceMatinExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('figure[itemprop=associatedMedia]', 0)->innertext; + $text .= $html2->find('div[id=content-article]', 0)->innertext; + return $text; + } + + $html = file_get_html('http://www.nicematin.com/derniere-minute/rss') or $this->returnError('Could not request NiceMatin.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 10) { + $item = new \Item(); + //$item->title = NiceMatinStripCDATA($element->find('title', 0)->innertext); + $item->title = $element->find('title', 0)->innertext; + $item->uri = NiceMatinUrl($element->plaintext); + + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = NiceMatinExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'NiceMatin'; + } + + public function getURI(){ + return 'http://www.nicematin.com/'; + } + + public function getCacheDuration(){ + return 3600; // 1 hour + } +} + diff --git a/bridges/PlanetLibreBridge.php b/bridges/PlanetLibreBridge.php new file mode 100644 index 0000000..073ec28 --- /dev/null +++ b/bridges/PlanetLibreBridge.php @@ -0,0 +1,49 @@ +', '', $string); + return $string; + } + function PlanetLibreExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('div[class=post-text]', 0)->innertext; + return $text; + } + $html = file_get_html('http://www.planet-libre.org/rss10.php') or $this->returnError('Could not request PlanetLibre.', 404); + $limit = 0; + foreach($html->find('item') as $element) { + if($limit < 5) { + $item = new \Item(); + $item->title = PlanetLibreStripCDATA($element->find('title', 0)->innertext); + $item->uri = PlanetLibreStripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = PlanetLibreExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + public function getName(){ + return 'PlanetLibre'; + } + public function getURI(){ + return 'http://www.planet-libre.org/'; + } + public function getCacheDuration(){ + return 3600*2; // 1 hour + } +} + diff --git a/bridges/RaymondBridge.php b/bridges/RaymondBridge.php new file mode 100644 index 0000000..2d79ebb --- /dev/null +++ b/bridges/RaymondBridge.php @@ -0,0 +1,52 @@ +', '', $string); + return $string; + } + function raymondExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('div.entry-content', 0)->innertext; + $text = preg_replace('/class="ad".*/', '', $text); + $text = strip_tags($text, '

'); + $text = str_replace('(adsbygoogle = window.adsbygoogle || []).push({});', '', $text); + return $text; + } + $html = file_get_html('http://www.raymond.cc/blog/feed') or $this->returnError('Could not request raymond.', 404); + $limit = 0; + foreach($html->find('item') as $element) { + if($limit < 3) { + $item = new \Item(); + $item->title = raymondStripCDATA($element->find('title', 0)->innertext); + $item->uri = raymondStripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = raymondExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + public function getName(){ + return 'raymond'; + } + public function getURI(){ + return 'http://www.raymond.cc/blog'; + } + public function getCacheDuration(){ + return 3600*12; // 12 hour + } +} + diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php new file mode 100644 index 0000000..6339095 --- /dev/null +++ b/bridges/Sexactu.php @@ -0,0 +1,91 @@ +getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('.content-holder') as $contentHolder) { + // only use first list as second one only contains pages numbers + $articles = $contentHolder->find('ul', 0); + foreach($articles->find('li') as $element) { + // if you ask about that method_exists, there seems to be a bug in simple html dom + // see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619 + if(is_object($element)) { + $item = new Item(); + // various metadata + $titleBlock = $element->find('.title-holder', 0); + if(is_object($titleBlock)) { + $titleDetails = $titleBlock->find('.article-title',0); + $titleData = $titleDetails->find('h2', 0)->find('a',0); + $titleTimestamp =$titleDetails->find('h4',0); + $item->title = $this->correctCase(trim($titleData->innertext)); + $item->uri = GQ.$titleData->href; + + // Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension + $dateText = $titleTimestamp->innertext; + $dateText = substr($dateText, strpos($dateText,',')+1); + $dateText = str_replace($find, $replace, strtolower($dateText)); + $date = strtotime($dateText); + $item->timestamp = $date; + + $item->name = "Maïa Mazaurette"; + $elementText = $element->find('.text-container', 0); + // don't forget to replace images server url with gq one + foreach($elementText->find('img') as $image) { + $image->src = GQ.$image->src; + } + $item->content = $elementText->innertext; + $this->items[] = $item; + } + + } + + } + } + } + + public function getName(){ + return 'Sexactu'; + } + + public function getURI(){ + return GQ.'/sexactu'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Sexactu"; + } + + public function correctCase($str) { + $sentences=explode('.', mb_strtolower($str, "UTF-8")); + $str=""; + $sep=""; + foreach ($sentences as $sentence) + { + //upper case first char + $sentence=ucfirst(trim($sentence)); + + //append sentence to output + $str=$str.$sep.$sentence; + $sep=". "; + } + return $str; + } +} + diff --git a/bridges/ThePirateBayBridge.php b/bridges/ThePirateBayBridge.php new file mode 100644 index 0000000..c0ce493 --- /dev/null +++ b/bridges/ThePirateBayBridge.php @@ -0,0 +1,49 @@ +returnError('You must specify a keyword (?q=...)', 400); + + $html = file_get_html('https://thepiratebay.se/search/'.rawurlencode($param['q']).'/0/99/0') or $this->returnError('Could not request TPB.', 404); + + if($html->find('table#searchResult', 0) == FALSE) + $this->returnError('No result for this query', 404); + + foreach($html->find('tr') as $element) { + $item = new \Item(); + $item->uri = 'https://thepiratebay.se/'.$element->find('a.detLink',0)->href; + $item->id = $item->uri; + $item->timestamp = time(); + $item->title = $element->find('a.detLink',0)->plaintext; + $item->content = $element->find('font',0)->plaintext.'
download'; + if(!empty($item->title)) + $this->items[] = $item; + } + } + + public function getName(){ + return 'The Pirate Bay'; + } + + public function getURI(){ + return 'https://thepiratebay.se/'; + } + + public function getCacheDuration(){ + return 3600; // 1 hour + } +} diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php new file mode 100644 index 0000000..c61fbb7 --- /dev/null +++ b/bridges/WordPressBridge.php @@ -0,0 +1,94 @@ +processParams($param); + + if (!$this->hasUrl()) { + $this->returnError('You must specify a URL', 400); + } + + $html = file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404); + + if(!empty($html->find('.post')) ) { + $i=0; + foreach ($html->find('.post') as $article) { + if($i < 3) { + $uri = $article->find('a', 0)->href; + $this->items[] = $this->getDetails($uri); + $i++; + } + } + } + else { + $this->returnError("Sorry, {$this->url} doesn't seem to be a Wordpress blog.", 404); + } + } + + private function getDetails($uri) { + $html = file_get_html($uri) or exit; + + $item = new \Item(); + + $article = $html->find('.post', 0); + $item->uri = $uri; + $item->title = $article->find('h1', 0)->innertext; + $item->content = $this->clearContent($article->find('.entry-content,.entry', 0)->innertext); + $item->timestamp = $this->getDate($uri); + + return $item; + } + + private function clearContent($content) { + $content = preg_replace('//', '', $content); + $content = preg_replace('/

format('U'); + } + + public function getName() { + return "{$this->name} - Wordpress Bridge"; + } + + public function getURI() { + return $this->url; + } + + public function getCacheDuration() { + return 3600*3; // 3 hours + } + + private function hasUrl() { + if (empty($this->url)) { + return false; + } + return true; + } + + private function processParams($param) { + $this->url = $param['url']; + $this->name = $param['name']; + } + +} +