From a9d6fd3e462d651a462547c7deb6a23579bfda24 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sat, 8 Feb 2014 18:12:49 +0100 Subject: [PATCH 001/161] Default to http for DDG bridge Because some hosts do not support https. --- bridges/DuckDuckGoBridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bridges/DuckDuckGoBridge.php b/bridges/DuckDuckGoBridge.php index a194a5c..d9f47d7 100644 --- a/bridges/DuckDuckGoBridge.php +++ b/bridges/DuckDuckGoBridge.php @@ -12,7 +12,7 @@ class DuckDuckGoBridge extends BridgeAbstract{ public function collectData(array $param){ $html = ''; - $link = 'https://duckduckgo.com/html/?q='.$param[u].'+sort:date'; + $link = 'http://duckduckgo.com/html/?q='.$param[u].'+sort:date'; $html = file_get_html($link) or $this->returnError('Could not request DuckDuckGo.', 404); From 50a96a3f215607d4112eae71f324e7d20ec4b0eb Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sat, 8 Feb 2014 18:18:09 +0100 Subject: [PATCH 002/161] Corrected default number of articles in CryptomeBridge. The bridge was outputting a single entry when parameter n was empty. --- bridges/CryptomeBridge.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bridges/CryptomeBridge.php b/bridges/CryptomeBridge.php index 1409ecf..39e3136 100644 --- a/bridges/CryptomeBridge.php +++ b/bridges/CryptomeBridge.php @@ -18,10 +18,11 @@ class CryptomeBridge extends BridgeAbstract{ //$link = 'https://secure.netsolhost.com/cryptome.org/'; $html = file_get_html($link) or $this->returnError('Could not request Cryptome.', 404); - if (isset($param['n'])) { /* number of documents */ + if (!empty($param['n'])) { /* number of documents */ $num = min(max(1, $param['n']+0), $num); } + foreach($html->find('pre') as $element) { for ( $i = 0; $i < $num; ++$i ) { $item = new \Item(); From 531dddecdcb587caeec1e863743931a38e6f159b Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 10:23:50 +0100 Subject: [PATCH 003/161] Changed default number of article from 90 to 20 in CryptomeBridge. --- bridges/CryptomeBridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bridges/CryptomeBridge.php b/bridges/CryptomeBridge.php index 39e3136..87df363 100644 --- a/bridges/CryptomeBridge.php +++ b/bridges/CryptomeBridge.php @@ -12,7 +12,7 @@ class CryptomeBridge extends BridgeAbstract{ public function collectData(array $param){ $html = ''; - $num = 90; + $num = 20; $link = 'http://cryptome.org/'; // If you want HTTPS access instead, uncomment the following line: //$link = 'https://secure.netsolhost.com/cryptome.org/'; From 7c5118959deeee9a8cb5833fd509d25db7ba2727 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 15:08:03 +0100 Subject: [PATCH 004/161] Removed non-working bridges --- README.md | 3 --- bridges/FSBridge.php | 56 --------------------------------------- bridges/GuruMedBridge.php | 55 -------------------------------------- 3 files changed, 114 deletions(-) delete mode 100644 bridges/FSBridge.php delete mode 100644 bridges/GuruMedBridge.php diff --git a/README.md b/README.md index 1baf5d2..e8b5151 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,8 @@ Supported sites/pages * `Identi.ca` : Identica user timeline (Should be compatible with other Pump.io instances). * `YouTube` : YouTube user channel feed. * `Cryptome` : Returns the most recent documents from Cryptome.org. - * `Futurasciences` : Returns the most recent articles from futura-sciences.com. - * `GuruMed`: Returns the most recent articles for gurumed.org * `DansTonChat`: Most recent quotes from danstonchat.com * `DuckDuckGo`: Most recent results from DuckDuckGo.com - * `FSBridge`: Most recent article (full text) from futura-sciences.com * `GuruMed`: Most recent entries (full text) from gurumed.org * `Instagram`: Most recent photos from an instagram.com user. * `OpenClassrooms`: Lastest tutorials from fr.openclassrooms.com. diff --git a/bridges/FSBridge.php b/bridges/FSBridge.php deleted file mode 100644 index 1ffd25d..0000000 --- a/bridges/FSBridge.php +++ /dev/null @@ -1,56 +0,0 @@ -', '', $string); - return $string; - } - function FS_ExtractContent($url) { - $html2 = file_get_html($url); - $text = $html2->find('div.fiche-actualite', 0)->innertext; - return $text; - } - $html = file_get_html('http://www.futura-sciences.com/rss/actualites.xml') or $this->returnError('Could not request Futura Sciences.', 404); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 20) { - $item = new \Item(); - $item->title = FS_StripCDATA($element->find('title', 0)->innertext); - $item->uri = FS_StripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = FS_ExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } - - } - - public function getName(){ - return 'Futura Sciences'; - } - - public function getURI(){ - return 'http://www.futura-sciences.com/'; - } - - public function getCacheDuration(){ - // return 3600; // 1 hour - return 0; // 1 hour - } -} diff --git a/bridges/GuruMedBridge.php b/bridges/GuruMedBridge.php deleted file mode 100644 index d312c24..0000000 --- a/bridges/GuruMedBridge.php +++ /dev/null @@ -1,55 +0,0 @@ -', '', $string); - return $string; - } - function GurumedExtractContent($url) { - $html2 = file_get_html($url); - $text = $html2->find('div.entry', 0)->innertext; - return $text; - } - $html = file_get_html('http://gurumed.org/feed') or $this->returnError('Could not request Gurumed.', 404); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 10) { - $item = new \Item(); - $item->title = GurumedStripCDATA($element->find('title', 0)->innertext); - $item->uri = GurumedStripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $item->content = GurumedExtractContent($item->uri); - $this->items[] = $item; - $limit++; - } - } - - } - - public function getName(){ - return 'Gurumed'; - } - - public function getURI(){ - return 'http://gurumed.org/'; - } - - public function getCacheDuration(){ - return 3600; // 1 hour - } -} From a1152aee96838bb3a2a328539b4d94f94c7a4838 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 15:15:15 +0100 Subject: [PATCH 005/161] Added parameter check of OpenClassRooms. --- bridges/OpenClassroomsBridge.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bridges/OpenClassroomsBridge.php b/bridges/OpenClassroomsBridge.php index d2d3c4a..966fd4e 100644 --- a/bridges/OpenClassroomsBridge.php +++ b/bridges/OpenClassroomsBridge.php @@ -11,8 +11,13 @@ class OpenClassroomsBridge extends BridgeAbstract{ public function collectData(array $param){ + if ($param['u']!='informatique' && $param['u']!='sciences') + { + $this->returnError('Error: You must chose "informatique" or "science".', 404); + } + $html = ''; - $link = 'http://fr.openclassrooms.com/'.$param[u].'/cours?title=&sort=updatedAt+desc'; + $link = 'http://fr.openclassrooms.com/'.$param['u'].'/cours?title=&sort=updatedAt+desc'; $html = file_get_html($link) or $this->returnError('Could not request OpenClassrooms.', 404); From b4b3c4b0acb38715da8f2d4414a835f78dfcbebb Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 15:20:52 +0100 Subject: [PATCH 006/161] Added parameters checks for PinterestBridge --- bridges/PinterestBridge.php | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bridges/PinterestBridge.php b/bridges/PinterestBridge.php index 8c24ecd..7f58e1d 100644 --- a/bridges/PinterestBridge.php +++ b/bridges/PinterestBridge.php @@ -16,7 +16,18 @@ class PinterestBridge extends BridgeAbstract{ public function collectData(array $param){ $html = ''; - if (isset($param['u']) && isset($param['b'])) { + if (isset($param['u']) || isset($param['b'])) { + + if (empty($param['u'])) + { + $this->returnError('You must specify a Pinterest username (?u=...).', 400); + } + + if (empty($param['b'])) + { + $this->returnError('You must specify a Pinterest board for this username (?b=...).', 400); + } + $this->username = $param['u']; $this->board = $param['b']; $html = file_get_html($this->getURI().'/'.urlencode($this->username).'/'.urlencode($this->board)) or $this->returnError('Could not request Pinterest.', 404); From a5591d9f5431cf9b24664f42a13327c3c34dbb88 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 15:33:02 +0100 Subject: [PATCH 007/161] Added Bandcamp Merged manually from https://github.com/sebsauvage/rss-bridge/pull/24 --- README.md | 1 + bridges/BandcampBridge.php | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 bridges/BandcampBridge.php diff --git a/README.md b/README.md index e8b5151..a6dd6da 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Supported sites/pages * `WikipediaENLatest`: highlighted articles from Wikipedia in English. * `WikipediaFRLatest`: highlighted articles from Wikipedia in French. * `WikipediaEOLatest`: highlighted articles from Wikipedia in Esperanto. + * `Bandcamp` : Returns last release from bandcamp for a tag Output format diff --git a/bridges/BandcampBridge.php b/bridges/BandcampBridge.php new file mode 100644 index 0000000..98eaf9f --- /dev/null +++ b/bridges/BandcampBridge.php @@ -0,0 +1,45 @@ +request = $param['tag']; + $html = file_get_html('http://bandcamp.com/tag/'.urlencode($this->request).'?sort_field=date') or $this->returnError('No results for this query.', 404); + } + else { + $this->returnError('You must specify tag (/tag/...)', 400); + } + + foreach($html->find('li.item') as $release) { + $item = new \Item(); + $item->name = $release->find('div.itemsubtext',0)->plaintext . ' - ' . $release->find('div.itemtext',0)->plaintext; + $item->title = $release->find('div.itemsubtext',0)->plaintext . ' - ' . $release->find('div.itemtext',0)->plaintext; + $item->content = '
' . $release->find('div.itemsubtext',0)->plaintext . ' - ' . $release->find('div.itemtext',0)->plaintext; + $item->id = $release->find('a',0)->getAttribute('href'); + $item->uri = $release->find('a',0)->getAttribute('href'); + $this->items[] = $item; + } + } + + public function getName(){ + return (!empty($this->request) ? $this->request .' - ' : '') .'Bandcamp Tag'; + } + + public function getURI(){ + return 'http://bandcamp.com'; + } + + public function getCacheDuration(){ + return 600; // 10 minutes + } +} From 589f1dbbad45c3aa4cc71d4507aa08151e856047 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 15:37:16 +0100 Subject: [PATCH 008/161] Small update on README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6dd6da..bfacaab 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Requirements * php 5.3 * [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net). (Put `simple_html_dom.php` in `vendor/simplehtmldom/`). - * Ssl lib activated in PHP config + * TLS lib activated in PHP config for some bridges. Author From 9c8a9d1d1dde160d7311645bea7b2d77f558f0b2 Mon Sep 17 00:00:00 2001 From: Sebastien SAUVAGE Date: Sun, 9 Feb 2014 15:59:16 +0100 Subject: [PATCH 009/161] Updated README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bfacaab..8c1673e 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ Patch/contributors : * [Astalaseven](https://github.com/Astalaseven) * [qwertygc](https://github.com/qwertygc) * [Djuuu](https://github.com/Djuuu) + * [Anadrark](https://github.com/Anadrark]) Licence === From 45f147ec24b74d7d370bca717db5d6eca3ec0a81 Mon Sep 17 00:00:00 2001 From: pauder Date: Tue, 11 Mar 2014 11:17:57 +0100 Subject: [PATCH 010/161] Use the full size image in feed instead of the small sized version --- bridges/PinterestBridge.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bridges/PinterestBridge.php b/bridges/PinterestBridge.php index 8c24ecd..78fb9e2 100644 --- a/bridges/PinterestBridge.php +++ b/bridges/PinterestBridge.php @@ -39,7 +39,7 @@ class PinterestBridge extends BridgeAbstract{ $item = new \Item(); $item->uri = $this->getURI().$a->getAttribute('href'); - $item->content = ''; + $item->content = ''; if (isset($this->query)) @@ -83,6 +83,6 @@ class PinterestBridge extends BridgeAbstract{ } public function getCacheDuration(){ - return 0; + return 3600; } } From e9902e2391a2dfe644e15ff9be6c6d0e3b329c56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20SAUVAGE?= Date: Wed, 14 May 2014 12:39:12 +0200 Subject: [PATCH 011/161] Updated from Mitsukarenai --- bridges/YoutubeBridge.php | 62 ++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/bridges/YoutubeBridge.php b/bridges/YoutubeBridge.php index 8d73925..c7d7111 100644 --- a/bridges/YoutubeBridge.php +++ b/bridges/YoutubeBridge.php @@ -4,8 +4,10 @@ * Returns the newest videos * * @name Youtube Bridge -* @description Returns the newest videos +* @description Returns the newest videos by username or playlist * @use1(u="username") +* @use2(p="playlist id") +* @use3(s="search keyword",pa="page") */ class YoutubeBridge extends BridgeAbstract{ @@ -16,20 +18,52 @@ class YoutubeBridge extends BridgeAbstract{ if (isset($param['u'])) { /* user timeline mode */ $this->request = $param['u']; $html = file_get_html('https://www.youtube.com/user/'.urlencode($this->request).'/videos') or $this->returnError('Could not request Youtube.', 404); + + foreach($html->find('li.channels-content-item') as $element) { + $item = new \Item(); + $item->uri = 'https://www.youtube.com'.$element->find('a',0)->href; + $item->thumbnailUri = 'https:'.$element->find('img',0)->src; + $item->title = trim($element->find('h3',0)->plaintext); + $item->content = '
' . $item->title . ''; + $this->items[] = $item; + } + } + else if (isset($param['p'])) { /* playlist mode */ + $this->request = $param['p']; + $html = file_get_html('https://www.youtube.com/playlist?list='.urlencode($this->request).'') or $this->returnError('Could not request Youtube.', 404); + + foreach($html->find('li.playlist-video-item') as $element) { + $item = new \Item(); + $item->uri = 'https://www.youtube.com'.$element->find('a',0)->href; + $item->thumbnailUri = 'https:'.$element->find('img',0)->src; + $item->title = trim($element->find('h3',0)->plaintext); + $item->content = '
' . $item->title . ''; + $this->items[] = $item; + } + $this->request = 'Playlist '.str_replace(' - YouTube', '', $html->find('title', 0)->plaintext).', by '.$html->find('h1', 0)->plaintext; + } + else if (isset($param['s'])) { /* search mode */ + $this->request = $param['s']; $page = 1; if (isset($param['pa'])) $page = (int)preg_replace("/[^0-9]/",'', $param['pa']); + $html = file_get_html('https://www.youtube.com/results?search_query='.urlencode($this->request).'&page='.$page.'&filters=video&search_sort=video_date_uploaded') or $this->returnError('Could not request Youtube.', 404); + + foreach($html->find('li.context-data-item') as $element) { + $item = new \Item(); + $item->uri = 'https://www.youtube.com'.$element->find('a',0)->href; + $checkthumb = $element->find('img', 0)->getAttribute('data-thumb'); + if($checkthumb !== FALSE) + $item->thumbnailUri = $checkthumb; + else + $item->thumbnailUri = ''.$element->find('img',0)->src; + $item->title = trim($element->find('h3',0)->plaintext); + $item->content = '
' . $item->title . ''; + $this->items[] = $item; + } + $this->request = 'Search: '.str_replace(' - YouTube', '', $html->find('title', 0)->plaintext); } else { - $this->returnError('You must specify a Youtbe username (?u=...).', 400); - } - - - foreach($html->find('li.channels-content-item') as $element) { - $item = new \Item(); - $item->uri = 'https://www.youtube.com'.$element->find('a',0)->href; - $item->thumbnailUri = 'https:'.$element->find('img',0)->src; - $item->title = trim($element->find('h3',0)->plaintext); - $item->content = '
' . $item->title . ''; - $this->items[] = $item; - } + $this->returnError('You must either specify a Youtube username (?u=...) or a playlist id (?p=...) or search (?s=...)', 400); + } + } public function getName(){ @@ -41,6 +75,6 @@ class YoutubeBridge extends BridgeAbstract{ } public function getCacheDuration(){ - return 21600; // 6 hours + return 10800; // 3 hours } } From e81a19a04ba460bb4bac68136ceb9a4cef1f0427 Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Wed, 14 May 2014 14:27:57 +0200 Subject: [PATCH 012/161] [pull] date fallback for ATOM format --- formats/AtomFormat.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/formats/AtomFormat.php b/formats/AtomFormat.php index cdd0a9a..c9a1e0d 100644 --- a/formats/AtomFormat.php +++ b/formats/AtomFormat.php @@ -53,6 +53,10 @@ EOD; - : RFC look with xhtml, keep this in spite of ? */ +// #### TEMPORARY FIX ### +$feedTimestamp = date(DATE_ATOM, time()); +// ################ + /* Data are prepared, now let's begin the "MAGIE !!!" */ $toReturn = ''; $toReturn .= <<{$title} http{$https}://{$httpHost}{$httpInfo}/ - + {$feedTimestamp} {$entries} From 1e40d6f492486658748dc12b7824d750525b780c Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Wed, 14 May 2014 14:34:06 +0200 Subject: [PATCH 013/161] [pull] Fix Youtube, Twitter --- bridges/TwitterBridge.php | 10 +++++----- bridges/YoutubeBridge.php | 18 +++++++++++------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/bridges/TwitterBridge.php b/bridges/TwitterBridge.php index fcc3967..cf85cb6 100644 --- a/bridges/TwitterBridge.php +++ b/bridges/TwitterBridge.php @@ -26,15 +26,15 @@ class TwitterBridge extends BridgeAbstract{ $this->returnError('You must specify a keyword (?q=...) or a Twitter username (?u=...).', 400); } - foreach($html->find('div.tweet') as $tweet) { + foreach($html->find('div.js-stream-tweet') as $tweet) { $item = new \Item(); - $item->username = trim(substr($tweet->find('span.username', 0)->plaintext, 1)); // extract username and sanitize + $item->username = $tweet->getAttribute('data-screen-name'); // extract username and sanitize $item->fullname = $tweet->getAttribute('data-name'); // extract fullname (pseudonym) $item->avatar = $tweet->find('img', 0)->src; // get avatar link $item->id = $tweet->getAttribute('data-tweet-id'); // get TweetID - $item->uri = 'https://twitter.com'.$tweet->find('a.details', 0)->getAttribute('href'); // get tweet link - $item->timestamp = $tweet->find('span._timestamp', 0)->getAttribute('data-time'); // extract tweet timestamp - $item->content = str_replace('href="/', 'href="https://twitter.com/', strip_tags($tweet->find('p.tweet-text', 0)->innertext, '')); // extract tweet text + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); // get tweet link + $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); // extract tweet timestamp + $item->content = str_replace('href="/', 'href="https://twitter.com/', strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, '')); // extract tweet text $item->title = $item->fullname . ' (@'. $item->username . ') | ' . $item->content; $this->items[] = $item; } diff --git a/bridges/YoutubeBridge.php b/bridges/YoutubeBridge.php index c7d7111..974d169 100644 --- a/bridges/YoutubeBridge.php +++ b/bridges/YoutubeBridge.php @@ -8,6 +8,10 @@ * @use1(u="username") * @use2(p="playlist id") * @use3(s="search keyword",pa="page") +* +* WARNING: to parse big playlists (over ~90 videos), you need to edit simple_html_dom.php: +* change: define('MAX_FILE_SIZE', 600000); +* into: define('MAX_FILE_SIZE', 900000); (or more) */ class YoutubeBridge extends BridgeAbstract{ @@ -32,21 +36,21 @@ class YoutubeBridge extends BridgeAbstract{ $this->request = $param['p']; $html = file_get_html('https://www.youtube.com/playlist?list='.urlencode($this->request).'') or $this->returnError('Could not request Youtube.', 404); - foreach($html->find('li.playlist-video-item') as $element) { + foreach($html->find('tr.pl-video') as $element) { $item = new \Item(); - $item->uri = 'https://www.youtube.com'.$element->find('a',0)->href; - $item->thumbnailUri = 'https:'.$element->find('img',0)->src; - $item->title = trim($element->find('h3',0)->plaintext); + $item->uri = 'https://www.youtube.com'.$element->find('.pl-video-title a',0)->href; + $item->thumbnailUri = 'https:'.str_replace('/default.','/mqdefault.',$element->find('.pl-video-thumbnail img',0)->src); + $item->title = trim($element->find('.pl-video-title a',0)->plaintext); $item->content = '
' . $item->title . ''; $this->items[] = $item; } - $this->request = 'Playlist '.str_replace(' - YouTube', '', $html->find('title', 0)->plaintext).', by '.$html->find('h1', 0)->plaintext; + $this->request = 'Playlist '.trim(str_replace(' - YouTube', '', $html->find('title', 0)->plaintext)).', by '.$html->find('h1', 0)->plaintext; } else if (isset($param['s'])) { /* search mode */ $this->request = $param['s']; $page = 1; if (isset($param['pa'])) $page = (int)preg_replace("/[^0-9]/",'', $param['pa']); - $html = file_get_html('https://www.youtube.com/results?search_query='.urlencode($this->request).'&page='.$page.'&filters=video&search_sort=video_date_uploaded') or $this->returnError('Could not request Youtube.', 404); + $html = file_get_html('https://www.youtube.com/results?search_query='.urlencode($this->request).'&&page='.$page.'&filters=video&search_sort=video_date_uploaded') or $this->returnError('Could not request Youtube.', 404); - foreach($html->find('li.context-data-item') as $element) { + foreach($html->find('li.yt-lockup') as $element) { $item = new \Item(); $item->uri = 'https://www.youtube.com'.$element->find('a',0)->href; $checkthumb = $element->find('img', 0)->getAttribute('data-thumb'); From 13da0112f234d5d139276278f213a3feb3b4b5f2 Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Wed, 14 May 2014 21:34:07 +0200 Subject: [PATCH 014/161] [pull/beta] UA spoofing --- index.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/index.php b/index.php index a95fcbb..e9754b0 100644 --- a/index.php +++ b/index.php @@ -15,6 +15,10 @@ date_default_timezone_set('UTC'); error_reporting(0); //ini_set('display_errors','1'); error_reporting(E_ALL); // For debugging only. +// FIXME : beta test UA spoofing, please report any blacklisting by PHP-fopen-unfriendly websites +ini_set('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20121202 Firefox/30.0 (rss-bridge/0.1; +https://github.com/sebsauvage/rss-bridge)'); +// ------- + try{ require_once __DIR__ . '/lib/RssBridge.php'; @@ -32,8 +36,7 @@ try{ $format = $_REQUEST['format']; unset($_REQUEST['format']); - // FIXME : necessary ? - // ini_set('user_agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:20.0) Gecko/20100101 Firefox/20.0'); + $cache = Cache::create('FileCache'); @@ -135,4 +138,4 @@ $formats = Format::searchInformation(); RSS-Bridge alpha 0.1 - \ No newline at end of file + From e504573fea67972d91ea8b96f91911bfb40ca362 Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Wed, 21 May 2014 17:21:53 +0200 Subject: [PATCH 015/161] updt readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c1673e..ad6194c 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,7 @@ Minecraft hashtag (#Minecraft) search on Twitter, in ATOM format (as displayed b Requirements === - * php 5.3 - * [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net). (Put `simple_html_dom.php` in `vendor/simplehtmldom/`). + * PHP 5.3 * TLS lib activated in PHP config for some bridges. @@ -72,6 +71,8 @@ Licence === Code is public domain. +Included `PHP Simple HTML DOM Parser` is under the [MIT License](http://opensource.org/licenses/MIT) + Technical notes === From 7bee7773623afcb04bc312072c7ce10b863583eb Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Wed, 21 May 2014 17:25:59 +0200 Subject: [PATCH 016/161] add simple_html_dom --- vendor/simplehtmldom/simple_html_dom.php | 1742 ++++++++++++++++++++++ 1 file changed, 1742 insertions(+) create mode 100644 vendor/simplehtmldom/simple_html_dom.php diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php new file mode 100644 index 0000000..b5d3089 --- /dev/null +++ b/vendor/simplehtmldom/simple_html_dom.php @@ -0,0 +1,1742 @@ +size is the "real" number of bytes the dom was created from. + * but for most purposes, it's a really good estimation. + * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. + * Allow the user to tell us how much they trust the html. + * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. + * This allows for us to find tags based on the text they contain. + * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. + * Paperg: added parse_charset so that we know about the character set of the source document. + * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the + * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. + * + * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. + * PaperG (John Schlick) Added get_display_size for "IMG" tags. + * + * Licensed under The MIT License + * Redistributions of files must retain the above copyright notice. + * + * @author S.C. Chen + * @author John Schlick + * @author Rus Carroll + * @version 1.5 ($Rev: 208 $) + * @package PlaceLocalInclude + * @subpackage simple_html_dom + */ + +/** + * All of the Defines for the classes below. + * @author S.C. Chen + */ +define('HDOM_TYPE_ELEMENT', 1); +define('HDOM_TYPE_COMMENT', 2); +define('HDOM_TYPE_TEXT', 3); +define('HDOM_TYPE_ENDTAG', 4); +define('HDOM_TYPE_ROOT', 5); +define('HDOM_TYPE_UNKNOWN', 6); +define('HDOM_QUOTE_DOUBLE', 0); +define('HDOM_QUOTE_SINGLE', 1); +define('HDOM_QUOTE_NO', 3); +define('HDOM_INFO_BEGIN', 0); +define('HDOM_INFO_END', 1); +define('HDOM_INFO_QUOTE', 2); +define('HDOM_INFO_SPACE', 3); +define('HDOM_INFO_TEXT', 4); +define('HDOM_INFO_INNER', 5); +define('HDOM_INFO_OUTER', 6); +define('HDOM_INFO_ENDSPACE',7); +define('DEFAULT_TARGET_CHARSET', 'UTF-8'); +define('DEFAULT_BR_TEXT', "\r\n"); +define('DEFAULT_SPAN_TEXT', " "); +define('MAX_FILE_SIZE', 10000000); +// helper functions +// ----------------------------------------------------------------------------- +// get html dom from file +// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. +function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) +{ + // We DO force the tags to be terminated. + $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); + // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. + $contents = file_get_contents($url, $use_include_path, $context, $offset); + // Paperg - use our own mechanism for getting the contents as we want to control the timeout. + //$contents = retrieve_url_contents($url); + if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) + { + return false; + } + // The second parameter can force the selectors to all be lowercase. + $dom->load($contents, $lowercase, $stripRN); + return $dom; +} + +// get html dom from string +function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) +{ + $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); + if (empty($str) || strlen($str) > MAX_FILE_SIZE) + { + $dom->clear(); + return false; + } + $dom->load($str, $lowercase, $stripRN); + return $dom; +} + +// dump html dom tree +function dump_html_tree($node, $show_attr=true, $deep=0) +{ + $node->dump($node); +} + + +/** + * simple html dom node + * PaperG - added ability for "find" routine to lowercase the value of the selector. + * PaperG - added $tag_start to track the start position of the tag in the total byte index + * + * @package PlaceLocalInclude + */ +class simple_html_dom_node +{ + public $nodetype = HDOM_TYPE_TEXT; + public $tag = 'text'; + public $attr = array(); + public $children = array(); + public $nodes = array(); + public $parent = null; + // The "info" array - see HDOM_INFO_... for what each element contains. + public $_ = array(); + public $tag_start = 0; + private $dom = null; + + function __construct($dom) + { + $this->dom = $dom; + $dom->nodes[] = $this; + } + + function __destruct() + { + $this->clear(); + } + + function __toString() + { + return $this->outertext(); + } + + // clean up memory due to php5 circular references memory leak... + function clear() + { + $this->dom = null; + $this->nodes = null; + $this->parent = null; + $this->children = null; + } + + // dump node's tree + function dump($show_attr=true, $deep=0) + { + $lead = str_repeat(' ', $deep); + + echo $lead.$this->tag; + if ($show_attr && count($this->attr)>0) + { + echo '('; + foreach ($this->attr as $k=>$v) + echo "[$k]=>\"".$this->$k.'", '; + echo ')'; + } + echo "\n"; + + if ($this->nodes) + { + foreach ($this->nodes as $c) + { + $c->dump($show_attr, $deep+1); + } + } + } + + + // Debugging function to dump a single dom node with a bunch of information about it. + function dump_node($echo=true) + { + + $string = $this->tag; + if (count($this->attr)>0) + { + $string .= '('; + foreach ($this->attr as $k=>$v) + { + $string .= "[$k]=>\"".$this->$k.'", '; + } + $string .= ')'; + } + if (count($this->_)>0) + { + $string .= ' $_ ('; + foreach ($this->_ as $k=>$v) + { + if (is_array($v)) + { + $string .= "[$k]=>("; + foreach ($v as $k2=>$v2) + { + $string .= "[$k2]=>\"".$v2.'", '; + } + $string .= ")"; + } else { + $string .= "[$k]=>\"".$v.'", '; + } + } + $string .= ")"; + } + + if (isset($this->text)) + { + $string .= " text: (" . $this->text . ")"; + } + + $string .= " HDOM_INNER_INFO: '"; + if (isset($node->_[HDOM_INFO_INNER])) + { + $string .= $node->_[HDOM_INFO_INNER] . "'"; + } + else + { + $string .= ' NULL '; + } + + $string .= " children: " . count($this->children); + $string .= " nodes: " . count($this->nodes); + $string .= " tag_start: " . $this->tag_start; + $string .= "\n"; + + if ($echo) + { + echo $string; + return; + } + else + { + return $string; + } + } + + // returns the parent of node + // If a node is passed in, it will reset the parent of the current node to that one. + function parent($parent=null) + { + // I am SURE that this doesn't work properly. + // It fails to unset the current node from it's current parents nodes or children list first. + if ($parent !== null) + { + $this->parent = $parent; + $this->parent->nodes[] = $this; + $this->parent->children[] = $this; + } + + return $this->parent; + } + + // verify that node has children + function has_child() + { + return !empty($this->children); + } + + // returns children of node + function children($idx=-1) + { + if ($idx===-1) + { + return $this->children; + } + if (isset($this->children[$idx])) + { + return $this->children[$idx]; + } + return null; + } + + // returns the first child of node + function first_child() + { + if (count($this->children)>0) + { + return $this->children[0]; + } + return null; + } + + // returns the last child of node + function last_child() + { + if (($count=count($this->children))>0) + { + return $this->children[$count-1]; + } + return null; + } + + // returns the next sibling of node + function next_sibling() + { + if ($this->parent===null) + { + return null; + } + + $idx = 0; + $count = count($this->parent->children); + while ($idx<$count && $this!==$this->parent->children[$idx]) + { + ++$idx; + } + if (++$idx>=$count) + { + return null; + } + return $this->parent->children[$idx]; + } + + // returns the previous sibling of node + function prev_sibling() + { + if ($this->parent===null) return null; + $idx = 0; + $count = count($this->parent->children); + while ($idx<$count && $this!==$this->parent->children[$idx]) + ++$idx; + if (--$idx<0) return null; + return $this->parent->children[$idx]; + } + + // function to locate a specific ancestor tag in the path to the root. + function find_ancestor_tag($tag) + { + global $debug_object; + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } + + // Start by including ourselves in the comparison. + $returnDom = $this; + + while (!is_null($returnDom)) + { + if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } + + if ($returnDom->tag == $tag) + { + break; + } + $returnDom = $returnDom->parent; + } + return $returnDom; + } + + // get dom node's inner html + function innertext() + { + if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; + if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + + $ret = ''; + foreach ($this->nodes as $n) + $ret .= $n->outertext(); + return $ret; + } + + // get dom node's outer text (with tag) + function outertext() + { + global $debug_object; + if (is_object($debug_object)) + { + $text = ''; + if ($this->tag == 'text') + { + if (!empty($this->text)) + { + $text = " with text: " . $this->text; + } + } + $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); + } + + if ($this->tag==='root') return $this->innertext(); + + // trigger callback + if ($this->dom && $this->dom->callback!==null) + { + call_user_func_array($this->dom->callback, array($this)); + } + + if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; + if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + + // render begin tag + if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) + { + $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); + } else { + $ret = ""; + } + + // render inner text + if (isset($this->_[HDOM_INFO_INNER])) + { + // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. + if ($this->tag != "br") + { + $ret .= $this->_[HDOM_INFO_INNER]; + } + } else { + if ($this->nodes) + { + foreach ($this->nodes as $n) + { + $ret .= $this->convert_text($n->outertext()); + } + } + } + + // render end tag + if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) + $ret .= 'tag.'>'; + return $ret; + } + + // get dom node's plain text + function text() + { + if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; + switch ($this->nodetype) + { + case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + case HDOM_TYPE_COMMENT: return ''; + case HDOM_TYPE_UNKNOWN: return ''; + } + if (strcasecmp($this->tag, 'script')===0) return ''; + if (strcasecmp($this->tag, 'style')===0) return ''; + + $ret = ''; + // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. + // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. + // WHY is this happening? + if (!is_null($this->nodes)) + { + foreach ($this->nodes as $n) + { + $ret .= $this->convert_text($n->text()); + } + + // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. + if ($this->tag == "span") + { + $ret .= $this->dom->default_span_text; + } + + + } + return $ret; + } + + function xmltext() + { + $ret = $this->innertext(); + $ret = str_ireplace('', '', $ret); + return $ret; + } + + // build node's text with tag + function makeup() + { + // text, comment, unknown + if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); + + $ret = '<'.$this->tag; + $i = -1; + + foreach ($this->attr as $key=>$val) + { + ++$i; + + // skip removed attribute + if ($val===null || $val===false) + continue; + + $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; + //no value attr: nowrap, checked selected... + if ($val===true) + $ret .= $key; + else { + switch ($this->_[HDOM_INFO_QUOTE][$i]) + { + case HDOM_QUOTE_DOUBLE: $quote = '"'; break; + case HDOM_QUOTE_SINGLE: $quote = '\''; break; + default: $quote = ''; + } + $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; + } + } + $ret = $this->dom->restore_noise($ret); + return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; + } + + // find elements by css selector + //PaperG - added ability for find to lowercase the value of the selector. + function find($selector, $idx=null, $lowercase=false) + { + $selectors = $this->parse_selector($selector); + if (($count=count($selectors))===0) return array(); + $found_keys = array(); + + // find each selector + for ($c=0; $c<$count; ++$c) + { + // The change on the below line was documented on the sourceforge code tracker id 2788009 + // used to be: if (($levle=count($selectors[0]))===0) return array(); + if (($levle=count($selectors[$c]))===0) return array(); + if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); + + $head = array($this->_[HDOM_INFO_BEGIN]=>1); + + // handle descendant selectors, no recursive! + for ($l=0; $l<$levle; ++$l) + { + $ret = array(); + foreach ($head as $k=>$v) + { + $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; + //PaperG - Pass this optional parameter on to the seek function. + $n->seek($selectors[$c][$l], $ret, $lowercase); + } + $head = $ret; + } + + foreach ($head as $k=>$v) + { + if (!isset($found_keys[$k])) + { + $found_keys[$k] = 1; + } + } + } + + // sort keys + ksort($found_keys); + + $found = array(); + foreach ($found_keys as $k=>$v) + $found[] = $this->dom->nodes[$k]; + + // return nth-element or array + if (is_null($idx)) return $found; + else if ($idx<0) $idx = count($found) + $idx; + return (isset($found[$idx])) ? $found[$idx] : null; + } + + // seek for given conditions + // PaperG - added parameter to allow for case insensitive testing of the value of a selector. + protected function seek($selector, &$ret, $lowercase=false) + { + global $debug_object; + if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } + + list($tag, $key, $val, $exp, $no_key) = $selector; + + // xpath index + if ($tag && $key && is_numeric($key)) + { + $count = 0; + foreach ($this->children as $c) + { + if ($tag==='*' || $tag===$c->tag) { + if (++$count==$key) { + $ret[$c->_[HDOM_INFO_BEGIN]] = 1; + return; + } + } + } + return; + } + + $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; + if ($end==0) { + $parent = $this->parent; + while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { + $end -= 1; + $parent = $parent->parent; + } + $end += $parent->_[HDOM_INFO_END]; + } + + for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { + $node = $this->dom->nodes[$i]; + + $pass = true; + + if ($tag==='*' && !$key) { + if (in_array($node, $this->children, true)) + $ret[$i] = 1; + continue; + } + + // compare tag + if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} + // compare key + if ($pass && $key) { + if ($no_key) { + if (isset($node->attr[$key])) $pass=false; + } else { + if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; + } + } + // compare value + if ($pass && $key && $val && $val!=='*') { + // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? + if ($key == "plaintext") { + // $node->plaintext actually returns $node->text(); + $nodeKeyValue = $node->text(); + } else { + // this is a normal search, we want the value of that attribute of the tag. + $nodeKeyValue = $node->attr[$key]; + } + if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} + + //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. + if ($lowercase) { + $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); + } else { + $check = $this->match($exp, $val, $nodeKeyValue); + } + if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} + + // handle multiple class + if (!$check && strcasecmp($key, 'class')===0) { + foreach (explode(' ',$node->attr[$key]) as $k) { + // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. + if (!empty($k)) { + if ($lowercase) { + $check = $this->match($exp, strtolower($val), strtolower($k)); + } else { + $check = $this->match($exp, $val, $k); + } + if ($check) break; + } + } + } + if (!$check) $pass = false; + } + if ($pass) $ret[$i] = 1; + unset($node); + } + // It's passed by reference so this is actually what this function returns. + if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} + } + + protected function match($exp, $pattern, $value) { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} + + switch ($exp) { + case '=': + return ($value===$pattern); + case '!=': + return ($value!==$pattern); + case '^=': + return preg_match("/^".preg_quote($pattern,'/')."/", $value); + case '$=': + return preg_match("/".preg_quote($pattern,'/')."$/", $value); + case '*=': + if ($pattern[0]=='/') { + return preg_match($pattern, $value); + } + return preg_match("/".$pattern."/i", $value); + } + return false; + } + + protected function parse_selector($selector_string) { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} + + // pattern of CSS selectors, modified from mootools + // Paperg: Add the colon to the attrbute, so that it properly finds like google does. + // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. +// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. +// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. +// farther study is required to determine of this should be documented or removed. +// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); + if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} + + $selectors = array(); + $result = array(); + //print_r($matches); + + foreach ($matches as $m) { + $m[0] = trim($m[0]); + if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; + // for browser generated xpath + if ($m[1]==='tbody') continue; + + list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); + if (!empty($m[2])) {$key='id'; $val=$m[2];} + if (!empty($m[3])) {$key='class'; $val=$m[3];} + if (!empty($m[4])) {$key=$m[4];} + if (!empty($m[5])) {$exp=$m[5];} + if (!empty($m[6])) {$val=$m[6];} + + // convert to lowercase + if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} + //elements that do NOT have the specified attribute + if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} + + $result[] = array($tag, $key, $val, $exp, $no_key); + if (trim($m[7])===',') { + $selectors[] = $result; + $result = array(); + } + } + if (count($result)>0) + $selectors[] = $result; + return $selectors; + } + + function __get($name) + { + if (isset($this->attr[$name])) + { + return $this->convert_text($this->attr[$name]); + } + switch ($name) + { + case 'outertext': return $this->outertext(); + case 'innertext': return $this->innertext(); + case 'plaintext': return $this->text(); + case 'xmltext': return $this->xmltext(); + default: return array_key_exists($name, $this->attr); + } + } + + function __set($name, $value) + { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} + + switch ($name) + { + case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; + case 'innertext': + if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; + return $this->_[HDOM_INFO_INNER] = $value; + } + if (!isset($this->attr[$name])) + { + $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); + $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; + } + $this->attr[$name] = $value; + } + + function __isset($name) + { + switch ($name) + { + case 'outertext': return true; + case 'innertext': return true; + case 'plaintext': return true; + } + //no value attr: nowrap, checked selected... + return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); + } + + function __unset($name) { + if (isset($this->attr[$name])) + unset($this->attr[$name]); + } + + // PaperG - Function to convert the text from one character set to another if the two sets are not the same. + function convert_text($text) + { + global $debug_object; + if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} + + $converted_text = $text; + + $sourceCharset = ""; + $targetCharset = ""; + + if ($this->dom) + { + $sourceCharset = strtoupper($this->dom->_charset); + $targetCharset = strtoupper($this->dom->_target_charset); + } + if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} + + if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) + { + // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 + if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) + { + $converted_text = $text; + } + else + { + $converted_text = iconv($sourceCharset, $targetCharset, $text); + } + } + + // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. + if ($targetCharset == 'UTF-8') + { + if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") + { + $converted_text = substr($converted_text, 3); + } + if (substr($converted_text, -3) == "\xef\xbb\xbf") + { + $converted_text = substr($converted_text, 0, -3); + } + } + + return $converted_text; + } + + /** + * Returns true if $string is valid UTF-8 and false otherwise. + * + * @param mixed $str String to be tested + * @return boolean + */ + static function is_utf8($str) + { + $c=0; $b=0; + $bits=0; + $len=strlen($str); + for($i=0; $i<$len; $i++) + { + $c=ord($str[$i]); + if($c > 128) + { + if(($c >= 254)) return false; + elseif($c >= 252) $bits=6; + elseif($c >= 248) $bits=5; + elseif($c >= 240) $bits=4; + elseif($c >= 224) $bits=3; + elseif($c >= 192) $bits=2; + else return false; + if(($i+$bits) > $len) return false; + while($bits > 1) + { + $i++; + $b=ord($str[$i]); + if($b < 128 || $b > 191) return false; + $bits--; + } + } + } + return true; + } + /* + function is_utf8($string) + { + //this is buggy + return (utf8_encode(utf8_decode($string)) == $string); + } + */ + + /** + * Function to try a few tricks to determine the displayed size of an img on the page. + * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. + * + * @author John Schlick + * @version April 19 2012 + * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. + */ + function get_display_size() + { + global $debug_object; + + $width = -1; + $height = -1; + + if ($this->tag !== 'img') + { + return false; + } + + // See if there is aheight or width attribute in the tag itself. + if (isset($this->attr['width'])) + { + $width = $this->attr['width']; + } + + if (isset($this->attr['height'])) + { + $height = $this->attr['height']; + } + + // Now look for an inline style. + if (isset($this->attr['style'])) + { + // Thanks to user gnarf from stackoverflow for this regular expression. + $attributes = array(); + preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); + foreach ($matches as $match) { + $attributes[$match[1]] = $match[2]; + } + + // If there is a width in the style attributes: + if (isset($attributes['width']) && $width == -1) + { + // check that the last two characters are px (pixels) + if (strtolower(substr($attributes['width'], -2)) == 'px') + { + $proposed_width = substr($attributes['width'], 0, -2); + // Now make sure that it's an integer and not something stupid. + if (filter_var($proposed_width, FILTER_VALIDATE_INT)) + { + $width = $proposed_width; + } + } + } + + // If there is a width in the style attributes: + if (isset($attributes['height']) && $height == -1) + { + // check that the last two characters are px (pixels) + if (strtolower(substr($attributes['height'], -2)) == 'px') + { + $proposed_height = substr($attributes['height'], 0, -2); + // Now make sure that it's an integer and not something stupid. + if (filter_var($proposed_height, FILTER_VALIDATE_INT)) + { + $height = $proposed_height; + } + } + } + + } + + // Future enhancement: + // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. + + // Far future enhancement + // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width + // Note that in this case, the class or id will have the img subselector for it to apply to the image. + + // ridiculously far future development + // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. + + $result = array('height' => $height, + 'width' => $width); + return $result; + } + + // camel naming conventions + function getAllAttributes() {return $this->attr;} + function getAttribute($name) {return $this->__get($name);} + function setAttribute($name, $value) {$this->__set($name, $value);} + function hasAttribute($name) {return $this->__isset($name);} + function removeAttribute($name) {$this->__set($name, null);} + function getElementById($id) {return $this->find("#$id", 0);} + function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} + function getElementByTagName($name) {return $this->find($name, 0);} + function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} + function parentNode() {return $this->parent();} + function childNodes($idx=-1) {return $this->children($idx);} + function firstChild() {return $this->first_child();} + function lastChild() {return $this->last_child();} + function nextSibling() {return $this->next_sibling();} + function previousSibling() {return $this->prev_sibling();} + function hasChildNodes() {return $this->has_child();} + function nodeName() {return $this->tag;} + function appendChild($node) {$node->parent($this); return $node;} + +} + +/** + * simple html dom parser + * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. + * Paperg - change $size from protected to public so we can easily access it + * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. + * + * @package PlaceLocalInclude + */ +class simple_html_dom +{ + public $root = null; + public $nodes = array(); + public $callback = null; + public $lowercase = false; + // Used to keep track of how large the text was when we started. + public $original_size; + public $size; + protected $pos; + protected $doc; + protected $char; + protected $cursor; + protected $parent; + protected $noise = array(); + protected $token_blank = " \t\r\n"; + protected $token_equal = ' =/>'; + protected $token_slash = " />\r\n\t"; + protected $token_attr = ' >'; + // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. + public $_charset = ''; + public $_target_charset = ''; + protected $default_br_text = ""; + public $default_span_text = ""; + + // use isset instead of in_array, performance boost about 30%... + protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); + protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); + // Known sourceforge issue #2977341 + // B tags that are not closed cause us to return everything to the end of the document. + protected $optional_closing_tags = array( + 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), + 'th'=>array('th'=>1), + 'td'=>array('td'=>1), + 'li'=>array('li'=>1), + 'dt'=>array('dt'=>1, 'dd'=>1), + 'dd'=>array('dd'=>1, 'dt'=>1), + 'dl'=>array('dd'=>1, 'dt'=>1), + 'p'=>array('p'=>1), + 'nobr'=>array('nobr'=>1), + 'b'=>array('b'=>1), + 'option'=>array('option'=>1), + ); + + function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) + { + if ($str) + { + if (preg_match("/^http:\/\//i",$str) || is_file($str)) + { + $this->load_file($str); + } + else + { + $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); + } + } + // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. + if (!$forceTagsClosed) { + $this->optional_closing_array=array(); + } + $this->_target_charset = $target_charset; + } + + function __destruct() + { + $this->clear(); + } + + // load html from string + function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) + { + global $debug_object; + + // prepare + $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); + // strip out cdata + $this->remove_noise("''is", true); + // strip out comments + $this->remove_noise("''is"); + // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 + // Script tags removal now preceeds style tag removal. + // strip out @si', '', $text); + return $text; + } + $html = file_get_html('http://www.futura-sciences.com/rss/actualites.xml') or $this->returnError('Could not request Futura Sciences.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 20) { + $item = new \Item(); + $item->title = FS_StripCDATA($element->find('title', 0)->innertext); + $item->uri = FS_StripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = FS_ExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'Futura Sciences'; + } + + public function getURI(){ + return 'http://www.futura-sciences.com/'; + } + + public function getCacheDuration(){ + return 3600; // 1 hour + // return 0; // 1 hour + } +} From ee19e5c6b53015354682b9178ab6a326164de81c Mon Sep 17 00:00:00 2001 From: Qwerty Date: Fri, 30 May 2014 12:11:02 +0200 Subject: [PATCH 041/161] Create OpenTheoryBridge.php --- bridges/OpenTheoryBridge.php | 58 ++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 bridges/OpenTheoryBridge.php diff --git a/bridges/OpenTheoryBridge.php b/bridges/OpenTheoryBridge.php new file mode 100644 index 0000000..adf4718 --- /dev/null +++ b/bridges/OpenTheoryBridge.php @@ -0,0 +1,58 @@ +', '', $string); + return $string; + } + function ExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('div.entry-content', 0)->innertext; + $text = preg_replace('@]*?>.*?@si', '', $text); + return $text; + } + $html = file_get_html('http://open1theory.com/feed') or $this->returnError('Could not request OpenTheory.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 10) { + $item = new \Item(); + $item->title = StripCDATA($element->find('title', 0)->innertext); + $item->uri = StripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = ExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'OpenTheory'; + } + + public function getURI(){ + return 'http://open1theory.com/feed'; + } + + public function getCacheDuration(){ + return 3600; // 1 hour + // return 0; // 1 hour + } +} From e816b2aa805caef02bf8d2ab63ca5cca2ba671bd Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Fri, 30 May 2014 14:07:34 +0200 Subject: [PATCH 042/161] Fix post limit for FS, GuruMed, OpenTheory --- bridges/FSBridge.php | 4 ++-- bridges/GuruMedBridge.php | 6 +++--- bridges/OpenTheoryBridge.php | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bridges/FSBridge.php b/bridges/FSBridge.php index 54876ba..1011b16 100644 --- a/bridges/FSBridge.php +++ b/bridges/FSBridge.php @@ -4,7 +4,7 @@ * Returns the 5 newest posts from http://www.futura-sciences.com (full text) * * @name Futurasciences -* @description Returns the 20 newest posts from FS (full text) +* @description Returns the 5 newest posts from FS (full text) *@maintainer qwertygc */ class FSBridge extends BridgeAbstract{ @@ -30,7 +30,7 @@ class FSBridge extends BridgeAbstract{ $limit = 0; foreach($html->find('item') as $element) { - if($limit < 20) { + if($limit < 5) { $item = new \Item(); $item->title = FS_StripCDATA($element->find('title', 0)->innertext); $item->uri = FS_StripCDATA($element->find('guid', 0)->plaintext); diff --git a/bridges/GuruMedBridge.php b/bridges/GuruMedBridge.php index 4fbd01c..1f44ec7 100644 --- a/bridges/GuruMedBridge.php +++ b/bridges/GuruMedBridge.php @@ -1,10 +1,10 @@ find('item') as $element) { - if($limit < 10) { + if($limit < 5) { $item = new \Item(); $item->title = GurumedStripCDATA($element->find('title', 0)->innertext); $item->uri = GurumedStripCDATA($element->find('guid', 0)->plaintext); diff --git a/bridges/OpenTheoryBridge.php b/bridges/OpenTheoryBridge.php index adf4718..f3f7cdb 100644 --- a/bridges/OpenTheoryBridge.php +++ b/bridges/OpenTheoryBridge.php @@ -1,10 +1,10 @@ find('item') as $element) { - if($limit < 10) { + if($limit < 5) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); From 237886feb6092e6bd4c6b5adebc84b690a37340e Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Fri, 30 May 2014 17:53:48 +0200 Subject: [PATCH 043/161] Add CoinDesk, update Maliki --- bridges/CoinDeskBridge.php | 56 ++++++++++++++++++++++++++++++++++++++ bridges/MalikiBridge.php | 32 +++++++++++++++++----- 2 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 bridges/CoinDeskBridge.php diff --git a/bridges/CoinDeskBridge.php b/bridges/CoinDeskBridge.php new file mode 100644 index 0000000..6dc5a09 --- /dev/null +++ b/bridges/CoinDeskBridge.php @@ -0,0 +1,56 @@ +', '', $string); + return $string; + } + function CoinDeskExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('div.single-content', 0)->innertext; + $text = strip_tags($text, '

'); + return $text; + } + $html = file_get_html('http://www.coindesk.com/feed/atom/') or $this->returnError('Could not request CoinDesk.', 404); + $limit = 0; + + foreach($html->find('entry') as $element) { + if($limit < 5) { + $item = new \Item(); + $item->title = CoinDeskStripCDATA($element->find('title', 0)->innertext); + $item->author = $element->find('author', 0)->plaintext; + $item->uri = $element->find('link', 0)->href; + $item->timestamp = strtotime($element->find('published', 0)->plaintext); + $item->content = CoinDeskExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'CoinDesk'; + } + + public function getURI(){ + return 'http://www.coindesk.com/'; + } + + public function getCacheDuration(){ + return 1800; // 30min + } +} diff --git a/bridges/MalikiBridge.php b/bridges/MalikiBridge.php index 55b09ab..29e4cfa 100644 --- a/bridges/MalikiBridge.php +++ b/bridges/MalikiBridge.php @@ -1,27 +1,45 @@ returnError('Could not request Maliki.', 404); $count=0; + $latest=1; $latest_title=""; + $latest = $html->find('div.conteneur_page a', 1)->href; + $latest_title = $html->find('div.conteneur_page img', 0)->title; + + function MalikiExtractContent($url) { + $html2 = file_get_html($url); + $text = 'http://www.maliki.com/'.$html2->find('img', 0)->src; + $text = '
'.$html2->find('div.imageetnews', 0)->plaintext; + return $text; + } + + $item = new \Item(); + $item->uri = 'http://www.maliki.com/'.$latest; + $item->title = $latest_title; + $item->timestamp = time(); + $item->content = MalikiExtractContent($item->uri); + $this->items[] = $item; + + foreach($html->find('div.boite_strip') as $element) { - if(!empty($element->find('a',0)->href) and $count < 20) { + if(!empty($element->find('a',0)->href) and $count < 3) { $item = new \Item(); $item->uri = 'http://www.maliki.com/'.$element->find('a',0)->href; - $item->thumbnailUri = 'http://www.maliki.com/'.$element->find('img',0)->src; $item->title = $element->find('img',0)->title; $item->timestamp = strtotime(str_replace('/', '-', $element->find('span.stylepetit', 0)->innertext)); - $item->content = '
'; + $item->content = MalikiExtractContent($item->uri); $this->items[] = $item; $count++; } @@ -37,6 +55,6 @@ class MalikiBridge extends BridgeAbstract{ } public function getCacheDuration(){ - return 86400; // 24 hours + return 86400*6; // 6 days } } From 6b2742b06753e012159a6a7ecdb53927dfb364e8 Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Sat, 31 May 2014 11:46:54 +0200 Subject: [PATCH 044/161] fix PlanetLibre --- bridges/PlanetLibreBridge.php | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/bridges/PlanetLibreBridge.php b/bridges/PlanetLibreBridge.php index 073ec28..5ada535 100644 --- a/bridges/PlanetLibreBridge.php +++ b/bridges/PlanetLibreBridge.php @@ -4,31 +4,27 @@ * Returns the 5 newest posts from PlanetLibre (full text) * * @name PlanetLibre -* @homepage http://www.www.planet-libre.org +* @homepage http://www.planet-libre.org * @description Returns the 5 newest posts from PlanetLibre (full text) * @maintainer pit-fgfjiudghdf * @update 2014-05-26 */ class PlanetLibreBridge extends BridgeAbstract{ public function collectData(array $param){ - function PlanetLibreStripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + function PlanetLibreExtractContent($url) { $html2 = file_get_html($url); - $text = $html2->find('div[class=post-text]', 0)->innertext; + $text = $html2->find('div[class="post-text"]', 0)->innertext; return $text; } - $html = file_get_html('http://www.planet-libre.org/rss10.php') or $this->returnError('Could not request PlanetLibre.', 404); + $html = file_get_html('http://www.planet-libre.org/') or $this->returnError('Could not request PlanetLibre.', 404); $limit = 0; - foreach($html->find('item') as $element) { + foreach($html->find('div.post') as $element) { if($limit < 5) { $item = new \Item(); - $item->title = PlanetLibreStripCDATA($element->find('title', 0)->innertext); - $item->uri = PlanetLibreStripCDATA($element->find('guid', 0)->plaintext); - $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->title = $element->find('h1', 0)->plaintext; + $item->uri = $element->find('a', 0)->href; + $item->timestamp = strtotime(str_replace('/', '-', $element->find('div[class="post-date"]', 0)->plaintext)); $item->content = PlanetLibreExtractContent($item->uri); $this->items[] = $item; $limit++; From 7f46cc3e91d8a7120501c3424ed7bc79360bcb86 Mon Sep 17 00:00:00 2001 From: pit-fgfjiudghdf Date: Sat, 31 May 2014 12:39:35 +0200 Subject: [PATCH 045/161] Create Rue89Bridge.php Voici le code. Mon serveur est en vrac, je ne peux pas valider qu il fonctionne tjs mais, il y a une semaine je n'avais pas de soucis. --- bridges/Rue89Bridge.php | 50 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 bridges/Rue89Bridge.php diff --git a/bridges/Rue89Bridge.php b/bridges/Rue89Bridge.php new file mode 100644 index 0000000..3bb313d --- /dev/null +++ b/bridges/Rue89Bridge.php @@ -0,0 +1,50 @@ +', '', $string); + return $string; + } + function Rue89ExtractContent($url) { + $html2 = file_get_html($url); + //$text = $html2->find('div[class=text]', 0)->innertext; + $text = $html2->find('div article', 0)->innertext; + //$text = $html2->find('div.article', 0)->innertext; + //$text = $html2->find('div[id=main]', 0)->innertext; + //$text = $html2->find('div[id=article]', 0)->innertext; + //$text = preg_replace('/(

@si', '', $text); + return $text; + } + $html = file_get_html('http://memo-linux.com/feed/') or $this->returnError('Could not request MemoLinux.', 404); + $limit = 0; + + foreach($html->find('item') as $element) { + if($limit < 10) { + $item = new \Item(); + $item->title = StripCDATA($element->find('title', 0)->innertext); + $item->uri = StripCDATA($element->find('guid', 0)->plaintext); + $item->timestamp = strtotime($element->find('pubDate', 0)->plaintext); + $item->content = ExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + } + + public function getName(){ + return 'MemoLinux'; + } + + public function getURI(){ + return 'http://memo-linux.com/feed/'; + } + + public function getCacheDuration(){ + // return 3600; // 1 hour + return 0; // 1 hour + } +} From 75773f62f80be5dacafcedee7d4fac123ce7818d Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Tue, 8 Jul 2014 16:17:50 +0200 Subject: [PATCH 054/161] fix Numerama: afterscript text --- bridges/NumeramaBridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index 3f3f0fd..8007075 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -22,7 +22,7 @@ class NumeramaBridge extends BridgeAbstract{ $html2 = file_get_html($url); $text = $html2->find('h2.intro', 0)->innertext; $text = $text.$html2->find('div.content', 0)->innertext; - $text = preg_replace('/@si', '', $text); $text = preg_replace('@]*?>.*?@si', '', $text); + $text = preg_replace("/returnError('Could not request MemoLinux.', 404); @@ -50,7 +49,6 @@ class MemoLinuxBridge extends BridgeAbstract{ } public function getCacheDuration(){ - // return 3600; // 1 hour - return 0; // 1 hour + return 3600*12; // 12 hours } } diff --git a/bridges/SegfaultMintBridge.php b/bridges/SegfaultMintBridge.php index 48d9aaf..bb89c45 100644 --- a/bridges/SegfaultMintBridge.php +++ b/bridges/SegfaultMintBridge.php @@ -1,11 +1,10 @@ find('item') as $element) { - if($limit < 10) { + if($limit < 5) { $item = new \Item(); $item->title = StripCDATA($element->find('title', 0)->innertext); $item->uri = StripCDATA($element->find('guid', 0)->plaintext); @@ -52,7 +51,6 @@ class SegfaultMintBridge extends BridgeAbstract{ } public function getCacheDuration(){ - // return 3600; // 1 hour - return 0; // 1 hour + return 3600*24; // 24 hours } } From d4de199501363f397b56cd48f76a74a9e7a74819 Mon Sep 17 00:00:00 2001 From: Mitsukarenai Date: Tue, 8 Jul 2014 17:06:49 +0200 Subject: [PATCH 056/161] fix issue #64 --- bridges/TuxboardBridge.php | 95 +++++++++++++------------------------- 1 file changed, 31 insertions(+), 64 deletions(-) diff --git a/bridges/TuxboardBridge.php b/bridges/TuxboardBridge.php index 4696d4f..f038fbd 100644 --- a/bridges/TuxboardBridge.php +++ b/bridges/TuxboardBridge.php @@ -1,79 +1,46 @@ returnError('Could not request Tuxboard.', 404); - foreach($html->find('div.posts') as $element) { - $a = $element->find("h2 a", 0); - $category = $element->find("div#category", 0); - $catTxt = $category->innertext; - $posFinDate = strpos(" -", $catTxt); - $list = explode(" ", trim(substr($catTxt, $posFinDate))); - $jour = $list[0]; - $mois = 1; - $annee = $list[2]; + function StripCDATA($string) { + $string = str_replace('', '', $string); + return $string; + } - switch (strtolower($list[1])) - { - case "janvier" : - $mois = 1; - break; - case "février" : - case "fevrier" : - $mois = 2; - break; - case "mars" : - $mois = 3; - break; - case "avril" : - $mois = 4; - break; - case "mai" : - $mois = 5; - break; - case "juin" : - $mois = 6; - break; - case "juillet" : - $mois = 7; - break; - case "aout" : - case "août" : - $mois = 8; - break; - case "septembre" : - $mois = 9; - break; - case "octobre" : - $mois = 10; - break; - case "novembre" : - $mois = 11; - break; - case "decembre" : - case "décembre" : - $mois = 12; - break; - } + function ExtractContent($url) { + $html2 = file_get_html($url); + $text = $html2->find('article#page', 0)->innertext; + $text = preg_replace('@]*?>.*?@si', '', $text); + return $text; + } - $item = new Item(); + $html = file_get_html('http://www.tuxboard.com/feed/atom/') or $this->returnError('Could not request Tuxboard.', 404); + $limit = 0; - $item->uri = $a->href; - $item->title = $a->innertext; - $item->content = trim($element->find("div.clear", 0)->innertext); - $item->timestamp = mktime(0, 0, 0, $mois, $jour, $annee); - - $this->items[] = $item; - } + foreach($html->find('entry') as $element) { + if($limit < 10) { + $item = new \Item(); + $item->title = StripCDATA($element->find('title', 0)->innertext); + $item->uri = $element->find('link', 0)->href; + $item->timestamp = strtotime($element->find('published', 0)->plaintext); + $item->content = ExtractContent($item->uri); + $this->items[] = $item; + $limit++; + } + } + + + } public function getName(){ @@ -89,7 +56,7 @@ class TuxboardBridge extends BridgeAbstract{ } public function getCacheDuration(){ - return 14600; // 4 hours + return 3600; // 1 hour } } ?> From ede0046d4fe1015a6b52934a24a0d4ac48c5f4ff Mon Sep 17 00:00:00 2001 From: Mitsu Date: Tue, 8 Jul 2014 17:36:44 +0200 Subject: [PATCH 057/161] README: add bridge guidelines --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1526fcb..cd42035 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,18 @@ Including `PHP Simple HTML DOM Parser` under the [MIT License](http://opensource Technical notes === * There is a cache so that source services won't ban you even if you hammer the rss-bridge with requests. Each bridge has a different duration for the cache. The `cache` subdirectory will be automatically created. You can purge it whenever you want. - * To implement a new rss-bridge, create a new class in `bridges` subdirectory. Look at existing bridges for examples. For items you generate in `$this->items`, only `uri` and `title` are mandatory in each item. `timestamp` and `content` are optional but recommended. Any additional key will be ignored by ATOM feed (but outputed to json). + * To implement a new rss-bridge, create a new class in `bridges` subdirectory. Look at existing bridges for examples and the guidelines below. For items you generate in `$this->items`, only `uri` and `title` are mandatory in each item. `timestamp` and `content` are optional but recommended. Any additional key will be ignored by ATOM feed (but outputed to json). + +### Bridge guidelines + + * metatags: `@name` {Name of service}, `@homepage` {URL to homepage}, `@description`, `@update` {YYYY-MM-DD}, `@maintainer` {Github username or nickname} + * scripts (eg. Javascript) must be stripped out. Make good use of `strip_tags()` and `preg_replace()` + * bridge must present data within 8 seconds (adjust iterators accordingly) + * cache timeout must be fine-tuned so that each refresh can provide 1 or 2 new elements on busy periods + * `