From 096c318d4e896582c73ae8b31c95228c74ba6725 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 1 Aug 2014 11:13:57 +0200 Subject: [PATCH 01/19] Create TwitterBridgeTweaked.php Extends TwitterBridgeExtended to provide access to more public fields. --- bridges/TwitterBridgeTweaked.php | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 bridges/TwitterBridgeTweaked.php diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php new file mode 100644 index 0000000..9b0d4bc --- /dev/null +++ b/bridges/TwitterBridgeTweaked.php @@ -0,0 +1,11 @@ +items[0]->username; + } +} From 4e9a0df1bd8da7d111b5380cd64947d6d32e4b79 Mon Sep 17 00:00:00 2001 From: vinz Date: Thu, 6 Nov 2014 23:56:37 +0100 Subject: [PATCH 02/19] cloned TwitterBridge for stripping out links etc. for clean sharing to GNU Social --- bridges/TwitterBridgeClean.php | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 bridges/TwitterBridgeClean.php diff --git a/bridges/TwitterBridgeClean.php b/bridges/TwitterBridgeClean.php new file mode 100644 index 0000000..8b3f71d --- /dev/null +++ b/bridges/TwitterBridgeClean.php @@ -0,0 +1,70 @@ +request = $param['q']; + $html = file_get_html('http://twitter.com/search/realtime?q='.urlencode($this->request).'+include:retweets&src=typd') or $this->returnError('No results for this query.', 404); + } + elseif (isset($param['u'])) { /* user timeline mode */ + $this->request = $param['u']; + $html = file_get_html('http://twitter.com/'.urlencode($this->request)) or $this->returnError('Requested username can\'t be found.', 404); + } + else { + $this->returnError('You must specify a keyword (?q=...) or a Twitter username (?u=...).', 400); + } + + foreach($html->find('div.js-stream-tweet') as $tweet) { + $item = new \Item(); + $item->username = $tweet->getAttribute('data-screen-name'); // extract username and sanitize + $item->fullname = $tweet->getAttribute('data-name'); // extract fullname (pseudonym) + $item->avatar = $tweet->find('img', 0)->src; // get avatar link + $item->id = $tweet->getAttribute('data-tweet-id'); // get TweetID + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); // get tweet link + $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); // extract tweet timestamp + // processing content links + foreach($tweet->find('a') as $link) { + if($link->hasAttribute('data-expanded-url') ) { + $link->href = $link->getAttribute('data-expanded-url'); + } + $link->removeAttribute('data-expanded-url'); + $link->removeAttribute('data-query-source'); + $link->removeAttribute('rel'); + $link->removeAttribute('class'); + $link->removeAttribute('target'); + $link->removeAttribute('title'); + $link->removeAttribute('dir'); + } + $item->content = str_replace('pic.twitter.com', 'https://pic.twitter.com', strip_tags($tweet->find('p.js-tweet-text', 0)->innertext)); // extract tweet text + $item->title = $item->content; + $this->items[] = $item; + } + } + + public function getName(){ + return (!empty($this->request) ? $this->request .' - ' : '') .'Twitter Bridge'; + } + + public function getURI(){ + return 'http://twitter.com'; + } + + public function getCacheDuration(){ + return 300; // 5 minutes + } +} From 5922e37bc0777f5309cb8bed523bca3a5514abdf Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 5 Dec 2014 13:18:37 +0100 Subject: [PATCH 03/19] Create GiphyBridge.php Add bridge for Giphy.com --- bridges/GiphyBridge.php | 85 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 bridges/GiphyBridge.php diff --git a/bridges/GiphyBridge.php b/bridges/GiphyBridge.php new file mode 100644 index 0000000..8312174 --- /dev/null +++ b/bridges/GiphyBridge.php @@ -0,0 +1,85 @@ +returnError('No results for this query.', 404); + } + else { + $this->returnError('You must specify a search worf (?s=...).', 400); + } + + $max = GIPHY_LIMIT; + if (isset($param['n'])) { + $max = (integer) $param['n']; + } + + $limit = 0; + $kw = urlencode($param['s']); + foreach($html->find('div.hoverable-gif') as $entry) { + if($limit < $max) { + $node = $entry->first_child(); + $href = $node->getAttribute('href'); + + $html2 = file_get_html($base_url . $href) or $this->returnError('No results for this query.', 404); + $figure = $html2->getElementByTagName('figure'); + $img = $figure->firstChild(); + $caption = $figure->lastChild(); + + $item = new \Item(); + $item->id = $img->getAttribute('data-gif_id'); + $item->uri = $img->getAttribute('data-bitly_gif_url'); + $item->username = 'Giphy - '.ucfirst($kw); + $title = $caption->innertext(); + $title = preg_replace('/\s+/', ' ',$title); + $title = str_replace('animated GIF', '', $title); + $title = str_replace($kw, '', $title); + $title = preg_replace('/\s+/', ' ',$title); + $title = trim($title); + if (strlen($title) <= 0) { + $title = $item->id; + } + $item->title = trim($title); + $item->content = + '' + .'' + .''; + + $this->items[] = $item; + $limit++; + } + } + } + + public function getName(){ + return 'Giphy Bridge'; + } + + public function getURI(){ + return 'http://giphy.com/'; + } + + public function getCacheDuration(){ + return 300; // 5 minutes + } + + public function getUsername(){ + return $this->items[0]->username; + } +} From 068557ed0830327f5fbca3455af484e714ebb835 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 5 Dec 2014 13:19:20 +0100 Subject: [PATCH 04/19] Update GiphyBridge.php --- bridges/GiphyBridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/GiphyBridge.php b/bridges/GiphyBridge.php index 8312174..b0a4f89 100644 --- a/bridges/GiphyBridge.php +++ b/bridges/GiphyBridge.php @@ -9,6 +9,7 @@ * @description Bridge for giphy.com * @maintainer kraoc * @use1(s="search tag") +* @use2(n="max number of returned items") */ define(GIPHY_LIMIT, 10); From 0d9df394ddd8d723934dc21d163cf67f8f510785 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 5 Dec 2014 13:20:17 +0100 Subject: [PATCH 05/19] Update TwitterBridgeTweaked.php --- bridges/TwitterBridgeTweaked.php | 77 +++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 9b0d4bc..53099ce 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -1,10 +1,83 @@ returnError('No results for this query.', 404); + } + elseif (isset($param['u'])) { /* user timeline mode */ + $html = file_get_html('https://twitter.com/'.urlencode($param['u']).'/with_replies') or $this->returnError('Requested username can\'t be found.', 404); + } + else { + $this->returnError('You must specify a keyword (?q=...) or a Twitter username (?u=...).', 400); + } + foreach($html->find('div.js-stream-tweet') as $tweet) { + $item = new \Item(); + // extract username and sanitize + $item->username = $tweet->getAttribute('data-screen-name'); + // extract fullname (pseudonym) + $item->fullname = $tweet->getAttribute('data-name'); + // get avatar link + $item->avatar = $tweet->find('img', 0)->src; + // get TweetID + $item->id = $tweet->getAttribute('data-tweet-id'); + // get tweet link + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); + // extract tweet timestamp + $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); + // extract plaintext + $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); + + // processing content links + foreach($tweet->find('a') as $link) { + if($link->hasAttribute('data-expanded-url') ) { + $link->href = $link->getAttribute('data-expanded-url'); + } + $link->removeAttribute('data-expanded-url'); + $link->removeAttribute('data-query-source'); + $link->removeAttribute('rel'); + $link->removeAttribute('class'); + $link->removeAttribute('target'); + $link->removeAttribute('title'); + } + // get tweet text + $item->content = 'avatar'.$item->username.' '.$item->fullname.'
'.str_replace('href="/', 'href="https://twitter.com/', $tweet->find('p.js-tweet-text', 0)->innertext).'
'; + // generate the title +// $item->title = $item->fullname . ' (@'. $item->username . ') | ' . $item->content_simple; + $item->title = $item->content_simple; + // put out + $this->items[] = $item; + } + } + + public function getName(){ + return 'Twitter Bridge Tweaked'; + } + + public function getURI(){ + return 'http://twitter.com'; + } + + public function getCacheDuration(){ + return 300; // 5 minutes + } + public function getUsername(){ return $this->items[0]->username; } From 30f339e3b2a40695c6f8c296e927294d4b8436a6 Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 8 Dec 2014 16:31:16 +0100 Subject: [PATCH 06/19] Add title cleaning Try to remove all links from title's entry. --- bridges/TwitterBridgeTweaked.php | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 53099ce..3b6ed13 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -13,6 +13,29 @@ */ class TwitterBridgeTweaked extends BridgeAbstract{ + + private function containsTLD($string) { + preg_match( + "/(AC($|\/)|\.AD($|\/)|\.AE($|\/)|\.AERO($|\/)|\.AF($|\/)|\.AG($|\/)|\.AI($|\/)|\.AL($|\/)|\.AM($|\/)|\.AN($|\/)|\.AO($|\/)|\.AQ($|\/)|\.AR($|\/)|\.ARPA($|\/)|\.AS($|\/)|\.ASIA($|\/)|\.AT($|\/)|\.AU($|\/)|\.AW($|\/)|\.AX($|\/)|\.AZ($|\/)|\.BA($|\/)|\.BB($|\/)|\.BD($|\/)|\.BE($|\/)|\.BF($|\/)|\.BG($|\/)|\.BH($|\/)|\.BI($|\/)|\.BIZ($|\/)|\.BJ($|\/)|\.BM($|\/)|\.BN($|\/)|\.BO($|\/)|\.BR($|\/)|\.BS($|\/)|\.BT($|\/)|\.BV($|\/)|\.BW($|\/)|\.BY($|\/)|\.BZ($|\/)|\.CA($|\/)|\.CAT($|\/)|\.CC($|\/)|\.CD($|\/)|\.CF($|\/)|\.CG($|\/)|\.CH($|\/)|\.CI($|\/)|\.CK($|\/)|\.CL($|\/)|\.CM($|\/)|\.CN($|\/)|\.CO($|\/)|\.COM($|\/)|\.COOP($|\/)|\.CR($|\/)|\.CU($|\/)|\.CV($|\/)|\.CX($|\/)|\.CY($|\/)|\.CZ($|\/)|\.DE($|\/)|\.DJ($|\/)|\.DK($|\/)|\.DM($|\/)|\.DO($|\/)|\.DZ($|\/)|\.EC($|\/)|\.EDU($|\/)|\.EE($|\/)|\.EG($|\/)|\.ER($|\/)|\.ES($|\/)|\.ET($|\/)|\.EU($|\/)|\.FI($|\/)|\.FJ($|\/)|\.FK($|\/)|\.FM($|\/)|\.FO($|\/)|\.FR($|\/)|\.GA($|\/)|\.GB($|\/)|\.GD($|\/)|\.GE($|\/)|\.GF($|\/)|\.GG($|\/)|\.GH($|\/)|\.GI($|\/)|\.GL($|\/)|\.GM($|\/)|\.GN($|\/)|\.GOV($|\/)|\.GP($|\/)|\.GQ($|\/)|\.GR($|\/)|\.GS($|\/)|\.GT($|\/)|\.GU($|\/)|\.GW($|\/)|\.GY($|\/)|\.HK($|\/)|\.HM($|\/)|\.HN($|\/)|\.HR($|\/)|\.HT($|\/)|\.HU($|\/)|\.ID($|\/)|\.IE($|\/)|\.IL($|\/)|\.IM($|\/)|\.IN($|\/)|\.INFO($|\/)|\.INT($|\/)|\.IO($|\/)|\.IQ($|\/)|\.IR($|\/)|\.IS($|\/)|\.IT($|\/)|\.JE($|\/)|\.JM($|\/)|\.JO($|\/)|\.JOBS($|\/)|\.JP($|\/)|\.KE($|\/)|\.KG($|\/)|\.KH($|\/)|\.KI($|\/)|\.KM($|\/)|\.KN($|\/)|\.KP($|\/)|\.KR($|\/)|\.KW($|\/)|\.KY($|\/)|\.KZ($|\/)|\.LA($|\/)|\.LB($|\/)|\.LC($|\/)|\.LI($|\/)|\.LK($|\/)|\.LR($|\/)|\.LS($|\/)|\.LT($|\/)|\.LU($|\/)|\.LV($|\/)|\.LY($|\/)|\.MA($|\/)|\.MC($|\/)|\.MD($|\/)|\.ME($|\/)|\.MG($|\/)|\.MH($|\/)|\.MIL($|\/)|\.MK($|\/)|\.ML($|\/)|\.MM($|\/)|\.MN($|\/)|\.MO($|\/)|\.MOBI($|\/)|\.MP($|\/)|\.MQ($|\/)|\.MR($|\/)|\.MS($|\/)|\.MT($|\/)|\.MU($|\/)|\.MUSEUM($|\/)|\.MV($|\/)|\.MW($|\/)|\.MX($|\/)|\.MY($|\/)|\.MZ($|\/)|\.NA($|\/)|\.NAME($|\/)|\.NC($|\/)|\.NE($|\/)|\.NET($|\/)|\.NF($|\/)|\.NG($|\/)|\.NI($|\/)|\.NL($|\/)|\.NO($|\/)|\.NP($|\/)|\.NR($|\/)|\.NU($|\/)|\.NZ($|\/)|\.OM($|\/)|\.ORG($|\/)|\.PA($|\/)|\.PE($|\/)|\.PF($|\/)|\.PG($|\/)|\.PH($|\/)|\.PK($|\/)|\.PL($|\/)|\.PM($|\/)|\.PN($|\/)|\.PR($|\/)|\.PRO($|\/)|\.PS($|\/)|\.PT($|\/)|\.PW($|\/)|\.PY($|\/)|\.QA($|\/)|\.RE($|\/)|\.RO($|\/)|\.RS($|\/)|\.RU($|\/)|\.RW($|\/)|\.SA($|\/)|\.SB($|\/)|\.SC($|\/)|\.SD($|\/)|\.SE($|\/)|\.SG($|\/)|\.SH($|\/)|\.SI($|\/)|\.SJ($|\/)|\.SK($|\/)|\.SL($|\/)|\.SM($|\/)|\.SN($|\/)|\.SO($|\/)|\.SR($|\/)|\.ST($|\/)|\.SU($|\/)|\.SV($|\/)|\.SY($|\/)|\.SZ($|\/)|\.TC($|\/)|\.TD($|\/)|\.TEL($|\/)|\.TF($|\/)|\.TG($|\/)|\.TH($|\/)|\.TJ($|\/)|\.TK($|\/)|\.TL($|\/)|\.TM($|\/)|\.TN($|\/)|\.TO($|\/)|\.TP($|\/)|\.TR($|\/)|\.TRAVEL($|\/)|\.TT($|\/)|\.TV($|\/)|\.TW($|\/)|\.TZ($|\/)|\.UA($|\/)|\.UG($|\/)|\.UK($|\/)|\.US($|\/)|\.UY($|\/)|\.UZ($|\/)|\.VA($|\/)|\.VC($|\/)|\.VE($|\/)|\.VG($|\/)|\.VI($|\/)|\.VN($|\/)|\.VU($|\/)|\.WF($|\/)|\.WS($|\/)|\.XN--0ZWM56D($|\/)|\.XN--11B5BS3A9AJ6G($|\/)|\.XN--80AKHBYKNJ4F($|\/)|\.XN--9T4B11YI5A($|\/)|\.XN--DEBA0AD($|\/)|\.XN--G6W251D($|\/)|\.XN--HGBK6AJ7F53BBA($|\/)|\.XN--HLCJ6AYA9ESC7A($|\/)|\.XN--JXALPDLP($|\/)|\.XN--KGBECHTV($|\/)|\.XN--ZCKZAH($|\/)|\.YE($|\/)|\.YT($|\/)|\.YU($|\/)|\.ZA($|\/)|\.ZM($|\/)|\.ZW)/i", + $string, + $M + ); + $has_tld = (count($M) > 0) ? true : false; + return $has_tld; + } + private function cleaner($url) { + $U = explode(' ', $url); + $W =array(); + foreach ($U as $k => $u) { + if (stristr($u,".")) { //only preg_match if there is a dot + if ($this->containsTLD($u) === true) { + unset($U[$k]); + return $this->cleaner( implode(' ', $U) ); + } + } + } + return implode(' ', $U); + } public function collectData(array $param){ $html = ''; @@ -61,6 +84,10 @@ class TwitterBridgeTweaked extends BridgeAbstract{ // generate the title // $item->title = $item->fullname . ' (@'. $item->username . ') | ' . $item->content_simple; $item->title = $item->content_simple; + $item->title = preg_replace('|https?://www\.[a-z\.0-9]+|i', '', $item->title); // remove http(s) links + $item->title = preg_replace('|www\.[a-z\.0-9]+|i', '', $item->title); // remove www. links + $item->title = $this->cleaner($item->title); // remove all remaining links + $item->title = trim($item->title); // remove extra spaces at beginning and end // put out $this->items[] = $item; } From ff4ccf985ff20a31231ccfcd8d8d478e2adb1bde Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 8 Dec 2014 16:53:58 +0100 Subject: [PATCH 07/19] Resolve content links Use some code to resolve content links to bypass shorteners... --- bridges/TwitterBridgeTweaked.php | 69 +++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 3b6ed13..6d7b7b8 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -1,6 +1,6 @@ 0) ? true : false; return $has_tld; - } + } private function cleaner($url) { $U = explode(' ', $url); $W =array(); foreach ($U as $k => $u) { - if (stristr($u,".")) { //only preg_match if there is a dot + if (stristr($u,".")) { //only preg_match if there is a dot if ($this->containsTLD($u) === true) { unset($U[$k]); return $this->cleaner( implode(' ', $U) ); - } + } } } return implode(' ', $U); } + // (c) Kraoc / urlclean + // https://github.com/kraoc/Leed-market/blob/master/urlclean/urlclean.plugin.disabled.php + private function resolve_url($link) { + // fallback to crawl to real url (slowest method and unsecure to privacy) + if (function_exists('curl_init') && !ini_get('safe_mode')) { + curl_setopt($ch, CURLOPT_USERAGENT, $ua); + curl_setopt($ch, CURLOPT_URL, $link); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + // >>> anonimization + curl_setopt($ch, CURLOPT_COOKIESESSION, true); + curl_setopt($ch, CURLOPT_REFERER, ''); + // <<< anonimization + $ch = curl_init(); + $ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.16 (KHTML, like Gecko) Chrome/24.0.1304.0 Safari/537.16'; + $a = curl_exec($ch); + $link = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + } + + $link = preg_replace("/[&#?]xtor=(.)+/", "", $link); // remove: xtor + $link = preg_replace("/utm_([^&#]|(&))+&*/", "", $link); // remove: utm_ + + // cleanup end of url + $link = preg_replace("/\?&/", "", $link); + if (isset($link[strlen($link) -1])){ + if ($link[strlen($link) -1] == '?') + $link = substr($link, 0, strlen($link) -1); + } + + return $link; + } + public function collectData(array $param){ - $html = ''; + $html = ''; if (isset($param['q'])) { /* keyword search mode */ $html = file_get_html('https://twitter.com/search/realtime?q='.urlencode($param['q']).'+include:retweets&src=typd') or $this->returnError('No results for this query.', 404); } @@ -54,18 +87,18 @@ class TwitterBridgeTweaked extends BridgeAbstract{ // extract username and sanitize $item->username = $tweet->getAttribute('data-screen-name'); // extract fullname (pseudonym) - $item->fullname = $tweet->getAttribute('data-name'); + $item->fullname = $tweet->getAttribute('data-name'); // get avatar link - $item->avatar = $tweet->find('img', 0)->src; + $item->avatar = $tweet->find('img', 0)->src; // get TweetID $item->id = $tweet->getAttribute('data-tweet-id'); - // get tweet link - $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); + // get tweet link + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); // extract tweet timestamp $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); - // extract plaintext - $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); - + // extract plaintext + $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); + // processing content links foreach($tweet->find('a') as $link) { if($link->hasAttribute('data-expanded-url') ) { @@ -88,6 +121,14 @@ class TwitterBridgeTweaked extends BridgeAbstract{ $item->title = preg_replace('|www\.[a-z\.0-9]+|i', '', $item->title); // remove www. links $item->title = $this->cleaner($item->title); // remove all remaining links $item->title = trim($item->title); // remove extra spaces at beginning and end + + // convert all content links to real ones + $regex = "/(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?/"; + $item->content = preg_replace_callback($regex, function($url) { + // do stuff with $url[0] here + return $this->resolve_url($url[0]); + }, $item->content); + // put out $this->items[] = $item; } @@ -104,7 +145,7 @@ class TwitterBridgeTweaked extends BridgeAbstract{ public function getCacheDuration(){ return 300; // 5 minutes } - + public function getUsername(){ return $this->items[0]->username; } From c44c569aa5d1ce6b6bd9ffde46fadee330ca2107 Mon Sep 17 00:00:00 2001 From: Paul de Rosanbo Date: Sun, 1 Feb 2015 15:03:39 +0100 Subject: [PATCH 08/19] Add paru vendu immo bridge --- bridges/ParuVenduImmoBridge.php | 77 +++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 bridges/ParuVenduImmoBridge.php diff --git a/bridges/ParuVenduImmoBridge.php b/bridges/ParuVenduImmoBridge.php new file mode 100644 index 0000000..cbb420e --- /dev/null +++ b/bridges/ParuVenduImmoBridge.php @@ -0,0 +1,77 @@ +getURI().'/immobilier/annonceimmofo/liste/listeAnnonces?tt=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1'; + + if (isset($param['minarea'])) { + $link .= '&sur0='.urlencode($param['minarea']); + } + + if (isset($param['maxprice'])) { + $link .= '&px1='.urlencode($param['maxprice']); + } + + if (isset($param['pa'])) { + $link .= '&pa='.urlencode($param['pa']); + } + + if (isset($param['lo'])) { + $link .= '&lo='.urlencode($param['lo']); + } + + $html = file_get_html($link) or $this->returnError('Could not request paruvendu.', 404); + + + foreach($html->find('div.annonce a') as $element) { + + $img =''; + foreach($element->find('span.img img') as $img) { + if ($img->original) { + $img = ''; + } + } + + $desc = $element->find('span.desc')[0]->innertext; + $desc = str_replace("voir l'annonce", '', $desc); + + $price = $element->find('span.price')[0]->innertext; + + $item = new \Item(); + $item->uri = $this->getURI().$element->href; + $item->title = $element->title; + $item->content = $img.$desc.$price; + $this->items[] = $item; + + } + } + + public function getName(){ + return 'ParuVenduImmo'; + } + + public function getURI(){ + return 'http://www.paruvendu.fr'; + } + + public function getCacheDuration(){ + return 0; + return 3600; // 1 hour + } +} From e1b5c9cda3196e67faf275da019e2b1cbffbcf54 Mon Sep 17 00:00:00 2001 From: Paul de Rosanbo Date: Sun, 1 Feb 2015 15:04:59 +0100 Subject: [PATCH 09/19] Set cache duration to 3 hours --- bridges/ParuVenduImmoBridge.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bridges/ParuVenduImmoBridge.php b/bridges/ParuVenduImmoBridge.php index cbb420e..303a090 100644 --- a/bridges/ParuVenduImmoBridge.php +++ b/bridges/ParuVenduImmoBridge.php @@ -71,7 +71,6 @@ class ParuVenduImmoBridge extends BridgeAbstract } public function getCacheDuration(){ - return 0; - return 3600; // 1 hour + return 10800; // 3 hours } } From 9cd174ceab49a79a5b11055c0e537bf33c360bcd Mon Sep 17 00:00:00 2001 From: Paul de Rosanbo Date: Sun, 1 Feb 2015 15:15:30 +0100 Subject: [PATCH 10/19] Set name --- bridges/ParuVenduImmoBridge.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bridges/ParuVenduImmoBridge.php b/bridges/ParuVenduImmoBridge.php index 303a090..cf3c3bb 100644 --- a/bridges/ParuVenduImmoBridge.php +++ b/bridges/ParuVenduImmoBridge.php @@ -5,7 +5,7 @@ * Returns the N most recent documents, sorting by date (most recent first). * 2014-05-25 * -* @name ParuVenduImmoBridge +* @name Paru Vendu Immobilier * @homepage http://www.paruvendu.fr/immobilier/ * @description Returns the N most recent documents. * @maintainer polo2ro @@ -13,6 +13,7 @@ */ class ParuVenduImmoBridge extends BridgeAbstract { + private $request = ''; public function collectData(array $param) { @@ -21,6 +22,7 @@ class ParuVenduImmoBridge extends BridgeAbstract $link = $this->getURI().'/immobilier/annonceimmofo/liste/listeAnnonces?tt=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1'; if (isset($param['minarea'])) { + $this->request .= ' '.$param['minarea'].' m2'; $link .= '&sur0='.urlencode($param['minarea']); } @@ -33,6 +35,7 @@ class ParuVenduImmoBridge extends BridgeAbstract } if (isset($param['lo'])) { + $this->request .= ' In: '.$param['lo']; $link .= '&lo='.urlencode($param['lo']); } @@ -63,7 +66,7 @@ class ParuVenduImmoBridge extends BridgeAbstract } public function getName(){ - return 'ParuVenduImmo'; + return 'Paru Vendu Immobilier'.$this->request; } public function getURI(){ From 4b04a77b00d2c7ebe47821a7a76374745b7d7bc3 Mon Sep 17 00:00:00 2001 From: Paul de Rosanbo Date: Sun, 1 Feb 2015 18:03:15 +0100 Subject: [PATCH 11/19] Remove empty entries --- bridges/ParuVenduImmoBridge.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bridges/ParuVenduImmoBridge.php b/bridges/ParuVenduImmoBridge.php index cf3c3bb..717b2c4 100644 --- a/bridges/ParuVenduImmoBridge.php +++ b/bridges/ParuVenduImmoBridge.php @@ -9,7 +9,7 @@ * @homepage http://www.paruvendu.fr/immobilier/ * @description Returns the N most recent documents. * @maintainer polo2ro -* @use1(minarea="Min area",maxprice="Max price",pa="Country code",lo="department number") +* @use1(minarea="Min area",maxprice="Max price",pa="Country code",lo="department numbers, comma-separated") */ class ParuVenduImmoBridge extends BridgeAbstract { @@ -44,6 +44,10 @@ class ParuVenduImmoBridge extends BridgeAbstract foreach($html->find('div.annonce a') as $element) { + if (!$element->title) { + continue; + } + $img =''; foreach($element->find('span.img img') as $img) { if ($img->original) { From 1a673766ecdc1f64b0592125c109b803d9caab49 Mon Sep 17 00:00:00 2001 From: Paul de Rosanbo Date: Sun, 1 Feb 2015 19:20:13 +0100 Subject: [PATCH 12/19] Add flat --- bridges/ParuVenduImmoBridge.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bridges/ParuVenduImmoBridge.php b/bridges/ParuVenduImmoBridge.php index 717b2c4..795d328 100644 --- a/bridges/ParuVenduImmoBridge.php +++ b/bridges/ParuVenduImmoBridge.php @@ -19,7 +19,9 @@ class ParuVenduImmoBridge extends BridgeAbstract { $html = ''; $num = 20; - $link = $this->getURI().'/immobilier/annonceimmofo/liste/listeAnnonces?tt=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1'; + $appartment = '&tbApp=1&tbDup=1&tbChb=1&tbLof=1&tbAtl=1&tbPla=1'; + $maison = '&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1'; + $link = $this->getURI().'/immobilier/annonceimmofo/liste/listeAnnonces?tt=1'.$appartment.$maison; if (isset($param['minarea'])) { $this->request .= ' '.$param['minarea'].' m2'; From 9171be9c504d03ba9213f4f20f9a1b252adbb6f2 Mon Sep 17 00:00:00 2001 From: Paul de Rosanbo Date: Mon, 2 Feb 2015 22:37:18 +0100 Subject: [PATCH 13/19] More descriptions --- bridges/ParuVenduImmoBridge.php | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bridges/ParuVenduImmoBridge.php b/bridges/ParuVenduImmoBridge.php index 795d328..88068f9 100644 --- a/bridges/ParuVenduImmoBridge.php +++ b/bridges/ParuVenduImmoBridge.php @@ -2,14 +2,13 @@ /** * RssBridge Paru Vendu Immo * Retrieve lastest documents from http://www.paruvendu.fr/immobilier/. -* Returns the N most recent documents, sorting by date (most recent first). -* 2014-05-25 * * @name Paru Vendu Immobilier * @homepage http://www.paruvendu.fr/immobilier/ -* @description Returns the N most recent documents. +* @description Returns the ads from the first page of search result. * @maintainer polo2ro -* @use1(minarea="Min area",maxprice="Max price",pa="Country code",lo="department numbers, comma-separated") +* @update 2015-02-02 +* @use1(minarea="Min surface m²",maxprice="Max price",pa="Country code (ex: FR)",lo="department numbers or postal codes, comma-separated") */ class ParuVenduImmoBridge extends BridgeAbstract { From 401663aac0f6bafb8f9ccc470df9fb2672fb17f5 Mon Sep 17 00:00:00 2001 From: Enzo PALMA Date: Thu, 5 Mar 2015 14:42:25 +0100 Subject: [PATCH 14/19] Adding Frandroid bridge since they truncate their RSS --- bridges/FrandroidBridge.php | 63 +++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 bridges/FrandroidBridge.php diff --git a/bridges/FrandroidBridge.php b/bridges/FrandroidBridge.php new file mode 100644 index 0000000..4f075ac --- /dev/null +++ b/bridges/FrandroidBridge.php @@ -0,0 +1,63 @@ +', '', $string); + return $string; + } + function FrandroidExtractContent($url) { + $html2 = file_get_html($url); + $html3 = $html2->find('div.post-content', 0); + $html3->find('div.no-sidebar-ad-top',0)->outertext=''; + $ret=$html3->find('div.shortcode-container'); + foreach ($ret as $value){ + $value->outertext=''; + } + + $html3->find('div#hrr-link',0)->outertext=''; + $text = $html3->innertext; + $text = strip_tags($text, '