From 096c318d4e896582c73ae8b31c95228c74ba6725 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 1 Aug 2014 11:13:57 +0200 Subject: [PATCH 1/6] Create TwitterBridgeTweaked.php Extends TwitterBridgeExtended to provide access to more public fields. --- bridges/TwitterBridgeTweaked.php | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 bridges/TwitterBridgeTweaked.php diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php new file mode 100644 index 0000000..9b0d4bc --- /dev/null +++ b/bridges/TwitterBridgeTweaked.php @@ -0,0 +1,11 @@ +items[0]->username; + } +} From 5922e37bc0777f5309cb8bed523bca3a5514abdf Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 5 Dec 2014 13:18:37 +0100 Subject: [PATCH 2/6] Create GiphyBridge.php Add bridge for Giphy.com --- bridges/GiphyBridge.php | 85 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 bridges/GiphyBridge.php diff --git a/bridges/GiphyBridge.php b/bridges/GiphyBridge.php new file mode 100644 index 0000000..8312174 --- /dev/null +++ b/bridges/GiphyBridge.php @@ -0,0 +1,85 @@ +returnError('No results for this query.', 404); + } + else { + $this->returnError('You must specify a search worf (?s=...).', 400); + } + + $max = GIPHY_LIMIT; + if (isset($param['n'])) { + $max = (integer) $param['n']; + } + + $limit = 0; + $kw = urlencode($param['s']); + foreach($html->find('div.hoverable-gif') as $entry) { + if($limit < $max) { + $node = $entry->first_child(); + $href = $node->getAttribute('href'); + + $html2 = file_get_html($base_url . $href) or $this->returnError('No results for this query.', 404); + $figure = $html2->getElementByTagName('figure'); + $img = $figure->firstChild(); + $caption = $figure->lastChild(); + + $item = new \Item(); + $item->id = $img->getAttribute('data-gif_id'); + $item->uri = $img->getAttribute('data-bitly_gif_url'); + $item->username = 'Giphy - '.ucfirst($kw); + $title = $caption->innertext(); + $title = preg_replace('/\s+/', ' ',$title); + $title = str_replace('animated GIF', '', $title); + $title = str_replace($kw, '', $title); + $title = preg_replace('/\s+/', ' ',$title); + $title = trim($title); + if (strlen($title) <= 0) { + $title = $item->id; + } + $item->title = trim($title); + $item->content = + '' + .'' + .''; + + $this->items[] = $item; + $limit++; + } + } + } + + public function getName(){ + return 'Giphy Bridge'; + } + + public function getURI(){ + return 'http://giphy.com/'; + } + + public function getCacheDuration(){ + return 300; // 5 minutes + } + + public function getUsername(){ + return $this->items[0]->username; + } +} From 068557ed0830327f5fbca3455af484e714ebb835 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 5 Dec 2014 13:19:20 +0100 Subject: [PATCH 3/6] Update GiphyBridge.php --- bridges/GiphyBridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/GiphyBridge.php b/bridges/GiphyBridge.php index 8312174..b0a4f89 100644 --- a/bridges/GiphyBridge.php +++ b/bridges/GiphyBridge.php @@ -9,6 +9,7 @@ * @description Bridge for giphy.com * @maintainer kraoc * @use1(s="search tag") +* @use2(n="max number of returned items") */ define(GIPHY_LIMIT, 10); From 0d9df394ddd8d723934dc21d163cf67f8f510785 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 5 Dec 2014 13:20:17 +0100 Subject: [PATCH 4/6] Update TwitterBridgeTweaked.php --- bridges/TwitterBridgeTweaked.php | 77 +++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 9b0d4bc..53099ce 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -1,10 +1,83 @@ returnError('No results for this query.', 404); + } + elseif (isset($param['u'])) { /* user timeline mode */ + $html = file_get_html('https://twitter.com/'.urlencode($param['u']).'/with_replies') or $this->returnError('Requested username can\'t be found.', 404); + } + else { + $this->returnError('You must specify a keyword (?q=...) or a Twitter username (?u=...).', 400); + } + foreach($html->find('div.js-stream-tweet') as $tweet) { + $item = new \Item(); + // extract username and sanitize + $item->username = $tweet->getAttribute('data-screen-name'); + // extract fullname (pseudonym) + $item->fullname = $tweet->getAttribute('data-name'); + // get avatar link + $item->avatar = $tweet->find('img', 0)->src; + // get TweetID + $item->id = $tweet->getAttribute('data-tweet-id'); + // get tweet link + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); + // extract tweet timestamp + $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); + // extract plaintext + $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); + + // processing content links + foreach($tweet->find('a') as $link) { + if($link->hasAttribute('data-expanded-url') ) { + $link->href = $link->getAttribute('data-expanded-url'); + } + $link->removeAttribute('data-expanded-url'); + $link->removeAttribute('data-query-source'); + $link->removeAttribute('rel'); + $link->removeAttribute('class'); + $link->removeAttribute('target'); + $link->removeAttribute('title'); + } + // get tweet text + $item->content = 'avatar'.$item->username.' '.$item->fullname.'
'.str_replace('href="/', 'href="https://twitter.com/', $tweet->find('p.js-tweet-text', 0)->innertext).'
'; + // generate the title +// $item->title = $item->fullname . ' (@'. $item->username . ') | ' . $item->content_simple; + $item->title = $item->content_simple; + // put out + $this->items[] = $item; + } + } + + public function getName(){ + return 'Twitter Bridge Tweaked'; + } + + public function getURI(){ + return 'http://twitter.com'; + } + + public function getCacheDuration(){ + return 300; // 5 minutes + } + public function getUsername(){ return $this->items[0]->username; } From 30f339e3b2a40695c6f8c296e927294d4b8436a6 Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 8 Dec 2014 16:31:16 +0100 Subject: [PATCH 5/6] Add title cleaning Try to remove all links from title's entry. --- bridges/TwitterBridgeTweaked.php | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 53099ce..3b6ed13 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -13,6 +13,29 @@ */ class TwitterBridgeTweaked extends BridgeAbstract{ + + private function containsTLD($string) { + preg_match( + "/(AC($|\/)|\.AD($|\/)|\.AE($|\/)|\.AERO($|\/)|\.AF($|\/)|\.AG($|\/)|\.AI($|\/)|\.AL($|\/)|\.AM($|\/)|\.AN($|\/)|\.AO($|\/)|\.AQ($|\/)|\.AR($|\/)|\.ARPA($|\/)|\.AS($|\/)|\.ASIA($|\/)|\.AT($|\/)|\.AU($|\/)|\.AW($|\/)|\.AX($|\/)|\.AZ($|\/)|\.BA($|\/)|\.BB($|\/)|\.BD($|\/)|\.BE($|\/)|\.BF($|\/)|\.BG($|\/)|\.BH($|\/)|\.BI($|\/)|\.BIZ($|\/)|\.BJ($|\/)|\.BM($|\/)|\.BN($|\/)|\.BO($|\/)|\.BR($|\/)|\.BS($|\/)|\.BT($|\/)|\.BV($|\/)|\.BW($|\/)|\.BY($|\/)|\.BZ($|\/)|\.CA($|\/)|\.CAT($|\/)|\.CC($|\/)|\.CD($|\/)|\.CF($|\/)|\.CG($|\/)|\.CH($|\/)|\.CI($|\/)|\.CK($|\/)|\.CL($|\/)|\.CM($|\/)|\.CN($|\/)|\.CO($|\/)|\.COM($|\/)|\.COOP($|\/)|\.CR($|\/)|\.CU($|\/)|\.CV($|\/)|\.CX($|\/)|\.CY($|\/)|\.CZ($|\/)|\.DE($|\/)|\.DJ($|\/)|\.DK($|\/)|\.DM($|\/)|\.DO($|\/)|\.DZ($|\/)|\.EC($|\/)|\.EDU($|\/)|\.EE($|\/)|\.EG($|\/)|\.ER($|\/)|\.ES($|\/)|\.ET($|\/)|\.EU($|\/)|\.FI($|\/)|\.FJ($|\/)|\.FK($|\/)|\.FM($|\/)|\.FO($|\/)|\.FR($|\/)|\.GA($|\/)|\.GB($|\/)|\.GD($|\/)|\.GE($|\/)|\.GF($|\/)|\.GG($|\/)|\.GH($|\/)|\.GI($|\/)|\.GL($|\/)|\.GM($|\/)|\.GN($|\/)|\.GOV($|\/)|\.GP($|\/)|\.GQ($|\/)|\.GR($|\/)|\.GS($|\/)|\.GT($|\/)|\.GU($|\/)|\.GW($|\/)|\.GY($|\/)|\.HK($|\/)|\.HM($|\/)|\.HN($|\/)|\.HR($|\/)|\.HT($|\/)|\.HU($|\/)|\.ID($|\/)|\.IE($|\/)|\.IL($|\/)|\.IM($|\/)|\.IN($|\/)|\.INFO($|\/)|\.INT($|\/)|\.IO($|\/)|\.IQ($|\/)|\.IR($|\/)|\.IS($|\/)|\.IT($|\/)|\.JE($|\/)|\.JM($|\/)|\.JO($|\/)|\.JOBS($|\/)|\.JP($|\/)|\.KE($|\/)|\.KG($|\/)|\.KH($|\/)|\.KI($|\/)|\.KM($|\/)|\.KN($|\/)|\.KP($|\/)|\.KR($|\/)|\.KW($|\/)|\.KY($|\/)|\.KZ($|\/)|\.LA($|\/)|\.LB($|\/)|\.LC($|\/)|\.LI($|\/)|\.LK($|\/)|\.LR($|\/)|\.LS($|\/)|\.LT($|\/)|\.LU($|\/)|\.LV($|\/)|\.LY($|\/)|\.MA($|\/)|\.MC($|\/)|\.MD($|\/)|\.ME($|\/)|\.MG($|\/)|\.MH($|\/)|\.MIL($|\/)|\.MK($|\/)|\.ML($|\/)|\.MM($|\/)|\.MN($|\/)|\.MO($|\/)|\.MOBI($|\/)|\.MP($|\/)|\.MQ($|\/)|\.MR($|\/)|\.MS($|\/)|\.MT($|\/)|\.MU($|\/)|\.MUSEUM($|\/)|\.MV($|\/)|\.MW($|\/)|\.MX($|\/)|\.MY($|\/)|\.MZ($|\/)|\.NA($|\/)|\.NAME($|\/)|\.NC($|\/)|\.NE($|\/)|\.NET($|\/)|\.NF($|\/)|\.NG($|\/)|\.NI($|\/)|\.NL($|\/)|\.NO($|\/)|\.NP($|\/)|\.NR($|\/)|\.NU($|\/)|\.NZ($|\/)|\.OM($|\/)|\.ORG($|\/)|\.PA($|\/)|\.PE($|\/)|\.PF($|\/)|\.PG($|\/)|\.PH($|\/)|\.PK($|\/)|\.PL($|\/)|\.PM($|\/)|\.PN($|\/)|\.PR($|\/)|\.PRO($|\/)|\.PS($|\/)|\.PT($|\/)|\.PW($|\/)|\.PY($|\/)|\.QA($|\/)|\.RE($|\/)|\.RO($|\/)|\.RS($|\/)|\.RU($|\/)|\.RW($|\/)|\.SA($|\/)|\.SB($|\/)|\.SC($|\/)|\.SD($|\/)|\.SE($|\/)|\.SG($|\/)|\.SH($|\/)|\.SI($|\/)|\.SJ($|\/)|\.SK($|\/)|\.SL($|\/)|\.SM($|\/)|\.SN($|\/)|\.SO($|\/)|\.SR($|\/)|\.ST($|\/)|\.SU($|\/)|\.SV($|\/)|\.SY($|\/)|\.SZ($|\/)|\.TC($|\/)|\.TD($|\/)|\.TEL($|\/)|\.TF($|\/)|\.TG($|\/)|\.TH($|\/)|\.TJ($|\/)|\.TK($|\/)|\.TL($|\/)|\.TM($|\/)|\.TN($|\/)|\.TO($|\/)|\.TP($|\/)|\.TR($|\/)|\.TRAVEL($|\/)|\.TT($|\/)|\.TV($|\/)|\.TW($|\/)|\.TZ($|\/)|\.UA($|\/)|\.UG($|\/)|\.UK($|\/)|\.US($|\/)|\.UY($|\/)|\.UZ($|\/)|\.VA($|\/)|\.VC($|\/)|\.VE($|\/)|\.VG($|\/)|\.VI($|\/)|\.VN($|\/)|\.VU($|\/)|\.WF($|\/)|\.WS($|\/)|\.XN--0ZWM56D($|\/)|\.XN--11B5BS3A9AJ6G($|\/)|\.XN--80AKHBYKNJ4F($|\/)|\.XN--9T4B11YI5A($|\/)|\.XN--DEBA0AD($|\/)|\.XN--G6W251D($|\/)|\.XN--HGBK6AJ7F53BBA($|\/)|\.XN--HLCJ6AYA9ESC7A($|\/)|\.XN--JXALPDLP($|\/)|\.XN--KGBECHTV($|\/)|\.XN--ZCKZAH($|\/)|\.YE($|\/)|\.YT($|\/)|\.YU($|\/)|\.ZA($|\/)|\.ZM($|\/)|\.ZW)/i", + $string, + $M + ); + $has_tld = (count($M) > 0) ? true : false; + return $has_tld; + } + private function cleaner($url) { + $U = explode(' ', $url); + $W =array(); + foreach ($U as $k => $u) { + if (stristr($u,".")) { //only preg_match if there is a dot + if ($this->containsTLD($u) === true) { + unset($U[$k]); + return $this->cleaner( implode(' ', $U) ); + } + } + } + return implode(' ', $U); + } public function collectData(array $param){ $html = ''; @@ -61,6 +84,10 @@ class TwitterBridgeTweaked extends BridgeAbstract{ // generate the title // $item->title = $item->fullname . ' (@'. $item->username . ') | ' . $item->content_simple; $item->title = $item->content_simple; + $item->title = preg_replace('|https?://www\.[a-z\.0-9]+|i', '', $item->title); // remove http(s) links + $item->title = preg_replace('|www\.[a-z\.0-9]+|i', '', $item->title); // remove www. links + $item->title = $this->cleaner($item->title); // remove all remaining links + $item->title = trim($item->title); // remove extra spaces at beginning and end // put out $this->items[] = $item; } From ff4ccf985ff20a31231ccfcd8d8d478e2adb1bde Mon Sep 17 00:00:00 2001 From: Olivier Date: Mon, 8 Dec 2014 16:53:58 +0100 Subject: [PATCH 6/6] Resolve content links Use some code to resolve content links to bypass shorteners... --- bridges/TwitterBridgeTweaked.php | 69 +++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/bridges/TwitterBridgeTweaked.php b/bridges/TwitterBridgeTweaked.php index 3b6ed13..6d7b7b8 100644 --- a/bridges/TwitterBridgeTweaked.php +++ b/bridges/TwitterBridgeTweaked.php @@ -1,6 +1,6 @@ 0) ? true : false; return $has_tld; - } + } private function cleaner($url) { $U = explode(' ', $url); $W =array(); foreach ($U as $k => $u) { - if (stristr($u,".")) { //only preg_match if there is a dot + if (stristr($u,".")) { //only preg_match if there is a dot if ($this->containsTLD($u) === true) { unset($U[$k]); return $this->cleaner( implode(' ', $U) ); - } + } } } return implode(' ', $U); } + // (c) Kraoc / urlclean + // https://github.com/kraoc/Leed-market/blob/master/urlclean/urlclean.plugin.disabled.php + private function resolve_url($link) { + // fallback to crawl to real url (slowest method and unsecure to privacy) + if (function_exists('curl_init') && !ini_get('safe_mode')) { + curl_setopt($ch, CURLOPT_USERAGENT, $ua); + curl_setopt($ch, CURLOPT_URL, $link); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + // >>> anonimization + curl_setopt($ch, CURLOPT_COOKIESESSION, true); + curl_setopt($ch, CURLOPT_REFERER, ''); + // <<< anonimization + $ch = curl_init(); + $ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.16 (KHTML, like Gecko) Chrome/24.0.1304.0 Safari/537.16'; + $a = curl_exec($ch); + $link = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + } + + $link = preg_replace("/[&#?]xtor=(.)+/", "", $link); // remove: xtor + $link = preg_replace("/utm_([^&#]|(&))+&*/", "", $link); // remove: utm_ + + // cleanup end of url + $link = preg_replace("/\?&/", "", $link); + if (isset($link[strlen($link) -1])){ + if ($link[strlen($link) -1] == '?') + $link = substr($link, 0, strlen($link) -1); + } + + return $link; + } + public function collectData(array $param){ - $html = ''; + $html = ''; if (isset($param['q'])) { /* keyword search mode */ $html = file_get_html('https://twitter.com/search/realtime?q='.urlencode($param['q']).'+include:retweets&src=typd') or $this->returnError('No results for this query.', 404); } @@ -54,18 +87,18 @@ class TwitterBridgeTweaked extends BridgeAbstract{ // extract username and sanitize $item->username = $tweet->getAttribute('data-screen-name'); // extract fullname (pseudonym) - $item->fullname = $tweet->getAttribute('data-name'); + $item->fullname = $tweet->getAttribute('data-name'); // get avatar link - $item->avatar = $tweet->find('img', 0)->src; + $item->avatar = $tweet->find('img', 0)->src; // get TweetID $item->id = $tweet->getAttribute('data-tweet-id'); - // get tweet link - $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); + // get tweet link + $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href'); // extract tweet timestamp $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time'); - // extract plaintext - $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); - + // extract plaintext + $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, ''))); + // processing content links foreach($tweet->find('a') as $link) { if($link->hasAttribute('data-expanded-url') ) { @@ -88,6 +121,14 @@ class TwitterBridgeTweaked extends BridgeAbstract{ $item->title = preg_replace('|www\.[a-z\.0-9]+|i', '', $item->title); // remove www. links $item->title = $this->cleaner($item->title); // remove all remaining links $item->title = trim($item->title); // remove extra spaces at beginning and end + + // convert all content links to real ones + $regex = "/(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?/"; + $item->content = preg_replace_callback($regex, function($url) { + // do stuff with $url[0] here + return $this->resolve_url($url[0]); + }, $item->content); + // put out $this->items[] = $item; } @@ -104,7 +145,7 @@ class TwitterBridgeTweaked extends BridgeAbstract{ public function getCacheDuration(){ return 300; // 5 minutes } - + public function getUsername(){ return $this->items[0]->username; }