TwitterBridgeTweaked.php 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. <?php
  2. class TwitterBridgeTweaked extends BridgeAbstract{
  3. public function loadMetadatas() {
  4. $this->maintainer = "kraoc";
  5. $this->name = "Twitter Bridge Tweaked";
  6. $this->uri = "https://twitter.com/";
  7. $this->description = "(same as Twitter Bridge Extended, but with cleaned title & content)";
  8. $this->update = "2014-12-05";
  9. $this->parameters["By keyword or hashtag"] =
  10. '[
  11. {
  12. "name" : "Keyword or #hashtag",
  13. "identifier" : "q"
  14. }
  15. ]';
  16. $this->parameters["By username"] =
  17. '[
  18. {
  19. "name" : "username",
  20. "identifier" : "u"
  21. }
  22. ]';
  23. }
  24. private function containsTLD($string) {
  25. preg_match(
  26. "/(AC($|\/)|\.AD($|\/)|\.AE($|\/)|\.AERO($|\/)|\.AF($|\/)|\.AG($|\/)|\.AI($|\/)|\.AL($|\/)|\.AM($|\/)|\.AN($|\/)|\.AO($|\/)|\.AQ($|\/)|\.AR($|\/)|\.ARPA($|\/)|\.AS($|\/)|\.ASIA($|\/)|\.AT($|\/)|\.AU($|\/)|\.AW($|\/)|\.AX($|\/)|\.AZ($|\/)|\.BA($|\/)|\.BB($|\/)|\.BD($|\/)|\.BE($|\/)|\.BF($|\/)|\.BG($|\/)|\.BH($|\/)|\.BI($|\/)|\.BIZ($|\/)|\.BJ($|\/)|\.BM($|\/)|\.BN($|\/)|\.BO($|\/)|\.BR($|\/)|\.BS($|\/)|\.BT($|\/)|\.BV($|\/)|\.BW($|\/)|\.BY($|\/)|\.BZ($|\/)|\.CA($|\/)|\.CAT($|\/)|\.CC($|\/)|\.CD($|\/)|\.CF($|\/)|\.CG($|\/)|\.CH($|\/)|\.CI($|\/)|\.CK($|\/)|\.CL($|\/)|\.CM($|\/)|\.CN($|\/)|\.CO($|\/)|\.COM($|\/)|\.COOP($|\/)|\.CR($|\/)|\.CU($|\/)|\.CV($|\/)|\.CX($|\/)|\.CY($|\/)|\.CZ($|\/)|\.DE($|\/)|\.DJ($|\/)|\.DK($|\/)|\.DM($|\/)|\.DO($|\/)|\.DZ($|\/)|\.EC($|\/)|\.EDU($|\/)|\.EE($|\/)|\.EG($|\/)|\.ER($|\/)|\.ES($|\/)|\.ET($|\/)|\.EU($|\/)|\.FI($|\/)|\.FJ($|\/)|\.FK($|\/)|\.FM($|\/)|\.FO($|\/)|\.FR($|\/)|\.GA($|\/)|\.GB($|\/)|\.GD($|\/)|\.GE($|\/)|\.GF($|\/)|\.GG($|\/)|\.GH($|\/)|\.GI($|\/)|\.GL($|\/)|\.GM($|\/)|\.GN($|\/)|\.GOV($|\/)|\.GP($|\/)|\.GQ($|\/)|\.GR($|\/)|\.GS($|\/)|\.GT($|\/)|\.GU($|\/)|\.GW($|\/)|\.GY($|\/)|\.HK($|\/)|\.HM($|\/)|\.HN($|\/)|\.HR($|\/)|\.HT($|\/)|\.HU($|\/)|\.ID($|\/)|\.IE($|\/)|\.IL($|\/)|\.IM($|\/)|\.IN($|\/)|\.INFO($|\/)|\.INT($|\/)|\.IO($|\/)|\.IQ($|\/)|\.IR($|\/)|\.IS($|\/)|\.IT($|\/)|\.JE($|\/)|\.JM($|\/)|\.JO($|\/)|\.JOBS($|\/)|\.JP($|\/)|\.KE($|\/)|\.KG($|\/)|\.KH($|\/)|\.KI($|\/)|\.KM($|\/)|\.KN($|\/)|\.KP($|\/)|\.KR($|\/)|\.KW($|\/)|\.KY($|\/)|\.KZ($|\/)|\.LA($|\/)|\.LB($|\/)|\.LC($|\/)|\.LI($|\/)|\.LK($|\/)|\.LR($|\/)|\.LS($|\/)|\.LT($|\/)|\.LU($|\/)|\.LV($|\/)|\.LY($|\/)|\.MA($|\/)|\.MC($|\/)|\.MD($|\/)|\.ME($|\/)|\.MG($|\/)|\.MH($|\/)|\.MIL($|\/)|\.MK($|\/)|\.ML($|\/)|\.MM($|\/)|\.MN($|\/)|\.MO($|\/)|\.MOBI($|\/)|\.MP($|\/)|\.MQ($|\/)|\.MR($|\/)|\.MS($|\/)|\.MT($|\/)|\.MU($|\/)|\.MUSEUM($|\/)|\.MV($|\/)|\.MW($|\/)|\.MX($|\/)|\.MY($|\/)|\.MZ($|\/)|\.NA($|\/)|\.NAME($|\/)|\.NC($|\/)|\.NE($|\/)|\.NET($|\/)|\.NF($|\/)|\.NG($|\/)|\.NI($|\/)|\.NL($|\/)|\.NO($|\/)|\.NP($|\/)|\.NR($|\/)|\.NU($|\/)|\.NZ($|\/)|\.OM($|\/)|\.ORG($|\/)|\.PA($|\/)|\.PE($|\/)|\.PF($|\/)|\.PG($|\/)|\.PH($|\/)|\.PK($|\/)|\.PL($|\/)|\.PM($|\/)|\.PN($|\/)|\.PR($|\/)|\.PRO($|\/)|\.PS($|\/)|\.PT($|\/)|\.PW($|\/)|\.PY($|\/)|\.QA($|\/)|\.RE($|\/)|\.RO($|\/)|\.RS($|\/)|\.RU($|\/)|\.RW($|\/)|\.SA($|\/)|\.SB($|\/)|\.SC($|\/)|\.SD($|\/)|\.SE($|\/)|\.SG($|\/)|\.SH($|\/)|\.SI($|\/)|\.SJ($|\/)|\.SK($|\/)|\.SL($|\/)|\.SM($|\/)|\.SN($|\/)|\.SO($|\/)|\.SR($|\/)|\.ST($|\/)|\.SU($|\/)|\.SV($|\/)|\.SY($|\/)|\.SZ($|\/)|\.TC($|\/)|\.TD($|\/)|\.TEL($|\/)|\.TF($|\/)|\.TG($|\/)|\.TH($|\/)|\.TJ($|\/)|\.TK($|\/)|\.TL($|\/)|\.TM($|\/)|\.TN($|\/)|\.TO($|\/)|\.TP($|\/)|\.TR($|\/)|\.TRAVEL($|\/)|\.TT($|\/)|\.TV($|\/)|\.TW($|\/)|\.TZ($|\/)|\.UA($|\/)|\.UG($|\/)|\.UK($|\/)|\.US($|\/)|\.UY($|\/)|\.UZ($|\/)|\.VA($|\/)|\.VC($|\/)|\.VE($|\/)|\.VG($|\/)|\.VI($|\/)|\.VN($|\/)|\.VU($|\/)|\.WF($|\/)|\.WS($|\/)|\.XN--0ZWM56D($|\/)|\.XN--11B5BS3A9AJ6G($|\/)|\.XN--80AKHBYKNJ4F($|\/)|\.XN--9T4B11YI5A($|\/)|\.XN--DEBA0AD($|\/)|\.XN--G6W251D($|\/)|\.XN--HGBK6AJ7F53BBA($|\/)|\.XN--HLCJ6AYA9ESC7A($|\/)|\.XN--JXALPDLP($|\/)|\.XN--KGBECHTV($|\/)|\.XN--ZCKZAH($|\/)|\.YE($|\/)|\.YT($|\/)|\.YU($|\/)|\.ZA($|\/)|\.ZM($|\/)|\.ZW)/i",
  27. $string,
  28. $M
  29. );
  30. $has_tld = (count($M) > 0) ? true : false;
  31. return $has_tld;
  32. }
  33. private function cleaner($url) {
  34. $U = explode(' ', $url);
  35. $W =array();
  36. foreach ($U as $k => $u) {
  37. if (stristr($u,".")) { //only preg_match if there is a dot
  38. if ($this->containsTLD($u) === true) {
  39. unset($U[$k]);
  40. return $this->cleaner( implode(' ', $U) );
  41. }
  42. }
  43. }
  44. return implode(' ', $U);
  45. }
  46. // (c) Kraoc / urlclean
  47. // https://github.com/kraoc/Leed-market/blob/master/urlclean/urlclean.plugin.disabled.php
  48. private function resolve_url($link) {
  49. // fallback to crawl to real url (slowest method and unsecure to privacy)
  50. if (function_exists('curl_init') && !ini_get('safe_mode')) {
  51. curl_setopt($ch, CURLOPT_USERAGENT, $ua);
  52. curl_setopt($ch, CURLOPT_URL, $link);
  53. curl_setopt($ch, CURLOPT_HEADER, true);
  54. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  55. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  56. // >>> anonimization
  57. curl_setopt($ch, CURLOPT_COOKIESESSION, true);
  58. curl_setopt($ch, CURLOPT_REFERER, '');
  59. // <<< anonimization
  60. $ch = curl_init();
  61. $ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.16 (KHTML, like Gecko) Chrome/24.0.1304.0 Safari/537.16';
  62. $a = curl_exec($ch);
  63. $link = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
  64. }
  65. $link = preg_replace("/[&#?]xtor=(.)+/", "", $link); // remove: xtor
  66. $link = preg_replace("/utm_([^&#]|(&amp;))+&*/", "", $link); // remove: utm_
  67. // cleanup end of url
  68. $link = preg_replace("/\?&/", "", $link);
  69. if (isset($link[strlen($link) -1])){
  70. if ($link[strlen($link) -1] == '?')
  71. $link = substr($link, 0, strlen($link) -1);
  72. }
  73. return $link;
  74. }
  75. public function collectData(array $param){
  76. $html = '';
  77. if (isset($param['q'])) { /* keyword search mode */
  78. $html = $this->file_get_html('https://twitter.com/search?q='.urlencode($param['q']).'&f=tweets') or $this->returnError('No results for this query.', 404);
  79. }
  80. elseif (isset($param['u'])) { /* user timeline mode */
  81. $html = $this->file_get_html('https://twitter.com/'.urlencode($param['u']).'/with_replies') or $this->returnError('Requested username can\'t be found.', 404);
  82. }
  83. else {
  84. $this->returnError('You must specify a keyword (?q=...) or a Twitter username (?u=...).', 400);
  85. }
  86. foreach($html->find('div.js-stream-tweet') as $tweet) {
  87. $item = new \Item();
  88. // extract username and sanitize
  89. $item->username = $tweet->getAttribute('data-screen-name');
  90. // extract fullname (pseudonym)
  91. $item->fullname = $tweet->getAttribute('data-name');
  92. // get avatar link
  93. $item->avatar = $tweet->find('img', 0)->src;
  94. // get TweetID
  95. $item->id = $tweet->getAttribute('data-tweet-id');
  96. // get tweet link
  97. $item->uri = 'https://twitter.com'.$tweet->find('a.js-permalink', 0)->getAttribute('href');
  98. // extract tweet timestamp
  99. $item->timestamp = $tweet->find('span.js-short-timestamp', 0)->getAttribute('data-time');
  100. // extract plaintext
  101. $item->content_simple = str_replace('href="/', 'href="https://twitter.com/', html_entity_decode(strip_tags($tweet->find('p.js-tweet-text', 0)->innertext, '<a>')));
  102. // processing content links
  103. foreach($tweet->find('a') as $link) {
  104. if($link->hasAttribute('data-expanded-url') ) {
  105. $link->href = $link->getAttribute('data-expanded-url');
  106. }
  107. $link->removeAttribute('data-expanded-url');
  108. $link->removeAttribute('data-query-source');
  109. $link->removeAttribute('rel');
  110. $link->removeAttribute('class');
  111. $link->removeAttribute('target');
  112. $link->removeAttribute('title');
  113. }
  114. // get tweet text
  115. $item->content = '<a href="https://twitter.com/'.$item->username.'"><img style="align:top;width:75px;" alt="avatar" src="'.$item->avatar.'" />'.$item->username.'</a> '.$item->fullname.'<br/><blockquote>'.str_replace('href="/', 'href="https://twitter.com/', $tweet->find('p.js-tweet-text', 0)->innertext).'</blockquote>';
  116. // generate the title
  117. // $item->title = $item->fullname . ' (@'. $item->username . ') | ' . $item->content_simple;
  118. $item->title = $item->content_simple;
  119. $item->title = preg_replace('|https?://www\.[a-z\.0-9]+|i', '', $item->title); // remove http(s) links
  120. $item->title = preg_replace('|www\.[a-z\.0-9]+|i', '', $item->title); // remove www. links
  121. $item->title = $this->cleaner($item->title); // remove all remaining links
  122. $item->title = trim($item->title); // remove extra spaces at beginning and end
  123. // convert all content links to real ones
  124. $regex = "/(http|https|ftp|ftps)\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(\/\S*)?/";
  125. $item->content = preg_replace_callback($regex, function($url) {
  126. // do stuff with $url[0] here
  127. return $this->resolve_url($url[0]);
  128. }, $item->content);
  129. // put out
  130. $this->items[] = $item;
  131. }
  132. }
  133. public function getName(){
  134. return 'Twitter Bridge Tweaked';
  135. }
  136. public function getURI(){
  137. return 'http://twitter.com';
  138. }
  139. public function getCacheDuration(){
  140. return 300; // 5 minutes
  141. }
  142. public function getUsername(){
  143. return $this->items[0]->username;
  144. }
  145. }