FacebookBridge.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. <?php
  2. class FacebookBridge extends BridgeAbstract {
  3. const MAINTAINER = 'teromene, logmanoriginal';
  4. const NAME = 'Facebook';
  5. const URI = 'https://www.facebook.com/';
  6. const CACHE_TIMEOUT = 300; // 5min
  7. const DESCRIPTION = 'Input a page title or a profile log. For a profile log,
  8. please insert the parameter as follow : myExamplePage/132621766841117';
  9. const PARAMETERS = array(
  10. 'User' => array(
  11. 'u' => array(
  12. 'name' => 'Username',
  13. 'required' => true
  14. ),
  15. 'media_type' => array(
  16. 'name' => 'Media type',
  17. 'type' => 'list',
  18. 'required' => false,
  19. 'values' => array(
  20. 'All' => 'all',
  21. 'Video' => 'video',
  22. 'No Video' => 'novideo'
  23. ),
  24. 'defaultValue' => 'all'
  25. ),
  26. 'skip_reviews' => array(
  27. 'name' => 'Skip reviews',
  28. 'type' => 'checkbox',
  29. 'required' => false,
  30. 'defaultValue' => false,
  31. 'title' => 'Feed includes reviews when checked'
  32. )
  33. ),
  34. 'Group' => array(
  35. 'g' => array(
  36. 'name' => 'Group',
  37. 'type' => 'text',
  38. 'required' => true,
  39. 'exampleValue' => 'https://www.facebook.com/groups/743149642484225',
  40. 'title' => 'Insert group name or facebook group URL'
  41. )
  42. )
  43. );
  44. private $authorName = '';
  45. private $groupName = '';
  46. public function getURI() {
  47. $uri = self::URI;
  48. switch($this->queriedContext) {
  49. case 'Group':
  50. $uri .= 'groups/' . $this->sanitizeGroup(filter_var($this->getInput('g'), FILTER_SANITIZE_URL));
  51. break;
  52. }
  53. return $uri .= '?_fb_noscript=1';
  54. }
  55. public function collectData() {
  56. switch($this->queriedContext) {
  57. case 'Group':
  58. $this->collectGroupData();
  59. break;
  60. case 'User':
  61. $this->collectUserData();
  62. break;
  63. default:
  64. returnClientError('Unknown context: "' . $this->queriedContext . '"!');
  65. }
  66. }
  67. #region Group
  68. private function collectGroupData() {
  69. $header = array('Accept-Language: ' . getEnv('HTTP_ACCEPT_LANGUAGE') . "\r\n");
  70. $html = getSimpleHTMLDOM($this->getURI(), $header)
  71. or returnServerError('Failed loading facebook page: ' . $this->getURI());
  72. if(!$this->isPublicGroup($html)) {
  73. returnClientError('This group is not public! RSS-Bridge only supports public groups!');
  74. }
  75. defaultLinkTo($html, substr(self::URI, 0, strlen(self::URI) - 1));
  76. $this->groupName = $this->extractGroupName($html);
  77. $posts = $html->find('div.userContentWrapper')
  78. or returnServerError('Failed finding posts!');
  79. foreach($posts as $post) {
  80. $item = array();
  81. $item['uri'] = $this->extractGroupURI($post);
  82. $item['title'] = $this->extractGroupTitle($post);
  83. $item['author'] = $this->extractGroupAuthor($post);
  84. $item['content'] = $this->extractGroupContent($post);
  85. $item['timestamp'] = $this->extractGroupTimestamp($post);
  86. $item['enclosures'] = $this->extractGroupEnclosures($post);
  87. $this->items[] = $item;
  88. }
  89. }
  90. private function sanitizeGroup($group) {
  91. if(filter_var(
  92. $group,
  93. FILTER_VALIDATE_URL,
  94. FILTER_FLAG_HOST_REQUIRED | FILTER_FLAG_PATH_REQUIRED)) {
  95. // User provided a URL
  96. $urlparts = parse_url($group);
  97. if($urlparts['host'] !== parse_url(self::URI)['host']
  98. && 'www.' . $urlparts['host'] !== parse_url(self::URI)['host']) {
  99. returnClientError('The host you provided is invalid! Received "'
  100. . $urlparts['host']
  101. . '", expected "'
  102. . parse_url(self::URI)['host']
  103. . '"!');
  104. }
  105. return explode('/', $urlparts['path'])[2];
  106. } elseif(strpos($group, '/') !== false) {
  107. returnClientError('The group you provided is invalid: ' . $group);
  108. } else {
  109. return $group;
  110. }
  111. }
  112. private function isPublicGroup($html) {
  113. // Facebook redirects to the groups about page for non-public groups
  114. $about = $html->find('#pagelet_group_about', 0);
  115. return !($about);
  116. }
  117. private function extractGroupName($html) {
  118. $ogtitle = $html->find('meta[property="og:title"]', 0)
  119. or returnServerError('Unable to find group title!');
  120. return htmlspecialchars_decode($ogtitle->content, ENT_QUOTES);
  121. }
  122. private function uri_strip_args($uri) {
  123. $clean = strtok($uri, '?');
  124. if($clean === FALSE) {
  125. return $uri;
  126. } else {
  127. return $clean;
  128. }
  129. }
  130. private function extractGroupURI($post) {
  131. $elements = $post->find('a')
  132. or returnServerError('Unable to find URI!');
  133. foreach($elements as $anchor) {
  134. // Find the one that is a permalink
  135. if(strpos($anchor->href, 'permalink') !== false) {
  136. return $this->uri_strip_args($anchor->href);
  137. }
  138. }
  139. return null;
  140. }
  141. private function extractGroupContent($post) {
  142. $content = $post->find('div.userContent', 0)
  143. or returnServerError('Unable to find user content!');
  144. return $content->innertext . $content->next_sibling()->innertext;
  145. }
  146. private function extractGroupTimestamp($post) {
  147. $element = $post->find('abbr[data-utime]', 0)
  148. or returnServerError('Unable to find timestamp!');
  149. return $element->getAttribute('data-utime');
  150. }
  151. private function extractGroupAuthor($post) {
  152. $element = $post->find('img', 0)
  153. or returnServerError('Unable to find author information!');
  154. return $element->{'aria-label'};
  155. }
  156. private function extractGroupEnclosures($post) {
  157. $elements = $post->find('div.userContent', 0)->next_sibling()->find('img');
  158. $enclosures = array();
  159. foreach($elements as $enclosure) {
  160. $enclosures[] = $enclosure->src;
  161. }
  162. return empty($enclosures) ? null : $enclosures;
  163. }
  164. private function extractGroupTitle($post) {
  165. $element = $post->find('h5', 0)
  166. or returnServerError('Unable to find title!');
  167. if(strpos($element->plaintext, 'shared') === false) {
  168. $content = strip_tags($this->extractGroupContent($post));
  169. return $this->extractGroupAuthor($post)
  170. . ' posted: '
  171. . substr(
  172. $content,
  173. 0,
  174. strpos(wordwrap($content, 64), "\n")
  175. )
  176. . '...';
  177. }
  178. return $element->plaintext;
  179. }
  180. #endregion
  181. private function collectUserData(){
  182. //Extract a string using start and end delimiters
  183. function extractFromDelimiters($string, $start, $end){
  184. if(strpos($string, $start) !== false) {
  185. $section_retrieved = substr($string, strpos($string, $start) + strlen($start));
  186. $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end));
  187. return $section_retrieved;
  188. }
  189. return false;
  190. }
  191. //Utility function for cleaning a Facebook link
  192. $unescape_fb_link = function($matches){
  193. if(is_array($matches) && count($matches) > 1) {
  194. $link = $matches[1];
  195. if(strpos($link, '/') === 0)
  196. $link = self::URI . $link;
  197. if(strpos($link, 'facebook.com/l.php?u=') !== false)
  198. $link = urldecode(extractFromDelimiters($link, 'facebook.com/l.php?u=', '&'));
  199. return ' href="' . $link . '"';
  200. }
  201. };
  202. //Utility function for converting facebook emoticons
  203. $unescape_fb_emote = function($matches){
  204. static $facebook_emoticons = array(
  205. 'smile' => ':)',
  206. 'frown' => ':(',
  207. 'tongue' => ':P',
  208. 'grin' => ':D',
  209. 'gasp' => ':O',
  210. 'wink' => ';)',
  211. 'pacman' => ':<',
  212. 'grumpy' => '>_<',
  213. 'unsure' => ':/',
  214. 'cry' => ':\'(',
  215. 'kiki' => '^_^',
  216. 'glasses' => '8-)',
  217. 'sunglasses' => 'B-)',
  218. 'heart' => '<3',
  219. 'devil' => ']:D',
  220. 'angel' => '0:)',
  221. 'squint' => '-_-',
  222. 'confused' => 'o_O',
  223. 'upset' => 'xD',
  224. 'colonthree' => ':3',
  225. 'like' => '&#x1F44D;');
  226. $len = count($matches);
  227. if ($len > 1)
  228. for ($i = 1; $i < $len; $i++)
  229. foreach ($facebook_emoticons as $name => $emote)
  230. if ($matches[$i] === $name)
  231. return $emote;
  232. return $matches[0];
  233. };
  234. $html = null;
  235. //Handle captcha response sent by the viewer
  236. if (isset($_POST['captcha_response'])) {
  237. if (session_status() == PHP_SESSION_NONE)
  238. session_start();
  239. if (isset($_SESSION['captcha_fields'], $_SESSION['captcha_action'])) {
  240. $captcha_action = $_SESSION['captcha_action'];
  241. $captcha_fields = $_SESSION['captcha_fields'];
  242. $captcha_fields['captcha_response'] = preg_replace('/[^a-zA-Z0-9]+/', '', $_POST['captcha_response']);
  243. $header = array("Content-type:
  244. application/x-www-form-urlencoded\r\nReferer: $captcha_action\r\nCookie: noscript=1\r\n");
  245. $opts = array(
  246. CURLOPT_POST => 1,
  247. CURLOPT_POSTFIELDS => http_build_query($captcha_fields)
  248. );
  249. $html = getContents($captcha_action, $header, $opts);
  250. if($html === false) {
  251. returnServerError('Failed to submit captcha response back to Facebook');
  252. }
  253. unset($_SESSION['captcha_fields']);
  254. $html = str_get_html($html);
  255. }
  256. unset($_SESSION['captcha_fields']);
  257. unset($_SESSION['captcha_action']);
  258. }
  259. //Retrieve page contents
  260. if(is_null($html)) {
  261. $header = array('Accept-Language: ' . getEnv('HTTP_ACCEPT_LANGUAGE') . "\r\n");
  262. // Check if the user provided a fully qualified URL
  263. if (filter_var($this->getInput('u'), FILTER_VALIDATE_URL)) {
  264. $urlparts = parse_url($this->getInput('u'));
  265. if($urlparts['host'] !== parse_url(self::URI)['host']) {
  266. returnClientError('The host you provided is invalid! Received "'
  267. . $urlparts['host']
  268. . '", expected "'
  269. . parse_url(self::URI)['host']
  270. . '"!');
  271. }
  272. if(!array_key_exists('path', $urlparts)
  273. || $urlparts['path'] === '/') {
  274. returnClientError('The URL you provided doesn\'t contain the user name!');
  275. }
  276. $user = explode('/', $urlparts['path'])[1];
  277. $html = getSimpleHTMLDOM(self::URI . urlencode($user) . '?_fb_noscript=1', $header)
  278. or returnServerError('No results for this query.');
  279. } else {
  280. // First character cannot be a forward slash
  281. if(strpos($this->getInput('u'), '/') === 0) {
  282. returnClientError('Remove leading slash "/" from the username!');
  283. }
  284. if(!strpos($this->getInput('u'), '/')) {
  285. $html = getSimpleHTMLDOM(self::URI . urlencode($this->getInput('u')) . '?_fb_noscript=1', $header)
  286. or returnServerError('No results for this query.');
  287. } else {
  288. $html = getSimpleHTMLDOM(self::URI . 'pages/' . $this->getInput('u') . '?_fb_noscript=1', $header)
  289. or returnServerError('No results for this query.');
  290. }
  291. }
  292. }
  293. //Handle captcha form?
  294. $captcha = $html->find('div.captcha_interstitial', 0);
  295. if (!is_null($captcha)) {
  296. //Save form for submitting after getting captcha response
  297. if (session_status() == PHP_SESSION_NONE)
  298. session_start();
  299. $captcha_fields = array();
  300. foreach ($captcha->find('input, button') as $input)
  301. $captcha_fields[$input->name] = $input->value;
  302. $_SESSION['captcha_fields'] = $captcha_fields;
  303. $_SESSION['captcha_action'] = $captcha->find('form', 0)->action;
  304. //Show captcha filling form to the viewer, proxying the captcha image
  305. $img = base64_encode(getContents($captcha->find('img', 0)->src));
  306. http_response_code(500);
  307. header('Content-Type: text/html');
  308. $message = <<<EOD
  309. <form method="post" action="?{$_SERVER['QUERY_STRING']}">
  310. <h2>Facebook captcha challenge</h2>
  311. <p>Unfortunately, rss-bridge cannot fetch the requested page.<br />
  312. Facebook wants rss-bridge to resolve the following captcha:</p>
  313. <p><img src="data:image/png;base64,{$img}" /></p>
  314. <p><b>Response:</b> <input name="captcha_response" placeholder="please fill in" />
  315. <input type="submit" value="Submit!" /></p>
  316. </form>
  317. EOD;
  318. die($message);
  319. }
  320. //No captcha? We can carry on retrieving page contents :)
  321. //First, we check wether the page is public or not
  322. $loginForm = $html->find('._585r', 0);
  323. if($loginForm != null) {
  324. returnServerError('You must be logged in to view this page. This is not supported by RSS-Bridge.');
  325. }
  326. $element = $html
  327. ->find('#pagelet_timeline_main_column')[0]
  328. ->children(0)
  329. ->children(0)
  330. ->children(0)
  331. ->next_sibling()
  332. ->children(0);
  333. if(isset($element)) {
  334. $author = str_replace(' | Facebook', '', $html->find('title#pageTitle', 0)->innertext);
  335. $profilePic = 'https://graph.facebook.com/'
  336. . $this->getInput('u')
  337. . '/picture?width=200&amp;height=200';
  338. $this->authorName = $author;
  339. foreach($element->children() as $cell) {
  340. // Manage summary posts
  341. if(strpos($cell->class, '_3xaf') !== false) {
  342. $posts = $cell->children();
  343. } else {
  344. $posts = array($cell);
  345. }
  346. // Optionally skip reviews
  347. if($this->getInput('skip_reviews')
  348. && !is_null($cell->find('#review_composer_container', 0))) {
  349. continue;
  350. }
  351. foreach($posts as $post) {
  352. // Check media type
  353. switch($this->getInput('media_type')) {
  354. case 'all': break;
  355. case 'video':
  356. if(empty($post->find('[aria-label=Video]'))) continue 2;
  357. break;
  358. case 'novideo':
  359. if(!empty($post->find('[aria-label=Video]'))) continue 2;
  360. break;
  361. default: break;
  362. }
  363. $item = array();
  364. if(count($post->find('abbr')) > 0) {
  365. //Retrieve post contents
  366. $content = preg_replace(
  367. '/(?i)><div class=\"clearfix([^>]+)>(.+?)div\ class=\"userContent\"/i',
  368. '',
  369. $post);
  370. $content = preg_replace(
  371. '/(?i)><div class=\"_59tj([^>]+)>(.+?)<\/div><\/div><a/i',
  372. '',
  373. $content);
  374. $content = preg_replace(
  375. '/(?i)><div class=\"_3dp([^>]+)>(.+?)div\ class=\"[^u]+userContent\"/i',
  376. '',
  377. $content);
  378. $content = preg_replace(
  379. '/(?i)><div class=\"_4l5([^>]+)>(.+?)<\/div>/i',
  380. '',
  381. $content);
  382. //Remove html nodes, keep only img, links, basic formatting
  383. $content = strip_tags($content, '<a><img><i><u><br><p>');
  384. //Adapt link hrefs: convert relative links into absolute links and bypass external link redirection
  385. $content = preg_replace_callback('/ href=\"([^"]+)\"/i', $unescape_fb_link, $content);
  386. //Clean useless html tag properties and fix link closing tags
  387. foreach (array(
  388. 'onmouseover',
  389. 'onclick',
  390. 'target',
  391. 'ajaxify',
  392. 'tabindex',
  393. 'class',
  394. 'style',
  395. 'data-[^=]*',
  396. 'aria-[^=]*',
  397. 'role',
  398. 'rel',
  399. 'id') as $property_name)
  400. $content = preg_replace('/ ' . $property_name . '=\"[^"]*\"/i', '', $content);
  401. $content = preg_replace('/<\/a [^>]+>/i', '</a>', $content);
  402. //Convert textual representation of emoticons eg
  403. //"<i><u>smile emoticon</u></i>" back to ASCII emoticons eg ":)"
  404. $content = preg_replace_callback(
  405. '/<i><u>([^ <>]+) ([^<>]+)<\/u><\/i>/i',
  406. $unescape_fb_emote,
  407. $content
  408. );
  409. //Retrieve date of the post
  410. $date = $post->find('abbr')[0];
  411. if(isset($date) && $date->hasAttribute('data-utime')) {
  412. $date = $date->getAttribute('data-utime');
  413. } else {
  414. $date = 0;
  415. }
  416. //Build title from username and content
  417. $title = $author;
  418. if(strlen($title) > 24)
  419. $title = substr($title, 0, strpos(wordwrap($title, 24), "\n")) . '...';
  420. $title = $title . ' | ' . strip_tags($content);
  421. if(strlen($title) > 64)
  422. $title = substr($title, 0, strpos(wordwrap($title, 64), "\n")) . '...';
  423. $uri = self::URI . $post->find('abbr')[0]->parent()->getAttribute('href');
  424. //Build and add final item
  425. $item['uri'] = $this->uri_strip_args(htmlspecialchars_decode($uri));
  426. $item['content'] = htmlspecialchars_decode($content);
  427. $item['title'] = $title;
  428. $item['author'] = $author;
  429. $item['timestamp'] = $date;
  430. $this->items[] = $item;
  431. }
  432. }
  433. }
  434. }
  435. }
  436. public function getName(){
  437. switch($this->queriedContext) {
  438. case 'User':
  439. if(!empty($this->authorName)) {
  440. return isset($this->extraInfos['name']) ? $this->extraInfos['name'] : $this->authorName
  441. . ' - Facebook Bridge';
  442. }
  443. break;
  444. case 'Group':
  445. if(!empty($this->groupName)) {
  446. return $this->groupName . ' - Facebook Bridge';
  447. }
  448. break;
  449. }
  450. return parent::getName();
  451. }
  452. }