1
0

FacebookBridge.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. <?php
  2. class FacebookBridge extends BridgeAbstract {
  3. const MAINTAINER = 'teromene, logmanoriginal';
  4. const NAME = 'Facebook';
  5. const URI = 'https://www.facebook.com/';
  6. const CACHE_TIMEOUT = 300; // 5min
  7. const DESCRIPTION = 'Input a page title or a profile log. For a profile log,
  8. please insert the parameter as follow : myExamplePage/132621766841117';
  9. const PARAMETERS = array(
  10. 'User' => array(
  11. 'u' => array(
  12. 'name' => 'Username',
  13. 'required' => true
  14. ),
  15. 'media_type' => array(
  16. 'name' => 'Media type',
  17. 'type' => 'list',
  18. 'required' => false,
  19. 'values' => array(
  20. 'All' => 'all',
  21. 'Video' => 'video',
  22. 'No Video' => 'novideo'
  23. ),
  24. 'defaultValue' => 'all'
  25. ),
  26. 'skip_reviews' => array(
  27. 'name' => 'Skip reviews',
  28. 'type' => 'checkbox',
  29. 'required' => false,
  30. 'defaultValue' => false,
  31. 'title' => 'Feed includes reviews when checked'
  32. )
  33. ),
  34. 'Group' => array(
  35. 'g' => array(
  36. 'name' => 'Group',
  37. 'type' => 'text',
  38. 'required' => true,
  39. 'exampleValue' => 'https://www.facebook.com/groups/743149642484225',
  40. 'title' => 'Insert group name or facebook group URL'
  41. )
  42. )
  43. );
  44. private $authorName = '';
  45. private $groupName = '';
  46. public function getURI() {
  47. $uri = self::URI;
  48. switch($this->queriedContext) {
  49. case 'Group':
  50. $uri .= 'groups/' . $this->sanitizeGroup(filter_var($this->getInput('g'), FILTER_SANITIZE_URL));
  51. break;
  52. }
  53. return $uri .= '?_fb_noscript=1';
  54. }
  55. public function collectData() {
  56. switch($this->queriedContext) {
  57. case 'Group':
  58. $this->collectGroupData();
  59. break;
  60. case 'User':
  61. $this->collectUserData();
  62. break;
  63. default:
  64. returnClientError('Unknown context: "' . $this->queriedContext . '"!');
  65. }
  66. }
  67. #region Group
  68. private function collectGroupData() {
  69. $header = array('Accept-Language: ' . getEnv('HTTP_ACCEPT_LANGUAGE') . "\r\n");
  70. $html = getSimpleHTMLDOM($this->getURI(), $header)
  71. or returnServerError('Failed loading facebook page: ' . $this->getURI());
  72. if(!$this->isPublicGroup($html)) {
  73. returnClientError('This group is not public! RSS-Bridge only supports public groups!');
  74. }
  75. defaultLinkTo($html, substr(self::URI, 0, strlen(self::URI) - 1));
  76. $this->groupName = $this->extractGroupName($html);
  77. $posts = $html->find('div.userContentWrapper')
  78. or returnServerError('Failed finding posts!');
  79. foreach($posts as $post) {
  80. $item = array();
  81. $item['uri'] = $this->extractGroupURI($post);
  82. $item['title'] = $this->extractGroupTitle($post);
  83. $item['author'] = $this->extractGroupAuthor($post);
  84. $item['content'] = $this->extractGroupContent($post);
  85. $item['timestamp'] = $this->extractGroupTimestamp($post);
  86. $item['enclosures'] = $this->extractGroupEnclosures($post);
  87. $this->items[] = $item;
  88. }
  89. }
  90. private function sanitizeGroup($group) {
  91. if(filter_var(
  92. $group,
  93. FILTER_VALIDATE_URL,
  94. FILTER_FLAG_HOST_REQUIRED | FILTER_FLAG_PATH_REQUIRED)) {
  95. // User provided a URL
  96. $urlparts = parse_url($group);
  97. if($urlparts['host'] !== parse_url(self::URI)['host']
  98. && 'www.' . $urlparts['host'] !== parse_url(self::URI)['host']) {
  99. returnClientError('The host you provided is invalid! Received "'
  100. . $urlparts['host']
  101. . '", expected "'
  102. . parse_url(self::URI)['host']
  103. . '"!');
  104. }
  105. return explode('/', $urlparts['path'])[2];
  106. } elseif(strpos($group, '/') !== false) {
  107. returnClientError('The group you provided is invalid: ' . $group);
  108. } else {
  109. return $group;
  110. }
  111. }
  112. private function isPublicGroup($html) {
  113. // Facebook redirects to the groups about page for non-public groups
  114. $about = $html->find('#pagelet_group_about', 0);
  115. return !($about);
  116. }
  117. private function extractGroupName($html) {
  118. $ogtitle = $html->find('meta[property="og:title"]', 0)
  119. or returnServerError('Unable to find group title!');
  120. return htmlspecialchars_decode($ogtitle->content, ENT_QUOTES);
  121. }
  122. private function extractGroupURI($post) {
  123. $elements = $post->find('a')
  124. or returnServerError('Unable to find URI!');
  125. foreach($elements as $anchor) {
  126. // Find the one that is a permalink
  127. if(strpos($anchor->href, 'permalink') !== false) {
  128. return $anchor->href;
  129. }
  130. }
  131. return null;
  132. }
  133. private function extractGroupContent($post) {
  134. $content = $post->find('div.userContent', 0)
  135. or returnServerError('Unable to find user content!');
  136. return $content->innertext . $content->next_sibling()->innertext;
  137. }
  138. private function extractGroupTimestamp($post) {
  139. $element = $post->find('abbr[data-utime]', 0)
  140. or returnServerError('Unable to find timestamp!');
  141. return $element->getAttribute('data-utime');
  142. }
  143. private function extractGroupAuthor($post) {
  144. $element = $post->find('img', 0)
  145. or returnServerError('Unable to find author information!');
  146. return $element->{'aria-label'};
  147. }
  148. private function extractGroupEnclosures($post) {
  149. $elements = $post->find('div.userContent', 0)->next_sibling()->find('img');
  150. $enclosures = array();
  151. foreach($elements as $enclosure) {
  152. $enclosures[] = $enclosure->src;
  153. }
  154. return empty($enclosures) ? null : $enclosures;
  155. }
  156. private function extractGroupTitle($post) {
  157. $element = $post->find('h5', 0)
  158. or returnServerError('Unable to find title!');
  159. if(strpos($element->plaintext, 'shared') === false) {
  160. $content = strip_tags($this->extractGroupContent($post));
  161. return $this->extractGroupAuthor($post)
  162. . ' posted: '
  163. . substr(
  164. $content,
  165. 0,
  166. strpos(wordwrap($content, 64), "\n")
  167. )
  168. . '...';
  169. }
  170. return $element->plaintext;
  171. }
  172. #endregion
  173. private function collectUserData(){
  174. //Extract a string using start and end delimiters
  175. function extractFromDelimiters($string, $start, $end){
  176. if(strpos($string, $start) !== false) {
  177. $section_retrieved = substr($string, strpos($string, $start) + strlen($start));
  178. $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end));
  179. return $section_retrieved;
  180. }
  181. return false;
  182. }
  183. //Utility function for cleaning a Facebook link
  184. $unescape_fb_link = function($matches){
  185. if(is_array($matches) && count($matches) > 1) {
  186. $link = $matches[1];
  187. if(strpos($link, '/') === 0)
  188. $link = self::URI . $link;
  189. if(strpos($link, 'facebook.com/l.php?u=') !== false)
  190. $link = urldecode(extractFromDelimiters($link, 'facebook.com/l.php?u=', '&'));
  191. return ' href="' . $link . '"';
  192. }
  193. };
  194. //Utility function for converting facebook emoticons
  195. $unescape_fb_emote = function($matches){
  196. static $facebook_emoticons = array(
  197. 'smile' => ':)',
  198. 'frown' => ':(',
  199. 'tongue' => ':P',
  200. 'grin' => ':D',
  201. 'gasp' => ':O',
  202. 'wink' => ';)',
  203. 'pacman' => ':<',
  204. 'grumpy' => '>_<',
  205. 'unsure' => ':/',
  206. 'cry' => ':\'(',
  207. 'kiki' => '^_^',
  208. 'glasses' => '8-)',
  209. 'sunglasses' => 'B-)',
  210. 'heart' => '<3',
  211. 'devil' => ']:D',
  212. 'angel' => '0:)',
  213. 'squint' => '-_-',
  214. 'confused' => 'o_O',
  215. 'upset' => 'xD',
  216. 'colonthree' => ':3',
  217. 'like' => '&#x1F44D;');
  218. $len = count($matches);
  219. if ($len > 1)
  220. for ($i = 1; $i < $len; $i++)
  221. foreach ($facebook_emoticons as $name => $emote)
  222. if ($matches[$i] === $name)
  223. return $emote;
  224. return $matches[0];
  225. };
  226. $html = null;
  227. //Handle captcha response sent by the viewer
  228. if (isset($_POST['captcha_response'])) {
  229. if (session_status() == PHP_SESSION_NONE)
  230. session_start();
  231. if (isset($_SESSION['captcha_fields'], $_SESSION['captcha_action'])) {
  232. $captcha_action = $_SESSION['captcha_action'];
  233. $captcha_fields = $_SESSION['captcha_fields'];
  234. $captcha_fields['captcha_response'] = preg_replace('/[^a-zA-Z0-9]+/', '', $_POST['captcha_response']);
  235. $header = array("Content-type:
  236. application/x-www-form-urlencoded\r\nReferer: $captcha_action\r\nCookie: noscript=1\r\n");
  237. $opts = array(
  238. CURLOPT_POST => 1,
  239. CURLOPT_POSTFIELDS => http_build_query($captcha_fields)
  240. );
  241. $html = getContents($captcha_action, $header, $opts);
  242. if($html === false) {
  243. returnServerError('Failed to submit captcha response back to Facebook');
  244. }
  245. unset($_SESSION['captcha_fields']);
  246. $html = str_get_html($html);
  247. }
  248. unset($_SESSION['captcha_fields']);
  249. unset($_SESSION['captcha_action']);
  250. }
  251. //Retrieve page contents
  252. if(is_null($html)) {
  253. $header = array('Accept-Language: ' . getEnv('HTTP_ACCEPT_LANGUAGE') . "\r\n");
  254. // Check if the user provided a fully qualified URL
  255. if (filter_var($this->getInput('u'), FILTER_VALIDATE_URL)) {
  256. $urlparts = parse_url($this->getInput('u'));
  257. if($urlparts['host'] !== parse_url(self::URI)['host']) {
  258. returnClientError('The host you provided is invalid! Received "'
  259. . $urlparts['host']
  260. . '", expected "'
  261. . parse_url(self::URI)['host']
  262. . '"!');
  263. }
  264. if(!array_key_exists('path', $urlparts)
  265. || $urlparts['path'] === '/') {
  266. returnClientError('The URL you provided doesn\'t contain the user name!');
  267. }
  268. $user = explode('/', $urlparts['path'])[1];
  269. $html = getSimpleHTMLDOM(self::URI . urlencode($user) . '?_fb_noscript=1', $header)
  270. or returnServerError('No results for this query.');
  271. } else {
  272. // First character cannot be a forward slash
  273. if(strpos($this->getInput('u'), '/') === 0) {
  274. returnClientError('Remove leading slash "/" from the username!');
  275. }
  276. if(!strpos($this->getInput('u'), '/')) {
  277. $html = getSimpleHTMLDOM(self::URI . urlencode($this->getInput('u')) . '?_fb_noscript=1', $header)
  278. or returnServerError('No results for this query.');
  279. } else {
  280. $html = getSimpleHTMLDOM(self::URI . 'pages/' . $this->getInput('u') . '?_fb_noscript=1', $header)
  281. or returnServerError('No results for this query.');
  282. }
  283. }
  284. }
  285. //Handle captcha form?
  286. $captcha = $html->find('div.captcha_interstitial', 0);
  287. if (!is_null($captcha)) {
  288. //Save form for submitting after getting captcha response
  289. if (session_status() == PHP_SESSION_NONE)
  290. session_start();
  291. $captcha_fields = array();
  292. foreach ($captcha->find('input, button') as $input)
  293. $captcha_fields[$input->name] = $input->value;
  294. $_SESSION['captcha_fields'] = $captcha_fields;
  295. $_SESSION['captcha_action'] = $captcha->find('form', 0)->action;
  296. //Show captcha filling form to the viewer, proxying the captcha image
  297. $img = base64_encode(getContents($captcha->find('img', 0)->src));
  298. http_response_code(500);
  299. header('Content-Type: text/html');
  300. $message = <<<EOD
  301. <form method="post" action="?{$_SERVER['QUERY_STRING']}">
  302. <h2>Facebook captcha challenge</h2>
  303. <p>Unfortunately, rss-bridge cannot fetch the requested page.<br />
  304. Facebook wants rss-bridge to resolve the following captcha:</p>
  305. <p><img src="data:image/png;base64,{$img}" /></p>
  306. <p><b>Response:</b> <input name="captcha_response" placeholder="please fill in" />
  307. <input type="submit" value="Submit!" /></p>
  308. </form>
  309. EOD;
  310. die($message);
  311. }
  312. //No captcha? We can carry on retrieving page contents :)
  313. //First, we check wether the page is public or not
  314. $loginForm = $html->find('._585r', 0);
  315. if($loginForm != null) {
  316. returnServerError('You must be logged in to view this page. This is not supported by RSS-Bridge.');
  317. }
  318. $element = $html
  319. ->find('#pagelet_timeline_main_column')[0]
  320. ->children(0)
  321. ->children(0)
  322. ->children(0)
  323. ->next_sibling()
  324. ->children(0);
  325. if(isset($element)) {
  326. $author = str_replace(' | Facebook', '', $html->find('title#pageTitle', 0)->innertext);
  327. $profilePic = 'https://graph.facebook.com/'
  328. . $this->getInput('u')
  329. . '/picture?width=200&amp;height=200';
  330. $this->authorName = $author;
  331. foreach($element->children() as $cell) {
  332. // Manage summary posts
  333. if(strpos($cell->class, '_3xaf') !== false) {
  334. $posts = $cell->children();
  335. } else {
  336. $posts = array($cell);
  337. }
  338. // Optionally skip reviews
  339. if($this->getInput('skip_reviews')
  340. && !is_null($cell->find('#review_composer_container', 0))) {
  341. continue;
  342. }
  343. foreach($posts as $post) {
  344. // Check media type
  345. switch($this->getInput('media_type')) {
  346. case 'all': break;
  347. case 'video':
  348. if(empty($post->find('[aria-label=Video]'))) continue 2;
  349. break;
  350. case 'novideo':
  351. if(!empty($post->find('[aria-label=Video]'))) continue 2;
  352. break;
  353. default: break;
  354. }
  355. $item = array();
  356. if(count($post->find('abbr')) > 0) {
  357. //Retrieve post contents
  358. $content = preg_replace(
  359. '/(?i)><div class=\"clearfix([^>]+)>(.+?)div\ class=\"userContent\"/i',
  360. '',
  361. $post);
  362. $content = preg_replace(
  363. '/(?i)><div class=\"_59tj([^>]+)>(.+?)<\/div><\/div><a/i',
  364. '',
  365. $content);
  366. $content = preg_replace(
  367. '/(?i)><div class=\"_3dp([^>]+)>(.+?)div\ class=\"[^u]+userContent\"/i',
  368. '',
  369. $content);
  370. $content = preg_replace(
  371. '/(?i)><div class=\"_4l5([^>]+)>(.+?)<\/div>/i',
  372. '',
  373. $content);
  374. //Remove html nodes, keep only img, links, basic formatting
  375. $content = strip_tags($content, '<a><img><i><u><br><p>');
  376. //Adapt link hrefs: convert relative links into absolute links and bypass external link redirection
  377. $content = preg_replace_callback('/ href=\"([^"]+)\"/i', $unescape_fb_link, $content);
  378. //Clean useless html tag properties and fix link closing tags
  379. foreach (array(
  380. 'onmouseover',
  381. 'onclick',
  382. 'target',
  383. 'ajaxify',
  384. 'tabindex',
  385. 'class',
  386. 'style',
  387. 'data-[^=]*',
  388. 'aria-[^=]*',
  389. 'role',
  390. 'rel',
  391. 'id') as $property_name)
  392. $content = preg_replace('/ ' . $property_name . '=\"[^"]*\"/i', '', $content);
  393. $content = preg_replace('/<\/a [^>]+>/i', '</a>', $content);
  394. //Convert textual representation of emoticons eg
  395. //"<i><u>smile emoticon</u></i>" back to ASCII emoticons eg ":)"
  396. $content = preg_replace_callback(
  397. '/<i><u>([^ <>]+) ([^<>]+)<\/u><\/i>/i',
  398. $unescape_fb_emote,
  399. $content
  400. );
  401. //Retrieve date of the post
  402. $date = $post->find('abbr')[0];
  403. if(isset($date) && $date->hasAttribute('data-utime')) {
  404. $date = $date->getAttribute('data-utime');
  405. } else {
  406. $date = 0;
  407. }
  408. //Build title from username and content
  409. $title = $author;
  410. if(strlen($title) > 24)
  411. $title = substr($title, 0, strpos(wordwrap($title, 24), "\n")) . '...';
  412. $title = $title . ' | ' . strip_tags($content);
  413. if(strlen($title) > 64)
  414. $title = substr($title, 0, strpos(wordwrap($title, 64), "\n")) . '...';
  415. $uri = self::URI . $post->find('abbr')[0]->parent()->getAttribute('href');
  416. //Build and add final item
  417. $item['uri'] = htmlspecialchars_decode($uri);
  418. $item['content'] = htmlspecialchars_decode($content);
  419. $item['title'] = $title;
  420. $item['author'] = $author;
  421. $item['timestamp'] = $date;
  422. $this->items[] = $item;
  423. }
  424. }
  425. }
  426. }
  427. }
  428. public function getName(){
  429. switch($this->queriedContext) {
  430. case 'User':
  431. if(!empty($this->authorName)) {
  432. return isset($this->extraInfos['name']) ? $this->extraInfos['name'] : $this->authorName
  433. . ' - Facebook Bridge';
  434. }
  435. break;
  436. case 'Group':
  437. if(!empty($this->groupName)) {
  438. return $this->groupName . ' - Facebook Bridge';
  439. }
  440. break;
  441. }
  442. return parent::getName();
  443. }
  444. }