diff --git a/bridges/Freenews.php b/bridges/Freenews.php new file mode 100644 index 0000000..caaf769 --- /dev/null +++ b/bridges/Freenews.php @@ -0,0 +1,37 @@ +title = trim($newsItem->title); +// $this->message("item has for title \"".$item->title."\""); + if(empty($newsItem->guid)) { + $item->uri = $newsItem->link; + } else { + $item->uri = $newsItem->guid; + } + // now load that uri from cache +// $this->message("now loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + + $content = $articlePage->find('.post-container', 0); + $item->content = $content->innertext; + $item->name = $articlePage->find('a[rel=author]', 0)->innertext; + // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple + $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); + return $item; + } +} diff --git a/bridges/Gawker.php b/bridges/Gawker.php new file mode 100644 index 0000000..f8b484f --- /dev/null +++ b/bridges/Gawker.php @@ -0,0 +1,62 @@ +name = $param['site']; + $param['url'] = $this->toURI(strtolower($param['site'])); + } +// $this->message("loading feed from ".$this->getURI()); + parent::collectData($param); + } + + protected function parseRSSItem($newsItem) { + $item = new Item(); + $item->uri = trim($newsItem->link); + $item->title = trim($newsItem->title); + $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); +// $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true)); + try { + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + if(is_object($articlePage)) { + $content = $articlePage->find('.post-content', 0); + $this->defaultImageSrcTo($content, $this->getURI()); + $vcard = $articlePage->find('.vcard', 0); + if(is_object($vcard)) { + $authorLink = $vcard->find('a', 0); + $item->name = $authorLink->innertext; + // TODO use author link href to fill the feed info + } +// $this->message("item quite loaded : ".var_export($item, true)); + // I set item content as last element, for easier var_export reading + $item->content = $content->innertext; + } else { + throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !"); + } + } catch(Exception $e) { + $this->message("obtaining ".$item->uri." resulted in exception ".$e->getMessage().". Deleting cached page ..."); + // maybe file is incorrect. it should be discarded from cache + $this->remove_from_cache($item->url); + $item->content = $e->getMessage(); + } + return $item; + } +} diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index aea233b..e6dee73 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -5,94 +5,40 @@ * @description La planète sexe vue par Agnès Girard via rss-bridge * @update 20/02/2014 */ +require_once 'bridges/RssExpander.php'; define("SEXE", "http://sexes.blogs.liberation.fr"); -class Les400Culs extends BridgeAbstract{ +define("RSS", "http://sexes.blogs.liberation.fr/feeds/"); +/** + * As it seems that Les 400 culs currently offer a full feed, we won't change it content here. + * But I'm ready for the day where it will ... again ... provide some truncated content + */ +class Les400Culs extends RssExpander{ public function collectData(array $param){ - $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); - - foreach($html->find('#alpha-inner') as $articles) { - foreach($articles->find('div.entry') as $article) { - $header = $article->find('h3.entry-header a', 0); - $content = $article->find('div.entry-content', 0); - - - $item = new Item(); - $item->title = trim($header->innertext); - $item->uri = $header->href; - $item->name = "Agnès Girard"; - // date is stored outside this node ! - $dateHeader = $article->prev_sibling(); - // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit) - $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp(); - - - $linkForMore = $content->find('p.entry-more-link a',0); - if($linkForMore==null) { - $item->content = $content->innertext; - } else { - $pageAddress = $linkForMore->href; - $articlePage = str_get_html($this->get_cached($linkForMore->href)); - if($articlePage==null) { - $item->content = $content->innertext."\n
".$linkForMore->outertext."
"; - } else { - // TODO use some caching there ! - $fullContent = $articlePage->find('div.entry-content', 0); - $item->content = $fullContent->innertext; - } - } - $this->items[] = $item; - } - } + $param['url'] = RSS; + parent::collectData($param); } + + protected function parseRSSItem($newsItem) { + $item = new Item(); + $item->title = trim($newsItem->title); +// $this->message("browsing item ".var_export($newsItem, true)); + if(empty($newsItem->guid)) { + $item->uri = $newsItem->link; + } else { + $item->uri = $newsItem->guid; + } + // now load that uri from cache +// $this->message("now loading page ".$item->uri); +// $articlePage = str_get_html($this->get_cached($item->uri)); - public function getName(){ - return 'Les 400 Culs'; +// $content = $articlePage->find('.post-container', 0); + $item->content = $newsItem->description; + $item->name = $newsItem->author; + $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); + return $item; } - - public function getURI(){ - return SEXE; - } - public function getCacheDuration(){ return 7200; // 2h hours } - public function getDescription(){ - return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; - } - - /** - * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. - * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache - * @param url url to cache - * @return content of file as string - */ - public function get_cached($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); - $filename = __DIR__ . '/../cache/'."pages/".$simplified_url; - if (substr($filename, -1) == '/') { - $filename = $filename."index.html"; - } - if(!file_exists($filename)) { - error_log("we have no local copy of ".$url." Downloading !"); - $dir = substr($filename, 0, strrpos($filename, '/')); - if(!is_dir($dir)) { - mkdir($dir, 0777, true); - } - $this->download_remote($url, $filename); - } - return file_get_contents($filename); - } - - public function download_remote($url , $save_path) { - $f = fopen( $save_path , 'w+'); - $handle = fopen($url , "rb"); - while (!feof($handle)) { - $contents = fread($handle, 8192); - fwrite($f , $contents); - } - fclose($handle); - fclose($f); - } - } diff --git a/bridges/RssExpander.php b/bridges/RssExpander.php new file mode 100644 index 0000000..07268ea --- /dev/null +++ b/bridges/RssExpander.php @@ -0,0 +1,68 @@ +returnError('There is no $param[\'url\'] for this RSS expander', 404); + } + // $this->message("Loading from ".$param['url']); + // Notice WE DO NOT use cache here on purpose : we want a fresh view of the RSS stream each time + $rssContent = simplexml_load_file($param['url']) or $this->returnError('Could not request '.$param['url'], 404); +// $this->message("loaded RSS from ".$param['url']); + // TODO insert RSS format detection + // we suppose for now, we have some RSS 2.0 + $this->collect_RSS_2_0_data($rssContent); + } + + private function collect_RSS_2_0_data($rssContent) { + $rssContent = $rssContent->channel[0]; +// $this->message("RSS content is ===========\n".var_export($rssContent, true)."==========="); + $this->load_RSS_2_0_feed_data($rssContent); + foreach($rssContent->item as $item) { +// $this->message("parsing item ".var_export($item, true)); + $this->items[] = $this->parseRSSItem($item); + } + } + + protected function RSS_2_0_time_to_timestamp($item) { + return DateTime::createFromFormat('D, d M Y H:i:s e', $item->pubDate)->getTimestamp(); + } + + // TODO set title, link, description, language, and so on + protected function load_RSS_2_0_feed_data($rssContent) { + $this->name = trim($rssContent->title); + $this->uri = trim($rssContent->link); + $this->description = trim($rssContent->description); + } + + /** + * Method should return, from a source RSS item given by lastRSS, one of our Items objects + * @param $item the input rss item + * @return a RSS-Bridge Item, with (hopefully) the whole content) + */ + abstract protected function parseRSSItem($item); + + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getDescription() { + return $this->description; + } +} \ No newline at end of file diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 20ba5f9..edc173f 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -1,6 +1,6 @@ lang = $param['lang']; + } + if(empty($param['category'])) { + $this->uri = WORLD_OF_TANKS.$this->lang.NEWS; + } else { + $this->uri = WORLD_OF_TANKS.$this->lang.NEWS.$param['category']."/"; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.b-imgblock_ico') as $infoLink) { + $this->parseLine($infoLink); + } + } + + public function parseLine($infoLink) { + $item = new Item(); + $item->uri = WORLD_OF_TANKS.$infoLink->href; + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + $content = $articlePage->find('.l-content', 0); + $this->defaultImageSrcTo($content, WORLD_OF_TANKS); + $item->title = $content->find('h1', 0)->innertext; + $item->content = $content->find('.b-content', 0)->innertext; +// $item->name = $auteur->innertext; + $item->timestamp = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp"); + $this->items[] = $item; + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 2h hours + } + public function getDescription(){ + return "Toutes les actualités les plus brulantes de ce simulateur de destruction d'acier."; + } +} diff --git a/index.php b/index.php index 98e6d93..34c0509 100644 --- a/index.php +++ b/index.php @@ -88,9 +88,11 @@ try{ // Data retrieval $bridge = Bridge::create($bridge); - $bridge - ->setCache($cache) // Comment this lign for avoid cache use - ->setDatas($_REQUEST); + if(isset($_REQUEST["disable_cache"])) { + } else { + $bridge->setCache($cache); // just add disable cache to your query to disable caching + } + $bridge->setDatas($_REQUEST); // Data transformation $format = Format::create($format); diff --git a/lib/Bridge.php b/lib/Bridge.php index 2dac94b..41b5e49 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -71,6 +71,102 @@ abstract class BridgeAbstract implements BridgeInterface{ return $this; } + + /** + * Set default image SRC attribute to point on given server when none is provided (that's to say when image src starts with '/' + */ + public function defaultImageSrcTo($content, $server) { + foreach($content->find('img') as $image) { + if(strpos($image->src, '/')==0) { + $image->src = $server.$image->src; + } + } + } +} + +/** + * Extension of BridgeAbstract allowing caching of files downloaded over http files. + * This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded + * separately. + * This class mainly provides a get_cached method which will will download the file from its remote location. + * TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time + * After all, rss-bridge is not respaw, isn't it ? + */ +abstract class HttpCachingBridgeAbstract extends BridgeAbstract { + + /** + * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. + * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * @param url url to cache + * @return content of file as string + */ + public function get_cached($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = $pageCacheDir.$simplified_url; + if (substr($filename, -1) == '/') { + $filename = $filename."index.html"; + } + if(file_exists($filename)) { +// $this->message("loading cached file from ".$filename." for page at url ".$url); + // TODO touch file and its parent, and try to do neighbour deletion + $this->refresh_in_cache($pageCacheDir, $filename); + } else { +// $this->message("we have no local copy of ".$url." Downloading to ".$filename); + $dir = substr($filename, 0, strrpos($filename, '/')); + if(!is_dir($dir)) { +// $this->message("creating directories for ".$dir); + mkdir($dir, 0777, true); + } + $this->download_remote($url, $filename); + } + return file_get_contents($filename); + } + + private function refresh_in_cache($pageCacheDir, $filename) { + $currentPath = $filename; + while(!$pageCacheDir==$currentPath) { + touch($currentPath); + $currentPath = dirname($currentPath); + } + } + + public function download_remote($url , $save_path) { + $f = fopen( $save_path , 'w+'); + if($f) { + $handle = fopen($url , "rb"); + if($handle) { + while (!feof($handle)) { + $contents = fread($handle, 8192); + if($contents) { + fwrite($f , $contents); + } + } + fclose($handle); + } + fclose($f); + } + } + + public function remove_from_cache($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = realpath($pageCacheDir.$simplified_url); + $this->message("removing from cache \"".$filename."\" WELL, NOT REALLY"); + // filename is NO GOOD +// unlink($filename); + } + + public function message($text) { + $backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3); + $calling = $backtrace[2]; + $message = $calling["file"].":".$calling["line"] + ." class ".get_class($this)."->".$calling["function"] + ." - ".$text; + error_log($message); + } } class Bridge{