diff --git a/bridges/Freenews.php b/bridges/FreenewsBridge.php similarity index 89% rename from bridges/Freenews.php rename to bridges/FreenewsBridge.php index 04cbdaf..871d10b 100644 --- a/bridges/Freenews.php +++ b/bridges/FreenewsBridge.php @@ -1,6 +1,6 @@ title); -// $this->message("item has for title \"".$item['title']."\""); + $this->debugMessage("item has for title \"".$item['title']."\""); if(empty($newsItem->guid)) { $item['uri'] = (string) $newsItem->link; } else { $item['uri'] = (string) $newsItem->guid; } // now load that uri from cache -// $this->message("now loading page ".$item['uri']); + $this->debugMessage("now loading page ".$item['uri']); $articlePage = str_get_html($this->get_cached($item['uri'])); $content = $articlePage->find('.post-container', 0); diff --git a/bridges/Gawker.php b/bridges/GawkerBridge.php similarity index 80% rename from bridges/Gawker.php rename to bridges/GawkerBridge.php index e8c7581..ea89af3 100644 --- a/bridges/Gawker.php +++ b/bridges/GawkerBridge.php @@ -2,7 +2,7 @@ define("RSS_PREFIX", "http://feeds.gawker.com/"); define("RSS_SUFFIX", "/full"); -class Gawker extends RssExpander{ +class GawkerBridge extends RssExpander{ public function loadMetadatas() { @@ -28,7 +28,7 @@ class Gawker extends RssExpander{ $this->name = $param['site']; $url = $this->toURI(strtolower($param['site'])); } -// $this->message("loading feed from ".$this->getURI()); + $this->debugMessage("loading feed from ".$this->getURI()); parent::collectExpandableDatas($param, $url); } @@ -37,10 +37,10 @@ class Gawker extends RssExpander{ $item['uri'] = trim($newsItem->link); $item['title'] = trim($newsItem->title); $item['timestamp'] = $this->RSS_2_0_time_to_timestamp($newsItem); -// $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true)); + $this->debugMessage("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true)); try { // now load that uri from cache -// $this->message("loading page ".$item['uri']); + $this->debugMessage("loading page ".$item['uri']); $articlePage = str_get_html($this->get_cached($item['uri'])); if(is_object($articlePage)) { $content = $articlePage->find('.post-content', 0); @@ -51,14 +51,14 @@ class Gawker extends RssExpander{ $item['author'] = $authorLink->innertext; // TODO use author link href to fill the feed info } -// $this->message("item quite loaded : ".var_export($item, true)); + $this->debugMessage("item quite loaded : ".var_export($item, true)); // I set item content as last element, for easier var_export reading $item['content'] = $content->innertext; } else { throw new Exception("cache content for ".$item['uri']." is NOT a Simple DOM parser object !"); } } catch(Exception $e) { - $this->message("obtaining ".$item['uri']." resulted in exception ".$e->getMessage().". Deleting cached page ..."); + $this->debugMessage("obtaining ".$item['uri']." resulted in exception ".$e->getMessage().". Deleting cached page ..."); // maybe file is incorrect. it should be discarded from cache $this->remove_from_cache($item['url']); $item['content'] = $e->getMessage(); diff --git a/bridges/Les400Culs.php b/bridges/Les400CulsBridge.php similarity index 87% rename from bridges/Les400Culs.php rename to bridges/Les400CulsBridge.php index 49b2c1e..695f6e9 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400CulsBridge.php @@ -2,7 +2,7 @@ define("SEXE", "http://sexes.blogs.liberation.fr"); define("SEXE_FEED", "http://sexes.blogs.liberation.fr/feeds/"); -class Les400Culs extends RssExpander{ +class Les400CulsBridge extends RssExpander{ public function loadMetadatas() { @@ -21,14 +21,14 @@ class Les400Culs extends RssExpander{ protected function parseRSSItem($newsItem) { $item = array(); $item['title'] = trim((string) $newsItem->title); -// $this->message("browsing item ".var_export($newsItem, true)); + $this->debugMessage("browsing item ".var_export($newsItem, true)); if(empty($newsItem->guid)) { $item['uri'] = (string) $newsItem->link; } else { $item['uri'] = (string) $newsItem->guid; } // now load that uri from cache -// $this->message("now loading page ".$item['uri']); + $this->debugMessage("now loading page ".$item['uri']); // $articlePage = str_get_html($this->get_cached($item['uri'])); // $content = $articlePage->find('.post-container', 0); diff --git a/bridges/TheOatMealBridge.php b/bridges/TheOatMealBridge.php index 4dfe8ba..a7bf308 100644 --- a/bridges/TheOatMealBridge.php +++ b/bridges/TheOatMealBridge.php @@ -24,10 +24,10 @@ class TheOatmealBridge extends RssExpander{ protected function collect_RSS_2_0_data($rssContent) { $rssContent->registerXPathNamespace("dc", "http://purl.org/dc/elements/1.1/"); $rssHeaderContent = $rssContent->channel[0]; -// $this->message("RSS content is ===========\n".var_export($rssHeaderContent, true)."==========="); + $this->debugMessage("RSS content is ===========\n".var_export($rssHeaderContent, true)."==========="); $this->load_RSS_2_0_feed_data($rssHeaderContent); foreach($rssContent->item as $item) { - $this->message("parsing item ".var_export($item, true)); + $this->debugMessage("parsing item ".var_export($item, true)); $this->items[] = $this->parseRSSItem($item); } } @@ -39,10 +39,10 @@ class TheOatmealBridge extends RssExpander{ $rdf = $newsItem->children($namespaces['rdf']); $item = array(); $item['title'] = trim($newsItem->title); - $this->message("browsing Oatmeal item ".var_export($newsItem, true)); + $this->debugMessage("browsing Oatmeal item ".var_export($newsItem, true)); $item['uri']=(string) $newsItem->attributes($namespaces['rdf'])->about; // now load that uri from cache - $this->message("now loading page ".$item['uri']); + $this->debugMessage("now loading page ".$item['uri']); $articlePage = str_get_html($this->get_cached($item['uri'])); $content = $articlePage->find('#comic', 0); @@ -51,10 +51,10 @@ class TheOatmealBridge extends RssExpander{ } $item['content'] = $content->innertext; - $this->message("dc content is ".var_export($dc, true)); + $this->debugMessage("dc content is ".var_export($dc, true)); $item['author'] = (string) $dc->creator; $item['timestamp'] = DateTime::createFromFormat(DateTime::ISO8601, $dc->date)->getTimestamp(); - $this->message("writtem by ".$item['author']." on ".$item['timestamp']); + $this->debugMessage("writtem by ".$item['author']." on ".$item['timestamp']); return $item; } diff --git a/bridges/WorldOfTanks.php b/bridges/WorldOfTanksBridge.php similarity index 92% rename from bridges/WorldOfTanks.php rename to bridges/WorldOfTanksBridge.php index 32e73df..b76c483 100644 --- a/bridges/WorldOfTanks.php +++ b/bridges/WorldOfTanksBridge.php @@ -1,7 +1,7 @@ uri = WORLD_OF_TANKS.$this->lang.NEWS.'pc-browser/'.$param['category']."/"; } $html = $this->getSimpleHTMLDOM($this->getURI()) or $this->returnServerError('Could not request '.$this->getURI()); - $this->message("loaded HTML from ".$this->getURI()); + $this->debugMessage("loaded HTML from ".$this->getURI()); // customize name $this->name = $html->find('title', 0)->innertext; foreach($html->find('.b-imgblock_ico') as $infoLink) { @@ -57,7 +57,7 @@ class WorldOfTanks extends HttpCachingBridgeAbstract{ $item = array(); $item['uri'] = WORLD_OF_TANKS.$infoLink->href; // now load that uri from cache -// $this->message("loading page ".$item['uri']); + $this->debugMessage("loading page ".$item['uri']); $articlePage = str_get_html($this->get_cached($item['uri'])); $content = $articlePage->find('.l-content', 0); HTMLSanitizer::defaultImageSrcTo($content, WORLD_OF_TANKS); diff --git a/lib/Bridge.php b/lib/Bridge.php index 05fe77e..41c71a9 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -1,10 +1,108 @@ IsInstantiable(); + } + + /** + * Create a new bridge object + * @param string $nameBridge Defined bridge name you want use + * @return Bridge object dedicated + */ + static public function create($nameBridge){ + if(!preg_match('@^[A-Z][a-zA-Z0-9-]*$@', $nameBridge)){ + $message = <<items; } - - /** * Defined datas with parameters depending choose bridge - * Note : you can define a cache before with "setCache" - * @param array $param $_REQUEST, $_GET, $_POST, or array with bridge expected paramters + * Note : you can define a cache with "setCache" + * @param array $param $_REQUEST, $_GET, $_POST, or array with expected + * bridge paramters */ public function setDatas(array $param){ - if( !is_null($this->cache) ){ + if(!is_null($this->cache)){ $this->cache->prepare($param); $time = $this->cache->getTime(); - } - else{ - $time = false; // No cache ? No time ! + } else { + $time = false; } - if( $time !== false && ( time() - $this->getCacheDuration() < $time ) ){ // Cache file has not expired. Serve it. + if($time !== false && (time() - $this->getCacheDuration() < $time)){ $this->items = $this->cache->loadData(); - } - else{ + } else { $this->collectData($param); - if( !is_null($this->cache) ){ // Cache defined ? We go to refresh is memory :D + if(!is_null($this->cache)){ $this->cache->saveData($this->getDatas()); } } } - /** - * Define default bridge name - */ public function getName(){ return $this->name; } - /** - * Define default bridge URI - */ public function getURI(){ return $this->uri; } - /** - * Define default duraction for cache - */ public function getCacheDuration(){ return 3600; } - /** - * Defined cache object to use - */ public function setCache(\CacheAbstract $cache){ $this->cache = $cache; - - return $this; } - public function message($text) { - if(!file_exists('DEBUG')){ - return; - } - $backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3); - $calling = $backtrace[2]; - $message = $calling["file"].":".$calling["line"] - ." class ".get_class($this)."->".$calling["function"] - ." - ".$text; - error_log($message); - } - - protected function getContents($url,$use_include_path=false,$context=null,$offset=0,$maxlen=null){ - $contextOptions = array( - 'http' => array( - 'user_agent'=>ini_get('user_agent') - ), - ); - - if(defined('PROXY_URL') && $this->useProxy) { - $contextOptions['http']['proxy'] = PROXY_URL; - $contextOptions['http']['request_fulluri'] = true; - - if(is_null($context)){ - $context = stream_context_create($contextOptions); - } else { - $prevContext=$context; - if(!stream_context_set_option($context,$contextOptions)){ - $context=$prevContext; - }; + public function debugMessage($text){ + if(!file_exists('DEBUG')) { + return; } - } - if(is_null($maxlen)){ - $content=@file_get_contents($url, $use_include_path, $context, $offset); - }else{ - $content=@file_get_contents($url, $use_include_path, $context, $offset,$maxlen); - } + $backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3); + $calling = $backtrace[2]; + $message = $calling['file'] . ':' + . $calling['line'] . ' class ' + . get_class($this) . '->' + . $calling['function'] . ' - ' + . $text; - if($content===false){ - $this->message('Cant\'t download '.$url ); - } - return $content; + error_log($message); } - protected function getSimpleHTMLDOM($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=null, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT){ - $content=$this->getContents($url,$use_include_path,$context,$offset,$maxLen); - return str_get_html($content,$lowercase,$forceTagsClosed,$target_charset,$stripRN,$defaultBRText,$defaultSpanText); + protected function getContents($url + , $use_include_path = false + , $context = null + , $offset = 0 + , $maxlen = null){ + $contextOptions = array( + 'http' => array( + 'user_agent' => ini_get('user_agent') + ), + ); + + if(defined('PROXY_URL') && $this->useProxy){ + $contextOptions['http']['proxy'] = PROXY_URL; + $contextOptions['http']['request_fulluri'] = true; + + if(is_null($context)){ + $context = stream_context_create($contextOptions); + } else { + $prevContext=$context; + if(!stream_context_set_option($context, $contextOptions)){ + $context = $prevContext; + } + } + } + + if(is_null($maxlen)){ + $content = @file_get_contents($url, $use_include_path, $context, $offset); + } else { + $content = @file_get_contents($url, $use_include_path, $context, $offset, $maxlen); + } + + if($content === false) + $this->debugMessage('Cant\'t download ' . $url); + + return $content; } + protected function getSimpleHTMLDOM($url + , $use_include_path = false + , $context = null + , $offset = 0 + , $maxLen = null + , $lowercase = true + , $forceTagsClosed = true + , $target_charset = DEFAULT_TARGET_CHARSET + , $stripRN = true + , $defaultBRText = DEFAULT_BR_TEXT + , $defaultSpanText = DEFAULT_SPAN_TEXT){ + $content = $this->getContents($url, $use_include_path, $context, $offset, $maxLen); + return str_get_html($content + , $lowercase + , $forceTagsClosed + , $target_charset + , $stripRN + , $defaultBRText + , $defaultSpanText); + } } /** - * Extension of BridgeAbstract allowing caching of files downloaded over http files. - * This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded - * separately. - * This class mainly provides a get_cached method which will will download the file from its remote location. - * TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time - * After all, rss-bridge is not respaw, isn't it ? + * Extension of BridgeAbstract allowing caching of files downloaded over http. + * TODO allow file cache invalidation by touching files on access, and removing + * files/directories which have not been touched since ... a long time */ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { /** - * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. - * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * Maintain locally cached versions of pages to download, to avoid multiple downloads. * @param url url to cache - * @return content of file as string + * @return content of the file as string */ - public function get_cached($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); - // TODO build this from the variable given to Cache - $pageCacheDir = __DIR__ . '/../cache/'."pages/"; - $filename = $pageCacheDir.$simplified_url; - if (substr($filename, -1) == '/') { - $filename = $filename."index.html"; - } - if(file_exists($filename)) { -// $this->message("loading cached file from ".$filename." for page at url ".$url); - // TODO touch file and its parent, and try to do neighbour deletion - $this->refresh_in_cache($pageCacheDir, $filename); - $content=file_get_contents($filename); - } else { -// $this->message("we have no local copy of ".$url." Downloading to ".$filename); - $dir = substr($filename, 0, strrpos($filename, '/')); - if(!is_dir($dir)) { -// $this->message("creating directories for ".$dir); + public function get_cached($url){ + // TODO build this from the variable given to Cache + $cacheDir = __DIR__ . '/../cache/pages/'; + $filepath = $this->buildCacheFilePath($url, $cacheDir); + + if(file_exists($filepath)){ + $this->debugMessage('loading cached file from ' . $filepath . ' for page at url ' . $url); + // TODO touch file and its parent, and try to do neighbour deletion + $this->refresh_in_cache($cacheDir, $filepath); + $content = file_get_contents($filepath); + } else { + $this->debugMessage('we have no local copy of ' . $url . ' Downloading to ' . $filepath); + $dir = substr($filepath, 0, strrpos($filepath, '/')); + + if(!is_dir($dir)){ + $this->debugMessage('creating directories for ' . $dir); mkdir($dir, 0777, true); } - $content=$this->getContents($url); - if($content!==false){ - file_put_contents($filename,$content); + + $content = $this->getContents($url); + if($content !== false){ + file_put_contents($filepath, $content); } } + return $content; } - public function get_cached_time($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + public function get_cached_time($url){ // TODO build this from the variable given to Cache - $pageCacheDir = __DIR__ . '/../cache/'."pages/"; - $filename = $pageCacheDir.$simplified_url; - if (substr($filename, -1) == '/') { - $filename = $filename."index.html"; - } - if(!file_exists($filename)) { + $cacheDir = __DIR__ . '/../cache/pages/'; + $filepath = $this->buildCacheFilePath($url, $cacheDir); + + if(!file_exists($filepath)){ $this->get_cached($url); } - return filectime($filename); + + return filectime($filepath); } - private function refresh_in_cache($pageCacheDir, $filename) { - $currentPath = $filename; - while(!$pageCacheDir==$currentPath) { - touch($currentPath); - $currentPath = dirname($currentPath); - } + private function refresh_in_cache($cacheDir, $filepath){ + $currentPath = $filepath; + while(!$cacheDir == $currentPath){ + touch($currentPath); + $currentPath = dirname($currentPath); + } } - public function remove_from_cache($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); - // TODO build this from the variable given to Cache - $pageCacheDir = __DIR__ . '/../cache/'."pages/"; - $filename = realpath($pageCacheDir.$simplified_url); - $this->message("removing from cache \"".$filename."\" WELL, NOT REALLY"); - // filename is NO GOOD -// unlink($filename); + private function buildCacheFilePath($url, $cacheDir){ + $simplified_url = str_replace( + ['http://', 'https://', '?', '&', '='], + ['', '', '/', '/', '/'], + $url); + + if(substr($cacheDir, -1) !== '/'){ + $cacheDir .= '/'; + } + + $filepath = $cacheDir . $simplified_url; + + if(substr($filepath, -1) === '/'){ + $filepath .= 'index.html'; + } + + return $filepath; } + public function remove_from_cache($url){ + // TODO build this from the variable given to Cache + $cacheDir = __DIR__ . '/../cache/pages/'; + $filepath = $this->buildCacheFilePath($url, $cacheDir); + $this->debugMessage('removing from cache \'' . $filepath . '\' WELL, NOT REALLY'); + // unlink($filepath); + } } -class Bridge{ - - static protected $dirBridge; - - public function __construct(){ - throw new \LogicException('Please use ' . __CLASS__ . '::create for new object.'); - } - - /** - * Checks if a bridge is an instantiable bridge. - * @param string $nameBridge name of the bridge that you want to use - * @return true if it is an instantiable bridge, false otherwise. - */ - static public function isInstantiable($nameBridge) { - - $re = new ReflectionClass($nameBridge); - return $re->IsInstantiable(); - - } - - - /** - * Create a new bridge object - * @param string $nameBridge Defined bridge name you want use - * @return Bridge object dedicated - */ - static public function create($nameBridge){ - if( !preg_match('@^[A-Z][a-zA-Z0-9-]*$@', $nameBridge)){ - throw new \InvalidArgumentException('Name bridge must be at least one uppercase follow or not by alphanumeric or dash characters.'); - } - - $nameBridge=$nameBridge.'Bridge'; - $pathBridge = self::getDir() . $nameBridge . '.php'; - - if( !file_exists($pathBridge) ){ - throw new \Exception('The bridge you looking for does not exist. It should be at path '.$pathBridge); - } - - require_once $pathBridge; - - if(Bridge::isInstantiable($nameBridge)) { - return new $nameBridge(); - } else { - return FALSE; - } - } - - static public function setDir($dirBridge){ - if( !is_string($dirBridge) ){ - throw new \InvalidArgumentException('Dir bridge must be a string.'); - } - - if( !file_exists($dirBridge) ){ - throw new \Exception('Dir bridge does not exist.'); - } - - self::$dirBridge = $dirBridge; - } - - static public function getDir(){ - $dirBridge = self::$dirBridge; - - if( is_null($dirBridge) ){ - throw new \LogicException(__CLASS__ . ' class need to know bridge path !'); - } - - return $dirBridge; - } - - /** - * Lists the available bridges. - * @return array List of the bridges - */ - static public function listBridges() { - - $pathDirBridge = self::getDir(); - $listBridge = array(); - $dirFiles = scandir($pathDirBridge); - - if( $dirFiles !== false ){ - foreach( $dirFiles as $fileName ) { - if( preg_match('@^([^.]+)Bridge\.php$@U', $fileName, $out) ){ - $listBridge[] = $out[1]; - } - } - } - - return $listBridge; - } - static function isWhitelisted( $whitelist, $name ) { - if(in_array($name, $whitelist) or in_array($name.'.php', $whitelist) or - // DEPRECATED: the nameBridge notation will be removed in future releases - in_array($name.'Bridge', $whitelist) or in_array($name.'Bridge.php', $whitelist) or - count($whitelist) === 1 and trim($whitelist[0]) === '*') - return TRUE; - else - return FALSE; - } - -} - -abstract class RssExpander extends HttpCachingBridgeAbstract{ - - public $name; - public $uri; - public $description; +abstract class RssExpander extends HttpCachingBridgeAbstract { public function collectExpandableDatas(array $param, $name){ - if (empty($name)) { + if(empty($name)){ $this->returnServerError('There is no $name for this RSS expander'); } -// $this->message("Loading from ".$param['url']); - // Notice WE DO NOT use cache here on purpose : we want a fresh view of the RSS stream each time - $content=$this->getContents($name) or - $this->returnServerError('Could not request '.$name); + + $this->debugMessage('Loading from ' . $param['url']); + + /* Notice we do not use cache here on purpose: + * we want a fresh view of the RSS stream each time + */ + $content = $this->getContents($name) or $this->returnServerError('Could not request ' . $name); $rssContent = simplexml_load_string($content); - // $this->message("loaded RSS from ".$param['url']); + $this->debugMessage('loaded RSS from ' . $param['url']); // TODO insert RSS format detection - // we suppose for now, we have some RSS 2.0 + // For now we always assume RSS 2.0 $this->collect_RSS_2_0_data($rssContent); } - protected function collect_RSS_2_0_data($rssContent) { + protected function collect_RSS_2_0_data($rssContent){ $rssContent = $rssContent->channel[0]; -// $this->message("RSS content is ===========\n".var_export($rssContent, true)."==========="); + $this->debugMessage('RSS content is ===========\n' . var_export($rssContent, true) . '==========='); $this->load_RSS_2_0_feed_data($rssContent); - foreach($rssContent->item as $item) { -// $this->message("parsing item ".var_export($item, true)); + foreach($rssContent->item as $item){ + $this->debugMessage('parsing item ' . var_export($item, true)); $this->items[] = $this->parseRSSItem($item); } } - protected function RSS_2_0_time_to_timestamp($item) { + protected function RSS_2_0_time_to_timestamp($item){ return DateTime::createFromFormat('D, d M Y H:i:s e', $item->pubDate)->getTimestamp(); } // TODO set title, link, description, language, and so on - protected function load_RSS_2_0_feed_data($rssContent) { + protected function load_RSS_2_0_feed_data($rssContent){ $this->name = trim($rssContent->title); $this->uri = trim($rssContent->link); $this->description = trim($rssContent->description); @@ -386,9 +395,7 @@ abstract class RssExpander extends HttpCachingBridgeAbstract{ */ abstract protected function parseRSSItem($item); - public function getDescription() { + public function getDescription(){ return $this->description; } } - -