From f2f82c7d03cfa8c05717439420a706e011b48265 Mon Sep 17 00:00:00 2001 From: Riduidel Date: Mon, 3 Mar 2014 14:12:24 +0100 Subject: [PATCH] Prepared Gawker bridge by extracting file cache from initial Liberation bridge --- bridges/Les400Culs.php | 37 +---------------------------- lib/Bridge.php | 54 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index aea233b..1dd9e3e 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -6,7 +6,7 @@ * @update 20/02/2014 */ define("SEXE", "http://sexes.blogs.liberation.fr"); -class Les400Culs extends BridgeAbstract{ +class Les400Culs extends HttpCachingBridgeAbstract{ public function collectData(array $param){ $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); @@ -60,39 +60,4 @@ class Les400Culs extends BridgeAbstract{ public function getDescription(){ return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; } - - /** - * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. - * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache - * @param url url to cache - * @return content of file as string - */ - public function get_cached($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); - $filename = __DIR__ . '/../cache/'."pages/".$simplified_url; - if (substr($filename, -1) == '/') { - $filename = $filename."index.html"; - } - if(!file_exists($filename)) { - error_log("we have no local copy of ".$url." Downloading !"); - $dir = substr($filename, 0, strrpos($filename, '/')); - if(!is_dir($dir)) { - mkdir($dir, 0777, true); - } - $this->download_remote($url, $filename); - } - return file_get_contents($filename); - } - - public function download_remote($url , $save_path) { - $f = fopen( $save_path , 'w+'); - $handle = fopen($url , "rb"); - while (!feof($handle)) { - $contents = fread($handle, 8192); - fwrite($f , $contents); - } - fclose($handle); - fclose($f); - } - } diff --git a/lib/Bridge.php b/lib/Bridge.php index dbff16b..3ef4a21 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -73,6 +73,60 @@ abstract class BridgeAbstract implements BridgeInterface{ } } +/** + * Extension of BridgeAbstract allowing caching of files downloaded over http files. + * This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded + * separately. + * This class mainly provides a get_cached method which will will download the file from its remote location. + * TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time + * After all, rss-bridge is not respaw, isn't it ? + */ +abstract class HttpCachingBridgeAbstract extends BridgeAbstract { + + /** + * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. + * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * @param url url to cache + * @return content of file as string + */ + public function get_cached($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = $pageCacheDir.$simplified_url; + if (substr($filename, -1) == '/') { + $filename = $filename."index.html"; + } + if(file_exists($filename)) { + // TODO touch file and its parent, and try to do neighbour deletion + $currentPath = $filename; + while(!$pageCacheDir==$currentPath) { + touch($currentPath); + $currentPath = dirname($currentPath); + } + } else { + error_log("we have no local copy of ".$url." Downloading !"); + $dir = substr($filename, 0, strrpos($filename, '/')); + if(!is_dir($dir)) { + mkdir($dir, 0777, true); + } + $this->download_remote($url, $filename); + } + return file_get_contents($filename); + } + + public function download_remote($url , $save_path) { + $f = fopen( $save_path , 'w+'); + $handle = fopen($url , "rb"); + while (!feof($handle)) { + $contents = fread($handle, 8192); + fwrite($f , $contents); + } + fclose($handle); + fclose($f); + } +} + class Bridge{ static protected $dirBridge;