Prepared Gawker bridge by extracting file cache from initial Liberation bridge
This commit is contained in:
parent
62a5265433
commit
f2f82c7d03
2 changed files with 55 additions and 36 deletions
|
@ -6,7 +6,7 @@
|
||||||
* @update 20/02/2014
|
* @update 20/02/2014
|
||||||
*/
|
*/
|
||||||
define("SEXE", "http://sexes.blogs.liberation.fr");
|
define("SEXE", "http://sexes.blogs.liberation.fr");
|
||||||
class Les400Culs extends BridgeAbstract{
|
class Les400Culs extends HttpCachingBridgeAbstract{
|
||||||
|
|
||||||
public function collectData(array $param){
|
public function collectData(array $param){
|
||||||
$html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
|
$html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
|
||||||
|
@ -60,39 +60,4 @@ class Les400Culs extends BridgeAbstract{
|
||||||
public function getDescription(){
|
public function getDescription(){
|
||||||
return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge";
|
return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge";
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Maintain locally cached versions of pages to download to avoid multiple doiwnloads.
|
|
||||||
* A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache
|
|
||||||
* @param url url to cache
|
|
||||||
* @return content of file as string
|
|
||||||
*/
|
|
||||||
public function get_cached($url) {
|
|
||||||
$simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url);
|
|
||||||
$filename = __DIR__ . '/../cache/'."pages/".$simplified_url;
|
|
||||||
if (substr($filename, -1) == '/') {
|
|
||||||
$filename = $filename."index.html";
|
|
||||||
}
|
|
||||||
if(!file_exists($filename)) {
|
|
||||||
error_log("we have no local copy of ".$url." Downloading !");
|
|
||||||
$dir = substr($filename, 0, strrpos($filename, '/'));
|
|
||||||
if(!is_dir($dir)) {
|
|
||||||
mkdir($dir, 0777, true);
|
|
||||||
}
|
|
||||||
$this->download_remote($url, $filename);
|
|
||||||
}
|
|
||||||
return file_get_contents($filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function download_remote($url , $save_path) {
|
|
||||||
$f = fopen( $save_path , 'w+');
|
|
||||||
$handle = fopen($url , "rb");
|
|
||||||
while (!feof($handle)) {
|
|
||||||
$contents = fread($handle, 8192);
|
|
||||||
fwrite($f , $contents);
|
|
||||||
}
|
|
||||||
fclose($handle);
|
|
||||||
fclose($f);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,6 +73,60 @@ abstract class BridgeAbstract implements BridgeInterface{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extension of BridgeAbstract allowing caching of files downloaded over http files.
|
||||||
|
* This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded
|
||||||
|
* separately.
|
||||||
|
* This class mainly provides a get_cached method which will will download the file from its remote location.
|
||||||
|
* TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time
|
||||||
|
* After all, rss-bridge is not respaw, isn't it ?
|
||||||
|
*/
|
||||||
|
abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maintain locally cached versions of pages to download to avoid multiple doiwnloads.
|
||||||
|
* A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache
|
||||||
|
* @param url url to cache
|
||||||
|
* @return content of file as string
|
||||||
|
*/
|
||||||
|
public function get_cached($url) {
|
||||||
|
$simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url);
|
||||||
|
// TODO build this from the variable given to Cache
|
||||||
|
$pageCacheDir = __DIR__ . '/../cache/'."pages/";
|
||||||
|
$filename = $pageCacheDir.$simplified_url;
|
||||||
|
if (substr($filename, -1) == '/') {
|
||||||
|
$filename = $filename."index.html";
|
||||||
|
}
|
||||||
|
if(file_exists($filename)) {
|
||||||
|
// TODO touch file and its parent, and try to do neighbour deletion
|
||||||
|
$currentPath = $filename;
|
||||||
|
while(!$pageCacheDir==$currentPath) {
|
||||||
|
touch($currentPath);
|
||||||
|
$currentPath = dirname($currentPath);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
error_log("we have no local copy of ".$url." Downloading !");
|
||||||
|
$dir = substr($filename, 0, strrpos($filename, '/'));
|
||||||
|
if(!is_dir($dir)) {
|
||||||
|
mkdir($dir, 0777, true);
|
||||||
|
}
|
||||||
|
$this->download_remote($url, $filename);
|
||||||
|
}
|
||||||
|
return file_get_contents($filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function download_remote($url , $save_path) {
|
||||||
|
$f = fopen( $save_path , 'w+');
|
||||||
|
$handle = fopen($url , "rb");
|
||||||
|
while (!feof($handle)) {
|
||||||
|
$contents = fread($handle, 8192);
|
||||||
|
fwrite($f , $contents);
|
||||||
|
}
|
||||||
|
fclose($handle);
|
||||||
|
fclose($f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class Bridge{
|
class Bridge{
|
||||||
|
|
||||||
static protected $dirBridge;
|
static protected $dirBridge;
|
||||||
|
|
Loading…
Reference in a new issue