diff --git a/bridges/Gawker.php b/bridges/Gawker.php new file mode 100644 index 0000000..4d13522 --- /dev/null +++ b/bridges/Gawker.php @@ -0,0 +1,89 @@ +uri = $param['site']; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.main-column') as $content) { + $this->parseContent($content); + } + } + + public function parseContent($content) { + foreach($content->find('.headline') as $headline) { + foreach($headline->find('a') as $articleLink) { + // notice we only use article from this gawker site (as gawker like to see us visit other sites) + if(strpos($articleLink->href, $this->getURI())>=0) { + $this->parseLink($articleLink); + } + } + } + } + + public function parseLink($infoLink) { + $item = new Item(); + $item->uri = $infoLink->href; + $item->title = $infoLink->innertext; + try { + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + if(is_object($articlePage)) { + $content = $articlePage->find('.post-content', 0); + $this->defaultImageSrcTo($content, $this->getURI()); + $item->content = $content->innertext; + // http://stackoverflow.com/q/22715928/15619 + $publishtime = $articlePage->find('.publish-time', 0)->getAttribute("data-publishtime"); + // don't know what I'm doing there, but http://www.epochconverter.com/programming/functions-php.php#epoch2date recommends it + $item->timestamp = $this->js_to_unix_timestamp($publishtime); + $vcard = $articlePage->find('.vcard', 0); + if(is_object($vcard)) { + $item->name = $vcard->find('a', 0)->innertext; + } + } else { + throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !"); + } + } catch(Exception $e) { + $this->message("obtaining ".$item->uri." resulted in exception ".$e->getMessage().". Deleting cached page ..."); + // maybe file is incorrect. it should be discarded from cache + $this->remove_from_cache($item->url); + $item->content = $e->getMessage(); + } + $this->items[] = $item; + } + + function js_to_unix_timestamp($jsTimestamp){ + return $jsTimestamp/1000; + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 1h + } + public function getDescription(){ + return "Gawker press blog content."; + } +} diff --git a/lib/Bridge.php b/lib/Bridge.php index 4467da0..aeaba6f 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -109,33 +109,54 @@ abstract class HttpCachingBridgeAbstract extends BridgeAbstract { $filename = $filename."index.html"; } if(file_exists($filename)) { - // $this->message("loading cached file from ".$filename." for page at url ".$url); +// $this->message("loading cached file from ".$filename." for page at url ".$url); // TODO touch file and its parent, and try to do neighbour deletion - $currentPath = $filename; - while(!$pageCacheDir==$currentPath) { - touch($currentPath); - $currentPath = dirname($currentPath); - } + $this->refresh_in_cache($pageCacheDir, $filename); } else { - // $this->message("we have no local copy of ".$url." Downloading !"); +// $this->message("we have no local copy of ".$url." Downloading to ".$filename); $dir = substr($filename, 0, strrpos($filename, '/')); if(!is_dir($dir)) { +// $this->message("creating directories for ".$dir); mkdir($dir, 0777, true); } $this->download_remote($url, $filename); } return file_get_contents($filename); } + + private function refresh_in_cache($pageCacheDir, $filename) { + $currentPath = $filename; + while(!$pageCacheDir==$currentPath) { + touch($currentPath); + $currentPath = dirname($currentPath); + } + } public function download_remote($url , $save_path) { $f = fopen( $save_path , 'w+'); - $handle = fopen($url , "rb"); - while (!feof($handle)) { - $contents = fread($handle, 8192); - fwrite($f , $contents); + if($f) { + $handle = fopen($url , "rb"); + if($handle) { + while (!feof($handle)) { + $contents = fread($handle, 8192); + if($contents) { + fwrite($f , $contents); + } + } + fclose($handle); + } + fclose($f); } - fclose($handle); - fclose($f); + } + + public function remove_from_cache($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = realpath($pageCacheDir.$simplified_url); + $this->message("removing from cache \"".$filename."\" WELL, NOT REALLY"); + // filename is NO GOOD +// unlink($filename); } public function message($text) {