From 1d26c7f1c3a5adbf223431c4560beea429ffa820 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 18 Feb 2017 10:23:46 +0100 Subject: [PATCH 1/5] [FileCache] Do not delete .gitkeep This commit reduces the chance of accidentally removing the cache folder from repository. --- caches/FileCache.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caches/FileCache.php b/caches/FileCache.php index c8e4982..67078eb 100644 --- a/caches/FileCache.php +++ b/caches/FileCache.php @@ -39,7 +39,7 @@ class FileCache implements CacheInterface { ); foreach($cacheIterator as $cacheFile){ - if(in_array($cacheFile->getBasename(), array('.', '..'))) + if(in_array($cacheFile->getBasename(), array('.', '..', '.gitkeep'))) continue; elseif($cacheFile->isFile()){ if(filemtime($cacheFile->getPathname()) < time() - $duration) From 5de03d6b9fcc7a6e1167baeba92c4416516320bb Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 18 Feb 2017 12:54:26 +0100 Subject: [PATCH 2/5] [FileCache] Use serialize instead of json_encode json_encode causes high memory footprint on large input data, where serialize is less problematic. Example: When using AcrimedBridge items contain pictures in raw format (entire picture) which leads to a file size of about 2MB using serialize. json_encode will allocate about 98MB of memory for encoding, causing memory exhausion errors (PHP allows for 128MB of memory by default) --- caches/FileCache.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/caches/FileCache.php b/caches/FileCache.php index 67078eb..59bced9 100644 --- a/caches/FileCache.php +++ b/caches/FileCache.php @@ -8,11 +8,13 @@ class FileCache implements CacheInterface { protected $param; public function loadData(){ - return json_decode(file_get_contents($this->getCacheFile()), true); + return unserialize(file_get_contents($this->getCacheFile())); } public function saveData($datas){ - $writeStream = file_put_contents($this->getCacheFile(), json_encode($datas, JSON_PRETTY_PRINT)); + // Notice: We use plain serialize() here to reduce memory footprint on + // large input data. + $writeStream = file_put_contents($this->getCacheFile(), serialize($datas)); if($writeStream === false) { throw new \Exception("Cannot write the cache... Do you have the right permissions ?"); @@ -110,6 +112,8 @@ class FileCache implements CacheInterface { throw new \Exception('Call "setParameters" first!'); } - return hash('md5', http_build_query($this->param)) . '.cache'; + // Change character when making incompatible changes to prevent loading + // errors due to incompatible file contents \|/ + return hash('md5', http_build_query($this->param) . 'A') . '.cache'; } } From bb8e7495d83e629d855c16cd148c684a05766b03 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 18 Feb 2017 13:13:40 +0100 Subject: [PATCH 3/5] [html] Fix img src replacement not working strpos returns false if the needle was not found. See: http://php.net/manual/en/function.strpos.php#refsect1-function.strpos-returnvalues --- lib/html.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/html.php b/lib/html.php index 1b9b5ab..24a2cab 100644 --- a/lib/html.php +++ b/lib/html.php @@ -280,9 +280,9 @@ $keptText = array()){ function defaultImageSrcTo($content, $server){ foreach($content->find('img') as $image){ - if(is_null(strpos($image->src, "http")) - && is_null(strpos($image->src, "//")) - && is_null(strpos($image->src, "data:"))) + if(strpos($image->src, 'http') === false + && strpos($image->src, '//') === false + && strpos($image->src, 'data:') === false) $image->src = $server . $image->src; } return $content; From cf7da1d41ca50ee9c772a469a83b1164da1c88cc Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 18 Feb 2017 13:40:58 +0100 Subject: [PATCH 4/5] [html] Fix anchors after fixing images Anchors will be fixed in a similar way as it is done with images, so it can be done in one go. --- lib/html.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/html.php b/lib/html.php index 24a2cab..8eb0deb 100644 --- a/lib/html.php +++ b/lib/html.php @@ -285,5 +285,14 @@ function defaultImageSrcTo($content, $server){ && strpos($image->src, 'data:') === false) $image->src = $server . $image->src; } + + foreach($content->find('a') as $anchor){ + if(strpos($anchor->href, 'http') === false + && strpos($anchor->href, '//') === false + && strpos($anchor->href, '#') !== 0 + && strpos($anchor->href, '?') !== 0) + $anchor->href = $server . $anchor->href; + } + return $content; } From 16bdf6b204e74fb1a8b2594e767206f8294d9df6 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sat, 18 Feb 2017 13:41:45 +0100 Subject: [PATCH 5/5] links: Rename defaultImageSrcTo to defaultLinkTo This function not only fixes image sources, but also anchors --- bridges/AcrimedBridge.php | 2 +- bridges/WorldOfTanksBridge.php | 2 +- lib/html.php | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bridges/AcrimedBridge.php b/bridges/AcrimedBridge.php index 3ca8e80..8b40d1d 100644 --- a/bridges/AcrimedBridge.php +++ b/bridges/AcrimedBridge.php @@ -16,7 +16,7 @@ class AcrimedBridge extends FeedExpander { $articlePage = getSimpleHTMLDOM($newsItem->link); $article = sanitize($articlePage->find('article.article1', 0)->innertext); - $article = defaultImageSrcTo($article, static::URI); + $article = defaultLinkTo($article, static::URI); $item['content'] = $article; return $item; diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index 1cc41b7..a894f6e 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -63,7 +63,7 @@ class WorldOfTanksBridge extends BridgeAbstract { debugMessage('loading page ' . $item['uri']); $articlePage = getSimpleHTMLDOMCached($item['uri']); $content = $articlePage->find('.l-content', 0); - defaultImageSrcTo($content, self::URI); + defaultLinkTo($content, self::URI); $item['title'] = $content->find('h1', 0)->innertext; $item['content'] = $content->find('.b-content', 0)->innertext; $item['timestamp'] = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp"); diff --git a/lib/html.php b/lib/html.php index 8eb0deb..d5f6667 100644 --- a/lib/html.php +++ b/lib/html.php @@ -278,7 +278,7 @@ $keptText = array()){ return $htmlContent; } -function defaultImageSrcTo($content, $server){ +function defaultLinkTo($content, $server){ foreach($content->find('img') as $image){ if(strpos($image->src, 'http') === false && strpos($image->src, '//') === false