From 8f76eebddb3d18cd70033a72c42b5cf146d1f353 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Tue, 2 Aug 2016 20:29:40 +0200 Subject: [PATCH 1/4] Fix parameters list Fixes warning: "array_key_exists() expects parameter 2 to be array, string given in /volume1/web/rss-bridge_dev/lib/HTMLUtils.php on line 59 Warning: Invalid argument supplied for foreach() in /volume1/web/rss-bridge_dev/lib/HTMLUtils.php on line 64 --- bridges/ElsevierBridge.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bridges/ElsevierBridge.php b/bridges/ElsevierBridge.php index bb73477..9265a0c 100644 --- a/bridges/ElsevierBridge.php +++ b/bridges/ElsevierBridge.php @@ -12,9 +12,9 @@ class ElsevierBridge extends BridgeAbstract{ $this->name = 'Elsevier journals recent articles'; $this->uri = 'http://www.journals.elsevier.com'; $this->description = 'Returns the recent articles published in Elsevier journals'; - $this->update = '2016-06-26'; + $this->update = '2016-08-02'; - $this->parameters= + $this->parameters[] = '[ { "name" : "Journal name", From 399fce06ce5f5eb95ec2f91914d970ba9db32836 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Tue, 2 Aug 2016 20:35:27 +0200 Subject: [PATCH 2/4] Require input field, add example value and title Previously the bridge could be requested without any journal causing error 403 due to broken URL. --- bridges/ElsevierBridge.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bridges/ElsevierBridge.php b/bridges/ElsevierBridge.php index 9265a0c..408fc45 100644 --- a/bridges/ElsevierBridge.php +++ b/bridges/ElsevierBridge.php @@ -18,7 +18,10 @@ class ElsevierBridge extends BridgeAbstract{ '[ { "name" : "Journal name", - "identifier" : "j" + "identifier" : "j", + "required" : "true", + "exampleValue" : "academic-pediatrics", + "title" : "Insert html-part of your journal" } ]'; } From f7839697213d9a2c93b7d52519444ffd963f2c65 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Tue, 2 Aug 2016 21:35:13 +0200 Subject: [PATCH 3/4] Create member functions to extract information from articles The extractor function will handle many situations more specifically in order to provide better results. --- bridges/ElsevierBridge.php | 47 +++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/bridges/ElsevierBridge.php b/bridges/ElsevierBridge.php index 408fc45..7911feb 100644 --- a/bridges/ElsevierBridge.php +++ b/bridges/ElsevierBridge.php @@ -26,6 +26,47 @@ class ElsevierBridge extends BridgeAbstract{ ]'; } + // Extracts the list of names from an article as string + function ExtractArticleName ($article){ + $names = $article->find('small', 0); + if($names) + return trim($names->plaintext); + return ''; + } + + // Extracts the timestamp from an article + function ExtractArticleTimestamp ($article){ + $time = $article->find('.article-info', 0); + if($time){ + $timestring = trim($time->plaintext); + /* + The format depends on the age of an article: + - Available online 29 July 2016 + - July 2016 + - May–June 2016 + */ + if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){ + return strtotime($matches[0]); + } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){ + return strtotime($matches[0]); + } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){ + return strtotime($matches[0]); + } else { + return 0; + } + } + return 0; + } + + // Extracts the content from an article + function ExtractArticleContent ($article){ + $content = $article->find('.article-content', 0); + if($content){ + return trim($content->plaintext); + } + return ''; + } + public function collectData(array $param){ $uri = 'http://www.journals.elsevier.com/'.$param['j'].'/recent-articles/'; $html = file_get_html($uri) @@ -36,9 +77,9 @@ class ElsevierBridge extends BridgeAbstract{ $item = new \Item(); $item->uri=$article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y'; $item->title=$article->find('.pod-listing-header>a',0)->plaintext; - $item->name=trim($article->find('small',0)->plaintext); - $item->timestamp=strtotime($article->find('.article-info',0)->plaintext); - $item->content=trim($article->find('.article-content',0)->plaintext); + $item->name=$this->ExtractArticleName($article); + $item->timestamp=$this->ExtractArticleTimestamp($article); + $item->content=$this->ExtractArticleContent($article); $this->items[]=$item; } From a1c680f8e8dac7cb8086cf469d4ff05b469d4b90 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Tue, 2 Aug 2016 21:40:22 +0200 Subject: [PATCH 4/4] Fix indentation and improve code style - Use tab instead of spaces - Remove obsolete bridge description at start of the file - Add spaces at the assignment operator ('=' -> ' = ') - Remove unnecessary empty lines --- bridges/ElsevierBridge.php | 164 ++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 86 deletions(-) diff --git a/bridges/ElsevierBridge.php b/bridges/ElsevierBridge.php index 7911feb..4f4cd57 100644 --- a/bridges/ElsevierBridge.php +++ b/bridges/ElsevierBridge.php @@ -1,99 +1,91 @@ maintainer = 'Pierre Mazière'; - $this->name = 'Elsevier journals recent articles'; - $this->uri = 'http://www.journals.elsevier.com'; - $this->description = 'Returns the recent articles published in Elsevier journals'; - $this->update = '2016-08-02'; + $this->maintainer = 'Pierre Mazière'; + $this->name = 'Elsevier journals recent articles'; + $this->uri = 'http://www.journals.elsevier.com'; + $this->description = 'Returns the recent articles published in Elsevier journals'; + $this->update = '2016-08-02'; - $this->parameters[] = - '[ - { - "name" : "Journal name", - "identifier" : "j", - "required" : "true", - "exampleValue" : "academic-pediatrics", - "title" : "Insert html-part of your journal" - } - ]'; - } + $this->parameters[] = + '[ + { + "name" : "Journal name", + "identifier" : "j", + "required" : "true", + "exampleValue" : "academic-pediatrics", + "title" : "Insert html-part of your journal" + } + ]'; + } - // Extracts the list of names from an article as string - function ExtractArticleName ($article){ - $names = $article->find('small', 0); - if($names) - return trim($names->plaintext); - return ''; - } + // Extracts the list of names from an article as string + function ExtractArticleName ($article){ + $names = $article->find('small', 0); + if($names) + return trim($names->plaintext); + return ''; + } - // Extracts the timestamp from an article - function ExtractArticleTimestamp ($article){ - $time = $article->find('.article-info', 0); - if($time){ - $timestring = trim($time->plaintext); - /* - The format depends on the age of an article: - - Available online 29 July 2016 - - July 2016 - - May–June 2016 - */ - if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){ - return strtotime($matches[0]); - } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){ - return strtotime($matches[0]); - } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){ - return strtotime($matches[0]); - } else { - return 0; - } - } - return 0; - } + // Extracts the timestamp from an article + function ExtractArticleTimestamp ($article){ + $time = $article->find('.article-info', 0); + if($time){ + $timestring = trim($time->plaintext); + /* + The format depends on the age of an article: + - Available online 29 July 2016 + - July 2016 + - May–June 2016 + */ + if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){ + return strtotime($matches[0]); + } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){ + return strtotime($matches[0]); + } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){ + return strtotime($matches[0]); + } else { + return 0; + } + } + return 0; + } - // Extracts the content from an article - function ExtractArticleContent ($article){ - $content = $article->find('.article-content', 0); - if($content){ - return trim($content->plaintext); - } - return ''; - } + // Extracts the content from an article + function ExtractArticleContent ($article){ + $content = $article->find('.article-content', 0); + if($content){ + return trim($content->plaintext); + } + return ''; + } - public function collectData(array $param){ - $uri = 'http://www.journals.elsevier.com/'.$param['j'].'/recent-articles/'; - $html = file_get_html($uri) - or $this->returnError('No results for Elsevier journal '.$param['j'], 404); + public function collectData(array $param){ + $uri = 'http://www.journals.elsevier.com/' . $param['j'] . '/recent-articles/'; + $html = file_get_html($uri) or $this->returnError('No results for Elsevier journal '.$param['j'], 404); - foreach($html->find('.pod-listing') as $article){ + foreach($html->find('.pod-listing') as $article){ + $item = new \Item(); + $item->uri = $article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y'; + $item->title = $article->find('.pod-listing-header>a',0)->plaintext; + $item->name = $this->ExtractArticleName($article); + $item->timestamp = $this->ExtractArticleTimestamp($article); + $item->content = $this->ExtractArticleContent($article); + $this->items[] = $item; + } + } - $item = new \Item(); - $item->uri=$article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y'; - $item->title=$article->find('.pod-listing-header>a',0)->plaintext; - $item->name=$this->ExtractArticleName($article); - $item->timestamp=$this->ExtractArticleTimestamp($article); - $item->content=$this->ExtractArticleContent($article); + public function getName(){ + return 'Elsevier journals recent articles'; + } - $this->items[]=$item; - } - } + public function getURI(){ + return 'http://www.journals.elsevier.com'; + } - public function getName(){ - return 'Elsevier journals recent articles'; - } - - public function getURI(){ - return 'http://www.journals.elsevier.com'; - } - - public function getCacheDuration(){ - return 43200; // 12h - } + public function getCacheDuration(){ + return 43200; // 12h + } } +?> \ No newline at end of file