ElsevierBridge.php 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. <?php
  2. class ElsevierBridge extends BridgeAbstract{
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'Elsevier journals recent articles';
  5. const URI = 'http://www.journals.elsevier.com/';
  6. const DESCRIPTION = 'Returns the recent articles published in Elsevier journals';
  7. const PARAMETERS = array( array(
  8. 'j'=>array(
  9. 'name'=>'Journal name',
  10. 'required'=>true,
  11. 'exampleValue'=>'academic-pediactrics',
  12. 'title'=>'Insert html-part of your journal'
  13. )
  14. ));
  15. // Extracts the list of names from an article as string
  16. private function ExtractArticleName ($article){
  17. $names = $article->find('small', 0);
  18. if($names)
  19. return trim($names->plaintext);
  20. return '';
  21. }
  22. // Extracts the timestamp from an article
  23. private function ExtractArticleTimestamp ($article){
  24. $time = $article->find('.article-info', 0);
  25. if($time){
  26. $timestring = trim($time->plaintext);
  27. /*
  28. The format depends on the age of an article:
  29. - Available online 29 July 2016
  30. - July 2016
  31. - May–June 2016
  32. */
  33. if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){
  34. return strtotime($matches[0]);
  35. } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  36. return strtotime($matches[0]);
  37. } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  38. return strtotime($matches[0]);
  39. } else {
  40. return 0;
  41. }
  42. }
  43. return 0;
  44. }
  45. // Extracts the content from an article
  46. private function ExtractArticleContent ($article){
  47. $content = $article->find('.article-content', 0);
  48. if($content){
  49. return trim($content->plaintext);
  50. }
  51. return '';
  52. }
  53. public function collectData(){
  54. $uri = self::URI . $this->getInput('j') . '/recent-articles/';
  55. $html = getSimpleHTMLDOM($uri) or returnServerError('No results for Elsevier journal '.$this->getInput('j'));
  56. foreach($html->find('.pod-listing') as $article){
  57. $item = array();
  58. $item['uri'] = $article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y';
  59. $item['title'] = $article->find('.pod-listing-header>a',0)->plaintext;
  60. $item['author'] = $this->ExtractArticleName($article);
  61. $item['timestamp'] = $this->ExtractArticleTimestamp($article);
  62. $item['content'] = $this->ExtractArticleContent($article);
  63. $this->items[] = $item;
  64. }
  65. }
  66. public function getCacheDuration(){
  67. return 43200; // 12h
  68. }
  69. }
  70. ?>