ElsevierBridge.php 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. <?php
  2. class ElsevierBridge extends BridgeAbstract{
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'Elsevier journals recent articles';
  5. const URI = 'http://www.journals.elsevier.com/';
  6. const CACHE_TIMEOUT = 43200; //12h
  7. const DESCRIPTION = 'Returns the recent articles published in Elsevier journals';
  8. const PARAMETERS = array( array(
  9. 'j'=>array(
  10. 'name'=>'Journal name',
  11. 'required'=>true,
  12. 'exampleValue'=>'academic-pediactrics',
  13. 'title'=>'Insert html-part of your journal'
  14. )
  15. ));
  16. // Extracts the list of names from an article as string
  17. private function ExtractArticleName ($article){
  18. $names = $article->find('small', 0);
  19. if($names)
  20. return trim($names->plaintext);
  21. return '';
  22. }
  23. // Extracts the timestamp from an article
  24. private function ExtractArticleTimestamp ($article){
  25. $time = $article->find('.article-info', 0);
  26. if($time){
  27. $timestring = trim($time->plaintext);
  28. /*
  29. The format depends on the age of an article:
  30. - Available online 29 July 2016
  31. - July 2016
  32. - May–June 2016
  33. */
  34. if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){
  35. return strtotime($matches[0]);
  36. } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  37. return strtotime($matches[0]);
  38. } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  39. return strtotime($matches[0]);
  40. } else {
  41. return 0;
  42. }
  43. }
  44. return 0;
  45. }
  46. // Extracts the content from an article
  47. private function ExtractArticleContent ($article){
  48. $content = $article->find('.article-content', 0);
  49. if($content){
  50. return trim($content->plaintext);
  51. }
  52. return '';
  53. }
  54. public function collectData(){
  55. $uri = self::URI . $this->getInput('j') . '/recent-articles/';
  56. $html = getSimpleHTMLDOM($uri) or returnServerError('No results for Elsevier journal '.$this->getInput('j'));
  57. foreach($html->find('.pod-listing') as $article){
  58. $item = array();
  59. $item['uri'] = $article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y';
  60. $item['title'] = $article->find('.pod-listing-header>a',0)->plaintext;
  61. $item['author'] = $this->ExtractArticleName($article);
  62. $item['timestamp'] = $this->ExtractArticleTimestamp($article);
  63. $item['content'] = $this->ExtractArticleContent($article);
  64. $this->items[] = $item;
  65. }
  66. }
  67. }
  68. ?>