ElsevierBridge.php 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. <?php
  2. class ElsevierBridge extends BridgeAbstract{
  3. public function loadMetadatas() {
  4. $this->maintainer = 'Pierre Mazière';
  5. $this->name = 'Elsevier journals recent articles';
  6. $this->uri = 'http://www.journals.elsevier.com';
  7. $this->description = 'Returns the recent articles published in Elsevier journals';
  8. $this->parameters[] = array(
  9. 'j'=>array(
  10. 'name'=>'Journal name',
  11. 'required'=>true,
  12. 'exampleValue'=>'academic-pediactrics',
  13. 'title'=>'Insert html-part of your journal'
  14. )
  15. );
  16. }
  17. // Extracts the list of names from an article as string
  18. private function ExtractArticleName ($article){
  19. $names = $article->find('small', 0);
  20. if($names)
  21. return trim($names->plaintext);
  22. return '';
  23. }
  24. // Extracts the timestamp from an article
  25. private function ExtractArticleTimestamp ($article){
  26. $time = $article->find('.article-info', 0);
  27. if($time){
  28. $timestring = trim($time->plaintext);
  29. /*
  30. The format depends on the age of an article:
  31. - Available online 29 July 2016
  32. - July 2016
  33. - May–June 2016
  34. */
  35. if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){
  36. return strtotime($matches[0]);
  37. } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  38. return strtotime($matches[0]);
  39. } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  40. return strtotime($matches[0]);
  41. } else {
  42. return 0;
  43. }
  44. }
  45. return 0;
  46. }
  47. // Extracts the content from an article
  48. private function ExtractArticleContent ($article){
  49. $content = $article->find('.article-content', 0);
  50. if($content){
  51. return trim($content->plaintext);
  52. }
  53. return '';
  54. }
  55. public function collectData(array $param){
  56. $uri = 'http://www.journals.elsevier.com/' . $param['j'] . '/recent-articles/';
  57. $html = $this->getSimpleHTMLDOM($uri) or $this->returnServerError('No results for Elsevier journal '.$param['j']);
  58. foreach($html->find('.pod-listing') as $article){
  59. $item = array();
  60. $item['uri'] = $article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y';
  61. $item['title'] = $article->find('.pod-listing-header>a',0)->plaintext;
  62. $item['author'] = $this->ExtractArticleName($article);
  63. $item['timestamp'] = $this->ExtractArticleTimestamp($article);
  64. $item['content'] = $this->ExtractArticleContent($article);
  65. $this->items[] = $item;
  66. }
  67. }
  68. public function getCacheDuration(){
  69. return 43200; // 12h
  70. }
  71. }
  72. ?>