ElsevierBridge.php 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. <?php
  2. class ElsevierBridge extends BridgeAbstract{
  3. public function loadMetadatas() {
  4. $this->maintainer = 'Pierre Mazière';
  5. $this->name = 'Elsevier journals recent articles';
  6. $this->uri = 'http://www.journals.elsevier.com';
  7. $this->description = 'Returns the recent articles published in Elsevier journals';
  8. $this->update = '2016-08-02';
  9. $this->parameters[] =
  10. '[
  11. {
  12. "name" : "Journal name",
  13. "identifier" : "j",
  14. "required" : "true",
  15. "exampleValue" : "academic-pediatrics",
  16. "title" : "Insert html-part of your journal"
  17. }
  18. ]';
  19. }
  20. // Extracts the list of names from an article as string
  21. function ExtractArticleName ($article){
  22. $names = $article->find('small', 0);
  23. if($names)
  24. return trim($names->plaintext);
  25. return '';
  26. }
  27. // Extracts the timestamp from an article
  28. function ExtractArticleTimestamp ($article){
  29. $time = $article->find('.article-info', 0);
  30. if($time){
  31. $timestring = trim($time->plaintext);
  32. /*
  33. The format depends on the age of an article:
  34. - Available online 29 July 2016
  35. - July 2016
  36. - May–June 2016
  37. */
  38. if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)){
  39. return strtotime($matches[0]);
  40. } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  41. return strtotime($matches[0]);
  42. } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)){
  43. return strtotime($matches[0]);
  44. } else {
  45. return 0;
  46. }
  47. }
  48. return 0;
  49. }
  50. // Extracts the content from an article
  51. function ExtractArticleContent ($article){
  52. $content = $article->find('.article-content', 0);
  53. if($content){
  54. return trim($content->plaintext);
  55. }
  56. return '';
  57. }
  58. public function collectData(array $param){
  59. $uri = 'http://www.journals.elsevier.com/' . $param['j'] . '/recent-articles/';
  60. $html = file_get_html($uri) or $this->returnError('No results for Elsevier journal '.$param['j'], 404);
  61. foreach($html->find('.pod-listing') as $article){
  62. $item = new \Item();
  63. $item->uri = $article->find('.pod-listing-header>a',0)->getAttribute('href').'?np=y';
  64. $item->title = $article->find('.pod-listing-header>a',0)->plaintext;
  65. $item->name = $this->ExtractArticleName($article);
  66. $item->timestamp = $this->ExtractArticleTimestamp($article);
  67. $item->content = $this->ExtractArticleContent($article);
  68. $this->items[] = $item;
  69. }
  70. }
  71. public function getName(){
  72. return 'Elsevier journals recent articles';
  73. }
  74. public function getURI(){
  75. return 'http://www.journals.elsevier.com';
  76. }
  77. public function getCacheDuration(){
  78. return 43200; // 12h
  79. }
  80. }
  81. ?>