ElsevierBridge.php 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. <?php
  2. class ElsevierBridge extends BridgeAbstract {
  3. const MAINTAINER = 'Pierre Mazière';
  4. const NAME = 'Elsevier journals recent articles';
  5. const URI = 'http://www.journals.elsevier.com/';
  6. const CACHE_TIMEOUT = 43200; //12h
  7. const DESCRIPTION = 'Returns the recent articles published in Elsevier journals';
  8. const PARAMETERS = array( array(
  9. 'j' => array(
  10. 'name' => 'Journal name',
  11. 'required' => true,
  12. 'exampleValue' => 'academic-pediactrics',
  13. 'title' => 'Insert html-part of your journal'
  14. )
  15. ));
  16. // Extracts the list of names from an article as string
  17. private function extractArticleName($article){
  18. $names = $article->find('small', 0);
  19. if($names)
  20. return trim($names->plaintext);
  21. return '';
  22. }
  23. // Extracts the timestamp from an article
  24. private function extractArticleTimestamp($article){
  25. $time = $article->find('.article-info', 0);
  26. if($time) {
  27. $timestring = trim($time->plaintext);
  28. /*
  29. The format depends on the age of an article:
  30. - Available online 29 July 2016
  31. - July 2016
  32. - May–June 2016
  33. */
  34. if(preg_match('/\S*(\d+\s\S+\s\d{4})/ims', $timestring, $matches)) {
  35. return strtotime($matches[0]);
  36. } elseif (preg_match('/[A-Za-z]+\-([A-Za-z]+\s\d{4})/ims', $timestring, $matches)) {
  37. return strtotime($matches[0]);
  38. } elseif (preg_match('/([A-Za-z]+\s\d{4})/ims', $timestring, $matches)) {
  39. return strtotime($matches[0]);
  40. } else {
  41. return 0;
  42. }
  43. }
  44. return 0;
  45. }
  46. // Extracts the content from an article
  47. private function extractArticleContent($article){
  48. $content = $article->find('.article-content', 0);
  49. if($content) {
  50. return trim($content->plaintext);
  51. }
  52. return '';
  53. }
  54. public function collectData(){
  55. $uri = self::URI . $this->getInput('j') . '/recent-articles/';
  56. $html = getSimpleHTMLDOM($uri)
  57. or returnServerError('No results for Elsevier journal ' . $this->getInput('j'));
  58. foreach($html->find('.pod-listing') as $article) {
  59. $item = array();
  60. $item['uri'] = $article->find('.pod-listing-header>a', 0)->getAttribute('href') . '?np=y';
  61. $item['title'] = $article->find('.pod-listing-header>a', 0)->plaintext;
  62. $item['author'] = $this->extractArticleName($article);
  63. $item['timestamp'] = $this->extractArticleTimestamp($article);
  64. $item['content'] = $this->extractArticleContent($article);
  65. $this->items[] = $item;
  66. }
  67. }
  68. }