feedparser.php 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. <?php
  2. class FeedParser {
  3. private $doc;
  4. private $error;
  5. private $items;
  6. private $link;
  7. private $title;
  8. private $type;
  9. private $xpath;
  10. const FEED_RDF = 0;
  11. const FEED_RSS = 1;
  12. const FEED_ATOM = 2;
  13. function __construct($data) {
  14. libxml_use_internal_errors(true);
  15. libxml_clear_errors();
  16. $this->doc = new DOMDocument();
  17. $this->doc->loadXML($data);
  18. $error = libxml_get_last_error();
  19. // libxml compiled without iconv?
  20. if ($error && $error->code == 32) {
  21. if (preg_match('/^(<\\?xml .*?)encoding="(.+?)"(.*?\\?>)/', $data, $matches) === 1) {
  22. libxml_clear_errors();
  23. $enc = $matches[2];
  24. $data = iconv($enc, 'UTF-8//IGNORE', $data);
  25. $data = preg_replace('/^<\\?xml .*?\\?>/', $matches[1] . $matches[3] , $data);
  26. $this->doc = new DOMDocument();
  27. $this->doc->loadXML($data);
  28. $error = libxml_get_last_error();
  29. }
  30. }
  31. // some terrible invalid unicode entity?
  32. if ($error && $error->code == 9) {
  33. libxml_clear_errors();
  34. // we might want to try guessing input encoding here too
  35. $data = iconv("UTF-8", "UTF-8//IGNORE", $data);
  36. $this->doc = new DOMDocument();
  37. $this->doc->loadXML($data);
  38. $error = libxml_get_last_error();
  39. }
  40. $this->error = $this->format_error($error);
  41. libxml_clear_errors();
  42. $this->items = array();
  43. }
  44. function init() {
  45. $root = $this->doc->firstChild;
  46. $xpath = new DOMXPath($this->doc);
  47. $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
  48. $xpath->registerNamespace('atom03', 'http://purl.org/atom/ns#');
  49. $xpath->registerNamespace('media', 'http://search.yahoo.com/mrss/');
  50. $xpath->registerNamespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
  51. $xpath->registerNamespace('slash', 'http://purl.org/rss/1.0/modules/slash/');
  52. $xpath->registerNamespace('dc', 'http://purl.org/dc/elements/1.1/');
  53. $xpath->registerNamespace('content', 'http://purl.org/rss/1.0/modules/content/');
  54. $this->xpath = $xpath;
  55. $root = $xpath->query("(//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF)");
  56. if ($root) {
  57. $root = $root->item(0);
  58. if ($root) {
  59. switch (mb_strtolower($root->tagName)) {
  60. case "rdf:rdf":
  61. $this->type = $this::FEED_RDF;
  62. break;
  63. case "channel":
  64. $this->type = $this::FEED_RSS;
  65. break;
  66. case "feed":
  67. $this->type = $this::FEED_ATOM;
  68. break;
  69. default:
  70. if( !isset($this->error) ){
  71. $this->error = "Unknown/unsupported feed type";
  72. }
  73. return;
  74. }
  75. }
  76. switch ($this->type) {
  77. case $this::FEED_ATOM:
  78. $title = $xpath->query("//atom:feed/atom:title")->item(0);
  79. if (!$title)
  80. $title = $xpath->query("//atom03:feed/atom03:title")->item(0);
  81. if ($title) {
  82. $this->title = $title->nodeValue;
  83. }
  84. $link = $xpath->query("//atom:feed/atom:link[not(@rel)]")->item(0);
  85. if (!$link)
  86. $link = $xpath->query("//atom03:feed/atom03:link[not(@rel)]")->item(0);
  87. if ($link && $link->hasAttributes()) {
  88. $this->link = $link->getAttribute("href");
  89. }
  90. $articles = $xpath->query("//atom:entry");
  91. if (!$articles || $articles->length == 0)
  92. $articles = $xpath->query("//atom03:entry");
  93. foreach ($articles as $article) {
  94. array_push($this->items, new FeedItem_Atom($article, $this->doc, $this->xpath));
  95. }
  96. break;
  97. case $this::FEED_RSS:
  98. $title = $xpath->query("//channel/title")->item(0);
  99. if ($title) {
  100. $this->title = $title->nodeValue;
  101. }
  102. $link = $xpath->query("//channel/link")->item(0);
  103. if ($link) {
  104. if ($link->getAttribute("href"))
  105. $this->link = $link->getAttribute("href");
  106. else if ($link->nodeValue)
  107. $this->link = $link->nodeValue;
  108. }
  109. $articles = $xpath->query("//channel/item");
  110. foreach ($articles as $article) {
  111. array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath));
  112. }
  113. break;
  114. case $this::FEED_RDF:
  115. $xpath->registerNamespace('rssfake', 'http://purl.org/rss/1.0/');
  116. $title = $xpath->query("//rssfake:channel/rssfake:title")->item(0);
  117. if ($title) {
  118. $this->title = $title->nodeValue;
  119. }
  120. $link = $xpath->query("//rssfake:channel/rssfake:link")->item(0);
  121. if ($link) {
  122. $this->link = $link->nodeValue;
  123. }
  124. $articles = $xpath->query("//rssfake:item");
  125. foreach ($articles as $article) {
  126. array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath));
  127. }
  128. break;
  129. }
  130. } else {
  131. if( !isset($this->error) ){
  132. $this->error = "Unknown/unsupported feed type";
  133. }
  134. return;
  135. }
  136. }
  137. function format_error($error) {
  138. if ($error) {
  139. return sprintf("LibXML error %s at line %d (column %d): %s",
  140. $error->code, $error->line, $error->column,
  141. $error->message);
  142. } else {
  143. return "";
  144. }
  145. }
  146. function error() {
  147. return $this->error;
  148. }
  149. function get_link() {
  150. return $this->link;
  151. }
  152. function get_title() {
  153. return $this->title;
  154. }
  155. function get_items() {
  156. return $this->items;
  157. }
  158. function get_links($rel) {
  159. $rv = array();
  160. switch ($this->type) {
  161. case $this::FEED_ATOM:
  162. $links = $this->xpath->query("//atom:feed/atom:link");
  163. foreach ($links as $link) {
  164. if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) {
  165. array_push($rv, $link->getAttribute('href'));
  166. }
  167. }
  168. break;
  169. case $this::FEED_RSS:
  170. $links = $this->xpath->query("//atom:link");
  171. foreach ($links as $link) {
  172. if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) {
  173. array_push($rv, $link->getAttribute('href'));
  174. }
  175. }
  176. break;
  177. }
  178. return $rv;
  179. }
  180. } ?>