feedparser.php 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. <?php
  2. class FeedParser {
  3. private $doc;
  4. private $error;
  5. private $libxml_errors = array();
  6. private $items;
  7. private $link;
  8. private $title;
  9. private $type;
  10. private $xpath;
  11. const FEED_RDF = 0;
  12. const FEED_RSS = 1;
  13. const FEED_ATOM = 2;
  14. function normalize_encoding($data) {
  15. if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) {
  16. $encoding = strtolower($matches[2]);
  17. if (in_array($encoding, array_map('strtolower', mb_list_encodings())))
  18. $data = mb_convert_encoding($data, 'UTF-8', $encoding);
  19. $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data);
  20. }
  21. return $data;
  22. }
  23. function __construct($data) {
  24. libxml_use_internal_errors(true);
  25. libxml_clear_errors();
  26. $this->doc = new DOMDocument();
  27. $this->doc->loadXML($data);
  28. mb_substitute_character("none");
  29. $error = libxml_get_last_error();
  30. // libxml compiled without iconv?
  31. if ($error && $error->code == 32) {
  32. $data = $this->normalize_encoding($data);
  33. if ($data) {
  34. libxml_clear_errors();
  35. $this->doc = new DOMDocument();
  36. $this->doc->loadXML($data);
  37. $error = libxml_get_last_error();
  38. }
  39. }
  40. // some terrible invalid unicode entity?
  41. if ($error) {
  42. foreach (libxml_get_errors() as $err) {
  43. if ($err->code == 9) {
  44. // if the source feed is not in utf8, next conversion will fail
  45. $data = $this->normalize_encoding($data);
  46. // remove dangling bytes
  47. $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8');
  48. // apparently not all UTF-8 characters are valid for XML
  49. $data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $data);
  50. if ($data) {
  51. libxml_clear_errors();
  52. $this->doc = new DOMDocument();
  53. $this->doc->loadXML($data);
  54. $error = libxml_get_last_error();
  55. }
  56. break;
  57. }
  58. }
  59. }
  60. if ($error) {
  61. foreach (libxml_get_errors() as $error) {
  62. if ($error->level == LIBXML_ERR_FATAL) {
  63. if(!isset($this->error)) //currently only the first error is reported
  64. $this->error = $this->format_error($error);
  65. $this->libxml_errors [] = $this->format_error($error);
  66. }
  67. }
  68. }
  69. libxml_clear_errors();
  70. $this->items = array();
  71. }
  72. function init() {
  73. $root = $this->doc->firstChild;
  74. $xpath = new DOMXPath($this->doc);
  75. $xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
  76. $xpath->registerNamespace('atom03', 'http://purl.org/atom/ns#');
  77. $xpath->registerNamespace('media', 'http://search.yahoo.com/mrss/');
  78. $xpath->registerNamespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');
  79. $xpath->registerNamespace('slash', 'http://purl.org/rss/1.0/modules/slash/');
  80. $xpath->registerNamespace('dc', 'http://purl.org/dc/elements/1.1/');
  81. $xpath->registerNamespace('content', 'http://purl.org/rss/1.0/modules/content/');
  82. $xpath->registerNamespace('thread', 'http://purl.org/syndication/thread/1.0');
  83. $this->xpath = $xpath;
  84. $root = $xpath->query("(//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF)");
  85. if ($root && $root->length > 0) {
  86. $root = $root->item(0);
  87. if ($root) {
  88. switch (mb_strtolower($root->tagName)) {
  89. case "rdf:rdf":
  90. $this->type = $this::FEED_RDF;
  91. break;
  92. case "channel":
  93. $this->type = $this::FEED_RSS;
  94. break;
  95. case "feed":
  96. case "atom:feed":
  97. $this->type = $this::FEED_ATOM;
  98. break;
  99. default:
  100. if( !isset($this->error) ){
  101. $this->error = "Unknown/unsupported feed type";
  102. }
  103. return;
  104. }
  105. }
  106. switch ($this->type) {
  107. case $this::FEED_ATOM:
  108. $title = $xpath->query("//atom:feed/atom:title")->item(0);
  109. if (!$title)
  110. $title = $xpath->query("//atom03:feed/atom03:title")->item(0);
  111. if ($title) {
  112. $this->title = $title->nodeValue;
  113. }
  114. $link = $xpath->query("//atom:feed/atom:link[not(@rel)]")->item(0);
  115. if (!$link)
  116. $link = $xpath->query("//atom:feed/atom:link[@rel='alternate']")->item(0);
  117. if (!$link)
  118. $link = $xpath->query("//atom03:feed/atom03:link[not(@rel)]")->item(0);
  119. if (!$link)
  120. $link = $xpath->query("//atom03:feed/atom03:link[@rel='alternate']")->item(0);
  121. if ($link && $link->hasAttributes()) {
  122. $this->link = $link->getAttribute("href");
  123. }
  124. $articles = $xpath->query("//atom:entry");
  125. if (!$articles || $articles->length == 0)
  126. $articles = $xpath->query("//atom03:entry");
  127. foreach ($articles as $article) {
  128. array_push($this->items, new FeedItem_Atom($article, $this->doc, $this->xpath));
  129. }
  130. break;
  131. case $this::FEED_RSS:
  132. $title = $xpath->query("//channel/title")->item(0);
  133. if ($title) {
  134. $this->title = $title->nodeValue;
  135. }
  136. $link = $xpath->query("//channel/link")->item(0);
  137. if ($link) {
  138. if ($link->getAttribute("href"))
  139. $this->link = $link->getAttribute("href");
  140. else if ($link->nodeValue)
  141. $this->link = $link->nodeValue;
  142. }
  143. $articles = $xpath->query("//channel/item");
  144. foreach ($articles as $article) {
  145. array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath));
  146. }
  147. break;
  148. case $this::FEED_RDF:
  149. $xpath->registerNamespace('rssfake', 'http://purl.org/rss/1.0/');
  150. $title = $xpath->query("//rssfake:channel/rssfake:title")->item(0);
  151. if ($title) {
  152. $this->title = $title->nodeValue;
  153. }
  154. $link = $xpath->query("//rssfake:channel/rssfake:link")->item(0);
  155. if ($link) {
  156. $this->link = $link->nodeValue;
  157. }
  158. $articles = $xpath->query("//rssfake:item");
  159. foreach ($articles as $article) {
  160. array_push($this->items, new FeedItem_RSS($article, $this->doc, $this->xpath));
  161. }
  162. break;
  163. }
  164. if ($this->title) $this->title = trim($this->title);
  165. if ($this->link) $this->link = trim($this->link);
  166. } else {
  167. if( !isset($this->error) ){
  168. $this->error = "Unknown/unsupported feed type";
  169. }
  170. return;
  171. }
  172. }
  173. function format_error($error) {
  174. if ($error) {
  175. return sprintf("LibXML error %s at line %d (column %d): %s",
  176. $error->code, $error->line, $error->column,
  177. $error->message);
  178. } else {
  179. return "";
  180. }
  181. }
  182. function error() {
  183. return $this->error;
  184. }
  185. function errors() {
  186. return $this->libxml_errors;
  187. }
  188. function get_link() {
  189. return $this->link;
  190. }
  191. function get_title() {
  192. return $this->title;
  193. }
  194. function get_items() {
  195. return $this->items;
  196. }
  197. function get_links($rel) {
  198. $rv = array();
  199. switch ($this->type) {
  200. case $this::FEED_ATOM:
  201. $links = $this->xpath->query("//atom:feed/atom:link");
  202. foreach ($links as $link) {
  203. if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) {
  204. array_push($rv, trim($link->getAttribute('href')));
  205. }
  206. }
  207. break;
  208. case $this::FEED_RSS:
  209. $links = $this->xpath->query("//atom:link");
  210. foreach ($links as $link) {
  211. if (!$rel || $link->hasAttribute('rel') && $link->getAttribute('rel') == $rel) {
  212. array_push($rv, trim($link->getAttribute('href')));
  213. }
  214. }
  215. break;
  216. }
  217. return $rv;
  218. }
  219. }