Parcourir la source

add basic tinyparser/atom

Andrew Dolgov il y a 11 ans
Parent
commit
cd07592c29
5 fichiers modifiés avec 255 ajouts et 13 suppressions
  1. 7 0
      classes/feedenclosure.php
  2. 15 0
      classes/feeditem.php
  3. 118 0
      classes/feeditem/atom.php
  4. 101 0
      classes/feedparser.php
  5. 14 13
      include/rssfuncs.php

+ 7 - 0
classes/feedenclosure.php

@@ -0,0 +1,7 @@
+<?php
+class FeedEnclosure {
+	public $link;
+	public $type;
+	public $length;
+}
+?>

+ 15 - 0
classes/feeditem.php

@@ -0,0 +1,15 @@
+<?php
+class FeedItem {
+	abstract function get_id();
+	abstract function get_date();
+	abstract function get_link();
+	abstract function get_title();
+	abstract function get_description();
+	abstract function get_content();
+	abstract function get_comments_url();
+	abstract function get_comments_count();
+	abstract function get_categories();
+	abstract function get_enclosures();
+	abstract function get_author();
+}
+?>

+ 118 - 0
classes/feeditem/atom.php

@@ -0,0 +1,118 @@
+<?php
+class FeedItem_Atom {
+	private $elem;
+
+	function __construct($elem) {
+		$this->elem = $elem;
+	}
+
+	function get_id() {
+		$id = $this->elem->getElementsByTagName("id")->item(0);
+
+		if ($id) {
+			return $id->nodeValue;
+		} else {
+			return $this->get_link();
+		}
+	}
+
+	function get_date() {
+
+
+	}
+
+	function get_link() {
+		$links = $this->elem->getElementsByTagName("link");
+
+		foreach ($links as $link) {
+			if ($link && $link->hasAttribute("href") && !$link->hasAttribute("rel")) {
+				return $link->getAttribute("href");
+			}
+		}
+	}
+
+	function get_title() {
+		$title = $this->elem->getElementsByTagName("title")->item(0);
+
+		if ($title) {
+			return $title->nodeValue;
+		}
+	}
+
+	function get_content() {
+		$content = $this->elem->getElementsByTagName("content")->item(0);
+
+		if ($content) {
+			return $content->nodeValue;
+		}
+	}
+
+	function get_description() {
+		$summary = $this->elem->getElementsByTagName("summary")->item(0);
+
+		if ($summary) {
+			return $summary->nodeValue;
+		}
+	}
+
+	// todo
+	function get_comments_url() {
+
+	}
+
+	// todo
+	function get_comments_count() {
+
+	}
+
+	function get_categories() {
+		$categories = $this->elem->getElementsByTagName("category");
+		$cats = array();
+
+		foreach ($categories as $cat) {
+			if ($cat->hasAttribute("term"))
+				array_push($cats, $cat->getAttribute("term"));
+		}
+
+
+		return $cats;
+	}
+
+	function get_enclosures() {
+		$links = $this->elem->getElementsByTagName("link");
+
+		$encs = array();
+
+		foreach ($links as $link) {
+			if ($link && $link->hasAttribute("href") && $link->hasAttribute("rel")) {
+				if ($link->getAttribute("rel") == "enclosure") {
+					$enc = new FeedEnclosure();
+
+					$enc->type = $link->getAttribute("type");
+					$enc->link = $link->getAttribute("href");
+					$enc->length = $link->getAttribute("length");
+
+					array_push($encs, $enc);
+				}
+			}
+		}
+
+		return $encs;
+	}
+
+	function get_author() {
+		$author = $this->elem->getElementsByTagName("author")->item(0);
+
+		if ($author) {
+			$name = $author->getElementsByTagName("name")->item(0);
+
+			if ($name) return $name->nodeValue;
+
+			$email = $author->getElementsByTagName("email")->item(0);
+
+			if ($email) return $email->nodeValue;
+
+		}
+	}
+}
+?>

+ 101 - 0
classes/feedparser.php

@@ -0,0 +1,101 @@
+<?php
+class FeedParser {
+	private $doc;
+	private $error;
+	private $items;
+	private $link;
+	private $title;
+	private $type;
+
+	const FEED_RDF = 0;
+	const FEED_RSS = 1;
+	const FEED_ATOM = 2;
+
+	function __construct($data) {
+		libxml_use_internal_errors(true);
+		libxml_clear_errors();
+		$this->doc = new DOMDocument();
+		$this->doc->loadXML($data);
+		$this->error = $this->format_error(libxml_get_last_error());
+		libxml_clear_errors();
+
+		$this->items = array();
+	}
+
+	function init() {
+		$root = $this->doc->firstChild;
+
+		if ($root) {
+			switch ($root->tagName) {
+			case "rss":
+				$this->type = $this::FEED_RSS;
+				break;
+			case "feed":
+				$this->type = $this::FEED_ATOM;
+				break;
+			default:
+				$this->error = "Unknown/unsupported feed type";
+				return;
+			}
+
+			$xpath = new DOMXPath($this->doc);
+
+			switch ($this->type) {
+			case $this::FEED_ATOM:
+				$xpath->registerNamespace('atom', 'http://www.w3.org/2005/Atom');
+
+				$title = $xpath->query("//atom:feed/atom:title")->item(0);
+
+				if ($title) {
+					$this->title = $title->nodeValue;
+				}
+
+				$link = $xpath->query("//atom:feed/atom:link[not(@rel)]")->item(0);
+
+				if ($link && $link->hasAttributes()) {
+					$this->link = $link->getAttribute("href");
+				}
+
+				$articles = $xpath->query("//atom:entry");
+
+				foreach ($articles as $article) {
+					array_push($this->items, new FeedItem_Atom($article));
+				}
+
+				break;
+			case $this::FEED_RDF:
+
+				break;
+			case $this::FEED_RSS:
+				break;
+			}
+		}
+	}
+
+	function format_error($error) {
+		if ($error) {
+			return sprintf("LibXML error %s at line %d (column %d): %s",
+				$error->code, $error->line, $error->column,
+				$error->message);
+		} else {
+			return "";
+		}
+	}
+
+	function error() {
+		return $this->error;
+	}
+
+	function get_link() {
+		return $this->link;
+	}
+
+	function get_title() {
+		return $this->title;
+	}
+
+	function get_items() {
+		return $this->items;
+	}
+
+} ?>

+ 14 - 13
include/rssfuncs.php

@@ -361,16 +361,17 @@
 		}
 
 		if (!$rss) {
-			$rss = new SimplePie();
+			/* $rss = new SimplePie();
 			$rss->set_sanitize_class("SanitizeDummy");
 			// simplepie ignores the above and creates default sanitizer anyway,
 			// so let's override it...
 			$rss->sanitize = new SanitizeDummy();
 			$rss->set_output_encoding('UTF-8');
 			$rss->set_raw_data($feed_data);
-			$rss->enable_cache(false);
+			$rss->enable_cache(false); */
 
-			@$rss->init();
+			$rss = new FeedParser($feed_data);
+			$rss->init();
 		}
 
 //		print_r($rss);
@@ -386,7 +387,7 @@
 
 				if ($new_rss_hash != $rss_hash) {
 					_debug("saving $cache_filename", $debug_enabled);
-					@file_put_contents($cache_filename, serialize($rss));
+					//@file_put_contents($cache_filename, serialize($rss)); NOT YET
 				}
 			}
 
@@ -418,6 +419,9 @@
 
 			$site_url = db_escape_string(mb_substr(rewrite_relative_url($fetch_url, $rss->get_link()), 0, 245));
 
+			_debug("site_url: $site_url", $debug_enabled);
+			_debug("feed_title: " . $rss->get_title(), $debug_enabled);
+
 			if ($favicon_needs_check || $force_refetch) {
 
 				/* terrible hack: if we crash on floicon shit here, we won't check
@@ -533,6 +537,8 @@
 				if (!$entry_guid) $entry_guid = $item->get_link();
 				if (!$entry_guid) $entry_guid = make_guid_from_title($item->get_title());
 
+				_debug("f_guid $entry_guid", $debug_enabled);
+
 				if (!$entry_guid) continue;
 
 				$entry_guid = "$owner_uid,$entry_guid";
@@ -575,20 +581,15 @@
 					print "\n";
 				}
 
-				$entry_comments = $item->data["comments"];
-
-				if ($item->get_author()) {
-					$entry_author_item = $item->get_author();
-					$entry_author = $entry_author_item->get_name();
-					if (!$entry_author) $entry_author = $entry_author_item->get_email();
-				}
+				$entry_comments = $item->get_comments_url();
+				$entry_author = $item->get_author();
 
 				$entry_guid = db_escape_string(mb_substr($entry_guid, 0, 245));
 
 				$entry_comments = db_escape_string(mb_substr(trim($entry_comments), 0, 245));
 				$entry_author = db_escape_string(mb_substr(trim($entry_author), 0, 245));
 
-				$num_comments = $item->get_item_tags('http://purl.org/rss/1.0/modules/slash/', 'comments');
+				$num_comments = $item->get_comments_count();
 
 				if (is_array($num_comments) && is_array($num_comments[0])) {
 					$num_comments = (int) $num_comments[0]["data"];
@@ -608,7 +609,7 @@
 
 				if (is_array($additional_tags_src)) {
 					foreach ($additional_tags_src as $tobj) {
-						array_push($additional_tags, $tobj->get_term());
+						array_push($additional_tags, $tobj);
 					}
 				}