123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722 |
- package atom
- import (
- "encoding/base64"
- "io"
- "strings"
- "github.com/PuerkitoBio/goquery"
- "github.com/mmcdole/gofeed/extensions"
- "github.com/mmcdole/gofeed/internal/shared"
- "github.com/mmcdole/goxpp"
- )
- // Parser is an Atom Parser
- type Parser struct{}
- // Parse parses an xml feed into an atom.Feed
- func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
- p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
- _, err := shared.FindRoot(p)
- if err != nil {
- return nil, err
- }
- return ap.parseRoot(p)
- }
- func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
- if err := p.Expect(xpp.StartTag, "feed"); err != nil {
- return nil, err
- }
- atom := &Feed{}
- atom.Entries = []*Entry{}
- atom.Version = ap.parseVersion(p)
- atom.Language = ap.parseLanguage(p)
- contributors := []*Person{}
- authors := []*Person{}
- categories := []*Category{}
- links := []*Link{}
- extensions := ext.Extensions{}
- for {
- tok, err := shared.NextTag(p)
- if err != nil {
- return nil, err
- }
- if tok == xpp.EndTag {
- break
- }
- if tok == xpp.StartTag {
- name := strings.ToLower(p.Name)
- if shared.IsExtension(p) {
- e, err := shared.ParseExtension(extensions, p)
- if err != nil {
- return nil, err
- }
- extensions = e
- } else if name == "title" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.Title = result
- } else if name == "id" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.ID = result
- } else if name == "updated" ||
- name == "modified" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.Updated = result
- date, err := shared.ParseDate(result)
- if err == nil {
- utcDate := date.UTC()
- atom.UpdatedParsed = &utcDate
- }
- } else if name == "subtitle" ||
- name == "tagline" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.Subtitle = result
- } else if name == "link" {
- result, err := ap.parseLink(p)
- if err != nil {
- return nil, err
- }
- links = append(links, result)
- } else if name == "generator" {
- result, err := ap.parseGenerator(p)
- if err != nil {
- return nil, err
- }
- atom.Generator = result
- } else if name == "icon" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.Icon = result
- } else if name == "logo" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.Logo = result
- } else if name == "rights" ||
- name == "copyright" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- atom.Rights = result
- } else if name == "contributor" {
- result, err := ap.parsePerson("contributor", p)
- if err != nil {
- return nil, err
- }
- contributors = append(contributors, result)
- } else if name == "author" {
- result, err := ap.parsePerson("author", p)
- if err != nil {
- return nil, err
- }
- authors = append(authors, result)
- } else if name == "category" {
- result, err := ap.parseCategory(p)
- if err != nil {
- return nil, err
- }
- categories = append(categories, result)
- } else if name == "entry" {
- result, err := ap.parseEntry(p)
- if err != nil {
- return nil, err
- }
- atom.Entries = append(atom.Entries, result)
- } else {
- err := p.Skip()
- if err != nil {
- return nil, err
- }
- }
- }
- }
- if len(categories) > 0 {
- atom.Categories = categories
- }
- if len(authors) > 0 {
- atom.Authors = authors
- }
- if len(contributors) > 0 {
- atom.Contributors = contributors
- }
- if len(links) > 0 {
- atom.Links = links
- }
- if len(extensions) > 0 {
- atom.Extensions = extensions
- }
- if err := p.Expect(xpp.EndTag, "feed"); err != nil {
- return nil, err
- }
- return atom, nil
- }
- func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) {
- if err := p.Expect(xpp.StartTag, "entry"); err != nil {
- return nil, err
- }
- entry := &Entry{}
- contributors := []*Person{}
- authors := []*Person{}
- categories := []*Category{}
- links := []*Link{}
- extensions := ext.Extensions{}
- for {
- tok, err := shared.NextTag(p)
- if err != nil {
- return nil, err
- }
- if tok == xpp.EndTag {
- break
- }
- if tok == xpp.StartTag {
- name := strings.ToLower(p.Name)
- if shared.IsExtension(p) {
- e, err := shared.ParseExtension(extensions, p)
- if err != nil {
- return nil, err
- }
- extensions = e
- } else if name == "title" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- entry.Title = result
- } else if name == "id" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- entry.ID = result
- } else if name == "rights" ||
- name == "copyright" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- entry.Rights = result
- } else if name == "summary" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- entry.Summary = result
- } else if name == "source" {
- result, err := ap.parseSource(p)
- if err != nil {
- return nil, err
- }
- entry.Source = result
- } else if name == "updated" ||
- name == "modified" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- entry.Updated = result
- date, err := shared.ParseDate(result)
- if err == nil {
- utcDate := date.UTC()
- entry.UpdatedParsed = &utcDate
- }
- } else if name == "contributor" {
- result, err := ap.parsePerson("contributor", p)
- if err != nil {
- return nil, err
- }
- contributors = append(contributors, result)
- } else if name == "author" {
- result, err := ap.parsePerson("author", p)
- if err != nil {
- return nil, err
- }
- authors = append(authors, result)
- } else if name == "category" {
- result, err := ap.parseCategory(p)
- if err != nil {
- return nil, err
- }
- categories = append(categories, result)
- } else if name == "link" {
- result, err := ap.parseLink(p)
- if err != nil {
- return nil, err
- }
- links = append(links, result)
- } else if name == "published" ||
- name == "issued" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- entry.Published = result
- date, err := shared.ParseDate(result)
- if err == nil {
- utcDate := date.UTC()
- entry.PublishedParsed = &utcDate
- }
- } else if name == "content" {
- result, err := ap.parseContent(p)
- if err != nil {
- return nil, err
- }
- entry.Content = result
- } else {
- err := p.Skip()
- if err != nil {
- return nil, err
- }
- }
- }
- }
- if len(categories) > 0 {
- entry.Categories = categories
- }
- if len(authors) > 0 {
- entry.Authors = authors
- }
- if len(links) > 0 {
- entry.Links = links
- }
- if len(contributors) > 0 {
- entry.Contributors = contributors
- }
- if len(extensions) > 0 {
- entry.Extensions = extensions
- }
- if err := p.Expect(xpp.EndTag, "entry"); err != nil {
- return nil, err
- }
- return entry, nil
- }
- func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) {
- if err := p.Expect(xpp.StartTag, "source"); err != nil {
- return nil, err
- }
- source := &Source{}
- contributors := []*Person{}
- authors := []*Person{}
- categories := []*Category{}
- links := []*Link{}
- extensions := ext.Extensions{}
- for {
- tok, err := shared.NextTag(p)
- if err != nil {
- return nil, err
- }
- if tok == xpp.EndTag {
- break
- }
- if tok == xpp.StartTag {
- name := strings.ToLower(p.Name)
- if shared.IsExtension(p) {
- e, err := shared.ParseExtension(extensions, p)
- if err != nil {
- return nil, err
- }
- extensions = e
- } else if name == "title" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.Title = result
- } else if name == "id" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.ID = result
- } else if name == "updated" ||
- name == "modified" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.Updated = result
- date, err := shared.ParseDate(result)
- if err == nil {
- utcDate := date.UTC()
- source.UpdatedParsed = &utcDate
- }
- } else if name == "subtitle" ||
- name == "tagline" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.Subtitle = result
- } else if name == "link" {
- result, err := ap.parseLink(p)
- if err != nil {
- return nil, err
- }
- links = append(links, result)
- } else if name == "generator" {
- result, err := ap.parseGenerator(p)
- if err != nil {
- return nil, err
- }
- source.Generator = result
- } else if name == "icon" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.Icon = result
- } else if name == "logo" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.Logo = result
- } else if name == "rights" ||
- name == "copyright" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- source.Rights = result
- } else if name == "contributor" {
- result, err := ap.parsePerson("contributor", p)
- if err != nil {
- return nil, err
- }
- contributors = append(contributors, result)
- } else if name == "author" {
- result, err := ap.parsePerson("author", p)
- if err != nil {
- return nil, err
- }
- authors = append(authors, result)
- } else if name == "category" {
- result, err := ap.parseCategory(p)
- if err != nil {
- return nil, err
- }
- categories = append(categories, result)
- } else {
- err := p.Skip()
- if err != nil {
- return nil, err
- }
- }
- }
- }
- if len(categories) > 0 {
- source.Categories = categories
- }
- if len(authors) > 0 {
- source.Authors = authors
- }
- if len(contributors) > 0 {
- source.Contributors = contributors
- }
- if len(links) > 0 {
- source.Links = links
- }
- if len(extensions) > 0 {
- source.Extensions = extensions
- }
- if err := p.Expect(xpp.EndTag, "source"); err != nil {
- return nil, err
- }
- return source, nil
- }
- func (ap *Parser) parseContent(p *xpp.XMLPullParser) (*Content, error) {
- c := &Content{}
- c.Type = p.Attribute("type")
- c.Src = p.Attribute("src")
- text, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- c.Value = text
- return c, nil
- }
- func (ap *Parser) parsePerson(name string, p *xpp.XMLPullParser) (*Person, error) {
- if err := p.Expect(xpp.StartTag, name); err != nil {
- return nil, err
- }
- person := &Person{}
- for {
- tok, err := shared.NextTag(p)
- if err != nil {
- return nil, err
- }
- if tok == xpp.EndTag {
- break
- }
- if tok == xpp.StartTag {
- name := strings.ToLower(p.Name)
- if name == "name" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- person.Name = result
- } else if name == "email" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- person.Email = result
- } else if name == "uri" ||
- name == "url" ||
- name == "homepage" {
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- person.URI = result
- } else {
- err := p.Skip()
- if err != nil {
- return nil, err
- }
- }
- }
- }
- if err := p.Expect(xpp.EndTag, name); err != nil {
- return nil, err
- }
- return person, nil
- }
- func (ap *Parser) parseLink(p *xpp.XMLPullParser) (*Link, error) {
- if err := p.Expect(xpp.StartTag, "link"); err != nil {
- return nil, err
- }
- l := &Link{}
- l.Href = p.Attribute("href")
- l.Hreflang = p.Attribute("hreflang")
- l.Type = p.Attribute("type")
- l.Length = p.Attribute("length")
- l.Title = p.Attribute("title")
- l.Rel = p.Attribute("rel")
- if l.Rel == "" {
- l.Rel = "alternate"
- }
- if err := p.Skip(); err != nil {
- return nil, err
- }
- if err := p.Expect(xpp.EndTag, "link"); err != nil {
- return nil, err
- }
- return l, nil
- }
- func (ap *Parser) parseCategory(p *xpp.XMLPullParser) (*Category, error) {
- if err := p.Expect(xpp.StartTag, "category"); err != nil {
- return nil, err
- }
- c := &Category{}
- c.Term = p.Attribute("term")
- c.Scheme = p.Attribute("scheme")
- c.Label = p.Attribute("label")
- if err := p.Skip(); err != nil {
- return nil, err
- }
- if err := p.Expect(xpp.EndTag, "category"); err != nil {
- return nil, err
- }
- return c, nil
- }
- func (ap *Parser) parseGenerator(p *xpp.XMLPullParser) (*Generator, error) {
- if err := p.Expect(xpp.StartTag, "generator"); err != nil {
- return nil, err
- }
- g := &Generator{}
- uri := p.Attribute("uri") // Atom 1.0
- url := p.Attribute("url") // Atom 0.3
- if uri != "" {
- g.URI = uri
- } else if url != "" {
- g.URI = url
- }
- g.Version = p.Attribute("version")
- result, err := ap.parseAtomText(p)
- if err != nil {
- return nil, err
- }
- g.Value = result
- if err := p.Expect(xpp.EndTag, "generator"); err != nil {
- return nil, err
- }
- return g, nil
- }
- func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
- var text struct {
- Type string `xml:"type,attr"`
- Mode string `xml:"mode,attr"`
- InnerXML string `xml:",innerxml"`
- }
- err := p.DecodeElement(&text)
- if err != nil {
- return "", err
- }
- result := text.InnerXML
- result = strings.TrimSpace(result)
- if strings.HasPrefix(result, "<![CDATA[") &&
- strings.HasSuffix(result, "]]>") {
- result = strings.TrimPrefix(result, "<![CDATA[")
- result = strings.TrimSuffix(result, "]]>")
- return result, nil
- }
- lowerType := strings.ToLower(text.Type)
- lowerMode := strings.ToLower(text.Mode)
- if lowerType == "text" ||
- strings.HasPrefix(lowerType, "text/") ||
- (lowerType == "" && lowerMode == "") {
- result, err = shared.DecodeEntities(result)
- } else if strings.Contains(lowerType, "xhtml") {
- result = ap.stripWrappingDiv(result)
- } else if lowerType == "html" {
- result = ap.stripWrappingDiv(result)
- result, err = shared.DecodeEntities(result)
- } else {
- decodedStr, err := base64.StdEncoding.DecodeString(result)
- if err == nil {
- result = string(decodedStr)
- }
- }
- return result, err
- }
- func (ap *Parser) parseLanguage(p *xpp.XMLPullParser) string {
- return p.Attribute("lang")
- }
- func (ap *Parser) parseVersion(p *xpp.XMLPullParser) string {
- ver := p.Attribute("version")
- if ver != "" {
- return ver
- }
- ns := p.Attribute("xmlns")
- if ns == "http://purl.org/atom/ns#" {
- return "0.3"
- }
- if ns == "http://www.w3.org/2005/Atom" {
- return "1.0"
- }
- return ""
- }
- func (ap *Parser) stripWrappingDiv(content string) (result string) {
- result = content
- r := strings.NewReader(result)
- doc, err := goquery.NewDocumentFromReader(r)
- if err == nil {
- root := doc.Find("body").Children()
- if root.Is("div") && root.Siblings().Size() == 0 {
- html, err := root.Unwrap().Html()
- if err == nil {
- result = html
- }
- }
- }
- return
- }
|