extparser.go 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. package shared
  2. import (
  3. "strings"
  4. "github.com/mmcdole/gofeed/extensions"
  5. "github.com/mmcdole/goxpp"
  6. )
  7. // IsExtension returns whether or not the current
  8. // XML element is an extension element (if it has a
  9. // non empty prefix)
  10. func IsExtension(p *xpp.XMLPullParser) bool {
  11. space := strings.TrimSpace(p.Space)
  12. if prefix, ok := p.Spaces[space]; ok {
  13. return !(prefix == "" || prefix == "rss" || prefix == "rdf" || prefix == "content")
  14. }
  15. return p.Space != ""
  16. }
  17. // ParseExtension parses the current element of the
  18. // XMLPullParser as an extension element and updates
  19. // the extension map
  20. func ParseExtension(fe ext.Extensions, p *xpp.XMLPullParser) (ext.Extensions, error) {
  21. prefix := prefixForNamespace(p.Space, p)
  22. result, err := parseExtensionElement(p)
  23. if err != nil {
  24. return nil, err
  25. }
  26. // Ensure the extension prefix map exists
  27. if _, ok := fe[prefix]; !ok {
  28. fe[prefix] = map[string][]ext.Extension{}
  29. }
  30. // Ensure the extension element slice exists
  31. if _, ok := fe[prefix][p.Name]; !ok {
  32. fe[prefix][p.Name] = []ext.Extension{}
  33. }
  34. fe[prefix][p.Name] = append(fe[prefix][p.Name], result)
  35. return fe, nil
  36. }
  37. func parseExtensionElement(p *xpp.XMLPullParser) (e ext.Extension, err error) {
  38. if err = p.Expect(xpp.StartTag, "*"); err != nil {
  39. return e, err
  40. }
  41. e.Name = p.Name
  42. e.Children = map[string][]ext.Extension{}
  43. e.Attrs = map[string]string{}
  44. for _, attr := range p.Attrs {
  45. // TODO: Alright that we are stripping
  46. // namespace information from attributes ?
  47. e.Attrs[attr.Name.Local] = attr.Value
  48. }
  49. for {
  50. tok, err := p.Next()
  51. if err != nil {
  52. return e, err
  53. }
  54. if tok == xpp.EndTag {
  55. break
  56. }
  57. if tok == xpp.StartTag {
  58. child, err := parseExtensionElement(p)
  59. if err != nil {
  60. return e, err
  61. }
  62. if _, ok := e.Children[child.Name]; !ok {
  63. e.Children[child.Name] = []ext.Extension{}
  64. }
  65. e.Children[child.Name] = append(e.Children[child.Name], child)
  66. } else if tok == xpp.Text {
  67. e.Value += p.Text
  68. }
  69. }
  70. e.Value = strings.TrimSpace(e.Value)
  71. if err = p.Expect(xpp.EndTag, e.Name); err != nil {
  72. return e, err
  73. }
  74. return e, nil
  75. }
  76. func prefixForNamespace(space string, p *xpp.XMLPullParser) string {
  77. // First we check if the global namespace map
  78. // contains an entry for this namespace/prefix.
  79. // This way we can use the canonical prefix for this
  80. // ns instead of the one defined in the feed.
  81. if prefix, ok := canonicalNamespaces[space]; ok {
  82. return prefix
  83. }
  84. // Next we check if the feed itself defined this
  85. // this namespace and return it if we have a result.
  86. if prefix, ok := p.Spaces[space]; ok {
  87. return prefix
  88. }
  89. // Lastly, any namespace which is not defined in the
  90. // the feed will be the prefix itself when using Go's
  91. // xml.Decoder.Token() method.
  92. return space
  93. }
  94. // Namespaces taken from github.com/kurtmckee/feedparser
  95. // These are used for determining canonical name space prefixes
  96. // for many of the popular RSS/Atom extensions.
  97. //
  98. // These canonical prefixes override any prefixes used in the feed itself.
  99. var canonicalNamespaces = map[string]string{
  100. "http://webns.net/mvcb/": "admin",
  101. "http://purl.org/rss/1.0/modules/aggregation/": "ag",
  102. "http://purl.org/rss/1.0/modules/annotate/": "annotate",
  103. "http://media.tangent.org/rss/1.0/": "audio",
  104. "http://backend.userland.com/blogChannelModule": "blogChannel",
  105. "http://creativecommons.org/ns#license": "cc",
  106. "http://web.resource.org/cc/": "cc",
  107. "http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": "creativeCommons",
  108. "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
  109. "http://purl.org/rss/1.0/modules/company": "co",
  110. "http://purl.org/rss/1.0/modules/content/": "content",
  111. "http://my.theinfo.org/changed/1.0/rss/": "cp",
  112. "http://purl.org/dc/elements/1.1/": "dc",
  113. "http://purl.org/dc/terms/": "dcterms",
  114. "http://purl.org/rss/1.0/modules/email/": "email",
  115. "http://purl.org/rss/1.0/modules/event/": "ev",
  116. "http://rssnamespace.org/feedburner/ext/1.0": "feedburner",
  117. "http://freshmeat.net/rss/fm/": "fm",
  118. "http://xmlns.com/foaf/0.1/": "foaf",
  119. "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo",
  120. "http://www.georss.org/georss": "georss",
  121. "http://www.opengis.net/gml": "gml",
  122. "http://postneo.com/icbm/": "icbm",
  123. "http://purl.org/rss/1.0/modules/image/": "image",
  124. "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes",
  125. "http://example.com/DTDs/PodCast-1.0.dtd": "itunes",
  126. "http://purl.org/rss/1.0/modules/link/": "l",
  127. "http://search.yahoo.com/mrss": "media",
  128. "http://search.yahoo.com/mrss/": "media",
  129. "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
  130. "http://prismstandard.org/namespaces/1.2/basic/": "prism",
  131. "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
  132. "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
  133. "http://purl.org/rss/1.0/modules/reference/": "ref",
  134. "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
  135. "http://purl.org/rss/1.0/modules/search/": "search",
  136. "http://purl.org/rss/1.0/modules/slash/": "slash",
  137. "http://schemas.xmlsoap.org/soap/envelope/": "soap",
  138. "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
  139. "http://hacks.benhammersley.com/rss/streaming/": "str",
  140. "http://purl.org/rss/1.0/modules/subscription/": "sub",
  141. "http://purl.org/rss/1.0/modules/syndication/": "sy",
  142. "http://schemas.pocketsoap.com/rss/myDescModule/": "szf",
  143. "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
  144. "http://purl.org/rss/1.0/modules/threading/": "thr",
  145. "http://purl.org/rss/1.0/modules/textinput/": "ti",
  146. "http://madskills.com/public/xml/rss/module/trackback/": "trackback",
  147. "http://wellformedweb.org/commentAPI/": "wfw",
  148. "http://purl.org/rss/1.0/modules/wiki/": "wiki",
  149. "http://www.w3.org/1999/xhtml": "xhtml",
  150. "http://www.w3.org/1999/xlink": "xlink",
  151. "http://www.w3.org/XML/1998/namespace": "xml",
  152. "http://podlove.org/simple-chapters": "psc",
  153. }