parser.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. package atom
  2. import (
  3. "encoding/base64"
  4. "io"
  5. "strings"
  6. "github.com/PuerkitoBio/goquery"
  7. "github.com/mmcdole/gofeed/extensions"
  8. "github.com/mmcdole/gofeed/internal/shared"
  9. "github.com/mmcdole/goxpp"
  10. )
  11. // Parser is an Atom Parser
  12. type Parser struct{}
  13. // Parse parses an xml feed into an atom.Feed
  14. func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
  15. p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
  16. _, err := shared.FindRoot(p)
  17. if err != nil {
  18. return nil, err
  19. }
  20. return ap.parseRoot(p)
  21. }
  22. func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
  23. if err := p.Expect(xpp.StartTag, "feed"); err != nil {
  24. return nil, err
  25. }
  26. atom := &Feed{}
  27. atom.Entries = []*Entry{}
  28. atom.Version = ap.parseVersion(p)
  29. atom.Language = ap.parseLanguage(p)
  30. contributors := []*Person{}
  31. authors := []*Person{}
  32. categories := []*Category{}
  33. links := []*Link{}
  34. extensions := ext.Extensions{}
  35. for {
  36. tok, err := shared.NextTag(p)
  37. if err != nil {
  38. return nil, err
  39. }
  40. if tok == xpp.EndTag {
  41. break
  42. }
  43. if tok == xpp.StartTag {
  44. name := strings.ToLower(p.Name)
  45. if shared.IsExtension(p) {
  46. e, err := shared.ParseExtension(extensions, p)
  47. if err != nil {
  48. return nil, err
  49. }
  50. extensions = e
  51. } else if name == "title" {
  52. result, err := ap.parseAtomText(p)
  53. if err != nil {
  54. return nil, err
  55. }
  56. atom.Title = result
  57. } else if name == "id" {
  58. result, err := ap.parseAtomText(p)
  59. if err != nil {
  60. return nil, err
  61. }
  62. atom.ID = result
  63. } else if name == "updated" ||
  64. name == "modified" {
  65. result, err := ap.parseAtomText(p)
  66. if err != nil {
  67. return nil, err
  68. }
  69. atom.Updated = result
  70. date, err := shared.ParseDate(result)
  71. if err == nil {
  72. utcDate := date.UTC()
  73. atom.UpdatedParsed = &utcDate
  74. }
  75. } else if name == "subtitle" ||
  76. name == "tagline" {
  77. result, err := ap.parseAtomText(p)
  78. if err != nil {
  79. return nil, err
  80. }
  81. atom.Subtitle = result
  82. } else if name == "link" {
  83. result, err := ap.parseLink(p)
  84. if err != nil {
  85. return nil, err
  86. }
  87. links = append(links, result)
  88. } else if name == "generator" {
  89. result, err := ap.parseGenerator(p)
  90. if err != nil {
  91. return nil, err
  92. }
  93. atom.Generator = result
  94. } else if name == "icon" {
  95. result, err := ap.parseAtomText(p)
  96. if err != nil {
  97. return nil, err
  98. }
  99. atom.Icon = result
  100. } else if name == "logo" {
  101. result, err := ap.parseAtomText(p)
  102. if err != nil {
  103. return nil, err
  104. }
  105. atom.Logo = result
  106. } else if name == "rights" ||
  107. name == "copyright" {
  108. result, err := ap.parseAtomText(p)
  109. if err != nil {
  110. return nil, err
  111. }
  112. atom.Rights = result
  113. } else if name == "contributor" {
  114. result, err := ap.parsePerson("contributor", p)
  115. if err != nil {
  116. return nil, err
  117. }
  118. contributors = append(contributors, result)
  119. } else if name == "author" {
  120. result, err := ap.parsePerson("author", p)
  121. if err != nil {
  122. return nil, err
  123. }
  124. authors = append(authors, result)
  125. } else if name == "category" {
  126. result, err := ap.parseCategory(p)
  127. if err != nil {
  128. return nil, err
  129. }
  130. categories = append(categories, result)
  131. } else if name == "entry" {
  132. result, err := ap.parseEntry(p)
  133. if err != nil {
  134. return nil, err
  135. }
  136. atom.Entries = append(atom.Entries, result)
  137. } else {
  138. err := p.Skip()
  139. if err != nil {
  140. return nil, err
  141. }
  142. }
  143. }
  144. }
  145. if len(categories) > 0 {
  146. atom.Categories = categories
  147. }
  148. if len(authors) > 0 {
  149. atom.Authors = authors
  150. }
  151. if len(contributors) > 0 {
  152. atom.Contributors = contributors
  153. }
  154. if len(links) > 0 {
  155. atom.Links = links
  156. }
  157. if len(extensions) > 0 {
  158. atom.Extensions = extensions
  159. }
  160. if err := p.Expect(xpp.EndTag, "feed"); err != nil {
  161. return nil, err
  162. }
  163. return atom, nil
  164. }
  165. func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) {
  166. if err := p.Expect(xpp.StartTag, "entry"); err != nil {
  167. return nil, err
  168. }
  169. entry := &Entry{}
  170. contributors := []*Person{}
  171. authors := []*Person{}
  172. categories := []*Category{}
  173. links := []*Link{}
  174. extensions := ext.Extensions{}
  175. for {
  176. tok, err := shared.NextTag(p)
  177. if err != nil {
  178. return nil, err
  179. }
  180. if tok == xpp.EndTag {
  181. break
  182. }
  183. if tok == xpp.StartTag {
  184. name := strings.ToLower(p.Name)
  185. if shared.IsExtension(p) {
  186. e, err := shared.ParseExtension(extensions, p)
  187. if err != nil {
  188. return nil, err
  189. }
  190. extensions = e
  191. } else if name == "title" {
  192. result, err := ap.parseAtomText(p)
  193. if err != nil {
  194. return nil, err
  195. }
  196. entry.Title = result
  197. } else if name == "id" {
  198. result, err := ap.parseAtomText(p)
  199. if err != nil {
  200. return nil, err
  201. }
  202. entry.ID = result
  203. } else if name == "rights" ||
  204. name == "copyright" {
  205. result, err := ap.parseAtomText(p)
  206. if err != nil {
  207. return nil, err
  208. }
  209. entry.Rights = result
  210. } else if name == "summary" {
  211. result, err := ap.parseAtomText(p)
  212. if err != nil {
  213. return nil, err
  214. }
  215. entry.Summary = result
  216. } else if name == "source" {
  217. result, err := ap.parseSource(p)
  218. if err != nil {
  219. return nil, err
  220. }
  221. entry.Source = result
  222. } else if name == "updated" ||
  223. name == "modified" {
  224. result, err := ap.parseAtomText(p)
  225. if err != nil {
  226. return nil, err
  227. }
  228. entry.Updated = result
  229. date, err := shared.ParseDate(result)
  230. if err == nil {
  231. utcDate := date.UTC()
  232. entry.UpdatedParsed = &utcDate
  233. }
  234. } else if name == "contributor" {
  235. result, err := ap.parsePerson("contributor", p)
  236. if err != nil {
  237. return nil, err
  238. }
  239. contributors = append(contributors, result)
  240. } else if name == "author" {
  241. result, err := ap.parsePerson("author", p)
  242. if err != nil {
  243. return nil, err
  244. }
  245. authors = append(authors, result)
  246. } else if name == "category" {
  247. result, err := ap.parseCategory(p)
  248. if err != nil {
  249. return nil, err
  250. }
  251. categories = append(categories, result)
  252. } else if name == "link" {
  253. result, err := ap.parseLink(p)
  254. if err != nil {
  255. return nil, err
  256. }
  257. links = append(links, result)
  258. } else if name == "published" ||
  259. name == "issued" {
  260. result, err := ap.parseAtomText(p)
  261. if err != nil {
  262. return nil, err
  263. }
  264. entry.Published = result
  265. date, err := shared.ParseDate(result)
  266. if err == nil {
  267. utcDate := date.UTC()
  268. entry.PublishedParsed = &utcDate
  269. }
  270. } else if name == "content" {
  271. result, err := ap.parseContent(p)
  272. if err != nil {
  273. return nil, err
  274. }
  275. entry.Content = result
  276. } else {
  277. err := p.Skip()
  278. if err != nil {
  279. return nil, err
  280. }
  281. }
  282. }
  283. }
  284. if len(categories) > 0 {
  285. entry.Categories = categories
  286. }
  287. if len(authors) > 0 {
  288. entry.Authors = authors
  289. }
  290. if len(links) > 0 {
  291. entry.Links = links
  292. }
  293. if len(contributors) > 0 {
  294. entry.Contributors = contributors
  295. }
  296. if len(extensions) > 0 {
  297. entry.Extensions = extensions
  298. }
  299. if err := p.Expect(xpp.EndTag, "entry"); err != nil {
  300. return nil, err
  301. }
  302. return entry, nil
  303. }
  304. func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) {
  305. if err := p.Expect(xpp.StartTag, "source"); err != nil {
  306. return nil, err
  307. }
  308. source := &Source{}
  309. contributors := []*Person{}
  310. authors := []*Person{}
  311. categories := []*Category{}
  312. links := []*Link{}
  313. extensions := ext.Extensions{}
  314. for {
  315. tok, err := shared.NextTag(p)
  316. if err != nil {
  317. return nil, err
  318. }
  319. if tok == xpp.EndTag {
  320. break
  321. }
  322. if tok == xpp.StartTag {
  323. name := strings.ToLower(p.Name)
  324. if shared.IsExtension(p) {
  325. e, err := shared.ParseExtension(extensions, p)
  326. if err != nil {
  327. return nil, err
  328. }
  329. extensions = e
  330. } else if name == "title" {
  331. result, err := ap.parseAtomText(p)
  332. if err != nil {
  333. return nil, err
  334. }
  335. source.Title = result
  336. } else if name == "id" {
  337. result, err := ap.parseAtomText(p)
  338. if err != nil {
  339. return nil, err
  340. }
  341. source.ID = result
  342. } else if name == "updated" ||
  343. name == "modified" {
  344. result, err := ap.parseAtomText(p)
  345. if err != nil {
  346. return nil, err
  347. }
  348. source.Updated = result
  349. date, err := shared.ParseDate(result)
  350. if err == nil {
  351. utcDate := date.UTC()
  352. source.UpdatedParsed = &utcDate
  353. }
  354. } else if name == "subtitle" ||
  355. name == "tagline" {
  356. result, err := ap.parseAtomText(p)
  357. if err != nil {
  358. return nil, err
  359. }
  360. source.Subtitle = result
  361. } else if name == "link" {
  362. result, err := ap.parseLink(p)
  363. if err != nil {
  364. return nil, err
  365. }
  366. links = append(links, result)
  367. } else if name == "generator" {
  368. result, err := ap.parseGenerator(p)
  369. if err != nil {
  370. return nil, err
  371. }
  372. source.Generator = result
  373. } else if name == "icon" {
  374. result, err := ap.parseAtomText(p)
  375. if err != nil {
  376. return nil, err
  377. }
  378. source.Icon = result
  379. } else if name == "logo" {
  380. result, err := ap.parseAtomText(p)
  381. if err != nil {
  382. return nil, err
  383. }
  384. source.Logo = result
  385. } else if name == "rights" ||
  386. name == "copyright" {
  387. result, err := ap.parseAtomText(p)
  388. if err != nil {
  389. return nil, err
  390. }
  391. source.Rights = result
  392. } else if name == "contributor" {
  393. result, err := ap.parsePerson("contributor", p)
  394. if err != nil {
  395. return nil, err
  396. }
  397. contributors = append(contributors, result)
  398. } else if name == "author" {
  399. result, err := ap.parsePerson("author", p)
  400. if err != nil {
  401. return nil, err
  402. }
  403. authors = append(authors, result)
  404. } else if name == "category" {
  405. result, err := ap.parseCategory(p)
  406. if err != nil {
  407. return nil, err
  408. }
  409. categories = append(categories, result)
  410. } else {
  411. err := p.Skip()
  412. if err != nil {
  413. return nil, err
  414. }
  415. }
  416. }
  417. }
  418. if len(categories) > 0 {
  419. source.Categories = categories
  420. }
  421. if len(authors) > 0 {
  422. source.Authors = authors
  423. }
  424. if len(contributors) > 0 {
  425. source.Contributors = contributors
  426. }
  427. if len(links) > 0 {
  428. source.Links = links
  429. }
  430. if len(extensions) > 0 {
  431. source.Extensions = extensions
  432. }
  433. if err := p.Expect(xpp.EndTag, "source"); err != nil {
  434. return nil, err
  435. }
  436. return source, nil
  437. }
  438. func (ap *Parser) parseContent(p *xpp.XMLPullParser) (*Content, error) {
  439. c := &Content{}
  440. c.Type = p.Attribute("type")
  441. c.Src = p.Attribute("src")
  442. text, err := ap.parseAtomText(p)
  443. if err != nil {
  444. return nil, err
  445. }
  446. c.Value = text
  447. return c, nil
  448. }
  449. func (ap *Parser) parsePerson(name string, p *xpp.XMLPullParser) (*Person, error) {
  450. if err := p.Expect(xpp.StartTag, name); err != nil {
  451. return nil, err
  452. }
  453. person := &Person{}
  454. for {
  455. tok, err := shared.NextTag(p)
  456. if err != nil {
  457. return nil, err
  458. }
  459. if tok == xpp.EndTag {
  460. break
  461. }
  462. if tok == xpp.StartTag {
  463. name := strings.ToLower(p.Name)
  464. if name == "name" {
  465. result, err := ap.parseAtomText(p)
  466. if err != nil {
  467. return nil, err
  468. }
  469. person.Name = result
  470. } else if name == "email" {
  471. result, err := ap.parseAtomText(p)
  472. if err != nil {
  473. return nil, err
  474. }
  475. person.Email = result
  476. } else if name == "uri" ||
  477. name == "url" ||
  478. name == "homepage" {
  479. result, err := ap.parseAtomText(p)
  480. if err != nil {
  481. return nil, err
  482. }
  483. person.URI = result
  484. } else {
  485. err := p.Skip()
  486. if err != nil {
  487. return nil, err
  488. }
  489. }
  490. }
  491. }
  492. if err := p.Expect(xpp.EndTag, name); err != nil {
  493. return nil, err
  494. }
  495. return person, nil
  496. }
  497. func (ap *Parser) parseLink(p *xpp.XMLPullParser) (*Link, error) {
  498. if err := p.Expect(xpp.StartTag, "link"); err != nil {
  499. return nil, err
  500. }
  501. l := &Link{}
  502. l.Href = p.Attribute("href")
  503. l.Hreflang = p.Attribute("hreflang")
  504. l.Type = p.Attribute("type")
  505. l.Length = p.Attribute("length")
  506. l.Title = p.Attribute("title")
  507. l.Rel = p.Attribute("rel")
  508. if l.Rel == "" {
  509. l.Rel = "alternate"
  510. }
  511. if err := p.Skip(); err != nil {
  512. return nil, err
  513. }
  514. if err := p.Expect(xpp.EndTag, "link"); err != nil {
  515. return nil, err
  516. }
  517. return l, nil
  518. }
  519. func (ap *Parser) parseCategory(p *xpp.XMLPullParser) (*Category, error) {
  520. if err := p.Expect(xpp.StartTag, "category"); err != nil {
  521. return nil, err
  522. }
  523. c := &Category{}
  524. c.Term = p.Attribute("term")
  525. c.Scheme = p.Attribute("scheme")
  526. c.Label = p.Attribute("label")
  527. if err := p.Skip(); err != nil {
  528. return nil, err
  529. }
  530. if err := p.Expect(xpp.EndTag, "category"); err != nil {
  531. return nil, err
  532. }
  533. return c, nil
  534. }
  535. func (ap *Parser) parseGenerator(p *xpp.XMLPullParser) (*Generator, error) {
  536. if err := p.Expect(xpp.StartTag, "generator"); err != nil {
  537. return nil, err
  538. }
  539. g := &Generator{}
  540. uri := p.Attribute("uri") // Atom 1.0
  541. url := p.Attribute("url") // Atom 0.3
  542. if uri != "" {
  543. g.URI = uri
  544. } else if url != "" {
  545. g.URI = url
  546. }
  547. g.Version = p.Attribute("version")
  548. result, err := ap.parseAtomText(p)
  549. if err != nil {
  550. return nil, err
  551. }
  552. g.Value = result
  553. if err := p.Expect(xpp.EndTag, "generator"); err != nil {
  554. return nil, err
  555. }
  556. return g, nil
  557. }
  558. func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
  559. var text struct {
  560. Type string `xml:"type,attr"`
  561. Mode string `xml:"mode,attr"`
  562. InnerXML string `xml:",innerxml"`
  563. }
  564. err := p.DecodeElement(&text)
  565. if err != nil {
  566. return "", err
  567. }
  568. result := text.InnerXML
  569. result = strings.TrimSpace(result)
  570. if strings.HasPrefix(result, "<![CDATA[") &&
  571. strings.HasSuffix(result, "]]>") {
  572. result = strings.TrimPrefix(result, "<![CDATA[")
  573. result = strings.TrimSuffix(result, "]]>")
  574. return result, nil
  575. }
  576. lowerType := strings.ToLower(text.Type)
  577. lowerMode := strings.ToLower(text.Mode)
  578. if lowerType == "text" ||
  579. strings.HasPrefix(lowerType, "text/") ||
  580. (lowerType == "" && lowerMode == "") {
  581. result, err = shared.DecodeEntities(result)
  582. } else if strings.Contains(lowerType, "xhtml") {
  583. result = ap.stripWrappingDiv(result)
  584. } else if lowerType == "html" {
  585. result = ap.stripWrappingDiv(result)
  586. result, err = shared.DecodeEntities(result)
  587. } else {
  588. decodedStr, err := base64.StdEncoding.DecodeString(result)
  589. if err == nil {
  590. result = string(decodedStr)
  591. }
  592. }
  593. return result, err
  594. }
  595. func (ap *Parser) parseLanguage(p *xpp.XMLPullParser) string {
  596. return p.Attribute("lang")
  597. }
  598. func (ap *Parser) parseVersion(p *xpp.XMLPullParser) string {
  599. ver := p.Attribute("version")
  600. if ver != "" {
  601. return ver
  602. }
  603. ns := p.Attribute("xmlns")
  604. if ns == "http://purl.org/atom/ns#" {
  605. return "0.3"
  606. }
  607. if ns == "http://www.w3.org/2005/Atom" {
  608. return "1.0"
  609. }
  610. return ""
  611. }
  612. func (ap *Parser) stripWrappingDiv(content string) (result string) {
  613. result = content
  614. r := strings.NewReader(result)
  615. doc, err := goquery.NewDocumentFromReader(r)
  616. if err == nil {
  617. root := doc.Find("body").Children()
  618. if root.Is("div") && root.Siblings().Size() == 0 {
  619. html, err := root.Unwrap().Html()
  620. if err == nil {
  621. result = html
  622. }
  623. }
  624. }
  625. return
  626. }