parser.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767
  1. package rss
  2. import (
  3. "fmt"
  4. "io"
  5. "strings"
  6. "github.com/mmcdole/gofeed/extensions"
  7. "github.com/mmcdole/gofeed/internal/shared"
  8. "github.com/mmcdole/goxpp"
  9. )
  10. // Parser is a RSS Parser
  11. type Parser struct{}
  12. // Parse parses an xml feed into an rss.Feed
  13. func (rp *Parser) Parse(feed io.Reader) (*Feed, error) {
  14. p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
  15. _, err := shared.FindRoot(p)
  16. if err != nil {
  17. return nil, err
  18. }
  19. return rp.parseRoot(p)
  20. }
  21. func (rp *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
  22. rssErr := p.Expect(xpp.StartTag, "rss")
  23. rdfErr := p.Expect(xpp.StartTag, "rdf")
  24. if rssErr != nil && rdfErr != nil {
  25. return nil, fmt.Errorf("%s or %s", rssErr.Error(), rdfErr.Error())
  26. }
  27. // Items found in feed root
  28. var channel *Feed
  29. var textinput *TextInput
  30. var image *Image
  31. items := []*Item{}
  32. ver := rp.parseVersion(p)
  33. for {
  34. tok, err := shared.NextTag(p)
  35. if err != nil {
  36. return nil, err
  37. }
  38. if tok == xpp.EndTag {
  39. break
  40. }
  41. if tok == xpp.StartTag {
  42. // Skip any extensions found in the feed root.
  43. if shared.IsExtension(p) {
  44. p.Skip()
  45. continue
  46. }
  47. name := strings.ToLower(p.Name)
  48. if name == "channel" {
  49. channel, err = rp.parseChannel(p)
  50. if err != nil {
  51. return nil, err
  52. }
  53. } else if name == "item" {
  54. item, err := rp.parseItem(p)
  55. if err != nil {
  56. return nil, err
  57. }
  58. items = append(items, item)
  59. } else if name == "textinput" {
  60. textinput, err = rp.parseTextInput(p)
  61. if err != nil {
  62. return nil, err
  63. }
  64. } else if name == "image" {
  65. image, err = rp.parseImage(p)
  66. if err != nil {
  67. return nil, err
  68. }
  69. } else {
  70. p.Skip()
  71. }
  72. }
  73. }
  74. rssErr = p.Expect(xpp.EndTag, "rss")
  75. rdfErr = p.Expect(xpp.EndTag, "rdf")
  76. if rssErr != nil && rdfErr != nil {
  77. return nil, fmt.Errorf("%s or %s", rssErr.Error(), rdfErr.Error())
  78. }
  79. if channel == nil {
  80. channel = &Feed{}
  81. channel.Items = []*Item{}
  82. }
  83. if len(items) > 0 {
  84. channel.Items = append(channel.Items, items...)
  85. }
  86. if textinput != nil {
  87. channel.TextInput = textinput
  88. }
  89. if image != nil {
  90. channel.Image = image
  91. }
  92. channel.Version = ver
  93. return channel, nil
  94. }
  95. func (rp *Parser) parseChannel(p *xpp.XMLPullParser) (rss *Feed, err error) {
  96. if err = p.Expect(xpp.StartTag, "channel"); err != nil {
  97. return nil, err
  98. }
  99. rss = &Feed{}
  100. rss.Items = []*Item{}
  101. extensions := ext.Extensions{}
  102. categories := []*Category{}
  103. for {
  104. tok, err := shared.NextTag(p)
  105. if err != nil {
  106. return nil, err
  107. }
  108. if tok == xpp.EndTag {
  109. break
  110. }
  111. if tok == xpp.StartTag {
  112. name := strings.ToLower(p.Name)
  113. if shared.IsExtension(p) {
  114. ext, err := shared.ParseExtension(extensions, p)
  115. if err != nil {
  116. return nil, err
  117. }
  118. extensions = ext
  119. } else if name == "title" {
  120. result, err := shared.ParseText(p)
  121. if err != nil {
  122. return nil, err
  123. }
  124. rss.Title = result
  125. } else if name == "description" {
  126. result, err := shared.ParseText(p)
  127. if err != nil {
  128. return nil, err
  129. }
  130. rss.Description = result
  131. } else if name == "link" {
  132. result, err := shared.ParseText(p)
  133. if err != nil {
  134. return nil, err
  135. }
  136. rss.Link = result
  137. } else if name == "language" {
  138. result, err := shared.ParseText(p)
  139. if err != nil {
  140. return nil, err
  141. }
  142. rss.Language = result
  143. } else if name == "copyright" {
  144. result, err := shared.ParseText(p)
  145. if err != nil {
  146. return nil, err
  147. }
  148. rss.Copyright = result
  149. } else if name == "managingeditor" {
  150. result, err := shared.ParseText(p)
  151. if err != nil {
  152. return nil, err
  153. }
  154. rss.ManagingEditor = result
  155. } else if name == "webmaster" {
  156. result, err := shared.ParseText(p)
  157. if err != nil {
  158. return nil, err
  159. }
  160. rss.WebMaster = result
  161. } else if name == "pubdate" {
  162. result, err := shared.ParseText(p)
  163. if err != nil {
  164. return nil, err
  165. }
  166. rss.PubDate = result
  167. date, err := shared.ParseDate(result)
  168. if err == nil {
  169. utcDate := date.UTC()
  170. rss.PubDateParsed = &utcDate
  171. }
  172. } else if name == "lastbuilddate" {
  173. result, err := shared.ParseText(p)
  174. if err != nil {
  175. return nil, err
  176. }
  177. rss.LastBuildDate = result
  178. date, err := shared.ParseDate(result)
  179. if err == nil {
  180. utcDate := date.UTC()
  181. rss.LastBuildDateParsed = &utcDate
  182. }
  183. } else if name == "generator" {
  184. result, err := shared.ParseText(p)
  185. if err != nil {
  186. return nil, err
  187. }
  188. rss.Generator = result
  189. } else if name == "docs" {
  190. result, err := shared.ParseText(p)
  191. if err != nil {
  192. return nil, err
  193. }
  194. rss.Docs = result
  195. } else if name == "ttl" {
  196. result, err := shared.ParseText(p)
  197. if err != nil {
  198. return nil, err
  199. }
  200. rss.TTL = result
  201. } else if name == "rating" {
  202. result, err := shared.ParseText(p)
  203. if err != nil {
  204. return nil, err
  205. }
  206. rss.Rating = result
  207. } else if name == "skiphours" {
  208. result, err := rp.parseSkipHours(p)
  209. if err != nil {
  210. return nil, err
  211. }
  212. rss.SkipHours = result
  213. } else if name == "skipdays" {
  214. result, err := rp.parseSkipDays(p)
  215. if err != nil {
  216. return nil, err
  217. }
  218. rss.SkipDays = result
  219. } else if name == "item" {
  220. result, err := rp.parseItem(p)
  221. if err != nil {
  222. return nil, err
  223. }
  224. rss.Items = append(rss.Items, result)
  225. } else if name == "cloud" {
  226. result, err := rp.parseCloud(p)
  227. if err != nil {
  228. return nil, err
  229. }
  230. rss.Cloud = result
  231. } else if name == "category" {
  232. result, err := rp.parseCategory(p)
  233. if err != nil {
  234. return nil, err
  235. }
  236. categories = append(categories, result)
  237. } else if name == "image" {
  238. result, err := rp.parseImage(p)
  239. if err != nil {
  240. return nil, err
  241. }
  242. rss.Image = result
  243. } else if name == "textinput" {
  244. result, err := rp.parseTextInput(p)
  245. if err != nil {
  246. return nil, err
  247. }
  248. rss.TextInput = result
  249. } else {
  250. // Skip element as it isn't an extension and not
  251. // part of the spec
  252. p.Skip()
  253. }
  254. }
  255. }
  256. if err = p.Expect(xpp.EndTag, "channel"); err != nil {
  257. return nil, err
  258. }
  259. if len(categories) > 0 {
  260. rss.Categories = categories
  261. }
  262. if len(extensions) > 0 {
  263. rss.Extensions = extensions
  264. if itunes, ok := rss.Extensions["itunes"]; ok {
  265. rss.ITunesExt = ext.NewITunesFeedExtension(itunes)
  266. }
  267. if dc, ok := rss.Extensions["dc"]; ok {
  268. rss.DublinCoreExt = ext.NewDublinCoreExtension(dc)
  269. }
  270. }
  271. return rss, nil
  272. }
  273. func (rp *Parser) parseItem(p *xpp.XMLPullParser) (item *Item, err error) {
  274. if err = p.Expect(xpp.StartTag, "item"); err != nil {
  275. return nil, err
  276. }
  277. item = &Item{}
  278. extensions := ext.Extensions{}
  279. categories := []*Category{}
  280. for {
  281. tok, err := shared.NextTag(p)
  282. if err != nil {
  283. return nil, err
  284. }
  285. if tok == xpp.EndTag {
  286. break
  287. }
  288. if tok == xpp.StartTag {
  289. name := strings.ToLower(p.Name)
  290. if shared.IsExtension(p) {
  291. ext, err := shared.ParseExtension(extensions, p)
  292. if err != nil {
  293. return nil, err
  294. }
  295. item.Extensions = ext
  296. } else if name == "title" {
  297. result, err := shared.ParseText(p)
  298. if err != nil {
  299. return nil, err
  300. }
  301. item.Title = result
  302. } else if name == "description" {
  303. result, err := shared.ParseText(p)
  304. if err != nil {
  305. return nil, err
  306. }
  307. item.Description = result
  308. } else if name == "encoded" {
  309. space := strings.TrimSpace(p.Space)
  310. if prefix, ok := p.Spaces[space]; ok && prefix == "content" {
  311. result, err := shared.ParseText(p)
  312. if err != nil {
  313. return nil, err
  314. }
  315. item.Content = result
  316. }
  317. } else if name == "link" {
  318. result, err := shared.ParseText(p)
  319. if err != nil {
  320. return nil, err
  321. }
  322. item.Link = result
  323. } else if name == "author" {
  324. result, err := shared.ParseText(p)
  325. if err != nil {
  326. return nil, err
  327. }
  328. item.Author = result
  329. } else if name == "comments" {
  330. result, err := shared.ParseText(p)
  331. if err != nil {
  332. return nil, err
  333. }
  334. item.Comments = result
  335. } else if name == "pubdate" {
  336. result, err := shared.ParseText(p)
  337. if err != nil {
  338. return nil, err
  339. }
  340. item.PubDate = result
  341. date, err := shared.ParseDate(result)
  342. if err == nil {
  343. utcDate := date.UTC()
  344. item.PubDateParsed = &utcDate
  345. }
  346. } else if name == "source" {
  347. result, err := rp.parseSource(p)
  348. if err != nil {
  349. return nil, err
  350. }
  351. item.Source = result
  352. } else if name == "enclosure" {
  353. result, err := rp.parseEnclosure(p)
  354. if err != nil {
  355. return nil, err
  356. }
  357. item.Enclosure = result
  358. } else if name == "guid" {
  359. result, err := rp.parseGUID(p)
  360. if err != nil {
  361. return nil, err
  362. }
  363. item.GUID = result
  364. } else if name == "category" {
  365. result, err := rp.parseCategory(p)
  366. if err != nil {
  367. return nil, err
  368. }
  369. categories = append(categories, result)
  370. } else {
  371. // Skip any elements not part of the item spec
  372. p.Skip()
  373. }
  374. }
  375. }
  376. if len(categories) > 0 {
  377. item.Categories = categories
  378. }
  379. if len(extensions) > 0 {
  380. item.Extensions = extensions
  381. if itunes, ok := item.Extensions["itunes"]; ok {
  382. item.ITunesExt = ext.NewITunesItemExtension(itunes)
  383. }
  384. if dc, ok := item.Extensions["dc"]; ok {
  385. item.DublinCoreExt = ext.NewDublinCoreExtension(dc)
  386. }
  387. }
  388. if err = p.Expect(xpp.EndTag, "item"); err != nil {
  389. return nil, err
  390. }
  391. return item, nil
  392. }
  393. func (rp *Parser) parseSource(p *xpp.XMLPullParser) (source *Source, err error) {
  394. if err = p.Expect(xpp.StartTag, "source"); err != nil {
  395. return nil, err
  396. }
  397. source = &Source{}
  398. source.URL = p.Attribute("url")
  399. result, err := shared.ParseText(p)
  400. if err != nil {
  401. return source, err
  402. }
  403. source.Title = result
  404. if err = p.Expect(xpp.EndTag, "source"); err != nil {
  405. return nil, err
  406. }
  407. return source, nil
  408. }
  409. func (rp *Parser) parseEnclosure(p *xpp.XMLPullParser) (enclosure *Enclosure, err error) {
  410. if err = p.Expect(xpp.StartTag, "enclosure"); err != nil {
  411. return nil, err
  412. }
  413. enclosure = &Enclosure{}
  414. enclosure.URL = p.Attribute("url")
  415. enclosure.Length = p.Attribute("length")
  416. enclosure.Type = p.Attribute("type")
  417. // Ignore any enclosure text
  418. _, err = p.NextText()
  419. if err != nil {
  420. return enclosure, err
  421. }
  422. if err = p.Expect(xpp.EndTag, "enclosure"); err != nil {
  423. return nil, err
  424. }
  425. return enclosure, nil
  426. }
  427. func (rp *Parser) parseImage(p *xpp.XMLPullParser) (image *Image, err error) {
  428. if err = p.Expect(xpp.StartTag, "image"); err != nil {
  429. return nil, err
  430. }
  431. image = &Image{}
  432. for {
  433. tok, err := shared.NextTag(p)
  434. if err != nil {
  435. return image, err
  436. }
  437. if tok == xpp.EndTag {
  438. break
  439. }
  440. if tok == xpp.StartTag {
  441. name := strings.ToLower(p.Name)
  442. if name == "url" {
  443. result, err := shared.ParseText(p)
  444. if err != nil {
  445. return nil, err
  446. }
  447. image.URL = result
  448. } else if name == "title" {
  449. result, err := shared.ParseText(p)
  450. if err != nil {
  451. return nil, err
  452. }
  453. image.Title = result
  454. } else if name == "link" {
  455. result, err := shared.ParseText(p)
  456. if err != nil {
  457. return nil, err
  458. }
  459. image.Link = result
  460. } else if name == "width" {
  461. result, err := shared.ParseText(p)
  462. if err != nil {
  463. return nil, err
  464. }
  465. image.Width = result
  466. } else if name == "height" {
  467. result, err := shared.ParseText(p)
  468. if err != nil {
  469. return nil, err
  470. }
  471. image.Height = result
  472. } else if name == "description" {
  473. result, err := shared.ParseText(p)
  474. if err != nil {
  475. return nil, err
  476. }
  477. image.Description = result
  478. } else {
  479. p.Skip()
  480. }
  481. }
  482. }
  483. if err = p.Expect(xpp.EndTag, "image"); err != nil {
  484. return nil, err
  485. }
  486. return image, nil
  487. }
  488. func (rp *Parser) parseGUID(p *xpp.XMLPullParser) (guid *GUID, err error) {
  489. if err = p.Expect(xpp.StartTag, "guid"); err != nil {
  490. return nil, err
  491. }
  492. guid = &GUID{}
  493. guid.IsPermalink = p.Attribute("isPermalink")
  494. result, err := shared.ParseText(p)
  495. if err != nil {
  496. return
  497. }
  498. guid.Value = result
  499. if err = p.Expect(xpp.EndTag, "guid"); err != nil {
  500. return nil, err
  501. }
  502. return guid, nil
  503. }
  504. func (rp *Parser) parseCategory(p *xpp.XMLPullParser) (cat *Category, err error) {
  505. if err = p.Expect(xpp.StartTag, "category"); err != nil {
  506. return nil, err
  507. }
  508. cat = &Category{}
  509. cat.Domain = p.Attribute("domain")
  510. result, err := shared.ParseText(p)
  511. if err != nil {
  512. return nil, err
  513. }
  514. cat.Value = result
  515. if err = p.Expect(xpp.EndTag, "category"); err != nil {
  516. return nil, err
  517. }
  518. return cat, nil
  519. }
  520. func (rp *Parser) parseTextInput(p *xpp.XMLPullParser) (*TextInput, error) {
  521. if err := p.Expect(xpp.StartTag, "textinput"); err != nil {
  522. return nil, err
  523. }
  524. ti := &TextInput{}
  525. for {
  526. tok, err := shared.NextTag(p)
  527. if err != nil {
  528. return nil, err
  529. }
  530. if tok == xpp.EndTag {
  531. break
  532. }
  533. if tok == xpp.StartTag {
  534. name := strings.ToLower(p.Name)
  535. if name == "title" {
  536. result, err := shared.ParseText(p)
  537. if err != nil {
  538. return nil, err
  539. }
  540. ti.Title = result
  541. } else if name == "description" {
  542. result, err := shared.ParseText(p)
  543. if err != nil {
  544. return nil, err
  545. }
  546. ti.Description = result
  547. } else if name == "name" {
  548. result, err := shared.ParseText(p)
  549. if err != nil {
  550. return nil, err
  551. }
  552. ti.Name = result
  553. } else if name == "link" {
  554. result, err := shared.ParseText(p)
  555. if err != nil {
  556. return nil, err
  557. }
  558. ti.Link = result
  559. } else {
  560. p.Skip()
  561. }
  562. }
  563. }
  564. if err := p.Expect(xpp.EndTag, "textinput"); err != nil {
  565. return nil, err
  566. }
  567. return ti, nil
  568. }
  569. func (rp *Parser) parseSkipHours(p *xpp.XMLPullParser) ([]string, error) {
  570. if err := p.Expect(xpp.StartTag, "skiphours"); err != nil {
  571. return nil, err
  572. }
  573. hours := []string{}
  574. for {
  575. tok, err := shared.NextTag(p)
  576. if err != nil {
  577. return nil, err
  578. }
  579. if tok == xpp.EndTag {
  580. break
  581. }
  582. if tok == xpp.StartTag {
  583. name := strings.ToLower(p.Name)
  584. if name == "hour" {
  585. result, err := shared.ParseText(p)
  586. if err != nil {
  587. return nil, err
  588. }
  589. hours = append(hours, result)
  590. } else {
  591. p.Skip()
  592. }
  593. }
  594. }
  595. if err := p.Expect(xpp.EndTag, "skiphours"); err != nil {
  596. return nil, err
  597. }
  598. return hours, nil
  599. }
  600. func (rp *Parser) parseSkipDays(p *xpp.XMLPullParser) ([]string, error) {
  601. if err := p.Expect(xpp.StartTag, "skipdays"); err != nil {
  602. return nil, err
  603. }
  604. days := []string{}
  605. for {
  606. tok, err := shared.NextTag(p)
  607. if err != nil {
  608. return nil, err
  609. }
  610. if tok == xpp.EndTag {
  611. break
  612. }
  613. if tok == xpp.StartTag {
  614. name := strings.ToLower(p.Name)
  615. if name == "day" {
  616. result, err := shared.ParseText(p)
  617. if err != nil {
  618. return nil, err
  619. }
  620. days = append(days, result)
  621. } else {
  622. p.Skip()
  623. }
  624. }
  625. }
  626. if err := p.Expect(xpp.EndTag, "skipdays"); err != nil {
  627. return nil, err
  628. }
  629. return days, nil
  630. }
  631. func (rp *Parser) parseCloud(p *xpp.XMLPullParser) (*Cloud, error) {
  632. if err := p.Expect(xpp.StartTag, "cloud"); err != nil {
  633. return nil, err
  634. }
  635. cloud := &Cloud{}
  636. cloud.Domain = p.Attribute("domain")
  637. cloud.Port = p.Attribute("port")
  638. cloud.Path = p.Attribute("path")
  639. cloud.RegisterProcedure = p.Attribute("registerProcedure")
  640. cloud.Protocol = p.Attribute("protocol")
  641. shared.NextTag(p)
  642. if err := p.Expect(xpp.EndTag, "cloud"); err != nil {
  643. return nil, err
  644. }
  645. return cloud, nil
  646. }
  647. func (rp *Parser) parseVersion(p *xpp.XMLPullParser) (ver string) {
  648. name := strings.ToLower(p.Name)
  649. if name == "rss" {
  650. ver = p.Attribute("version")
  651. } else if name == "rdf" {
  652. ns := p.Attribute("xmlns")
  653. if ns == "http://channel.netscape.com/rdf/simple/0.9/" ||
  654. ns == "http://my.netscape.com/rdf/simple/0.9/" {
  655. ver = "0.9"
  656. } else if ns == "http://purl.org/rss/1.0/" {
  657. ver = "1.0"
  658. }
  659. }
  660. return
  661. }