parseutils.go 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. package shared
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "regexp"
  7. "strconv"
  8. "strings"
  9. "github.com/mmcdole/goxpp"
  10. )
  11. var (
  12. emailNameRgx = regexp.MustCompile(`^([^@]+@[^\s]+)\s+\(([^@]+)\)$`)
  13. nameEmailRgx = regexp.MustCompile(`^([^@]+)\s+\(([^@]+@[^)]+)\)$`)
  14. nameOnlyRgx = regexp.MustCompile(`^([^@()]+)$`)
  15. emailOnlyRgx = regexp.MustCompile(`^([^@()]+@[^@()]+)$`)
  16. TruncatedEntity = errors.New("truncated entity")
  17. InvalidNumericReference = errors.New("invalid numeric reference")
  18. )
  19. // FindRoot iterates through the tokens of an xml document until
  20. // it encounters its first StartTag event. It returns an error
  21. // if it reaches EndDocument before finding a tag.
  22. func FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
  23. for {
  24. event, err = p.Next()
  25. if err != nil {
  26. return event, err
  27. }
  28. if event == xpp.StartTag {
  29. break
  30. }
  31. if event == xpp.EndDocument {
  32. return event, fmt.Errorf("Failed to find root node before document end.")
  33. }
  34. }
  35. return
  36. }
  37. // NextTag iterates through the tokens until it reaches a StartTag or EndTag
  38. // It is similar to goxpp's NextTag method except it wont throw an error if
  39. // the next immediate token isnt a Start/EndTag. Instead, it will continue to
  40. // consume tokens until it hits a Start/EndTag or EndDocument.
  41. func NextTag(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
  42. for {
  43. event, err = p.Next()
  44. if err != nil {
  45. return event, err
  46. }
  47. if event == xpp.StartTag || event == xpp.EndTag {
  48. break
  49. }
  50. if event == xpp.EndDocument {
  51. return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")
  52. }
  53. }
  54. return
  55. }
  56. // ParseText is a helper function for parsing the text
  57. // from the current element of the XMLPullParser.
  58. // This function can handle parsing naked XML text from
  59. // an element.
  60. func ParseText(p *xpp.XMLPullParser) (string, error) {
  61. var text struct {
  62. Type string `xml:"type,attr"`
  63. InnerXML string `xml:",innerxml"`
  64. }
  65. err := p.DecodeElement(&text)
  66. if err != nil {
  67. return "", err
  68. }
  69. result := text.InnerXML
  70. result = strings.TrimSpace(result)
  71. if strings.HasPrefix(result, "<![CDATA[") &&
  72. strings.HasSuffix(result, "]]>") {
  73. result = strings.TrimPrefix(result, "<![CDATA[")
  74. result = strings.TrimSuffix(result, "]]>")
  75. return result, nil
  76. }
  77. return DecodeEntities(result)
  78. }
  79. // DecodeEntities decodes escaped XML entities
  80. // in a string and returns the unescaped string
  81. func DecodeEntities(str string) (string, error) {
  82. data := []byte(str)
  83. buf := bytes.NewBuffer([]byte{})
  84. for len(data) > 0 {
  85. // Find the next entity
  86. idx := bytes.IndexByte(data, '&')
  87. if idx == -1 {
  88. buf.Write(data)
  89. break
  90. }
  91. // Write and skip everything before it
  92. buf.Write(data[:idx])
  93. data = data[idx+1:]
  94. if len(data) == 0 {
  95. return "", TruncatedEntity
  96. }
  97. // Find the end of the entity
  98. end := bytes.IndexByte(data, ';')
  99. if end == -1 {
  100. return "", TruncatedEntity
  101. }
  102. if data[0] == '#' {
  103. // Numerical character reference
  104. var str string
  105. base := 10
  106. if len(data) > 1 && data[1] == 'x' {
  107. str = string(data[2:end])
  108. base = 16
  109. } else {
  110. str = string(data[1:end])
  111. }
  112. i, err := strconv.ParseUint(str, base, 32)
  113. if err != nil {
  114. return "", InvalidNumericReference
  115. }
  116. buf.WriteRune(rune(i))
  117. } else {
  118. // Predefined entity
  119. name := string(data[:end])
  120. var c byte
  121. switch name {
  122. case "lt":
  123. c = '<'
  124. case "gt":
  125. c = '>'
  126. case "quot":
  127. c = '"'
  128. case "apos":
  129. c = '\''
  130. case "amp":
  131. c = '&'
  132. default:
  133. return "", fmt.Errorf("unknown predefined "+
  134. "entity &%s;", name)
  135. }
  136. buf.WriteByte(c)
  137. }
  138. // Skip the entity
  139. data = data[end+1:]
  140. }
  141. return buf.String(), nil
  142. }
  143. // ParseNameAddress parses name/email strings commonly
  144. // found in RSS feeds of the format "Example Name (example@site.com)"
  145. // and other variations of this format.
  146. func ParseNameAddress(nameAddressText string) (name string, address string) {
  147. if nameAddressText == "" {
  148. return
  149. }
  150. if emailNameRgx.MatchString(nameAddressText) {
  151. result := emailNameRgx.FindStringSubmatch(nameAddressText)
  152. address = result[1]
  153. name = result[2]
  154. } else if nameEmailRgx.MatchString(nameAddressText) {
  155. result := nameEmailRgx.FindStringSubmatch(nameAddressText)
  156. name = result[1]
  157. address = result[2]
  158. } else if nameOnlyRgx.MatchString(nameAddressText) {
  159. result := nameOnlyRgx.FindStringSubmatch(nameAddressText)
  160. name = result[1]
  161. } else if emailOnlyRgx.MatchString(nameAddressText) {
  162. result := emailOnlyRgx.FindStringSubmatch(nameAddressText)
  163. address = result[1]
  164. }
  165. return
  166. }