123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196 |
- package shared
- import (
- "bytes"
- "errors"
- "fmt"
- "regexp"
- "strconv"
- "strings"
- "github.com/mmcdole/goxpp"
- )
- var (
- emailNameRgx = regexp.MustCompile(`^([^@]+@[^\s]+)\s+\(([^@]+)\)$`)
- nameEmailRgx = regexp.MustCompile(`^([^@]+)\s+\(([^@]+@[^)]+)\)$`)
- nameOnlyRgx = regexp.MustCompile(`^([^@()]+)$`)
- emailOnlyRgx = regexp.MustCompile(`^([^@()]+@[^@()]+)$`)
- TruncatedEntity = errors.New("truncated entity")
- InvalidNumericReference = errors.New("invalid numeric reference")
- )
- // FindRoot iterates through the tokens of an xml document until
- // it encounters its first StartTag event. It returns an error
- // if it reaches EndDocument before finding a tag.
- func FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
- for {
- event, err = p.Next()
- if err != nil {
- return event, err
- }
- if event == xpp.StartTag {
- break
- }
- if event == xpp.EndDocument {
- return event, fmt.Errorf("Failed to find root node before document end.")
- }
- }
- return
- }
- // NextTag iterates through the tokens until it reaches a StartTag or EndTag
- // It is similar to goxpp's NextTag method except it wont throw an error if
- // the next immediate token isnt a Start/EndTag. Instead, it will continue to
- // consume tokens until it hits a Start/EndTag or EndDocument.
- func NextTag(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
- for {
- event, err = p.Next()
- if err != nil {
- return event, err
- }
- if event == xpp.StartTag || event == xpp.EndTag {
- break
- }
- if event == xpp.EndDocument {
- return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")
- }
- }
- return
- }
- // ParseText is a helper function for parsing the text
- // from the current element of the XMLPullParser.
- // This function can handle parsing naked XML text from
- // an element.
- func ParseText(p *xpp.XMLPullParser) (string, error) {
- var text struct {
- Type string `xml:"type,attr"`
- InnerXML string `xml:",innerxml"`
- }
- err := p.DecodeElement(&text)
- if err != nil {
- return "", err
- }
- result := text.InnerXML
- result = strings.TrimSpace(result)
- if strings.HasPrefix(result, "<![CDATA[") &&
- strings.HasSuffix(result, "]]>") {
- result = strings.TrimPrefix(result, "<![CDATA[")
- result = strings.TrimSuffix(result, "]]>")
- return result, nil
- }
- return DecodeEntities(result)
- }
- // DecodeEntities decodes escaped XML entities
- // in a string and returns the unescaped string
- func DecodeEntities(str string) (string, error) {
- data := []byte(str)
- buf := bytes.NewBuffer([]byte{})
- for len(data) > 0 {
- // Find the next entity
- idx := bytes.IndexByte(data, '&')
- if idx == -1 {
- buf.Write(data)
- break
- }
- // Write and skip everything before it
- buf.Write(data[:idx])
- data = data[idx+1:]
- if len(data) == 0 {
- return "", TruncatedEntity
- }
- // Find the end of the entity
- end := bytes.IndexByte(data, ';')
- if end == -1 {
- return "", TruncatedEntity
- }
- if data[0] == '#' {
- // Numerical character reference
- var str string
- base := 10
- if len(data) > 1 && data[1] == 'x' {
- str = string(data[2:end])
- base = 16
- } else {
- str = string(data[1:end])
- }
- i, err := strconv.ParseUint(str, base, 32)
- if err != nil {
- return "", InvalidNumericReference
- }
- buf.WriteRune(rune(i))
- } else {
- // Predefined entity
- name := string(data[:end])
- var c byte
- switch name {
- case "lt":
- c = '<'
- case "gt":
- c = '>'
- case "quot":
- c = '"'
- case "apos":
- c = '\''
- case "amp":
- c = '&'
- default:
- return "", fmt.Errorf("unknown predefined "+
- "entity &%s;", name)
- }
- buf.WriteByte(c)
- }
- // Skip the entity
- data = data[end+1:]
- }
- return buf.String(), nil
- }
- // ParseNameAddress parses name/email strings commonly
- // found in RSS feeds of the format "Example Name (example@site.com)"
- // and other variations of this format.
- func ParseNameAddress(nameAddressText string) (name string, address string) {
- if nameAddressText == "" {
- return
- }
- if emailNameRgx.MatchString(nameAddressText) {
- result := emailNameRgx.FindStringSubmatch(nameAddressText)
- address = result[1]
- name = result[2]
- } else if nameEmailRgx.MatchString(nameAddressText) {
- result := nameEmailRgx.FindStringSubmatch(nameAddressText)
- name = result[1]
- address = result[2]
- } else if nameOnlyRgx.MatchString(nameAddressText) {
- result := nameOnlyRgx.FindStringSubmatch(nameAddressText)
- name = result[1]
- } else if emailOnlyRgx.MatchString(nameAddressText) {
- result := emailOnlyRgx.FindStringSubmatch(nameAddressText)
- address = result[1]
- }
- return
- }
|