tokenize.go 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. package multibayes
  2. import (
  3. "bytes"
  4. "encoding/base64"
  5. "regexp"
  6. "strings"
  7. "github.com/blevesearch/bleve/analysis"
  8. regexp_tokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp"
  9. "github.com/blevesearch/go-porterstemmer"
  10. )
  11. const (
  12. tokenSeparator = "_"
  13. )
  14. type ngram struct {
  15. Tokens [][]byte
  16. }
  17. // encodes in base64 for safe comparison
  18. func (ng *ngram) String() string {
  19. encoded := make([]string, len(ng.Tokens))
  20. for i, token := range ng.Tokens {
  21. encoded[i] = string(token)
  22. //encoded[i] = base64.StdEncoding.EncodeToString(token) // safer?
  23. }
  24. return strings.Join(encoded, tokenSeparator)
  25. }
  26. func decodeNGram(s string) (*ngram, error) {
  27. encodedTokens := strings.Split(s, tokenSeparator)
  28. tokens := make([][]byte, len(encodedTokens))
  29. var err error
  30. for i, encodedToken := range encodedTokens {
  31. tokens[i], err = base64.StdEncoding.DecodeString(encodedToken)
  32. if err != nil {
  33. return nil, err
  34. }
  35. }
  36. return &ngram{tokens}, nil
  37. }
  38. type tokenizerConf struct {
  39. regexp *regexp.Regexp
  40. NGramSize int64
  41. }
  42. type tokenizer struct {
  43. regexp_tokenizer.RegexpTokenizer
  44. Conf *tokenizerConf
  45. }
  46. func validateConf(tc *tokenizerConf) {
  47. tc.regexp = regexp.MustCompile(`[0-9A-z_'\-]+|\%|\$`)
  48. // TODO: We force NGramSize = 1 so as to create disjoint ngrams,
  49. // which is necessary for the naive assumption of conditional
  50. // independence among tokens. It would be great to allow ngrams
  51. // to be greater than 1 and select only disjoint ngrams from the
  52. // tokenizer.
  53. tc.NGramSize = 1
  54. }
  55. func newTokenizer(tc *tokenizerConf) (*tokenizer, error) {
  56. validateConf(tc)
  57. return &tokenizer{*regexp_tokenizer.NewRegexpTokenizer(tc.regexp), tc}, nil
  58. }
  59. // Tokenize and Gramify
  60. func (t *tokenizer) Parse(doc string) []ngram {
  61. // maybe use token types for datetimes or something instead of
  62. // the actual byte slice
  63. alltokens := t.Tokenize([]byte(strings.ToLower(doc)))
  64. filtered := make(map[int][]byte)
  65. for i, token := range alltokens {
  66. exclude := false
  67. for _, stop := range stopbytes {
  68. if bytes.Equal(token.Term, stop) {
  69. exclude = true
  70. break
  71. }
  72. }
  73. if exclude {
  74. continue
  75. }
  76. tokenString := porterstemmer.StemString(string(token.Term))
  77. //tokenBytes := porterstemmer.Stem(token.Term) // takes runes, not bytes
  78. if token.Type == analysis.Numeric {
  79. tokenString = "NUMBER"
  80. } else if token.Type == analysis.DateTime {
  81. tokenString = "DATE"
  82. }
  83. filtered[i] = []byte(tokenString)
  84. }
  85. // only consider sequential terms as candidates for ngrams
  86. // terms separated by stopwords are ineligible
  87. allNGrams := make([]ngram, 0, 100)
  88. currentTokens := make([][]byte, 0, 100)
  89. lastObserved := -1
  90. for i, token := range filtered {
  91. if (i - 1) != lastObserved {
  92. ngrams := t.tokensToNGrams(currentTokens)
  93. allNGrams = append(allNGrams, ngrams...)
  94. currentTokens = make([][]byte, 0, 100)
  95. }
  96. currentTokens = append(currentTokens, token)
  97. lastObserved = i
  98. }
  99. // bring in the last one
  100. if len(currentTokens) > 0 {
  101. ngrams := t.tokensToNGrams(currentTokens)
  102. allNGrams = append(allNGrams, ngrams...)
  103. }
  104. return allNGrams
  105. }
  106. func (t *tokenizer) tokensToNGrams(tokens [][]byte) []ngram {
  107. nTokens := int64(len(tokens))
  108. nNGrams := int64(0)
  109. for i := int64(1); i <= t.Conf.NGramSize; i++ {
  110. chosen := choose(nTokens, i)
  111. nNGrams += chosen
  112. }
  113. ngrams := make([]ngram, 0, nNGrams)
  114. for ngramSize := int64(1); ngramSize <= t.Conf.NGramSize; ngramSize++ {
  115. nNGramsOfSize := choose(nTokens, ngramSize)
  116. for i := int64(0); i < nNGramsOfSize; i++ {
  117. ngrams = append(ngrams, ngram{tokens[i:(i + ngramSize)]})
  118. }
  119. }
  120. return ngrams
  121. }
  122. // not a binomial coefficient -- combinations must be sequential
  123. func choose(n, k int64) int64 {
  124. return max(n-k+int64(1), 0)
  125. }
  126. func max(x, y int64) int64 {
  127. if x > y {
  128. return x
  129. }
  130. return y
  131. }