freq.go 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package analysis
  15. import (
  16. "reflect"
  17. "github.com/blevesearch/bleve/size"
  18. )
  19. var reflectStaticSizeTokenLocation int
  20. var reflectStaticSizeTokenFreq int
  21. func init() {
  22. var tl TokenLocation
  23. reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
  24. var tf TokenFreq
  25. reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
  26. }
  27. // TokenLocation represents one occurrence of a term at a particular location in
  28. // a field. Start, End and Position have the same meaning as in analysis.Token.
  29. // Field and ArrayPositions identify the field value in the source document.
  30. // See document.Field for details.
  31. type TokenLocation struct {
  32. Field string
  33. ArrayPositions []uint64
  34. Start int
  35. End int
  36. Position int
  37. }
  38. func (tl *TokenLocation) Size() int {
  39. rv := reflectStaticSizeTokenLocation
  40. rv += len(tl.ArrayPositions) * size.SizeOfUint64
  41. return rv
  42. }
  43. // TokenFreq represents all the occurrences of a term in all fields of a
  44. // document.
  45. type TokenFreq struct {
  46. Term []byte
  47. Locations []*TokenLocation
  48. frequency int
  49. }
  50. func (tf *TokenFreq) Size() int {
  51. rv := reflectStaticSizeTokenFreq
  52. rv += len(tf.Term)
  53. for _, loc := range tf.Locations {
  54. rv += loc.Size()
  55. }
  56. return rv
  57. }
  58. func (tf *TokenFreq) Frequency() int {
  59. return tf.frequency
  60. }
  61. // TokenFrequencies maps document terms to their combined frequencies from all
  62. // fields.
  63. type TokenFrequencies map[string]*TokenFreq
  64. func (tfs TokenFrequencies) Size() int {
  65. rv := size.SizeOfMap
  66. rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
  67. for k, v := range tfs {
  68. rv += len(k)
  69. rv += v.Size()
  70. }
  71. return rv
  72. }
  73. func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
  74. // walk the new token frequencies
  75. for tfk, tf := range other {
  76. // set the remoteField value in incoming token freqs
  77. for _, l := range tf.Locations {
  78. l.Field = remoteField
  79. }
  80. existingTf, exists := tfs[tfk]
  81. if exists {
  82. existingTf.Locations = append(existingTf.Locations, tf.Locations...)
  83. existingTf.frequency = existingTf.frequency + tf.frequency
  84. } else {
  85. tfs[tfk] = &TokenFreq{
  86. Term: tf.Term,
  87. frequency: tf.frequency,
  88. Locations: make([]*TokenLocation, len(tf.Locations)),
  89. }
  90. copy(tfs[tfk].Locations, tf.Locations)
  91. }
  92. }
  93. }
  94. func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
  95. rv := make(map[string]*TokenFreq, len(tokens))
  96. if includeTermVectors {
  97. tls := make([]TokenLocation, len(tokens))
  98. tlNext := 0
  99. for _, token := range tokens {
  100. tls[tlNext] = TokenLocation{
  101. ArrayPositions: arrayPositions,
  102. Start: token.Start,
  103. End: token.End,
  104. Position: token.Position,
  105. }
  106. curr, ok := rv[string(token.Term)]
  107. if ok {
  108. curr.Locations = append(curr.Locations, &tls[tlNext])
  109. curr.frequency++
  110. } else {
  111. rv[string(token.Term)] = &TokenFreq{
  112. Term: token.Term,
  113. Locations: []*TokenLocation{&tls[tlNext]},
  114. frequency: 1,
  115. }
  116. }
  117. tlNext++
  118. }
  119. } else {
  120. for _, token := range tokens {
  121. curr, exists := rv[string(token.Term)]
  122. if exists {
  123. curr.frequency++
  124. } else {
  125. rv[string(token.Term)] = &TokenFreq{
  126. Term: token.Term,
  127. frequency: 1,
  128. }
  129. }
  130. }
  131. }
  132. return rv
  133. }