index.go 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package index
  15. import (
  16. "bytes"
  17. "encoding/json"
  18. "fmt"
  19. "reflect"
  20. "github.com/blevesearch/bleve/document"
  21. "github.com/blevesearch/bleve/index/store"
  22. "github.com/blevesearch/bleve/size"
  23. )
  24. var reflectStaticSizeTermFieldDoc int
  25. var reflectStaticSizeTermFieldVector int
  26. func init() {
  27. var tfd TermFieldDoc
  28. reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
  29. var tfv TermFieldVector
  30. reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
  31. }
  32. var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
  33. type Index interface {
  34. Open() error
  35. Close() error
  36. Update(doc *document.Document) error
  37. Delete(id string) error
  38. Batch(batch *Batch) error
  39. SetInternal(key, val []byte) error
  40. DeleteInternal(key []byte) error
  41. // Reader returns a low-level accessor on the index data. Close it to
  42. // release associated resources.
  43. Reader() (IndexReader, error)
  44. Stats() json.Marshaler
  45. StatsMap() map[string]interface{}
  46. Analyze(d *document.Document) *AnalysisResult
  47. Advanced() (store.KVStore, error)
  48. }
  49. type DocumentFieldTermVisitor func(field string, term []byte)
  50. type IndexReader interface {
  51. TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
  52. // DocIDReader returns an iterator over all doc ids
  53. // The caller must close returned instance to release associated resources.
  54. DocIDReaderAll() (DocIDReader, error)
  55. DocIDReaderOnly(ids []string) (DocIDReader, error)
  56. FieldDict(field string) (FieldDict, error)
  57. // FieldDictRange is currently defined to include the start and end terms
  58. FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error)
  59. FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error)
  60. Document(id string) (*document.Document, error)
  61. DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
  62. DocValueReader(fields []string) (DocValueReader, error)
  63. Fields() ([]string, error)
  64. GetInternal(key []byte) ([]byte, error)
  65. DocCount() (uint64, error)
  66. ExternalID(id IndexInternalID) (string, error)
  67. InternalID(id string) (IndexInternalID, error)
  68. DumpAll() chan interface{}
  69. DumpDoc(id string) chan interface{}
  70. DumpFields() chan interface{}
  71. Close() error
  72. }
  73. // The Regexp interface defines the subset of the regexp.Regexp API
  74. // methods that are used by bleve indexes, allowing callers to pass in
  75. // alternate implementations.
  76. type Regexp interface {
  77. FindStringIndex(s string) (loc []int)
  78. LiteralPrefix() (prefix string, complete bool)
  79. String() string
  80. }
  81. type IndexReaderRegexp interface {
  82. FieldDictRegexp(field string, regex string) (FieldDict, error)
  83. }
  84. type IndexReaderFuzzy interface {
  85. FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
  86. }
  87. type IndexReaderOnly interface {
  88. FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
  89. }
  90. type IndexReaderContains interface {
  91. FieldDictContains(field string) (FieldDictContains, error)
  92. }
  93. // FieldTerms contains the terms used by a document, keyed by field
  94. type FieldTerms map[string][]string
  95. // FieldsNotYetCached returns a list of fields not yet cached out of a larger list of fields
  96. func (f FieldTerms) FieldsNotYetCached(fields []string) []string {
  97. rv := make([]string, 0, len(fields))
  98. for _, field := range fields {
  99. if _, ok := f[field]; !ok {
  100. rv = append(rv, field)
  101. }
  102. }
  103. return rv
  104. }
  105. // Merge will combine two FieldTerms
  106. // it assumes that the terms lists are complete (thus do not need to be merged)
  107. // field terms from the other list always replace the ones in the receiver
  108. func (f FieldTerms) Merge(other FieldTerms) {
  109. for field, terms := range other {
  110. f[field] = terms
  111. }
  112. }
  113. type TermFieldVector struct {
  114. Field string
  115. ArrayPositions []uint64
  116. Pos uint64
  117. Start uint64
  118. End uint64
  119. }
  120. func (tfv *TermFieldVector) Size() int {
  121. return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
  122. len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
  123. }
  124. // IndexInternalID is an opaque document identifier interal to the index impl
  125. type IndexInternalID []byte
  126. func (id IndexInternalID) Equals(other IndexInternalID) bool {
  127. return id.Compare(other) == 0
  128. }
  129. func (id IndexInternalID) Compare(other IndexInternalID) int {
  130. return bytes.Compare(id, other)
  131. }
  132. type TermFieldDoc struct {
  133. Term string
  134. ID IndexInternalID
  135. Freq uint64
  136. Norm float64
  137. Vectors []*TermFieldVector
  138. }
  139. func (tfd *TermFieldDoc) Size() int {
  140. sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
  141. len(tfd.Term) + len(tfd.ID)
  142. for _, entry := range tfd.Vectors {
  143. sizeInBytes += entry.Size()
  144. }
  145. return sizeInBytes
  146. }
  147. // Reset allows an already allocated TermFieldDoc to be reused
  148. func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
  149. // remember the []byte used for the ID
  150. id := tfd.ID
  151. vectors := tfd.Vectors
  152. // idiom to copy over from empty TermFieldDoc (0 allocations)
  153. *tfd = TermFieldDoc{}
  154. // reuse the []byte already allocated (and reset len to 0)
  155. tfd.ID = id[:0]
  156. tfd.Vectors = vectors[:0]
  157. return tfd
  158. }
  159. // TermFieldReader is the interface exposing the enumeration of documents
  160. // containing a given term in a given field. Documents are returned in byte
  161. // lexicographic order over their identifiers.
  162. type TermFieldReader interface {
  163. // Next returns the next document containing the term in this field, or nil
  164. // when it reaches the end of the enumeration. The preAlloced TermFieldDoc
  165. // is optional, and when non-nil, will be used instead of allocating memory.
  166. Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error)
  167. // Advance resets the enumeration at specified document or its immediate
  168. // follower.
  169. Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error)
  170. // Count returns the number of documents contains the term in this field.
  171. Count() uint64
  172. Close() error
  173. Size() int
  174. }
  175. type DictEntry struct {
  176. Term string
  177. Count uint64
  178. }
  179. type FieldDict interface {
  180. Next() (*DictEntry, error)
  181. Close() error
  182. }
  183. type FieldDictContains interface {
  184. Contains(key []byte) (bool, error)
  185. }
  186. // DocIDReader is the interface exposing enumeration of documents identifiers.
  187. // Close the reader to release associated resources.
  188. type DocIDReader interface {
  189. // Next returns the next document internal identifier in the natural
  190. // index order, nil when the end of the sequence is reached.
  191. Next() (IndexInternalID, error)
  192. // Advance resets the iteration to the first internal identifier greater than
  193. // or equal to ID. If ID is smaller than the start of the range, the iteration
  194. // will start there instead. If ID is greater than or equal to the end of
  195. // the range, Next() call will return io.EOF.
  196. Advance(ID IndexInternalID) (IndexInternalID, error)
  197. Size() int
  198. Close() error
  199. }
  200. type BatchCallback func(error)
  201. type Batch struct {
  202. IndexOps map[string]*document.Document
  203. InternalOps map[string][]byte
  204. persistedCallback BatchCallback
  205. }
  206. func NewBatch() *Batch {
  207. return &Batch{
  208. IndexOps: make(map[string]*document.Document),
  209. InternalOps: make(map[string][]byte),
  210. }
  211. }
  212. func (b *Batch) Update(doc *document.Document) {
  213. b.IndexOps[doc.ID] = doc
  214. }
  215. func (b *Batch) Delete(id string) {
  216. b.IndexOps[id] = nil
  217. }
  218. func (b *Batch) SetInternal(key, val []byte) {
  219. b.InternalOps[string(key)] = val
  220. }
  221. func (b *Batch) DeleteInternal(key []byte) {
  222. b.InternalOps[string(key)] = nil
  223. }
  224. func (b *Batch) SetPersistedCallback(f BatchCallback) {
  225. b.persistedCallback = f
  226. }
  227. func (b *Batch) PersistedCallback() BatchCallback {
  228. return b.persistedCallback
  229. }
  230. func (b *Batch) String() string {
  231. rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
  232. for k, v := range b.IndexOps {
  233. if v != nil {
  234. rv += fmt.Sprintf("\tINDEX - '%s'\n", k)
  235. } else {
  236. rv += fmt.Sprintf("\tDELETE - '%s'\n", k)
  237. }
  238. }
  239. for k, v := range b.InternalOps {
  240. if v != nil {
  241. rv += fmt.Sprintf("\tSET INTERNAL - '%s'\n", k)
  242. } else {
  243. rv += fmt.Sprintf("\tDELETE INTERNAL - '%s'\n", k)
  244. }
  245. }
  246. return rv
  247. }
  248. func (b *Batch) Reset() {
  249. b.IndexOps = make(map[string]*document.Document)
  250. b.InternalOps = make(map[string][]byte)
  251. b.persistedCallback = nil
  252. }
  253. func (b *Batch) Merge(o *Batch) {
  254. for k, v := range o.IndexOps {
  255. b.IndexOps[k] = v
  256. }
  257. for k, v := range o.InternalOps {
  258. b.InternalOps[k] = v
  259. }
  260. }
  261. func (b *Batch) TotalDocSize() int {
  262. var s int
  263. for k, v := range b.IndexOps {
  264. if v != nil {
  265. s += v.Size() + size.SizeOfString
  266. }
  267. s += len(k)
  268. }
  269. return s
  270. }
  271. // Optimizable represents an optional interface that implementable by
  272. // optimizable resources (e.g., TermFieldReaders, Searchers). These
  273. // optimizable resources are provided the same OptimizableContext
  274. // instance, so that they can coordinate via dynamic interface
  275. // casting.
  276. type Optimizable interface {
  277. Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
  278. }
  279. // Represents a result of optimization -- see the Finish() method.
  280. type Optimized interface{}
  281. type OptimizableContext interface {
  282. // Once all the optimzable resources have been provided the same
  283. // OptimizableContext instance, the optimization preparations are
  284. // finished or completed via the Finish() method.
  285. //
  286. // Depending on the optimization being performed, the Finish()
  287. // method might return a non-nil Optimized instance. For example,
  288. // the Optimized instance might represent an optimized
  289. // TermFieldReader instance.
  290. Finish() (Optimized, error)
  291. }
  292. type DocValueReader interface {
  293. VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
  294. }