123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 |
- // Copyright (c) 2014 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- package index
- import (
- "bytes"
- "encoding/json"
- "fmt"
- "reflect"
- "github.com/blevesearch/bleve/document"
- "github.com/blevesearch/bleve/index/store"
- "github.com/blevesearch/bleve/size"
- )
- var reflectStaticSizeTermFieldDoc int
- var reflectStaticSizeTermFieldVector int
- func init() {
- var tfd TermFieldDoc
- reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
- var tfv TermFieldVector
- reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
- }
- var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
- type Index interface {
- Open() error
- Close() error
- Update(doc *document.Document) error
- Delete(id string) error
- Batch(batch *Batch) error
- SetInternal(key, val []byte) error
- DeleteInternal(key []byte) error
- // Reader returns a low-level accessor on the index data. Close it to
- // release associated resources.
- Reader() (IndexReader, error)
- Stats() json.Marshaler
- StatsMap() map[string]interface{}
- Analyze(d *document.Document) *AnalysisResult
- Advanced() (store.KVStore, error)
- }
- type DocumentFieldTermVisitor func(field string, term []byte)
- type IndexReader interface {
- TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
- // DocIDReader returns an iterator over all doc ids
- // The caller must close returned instance to release associated resources.
- DocIDReaderAll() (DocIDReader, error)
- DocIDReaderOnly(ids []string) (DocIDReader, error)
- FieldDict(field string) (FieldDict, error)
- // FieldDictRange is currently defined to include the start and end terms
- FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error)
- FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error)
- Document(id string) (*document.Document, error)
- DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
- DocValueReader(fields []string) (DocValueReader, error)
- Fields() ([]string, error)
- GetInternal(key []byte) ([]byte, error)
- DocCount() (uint64, error)
- ExternalID(id IndexInternalID) (string, error)
- InternalID(id string) (IndexInternalID, error)
- DumpAll() chan interface{}
- DumpDoc(id string) chan interface{}
- DumpFields() chan interface{}
- Close() error
- }
- // The Regexp interface defines the subset of the regexp.Regexp API
- // methods that are used by bleve indexes, allowing callers to pass in
- // alternate implementations.
- type Regexp interface {
- FindStringIndex(s string) (loc []int)
- LiteralPrefix() (prefix string, complete bool)
- String() string
- }
- type IndexReaderRegexp interface {
- FieldDictRegexp(field string, regex string) (FieldDict, error)
- }
- type IndexReaderFuzzy interface {
- FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
- }
- type IndexReaderOnly interface {
- FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
- }
- type IndexReaderContains interface {
- FieldDictContains(field string) (FieldDictContains, error)
- }
- // FieldTerms contains the terms used by a document, keyed by field
- type FieldTerms map[string][]string
- // FieldsNotYetCached returns a list of fields not yet cached out of a larger list of fields
- func (f FieldTerms) FieldsNotYetCached(fields []string) []string {
- rv := make([]string, 0, len(fields))
- for _, field := range fields {
- if _, ok := f[field]; !ok {
- rv = append(rv, field)
- }
- }
- return rv
- }
- // Merge will combine two FieldTerms
- // it assumes that the terms lists are complete (thus do not need to be merged)
- // field terms from the other list always replace the ones in the receiver
- func (f FieldTerms) Merge(other FieldTerms) {
- for field, terms := range other {
- f[field] = terms
- }
- }
- type TermFieldVector struct {
- Field string
- ArrayPositions []uint64
- Pos uint64
- Start uint64
- End uint64
- }
- func (tfv *TermFieldVector) Size() int {
- return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
- len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
- }
- // IndexInternalID is an opaque document identifier interal to the index impl
- type IndexInternalID []byte
- func (id IndexInternalID) Equals(other IndexInternalID) bool {
- return id.Compare(other) == 0
- }
- func (id IndexInternalID) Compare(other IndexInternalID) int {
- return bytes.Compare(id, other)
- }
- type TermFieldDoc struct {
- Term string
- ID IndexInternalID
- Freq uint64
- Norm float64
- Vectors []*TermFieldVector
- }
- func (tfd *TermFieldDoc) Size() int {
- sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
- len(tfd.Term) + len(tfd.ID)
- for _, entry := range tfd.Vectors {
- sizeInBytes += entry.Size()
- }
- return sizeInBytes
- }
- // Reset allows an already allocated TermFieldDoc to be reused
- func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
- // remember the []byte used for the ID
- id := tfd.ID
- vectors := tfd.Vectors
- // idiom to copy over from empty TermFieldDoc (0 allocations)
- *tfd = TermFieldDoc{}
- // reuse the []byte already allocated (and reset len to 0)
- tfd.ID = id[:0]
- tfd.Vectors = vectors[:0]
- return tfd
- }
- // TermFieldReader is the interface exposing the enumeration of documents
- // containing a given term in a given field. Documents are returned in byte
- // lexicographic order over their identifiers.
- type TermFieldReader interface {
- // Next returns the next document containing the term in this field, or nil
- // when it reaches the end of the enumeration. The preAlloced TermFieldDoc
- // is optional, and when non-nil, will be used instead of allocating memory.
- Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error)
- // Advance resets the enumeration at specified document or its immediate
- // follower.
- Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error)
- // Count returns the number of documents contains the term in this field.
- Count() uint64
- Close() error
- Size() int
- }
- type DictEntry struct {
- Term string
- Count uint64
- }
- type FieldDict interface {
- Next() (*DictEntry, error)
- Close() error
- }
- type FieldDictContains interface {
- Contains(key []byte) (bool, error)
- }
- // DocIDReader is the interface exposing enumeration of documents identifiers.
- // Close the reader to release associated resources.
- type DocIDReader interface {
- // Next returns the next document internal identifier in the natural
- // index order, nil when the end of the sequence is reached.
- Next() (IndexInternalID, error)
- // Advance resets the iteration to the first internal identifier greater than
- // or equal to ID. If ID is smaller than the start of the range, the iteration
- // will start there instead. If ID is greater than or equal to the end of
- // the range, Next() call will return io.EOF.
- Advance(ID IndexInternalID) (IndexInternalID, error)
- Size() int
- Close() error
- }
- type BatchCallback func(error)
- type Batch struct {
- IndexOps map[string]*document.Document
- InternalOps map[string][]byte
- persistedCallback BatchCallback
- }
- func NewBatch() *Batch {
- return &Batch{
- IndexOps: make(map[string]*document.Document),
- InternalOps: make(map[string][]byte),
- }
- }
- func (b *Batch) Update(doc *document.Document) {
- b.IndexOps[doc.ID] = doc
- }
- func (b *Batch) Delete(id string) {
- b.IndexOps[id] = nil
- }
- func (b *Batch) SetInternal(key, val []byte) {
- b.InternalOps[string(key)] = val
- }
- func (b *Batch) DeleteInternal(key []byte) {
- b.InternalOps[string(key)] = nil
- }
- func (b *Batch) SetPersistedCallback(f BatchCallback) {
- b.persistedCallback = f
- }
- func (b *Batch) PersistedCallback() BatchCallback {
- return b.persistedCallback
- }
- func (b *Batch) String() string {
- rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
- for k, v := range b.IndexOps {
- if v != nil {
- rv += fmt.Sprintf("\tINDEX - '%s'\n", k)
- } else {
- rv += fmt.Sprintf("\tDELETE - '%s'\n", k)
- }
- }
- for k, v := range b.InternalOps {
- if v != nil {
- rv += fmt.Sprintf("\tSET INTERNAL - '%s'\n", k)
- } else {
- rv += fmt.Sprintf("\tDELETE INTERNAL - '%s'\n", k)
- }
- }
- return rv
- }
- func (b *Batch) Reset() {
- b.IndexOps = make(map[string]*document.Document)
- b.InternalOps = make(map[string][]byte)
- b.persistedCallback = nil
- }
- func (b *Batch) Merge(o *Batch) {
- for k, v := range o.IndexOps {
- b.IndexOps[k] = v
- }
- for k, v := range o.InternalOps {
- b.InternalOps[k] = v
- }
- }
- func (b *Batch) TotalDocSize() int {
- var s int
- for k, v := range b.IndexOps {
- if v != nil {
- s += v.Size() + size.SizeOfString
- }
- s += len(k)
- }
- return s
- }
- // Optimizable represents an optional interface that implementable by
- // optimizable resources (e.g., TermFieldReaders, Searchers). These
- // optimizable resources are provided the same OptimizableContext
- // instance, so that they can coordinate via dynamic interface
- // casting.
- type Optimizable interface {
- Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
- }
- // Represents a result of optimization -- see the Finish() method.
- type Optimized interface{}
- type OptimizableContext interface {
- // Once all the optimzable resources have been provided the same
- // OptimizableContext instance, the optimization preparations are
- // finished or completed via the Finish() method.
- //
- // Depending on the optimization being performed, the Finish()
- // method might return a non-nil Optimized instance. For example,
- // the Optimized instance might represent an optimized
- // TermFieldReader instance.
- Finish() (Optimized, error)
- }
- type DocValueReader interface {
- VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
- }
|