type.go 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package analysis
  15. import (
  16. "fmt"
  17. "time"
  18. )
  19. type CharFilter interface {
  20. Filter([]byte) []byte
  21. }
  22. type TokenType int
  23. const (
  24. AlphaNumeric TokenType = iota
  25. Ideographic
  26. Numeric
  27. DateTime
  28. Shingle
  29. Single
  30. Double
  31. Boolean
  32. )
  33. // Token represents one occurrence of a term at a particular location in a
  34. // field.
  35. type Token struct {
  36. // Start specifies the byte offset of the beginning of the term in the
  37. // field.
  38. Start int `json:"start"`
  39. // End specifies the byte offset of the end of the term in the field.
  40. End int `json:"end"`
  41. Term []byte `json:"term"`
  42. // Position specifies the 1-based index of the token in the sequence of
  43. // occurrences of its term in the field.
  44. Position int `json:"position"`
  45. Type TokenType `json:"type"`
  46. KeyWord bool `json:"keyword"`
  47. }
  48. func (t *Token) String() string {
  49. return fmt.Sprintf("Start: %d End: %d Position: %d Token: %s Type: %d", t.Start, t.End, t.Position, string(t.Term), t.Type)
  50. }
  51. type TokenStream []*Token
  52. // A Tokenizer splits an input string into tokens, the usual behaviour being to
  53. // map words to tokens.
  54. type Tokenizer interface {
  55. Tokenize([]byte) TokenStream
  56. }
  57. // A TokenFilter adds, transforms or removes tokens from a token stream.
  58. type TokenFilter interface {
  59. Filter(TokenStream) TokenStream
  60. }
  61. type Analyzer struct {
  62. CharFilters []CharFilter
  63. Tokenizer Tokenizer
  64. TokenFilters []TokenFilter
  65. }
  66. func (a *Analyzer) Analyze(input []byte) TokenStream {
  67. if a.CharFilters != nil {
  68. for _, cf := range a.CharFilters {
  69. input = cf.Filter(input)
  70. }
  71. }
  72. tokens := a.Tokenizer.Tokenize(input)
  73. if a.TokenFilters != nil {
  74. for _, tf := range a.TokenFilters {
  75. tokens = tf.Filter(tokens)
  76. }
  77. }
  78. return tokens
  79. }
  80. var ErrInvalidDateTime = fmt.Errorf("unable to parse datetime with any of the layouts")
  81. type DateTimeParser interface {
  82. ParseDateTime(string) (time.Time, error)
  83. }
  84. type ByteArrayConverter interface {
  85. Convert([]byte) (interface{}, error)
  86. }