-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathftfastconfig.go
244 lines (236 loc) · 10.7 KB
/
ftfastconfig.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
package reindexer
type FtFastFieldConfig struct {
FieldName string `json:"field_name"`
// boost of bm25 ranking. default value 1.
Bm25Boost float64 `json:"bm25_boost"`
// weight of bm25 rank in final rank.
// 0: bm25 will not change final rank.
// 1: bm25 will affect to final rank in 0 - 100% range
Bm25Weight float64 `json:"bm25_weight"`
// boost of search query term length. default value 1
TermLenBoost float64 `json:"term_len_boost"`
// weight of search query term length in final rank.
// 0: term length will not change final rank.
// 1: term length will affect to final rank in 0 - 100% range
TermLenWeight float64 `json:"term_len_weight"`
// boost of search query term position. default value 1
PositionBoost float64 `json:"position_boost"`
// weight of search query term position in final rank.
// 0: term position will not change final rank.
// 1: term position will affect to final rank in 0 - 100% range
PositionWeight float64 `json:"position_weight"`
}
type FtTyposDetailedConfig struct {
// Maximum distance between symbols in initial and target words to perform substitution
// Values range: [-1,100]
// Default: 0
MaxTypoDistance int `json:"max_typo_distance"`
// Maximum distance between same symbols in initial and target words to perform substitution (to handle cases, when two symbolws were switched with each other)
// Values range: [-1,100]
// Default: 1
MaxSymbolPermutationDistance int `json:"max_symbol_permutation_distance"`
// Maximum number of symbols, which may be removed from the initial term to transform it into the result word
// Values range: [-1,2]
// Default: 2
MaxMissingLetters int `json:"max_missing_letters"`
// Maximum number of symbols, which may be added to the initial term to transform it into the result word
// Values range: [-1,2]
// Default: 2
MaxExtraLetters int `json:"max_extra_letters"`
}
type FtBaseRanking struct {
// Relevancy of full word match
// Values range: [0,500]
// Default: 100
FullMatch int `json:"full_match_proc"`
// Mininum relevancy of prefix word match.
// Values range: [0,500]
// Default: 50
PrefixMin int `json:"prefix_min_proc"`
// Mininum relevancy of suffix word match.
// Values range: [0,500]
// Default: 10
SuffixMin int `json:"suffix_min_proc"`
// Base relevancy of typo match
// Values range: [0,500]
// Default: 85
Typo int `json:"base_typo_proc"`
// Extra penalty for each word's permutation (addition/deletion of the symbol) in typo algorithm
// Values range: [0,500]
// Default: 15
TypoPenalty int `json:"typo_proc_penalty"`
// Penalty for the variants, created by stemming
// Values range: [0,500]
// Default: 15
StemmerPenalty int `json:"stemmer_proc_penalty"`
// Relevancy of the match in incorrect kblayout
// Values range: [0,500]
// Default: 90
Kblayout int `json:"kblayout_proc"`
// Relevancy of the match in translit
// Values range: [0,500]
// Default: 90
Translit int `json:"translit_proc"`
// Relevancy of the synonym match
// Values range: [0,500]
// Default: 95
Synonyms int `json:"synonyms_proc"`
}
type StopWord struct {
Word string `json:"word"`
IsMorpheme bool `json:"is_morpheme"`
}
type Bm25ConfigType struct {
// Coefficient k1 in the formula for calculating bm25
Bm25k1 float64 `json:"bm25_k1"`
// Coefficient b in the formula for calculating bm25
Bm25b float64 `json:"bm25_b"`
// Formula for calculating document relevance (rx, classic, word_count)
Bm25Type string `json:"bm25_type"`
}
// FtFastConfig configurarion of FullText search index
type FtFastConfig struct {
// boost of bm25 ranking. default value 1.
Bm25Boost float64 `json:"bm25_boost"`
// weight of bm25 rank in final rank.
// 0: bm25 will not change final rank.
// 1: bm25 will affect to final rank in 0 - 100% range
Bm25Weight float64 `json:"bm25_weight"`
// boost of search query term distance in found document. default vaule 1
DistanceBoost float64 `json:"distance_boost"`
// weight of search query terms distance in found document in final rank.
// 0: distance will not change final rank.
// 1: distance will affect to final rank in 0 - 100% range
DistanceWeight float64 `json:"distance_weight"`
// boost of search query term length. default value 1
TermLenBoost float64 `json:"term_len_boost"`
// weight of search query term length in final rank.
// 0: term length will not change final rank.
// 1: term length will affect to final rank in 0 - 100% range
TermLenWeight float64 `json:"term_len_weight"`
// boost of search query term position. default value 1
PositionBoost float64 `json:"position_boost"`
// weight of search query term position in final rank.
// 0: term position will not change final rank.
// 1: term position will affect to final rank in 0 - 100% range
PositionWeight float64 `json:"position_weight"`
// Boost of full match of search phrase with doc
FullMatchBoost float64 `json:"full_match_boost"`
// Relevancy step of partial match: relevancy = kFullMatchProc - partialMatchDecrease * (non matched symbols) / (matched symbols)
// For example: partialMatchDecrease: 15, word in index 'terminator', pattern 'termin'. matched: 6 symbols, unmatched: 4. relevancy = 100 - (15*4)/6 = 80
PartialMatchDecrease int `json:"partial_match_decrease"`
// Minimum rank of found documents
MinRelevancy float64 `json:"min_relevancy"`
// Maximum possible typos in word.
// 0: typos is disabled, words with typos will not match
// N: words with N possible typos will match
// Values range: [0,4]
// Default: 2
// It is not recommended to set more than 2 possible typo: It will serously increase RAM usage, and decrease search speed
MaxTypos int `json:"max_typos"`
// Maximum word length for building and matching variants with typos. Default value is 15
MaxTypoLen int `json:"max_typo_len"`
// Config for more precise typos algorithm tuning
TyposDetailedConfig *FtTyposDetailedConfig `json:"typos_detailed_config,omitempty"`
// Maximum commit steps - set it 1 for always full rebuild - it can be from 1 to 500
MaxRebuildSteps int `json:"max_rebuild_steps"`
// Maximum words in one commit - it can be from 5 to DOUBLE_MAX
MaxStepSize int `json:"max_step_size"`
// Maximum documents which will be processed in merge query results
// Default value is 20000. Increasing this value may refine ranking
// of queries with high frequency words
MergeLimit int `json:"merge_limit"`
// List of used stemmers
Stemmers []string `json:"stemmers"`
// Enable translit variants processing
EnableTranslit bool `json:"enable_translit"`
// Enable wrong keyboard layout variants processing
EnableKbLayout bool `json:"enable_kb_layout"`
// List of objects of stop words. Words from this list will be ignored when building indexes
// but can be included in search results in queries such as 'word*', 'word~' etc. if for the stop-word attribute is_morpheme is true.
// The list item can be either a reindexer.StopWord, or string
StopWords []interface{} `json:"stop_words"`
// List of synonyms for replacement
Synonyms []struct {
// List source tokens in query, which will be replaced with alternatives
Tokens []string `json:"tokens"`
// List of alternatives, which will be used for search documents
Alternatives []string `json:"alternatives"`
} `json:"synonyms"`
// Log level of full text search engine
LogLevel int `json:"log_level"`
// Enable search by numbers as words and backwards
EnableNumbersSearch bool `json:"enable_numbers_search"`
// *DEPREEACTED* - all of the fulltex indexes will perform commit/warmup after copying transatcion
// Enable auto index warmup after atomic namespace copy on transaction
EnableWarmupOnNsCopy bool `json:"enable_warmup_on_ns_copy"`
// Extra symbols, which will be threated as parts of word to addition to letters and digits
ExtraWordSymbols string `json:"extra_word_symbols"`
// Ratio of summation of ranks of match one term in several fields
SumRanksByFieldsRatio float64 `json:"sum_ranks_by_fields_ratio"`
// Max number of highlighted areas for each field in each document (for snippet() and highlight()). '-1' means unlimited
MaxAreasInDoc int `json:"max_areas_in_doc"`
// Max total number of highlighted areas in ft result, when result still remains cacheable. '-1' means unlimited
MaxTotalAreasToCache int `json:"max_total_areas_to_cache"`
// Configuration for certain field
FieldsCfg []FtFastFieldConfig `json:"fields,omitempty"`
// Optimize the index by memory or by cpu. Default 'memory'.
// 'memory': compressed vector of document identifiers
// 'cpu': uncompressed vector of document identifiers
Optimization string `json:"optimization,omitempty"`
// Enable to execute others queries before the ft query
EnablePreselectBeforeFt bool `json:"enable_preselect_before_ft"`
// Config for subterm rank multiplier
FtBaseRankingConfig *FtBaseRanking `json:"base_ranking,omitempty"`
// Config for document ranking
Bm25Config *Bm25ConfigType `json:"bm25_config,omitempty"`
// Text tokenization algorithm. Default 'fast'.
// 'fast' : splits text by spaces, special characters and unsupported UTF-8 symbols.
// Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols.
// 'mmseg_cn': algorithm based on friso mmseg for Chinese and English
SplitterType string `json:"splitter,omitempty"`
}
func DefaultFtFastConfig() FtFastConfig {
return FtFastConfig{
Bm25Boost: 1.0,
Bm25Weight: 0.1,
DistanceBoost: 1.0,
DistanceWeight: 0.5,
TermLenBoost: 1.0,
TermLenWeight: 0.3,
PositionBoost: 1.0,
PositionWeight: 0.1,
FullMatchBoost: 1.1,
PartialMatchDecrease: 15,
MinRelevancy: 0.05,
MaxTypos: 2,
MaxTypoLen: 15,
TyposDetailedConfig: &FtTyposDetailedConfig{MaxTypoDistance: 0, MaxSymbolPermutationDistance: 1, MaxExtraLetters: 2, MaxMissingLetters: 2},
MaxRebuildSteps: 50,
MaxStepSize: 4000,
MergeLimit: 20000,
Stemmers: []string{"en", "ru"},
EnableTranslit: true,
EnableKbLayout: true,
LogLevel: 0,
ExtraWordSymbols: "/-+",
SumRanksByFieldsRatio: 0.0,
MaxAreasInDoc: 5,
MaxTotalAreasToCache: -1,
Optimization: "Memory",
EnablePreselectBeforeFt: false,
FtBaseRankingConfig: &FtBaseRanking{FullMatch: 100, PrefixMin: 50, SuffixMin: 10, Typo: 85, TypoPenalty: 15, StemmerPenalty: 15, Kblayout: 90, Translit: 90, Synonyms: 95},
Bm25Config: &Bm25ConfigType{Bm25k1: 2.0, Bm25b: 0.75, Bm25Type: "rx_bm25"},
}
}
func DefaultFtFastFieldConfig(fieldName string) FtFastFieldConfig {
return FtFastFieldConfig{
FieldName: fieldName,
Bm25Boost: 1.0,
Bm25Weight: 0.1,
TermLenBoost: 1.0,
TermLenWeight: 0.3,
PositionBoost: 1.0,
PositionWeight: 0.1,
}
}