-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfulltext.go
429 lines (404 loc) · 14.7 KB
/
fulltext.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
package main
import (
"flag"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/pkg/profile"
)
const (
orgPatStr = "" +
"(?im)^(" +
"\\*.*" + // headline
"|((?s)[ \\t]*#\\+begin_.*?\\n#\\+end_[^\\n]*)" + // block
"|((?s)[ \\t]*#\\+begin:.*?\\n#\\+end:[ \\t]*)" + // dynblock
"|[ \\t]*#\\+[a-z0-9_]+:.*" + // keyword
"|((?s)[ \\t]*:[a-z0-9_]+[ \\t]*:.*?\\n:end:[ \\t]*)" + // drawer
"|[ \\t]*\\[fn:[^]].*" + // footnote
"|([ \\t]*[-+]|[ \\t]+\\*|[ \\t]*([0-9]+|[a-z])\\.)([ \t].*|$)" + // list item
"|[ \\t]*(\\|.*\\||\\+-.*\\+).*" + // table line
"|[ \\t]*-----+" + // hr
"|((?s)[ \\t]*\\\\begin\\{.*?\\n[ \\t]\\\\end\\{).*" + // latex env
"|^[ \\t]*" + // blank line
")(\\n|$)" +
""
)
const (
ERROR = iota + 1
ERROR_FILE_MISSING
ERROR_FILE_CHANGED
ERROR_FILE_UNREADABLE
ERROR_FILE_NOT_IN_DB
ERROR_DB_MISSING
)
const (
PARAGRAPH = iota
HEADLINE
BLOCK
DYNBLOCK
KEYWORD
DRAWER
FOOTNOTE
LIST_ITEM
TABLE_LINE
HR
LATEXT_ENV
BLANK
)
type gram uint16
const (
GRAM_ZERO = gram(1)
GRAM_A = gram(11)
GRAM_BASE = gram(GRAM_A + 26)
GRAM_2_BASE = gram(GRAM_BASE * GRAM_BASE)
GRAM_3_BASE = gram(GRAM_2_BASE * GRAM_BASE)
)
var types = []string{
"PARAGRAPH",
"HEADLINE",
"BLOCK",
"DYNBLOCK",
"KEYWORD",
"DRAWER",
"FOOTNOTE",
"LIST_ITEM",
"TABLE_LINE",
"HR",
"LATEXT_ENV",
"BLANK",
}
//remove [ \\t] from starts
var orgPat = compile(orgPatStr)
var blockPat = compile("^(?i)[ \\t]*#\\+begin(_[a-z0-9_]+)")
var dynblockPat = compile("^(?i)[ \\t]*#\\+begin:")
var keywordPat = compile("^[ \\t]*#\\+[a-zA-Z0-9_]+:")
var drawerPat = compile("^(?i)[ \\t]*:[a-z0-9_]+[ \\t]*:")
var footnotePat = compile("^(?i)[ \\t]*\\[fn:[^]]")
var listItemPat = compile("^(?i)([ \\t]*[+-]|[ \\t]+\\*|[ \\t]*([0-9]*|[a-z])\\.)([ \\t]|$)")
var tableLinePat = compile("^(?i)[ \\t]*(\\||\\+-)")
var hrPat = compile("^(?i)[ \\t]*-----+")
var latexEnvPat = compile("^(?i)[ \\t]*\\\\begin\\{.*?\\n[ \\t]\\\\end\\{")
var blankLine = compile("^[ \\t]*\\n$")
var member struct{}
var fileID = 0
var genSQL = true
var diag = false
var customFts = false
func compile(str string) *regexp.Regexp {
pat, err := regexp.Compile(str)
if err == nil {return pat}
panic(err)
}
func forParts(str string, code func(int, int, int, int)) {
var pos, typ, start, end, prev int
for line := 1; pos < len(str); pos = end {
typ, start, end = orgPart(pos, str)
line += strings.Count(str[prev:start], "\n")
code(line, typ, start, end-(len(str[start:end])-len(strings.TrimRight(str[start:end], " \t\n"))))
prev = start
}
}
func orgPart(pos int, str string) (int, int, int) {
para := pos
lineEnd := 0
for ; pos < len(str); pos += lineEnd + 1 {
curStr := str[pos:]
lineEnd = strings.IndexByte(curStr, '\n')
if lineEnd < 0 {
lineEnd = len(curStr)
}
typ, blockEnd := classify(curStr[:lineEnd])
if typ == LIST_ITEM {
item := curStr[:lineEnd]
offset := len(item) - len(strings.TrimLeft(item, " \t"))
nextStr := curStr[lineEnd:]
listEnd := lineEnd
for {
nextEnd := strings.IndexByte(nextStr, '\n')
if nextEnd < 0 {
nextEnd = len(nextStr)
}
if nextEnd == listEnd {break}
nextItem := nextStr[:nextEnd]
itemType, _ := classify(nextItem)
if itemType == PARAGRAPH {
nextOffset := len(nextItem) - len(strings.TrimLeft(nextItem, " \t"))
if nextOffset <= offset {break}
} else if itemType != BLANK {
break
}
listEnd += nextEnd + 1
if nextEnd+1 >= len(nextStr) {break}
nextStr = nextStr[nextEnd+1:]
}
lineEnd = listEnd
} else if blockEnd != "" { // move lineEnd to the end of the block
for {
sub := curStr[lineEnd+1:]
subEnd := strings.IndexByte(sub, '\n')
if subEnd < 0 {
subEnd = len(sub)
}
trimmed := strings.ToLower(strings.Trim(sub[:subEnd], " \t"))
lineEnd += 1 + subEnd
if lineEnd >= len(curStr) || trimmed == blockEnd {break}
}
}
if typ != PARAGRAPH {
if para < pos {return PARAGRAPH, para, pos}
if typ != BLANK {return typ, pos, pos + lineEnd}
para = pos + lineEnd + 1
} else if pos-para > 1 && str[pos-2:pos] == "\n\n" {
return PARAGRAPH, para, pos
}
}
if para < len(str) {return PARAGRAPH, para, len(str)}
return BLANK, len(str), len(str)
}
func classify(curStr string) (int, string) {
trimmed := strings.TrimLeft(curStr, " \t")
if trimmed == "" {
return BLANK, ""
} else if curStr[0] == '*' {
return HEADLINE, ""
} else if trimmed[0] == '[' && footnotePat.MatchString(trimmed) {
return FOOTNOTE, ""
} else if len(trimmed) > 4 && strings.HasPrefix(trimmed, "-----") && hrPat.MatchString(trimmed) {
return HR, ""
} else if len(trimmed) > 7 && strings.HasPrefix(trimmed, "\\begin") && latexEnvPat.MatchString(trimmed) {
return LATEXT_ENV, ""
} else if trimmed[0] == ':' && drawerPat.MatchString(trimmed) {
return DRAWER, ":end:"
} else if len(trimmed) > 2 && strings.HasPrefix(trimmed, "#+") {
if match := blockPat.FindStringSubmatch(trimmed); match != nil {
return BLOCK, fmt.Sprintf("#+end_%s", strings.ToLower(match[1]))
} else if dynblockPat.MatchString(trimmed) {
return DYNBLOCK, "#+end:"
} else if keywordPat.MatchString(trimmed) {
return KEYWORD, ""
}
return PARAGRAPH, ""
} else if listItemPat.MatchString(trimmed) {
return LIST_ITEM, ""
} else if tableLinePat.MatchString(trimmed) {
return TABLE_LINE, ""
}
return PARAGRAPH, ""
}
func check(err error) {
if err != nil {
panic(fmt.Sprintf("Error: %s, args: %v", err.Error(), flag.Args()))
}
}
// 3 digits in base 37 fits into two bytes
func grams(partial bool, args ...string) map[gram]struct{} {
result := map[gram]struct{}{}
if partial {
for _, term := range args {
addGrams(true, term, result)
}
} else {
addGrams(false, " "+strings.Join(args, " ")+" ", result)
}
return result
}
// 3 digits in base 37 fits into two bytes
func addGrams(partial bool, str string, result map[gram]struct{}) {
var grm gram
for _, c := range str {
v := gramForChar(c)
if v == 0 && grm%GRAM_BASE == 0 {continue} // don't append more than one space
if grm%GRAM_BASE == 0 { // starting a word
grm = v
} else {
grm = gram(((int(grm) * int(GRAM_BASE)) + int(v)) % int(GRAM_3_BASE))
}
if partial && (grm/GRAM_2_BASE == 0 || grm/GRAM_BASE%GRAM_BASE == 0 || grm%GRAM_BASE == 0) {continue}
if grm >= GRAM_BASE { // don't track grams with two leading spaces
result[grm] = member
}
}
}
func gramString(grm gram) string {
g1 := charForGram((grm / GRAM_2_BASE) % GRAM_BASE)
g2 := charForGram((grm / GRAM_BASE) % GRAM_BASE)
g3 := charForGram(grm % GRAM_BASE)
return string([]byte{g1, g2, g3})
}
func charForGram(grm gram) byte {
if grm == 0 {
return '.'
} else if grm < GRAM_A {
return '0' + byte(grm-GRAM_ZERO)
}
return 'A' + byte(grm-GRAM_A)
}
func gramForChar(c rune) gram {
if '0' <= c && c <= '9' {
return gram(c-'0') + GRAM_ZERO
} else if 'A' <= c && c <= 'Z' {
return gram(c-'A') + GRAM_A
} else if 'a' <= c && c <= 'z' {
return gram(c-'a') + GRAM_A
}
return 0
}
func gramForUnicode(str string) gram {
if len(str) != 3 {
exitError(fmt.Sprintf("Unicode gram is not a trigram: '%s'", str), 1)
}
var grm gram
for i := 0; i < 3; i++ {
c := gramForChar(rune(str[i]))
if grm%GRAM_BASE == 0 && c == 0 {continue}
if grm%GRAM_BASE == 0 { // starting a word
grm = c
} else {
grm = ((grm * GRAM_BASE) + c) % GRAM_3_BASE
}
}
return grm
}
func exitError(arg interface{}, code int) {
fmt.Fprintln(os.Stderr, arg)
os.Exit(code)
}
func main() {
//testGrams()
//testNums()
if len(os.Args) == 1 {
usage()
}
flag.Usage = printUsage
prof := false
flag.BoolVar(&prof, "prof", false, "profile cpu")
flag.BoolVar(&diag, "v", false, "verbose")
flag.IntVar(&lmdbConfig.gramSize, "s", 0, "gram size")
flag.StringVar(&lmdbConfig.delimiter, "d", ",", "delimiter for unicode tags")
flag.BoolVar(&lmdbConfig.gramHex, "gx", false, "use hex instead of unicode for grams")
flag.BoolVar(&lmdbConfig.gramDec, "gd", false, "use decimal instead of unicode for grams")
flag.BoolVar(&lmdbConfig.dataHex, "dx", false, "use hex instead of unicode for object data")
flag.StringVar(&lmdbConfig.dataString, "data", "", "data to define for object")
flag.BoolVar(&lmdbConfig.candidates, "candidates", false, "return docs with grams for search")
flag.BoolVar(&lmdbConfig.separate, "sep", false, "print candidates on separate lines")
flag.BoolVar(&lmdbConfig.numbers, "n", false, "only print line numbers for search")
flag.IntVar(&lmdbConfig.limit, "limit", maxInt, "search: limit the number of results")
flag.BoolVar(&lmdbConfig.org, "org", false, "index org-mode chunks instead of lines")
flag.BoolVar(&lmdbConfig.sexp, "sexp", false, `search: output matches as an s-expression ((FILE (POS LINE OFFSET chunk) ... ) ... )
POS is the 1-based character position of the chunk in the file
LINE is the 1-based line of the chunk in the file
OFFSET is the 0-based offset of the first match in the chunk`)
flag.BoolVar(&lmdbConfig.partial, "partial", false, "search: allow partial matches in search")
flag.BoolVar(&lmdbConfig.force, "f", false, "search: skip changed and missing files instead of exiting")
flag.BoolVar(&lmdbConfig.test, "t", false, "update: do a test run, printing what would have happened")
flag.BoolVar(&lmdbConfig.autoupdate, "u", false, "search: update the database before searching")
flag.Float64Var(&lmdbConfig.fuzzy, "fuzzy", 0, "search: specify a percentage fuzzy match")
flag.StringVar(&lmdbConfig.compression, "comp", "", "compression type to use when creating a database")
flag.BoolVar(&lmdbConfig.groups, "groups", false, "info: display information for each group")
flag.StringVar(&lmdbConfig.filter, "filter", "", "search: filter results that match REGEXP")
flag.BoolVar(&lmdbConfig.chunks, "chunks", false, "info DB GROUP: display all of a group's chunks")
flag.BoolVar(&lmdbConfig.file, "file", false, "search: display files rather than chunks")
flag.BoolVar(&lmdbConfig.sort, "sort", false, `search -fuzzy: sort all matches
This ignores start-format and end-format because it sorts all matches, regardless of
which file they come from.`)
flag.StringVar(&lmdbConfig.format, "format", lineFormat, `search: Go format string for each result
Args to printf are FILE POSITION LINE OFFSET PERCENTAGE CHUNK
FILE (string) is the name of the file
POSITION (int) is the 1-based character position of the chunk in the file
LINE (int) is the 1-based line of the chunk in the file
OFFSET (int) is the 0-based offset of the first match in the chunk
PERCENTAGE (float) is the percentage of a fuzzy match
Note that you can place [ARGNUM] after the % to pick a particular arg to format
The default format is %s:%[2]s:%[5]s\n
-sexp sets format to (:filename "%s" :line %[3]d :offset %[4]d :text "%[6]s" :percent %[5]f)
Note that this will cause all matches to be on one (potentially large) line of output`)
flag.StringVar(&lmdbConfig.startFormat, "start-format", groupStart, `search: Go format string for the start of a group
Arg to printf is the FILE
The default value is ""
Not used with search -fuzzy -sort`)
flag.StringVar(&lmdbConfig.endFormat, "end-format", groupEnd, `search: Go format string for the end of a group
Arg to printf is the FILE
The default value is ""
if -sexp is provided and -end-format is not, the default is "\n"
Not used with search -fuzzy -sort`)
flag.BoolVar(&lmdbConfig.grams, "grams", false, "get: specify tags for intead of text\n"+
"info: print gram coverage\n"+
"search: specify grams instead of search terms")
flag.CommandLine.Parse(os.Args[2:])
if prof {
defer profile.Start().Stop()
//defer profile.Start(profile.MemProfile).Stop()
//defer profile.Start(profile.TraceProfile).Stop()
for i := 0; i < 1000; i++ {
runLmdb()
}
} else if !runLmdb() {
usage()
}
}
func usage() {
printUsage()
os.Exit(1)
}
func printUsage() {
prog, err := filepath.Abs(os.Args[0])
if err == nil {
prog = filepath.Base(prog)
} else {
prog = fmt.Sprintf("<BAD PROGRAM PATH: %s>", os.Args[0])
}
fmt.Fprintf(flag.CommandLine.Output(),
`Usage:
%[1]s info -groups DB
print information about each group in the database,
whether it is missing or changed
whether it is an org-mode entry
%[1]s info [-chunks] DB GROUP
print info for a GROUP
-chunks also prints the chunks in GROUP if it has a corresponding file
%[1]s info [-grams] DB
print info for database
displays any groups which do not exist as files
displays any groups which refer to files that have changed
-grams displays distribution information about the trigram index
%[1]s create [-s GRAMSIZE] DB
create DATABASE if it does not exist
%[1]s chunk [-nx | -data D | -dx] -d DELIM DB GROUP GRAMS
%[1]s chunk [-nx | -data D | -dx] -gx DB GROUP GRAMS
ADD a chunk to GROUP with GRAMS.
-d means use DELIM to split GRAMS.
-gx means GRAMS is hex encoded with two bytes for each gram using base 37.
%[1]s grams [-gx] CHUNK
output grams for CHUNK
%[1]s input [-nx | -dx | -org] DB FILE...
For each FILE, create a group with its name and add a CHUNK for each chunk of input.
Chunk data is the line number, offset, and length for each chunk (starting at 1).
-org means chunks are org elements, otherwise chunks are lines
%[1]s delete [-nx] DB GROUP
delete GROUP, its chunks, and tag entries.
NOTE: THIS DOES NOT RECLAIM SPACE! USE COMPACT FOR THAT
%[1]s compact DB
Reclaim space for deleted groups
%[1]s search [-n | -partial | -f | - limit N | -filter REGEXP | -u] DB TEXT ...
query with TEXT for objects
-f force search to skip changed and missing files instead of exiting
-filter makes search only return chunks that match the REGEXP
REGEXP syntax is here: https://golang.org/pkg/regexp/syntax/
%[1]s search -candidates [-grams | -gx | -gd | -n | -f | -limit N | -dx | -u] DB TERM1 ...
dispay all candidates with the grams for TERMS without filtering
-grams indicates TERMS are grams, otherwise extract grams from TERMS
-gx: grams are in hex, -gd: grams are in decimal, otherwise they are 3-char strings
%[1]s data [-nx | -dx] DB GROUP
get data for each doc in GROUP
%[1]s update [-t] DB
reinput files that have changed
delete files that have been removed
-t means do a test run, printing what would have happened
%[1]s empty DB GROUP...
Create empty GROUPs, ignoring existing ones
%[1]s is targeted for groups of small documents, like lines in a file.
`, prog)
flag.PrintDefaults()
}