Skip to content

Commit 3ab06f1

Browse files
committed
use bloom filters for dictionaries (close #115)
1 parent 27035b0 commit 3ab06f1

18 files changed

+227
-23
lines changed

go.mod

+9-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.17
55
require (
66
github.com/abadojack/whatlanggo v1.0.1
77
github.com/aclements/perflock v0.0.0-20180319185109-8402f33a418d
8+
github.com/devopsfaith/bloomfilter v1.4.0
89
github.com/gnames/bayes v0.4.0
910
github.com/gnames/gndoc v0.3.1
1011
github.com/gnames/gner v0.1.4
@@ -16,6 +17,7 @@ require (
1617
github.com/labstack/echo/v4 v4.6.3
1718
github.com/maxbrunsfeld/counterfeiter/v6 v6.4.1
1819
github.com/rendon/testcli v1.0.0
20+
github.com/rs/zerolog v1.26.1
1921
github.com/spf13/cobra v1.3.0
2022
github.com/spf13/viper v1.10.1
2123
github.com/stretchr/testify v1.7.0
@@ -41,13 +43,18 @@ require (
4143
github.com/modern-go/reflect2 v1.0.2 // indirect
4244
github.com/pelletier/go-toml v1.9.4 // indirect
4345
github.com/pmezard/go-difflib v1.0.0 // indirect
44-
github.com/rs/zerolog v1.26.1 // indirect
45-
github.com/sirupsen/logrus v1.8.1 // indirect
4646
github.com/spf13/afero v1.8.1 // indirect
4747
github.com/spf13/cast v1.4.1 // indirect
4848
github.com/spf13/jwalterweatherman v1.1.0 // indirect
4949
github.com/spf13/pflag v1.0.5 // indirect
5050
github.com/subosito/gotenv v1.2.0 // indirect
51+
github.com/tmthrgd/atomics v0.0.0-20180217065130-6910de195248 // indirect
52+
github.com/tmthrgd/go-bitset v0.0.0-20180828125936-62ad9ed7ff29 // indirect
53+
github.com/tmthrgd/go-bitwise v0.0.0-20170218093117-01bef038b6bd // indirect
54+
github.com/tmthrgd/go-byte-test v0.0.0-20170223110042-2eb5216b83f7 // indirect
55+
github.com/tmthrgd/go-hex v0.0.0-20180828131331-d1fb3dbb16a1 // indirect
56+
github.com/tmthrgd/go-memset v0.0.0-20180828131805-6f4e59bf1e1d // indirect
57+
github.com/tmthrgd/go-popcount v0.0.0-20180111143836-3918361d3e97 // indirect
5158
github.com/valyala/bytebufferpool v1.0.0 // indirect
5259
github.com/valyala/fasttemplate v1.2.1 // indirect
5360
golang.org/x/crypto v0.0.0-20220214200702-86341886e292 // indirect

go.sum

+168-21
Large diffs are not rendered by default.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

tools/bloom/README.md

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# bloom script
2+
3+
This script generates bloom filters that can be used as a smaller and faster
4+
substitution to dictionaries. Bloom filters are generated with this script
5+
are then compiled into the binaries, while dictionaries themselves are
6+
discarded from the final binaries.
7+
8+
## Usage
9+
10+
```bash
11+
go run ./...
12+
```

tools/bloom/main.go

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package main
2+
3+
import (
4+
"sync"
5+
6+
baseBloomfilter "github.com/devopsfaith/bloomfilter/bloomfilter"
7+
"github.com/rs/zerolog/log"
8+
)
9+
10+
var dataPath = "../../io/dict/data"
11+
var bloomPath = "../../io/dict/bloom"
12+
13+
type filter struct {
14+
name string
15+
path string
16+
filter *baseBloomfilter.Bloomfilter
17+
size int
18+
mux sync.Mutex
19+
}
20+
21+
func main() {
22+
log.Info().Msg("Creating bloom filters")
23+
items := []string{
24+
"bad/uninomials",
25+
"bad/species",
26+
"common/eu",
27+
"ambig/genera",
28+
"ambig/genera_species",
29+
"ambig/species",
30+
"ambig/uninomials",
31+
"good/genera",
32+
"good/species",
33+
"good/uninomials",
34+
}
35+
for _, v := range items {
36+
createFilters(v)
37+
}
38+
}

0 commit comments

Comments
 (0)