Skip to content

Commit e2d4ea1

Browse files
committed
optional find by nom. annotation (close #143)
1 parent e3123c9 commit e2d4ea1

File tree

10 files changed

+62
-21
lines changed

10 files changed

+62
-21
lines changed

cmd/flags.go

+7
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ func allMatchesFlag(cmd *cobra.Command) {
9595
}
9696
}
9797

98+
func findByAnnotFlag(cmd *cobra.Command) {
99+
b, _ := cmd.Flags().GetBool("find-by-annotation")
100+
if b {
101+
opts = append(opts, config.OptWithFindByAnnotation(b))
102+
}
103+
}
104+
98105
func oddsDetailsFlag(cmd *cobra.Command) {
99106
b, _ := cmd.Flags().GetBool("details-odds")
100107
if b {

cmd/gnfinder.yml

+6
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@
7171
#
7272
# WithBayesOddsDetails: false
7373

74+
# WithFindByAnnotation allows to detect names by existence of a
75+
# nomenclatural annotation. If it is true, dictionaries do not prevent
76+
# detection of a name.
77+
#
78+
# WithFindByAnnotation: false
79+
7480
# WithOddsAdjustment can be set to true to adjust calculated odds using the
7581
# ratio of scientific names found in text to the number of capitalized
7682
# words.

cmd/root.go

+14-4
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ type cfgData struct {
6565
WithAllMatches bool
6666
WithAmbiguousNames bool
6767
WithBayesOddsDetails bool
68+
WithFindByAnnotation bool
6869
WithOddsAdjustment bool
6970
WithPlainInput bool
7071
WithPositionInBytes bool
@@ -120,6 +121,7 @@ verification results.
120121
inputOnlyFlag(cmd)
121122
langFlag(cmd)
122123
allMatchesFlag(cmd)
124+
findByAnnotFlag(cmd)
123125
oddsDetailsFlag(cmd)
124126
plainInputFlag(cmd)
125127
sourcesFlag(cmd)
@@ -194,6 +196,9 @@ func init() {
194196
"show details of odds calculation.")
195197
rootCmd.Flags().StringP("verifier-url", "e", "",
196198
"custom URL for name-verification service.")
199+
rootCmd.Flags().BoolP("find-by-annotation", "F", false,
200+
`if there is a nomenclatural annotation ('sp. nov.' etc),
201+
a name will be detected.`)
197202
rootCmd.Flags().StringP("format", "f", "",
198203
`Format of the output: "compact", "pretty", "csv".
199204
compact: compact JSON,
@@ -272,6 +277,7 @@ func initConfig() {
272277
_ = viper.BindEnv("WithAmbiguousNames", "GNF_WITH_AMBIGUOUS_NAMES")
273278
_ = viper.BindEnv("WithAllMatches", "GNF_WITH_ALL_MATCHES")
274279
_ = viper.BindEnv("WithBayesOddsDetails", "GNF_WITH_BAYES_ODDS_DETAILS")
280+
_ = viper.BindEnv("WithFindByAnnotation", "GNF_WITH_FIND_BY_ANNOTATION")
275281
_ = viper.BindEnv("WithOddsAdjustment", "GNF_WITH_ODDS_ADJUSTMENT")
276282
_ = viper.BindEnv("WithPlainInput", "GNF_WITH_PLAIN_INPUT")
277283
_ = viper.BindEnv("WithPositionInBytes", "GNF_WITH_POSITION_IN_BYTES")
@@ -362,6 +368,14 @@ func getOpts() {
362368
opts = append(opts, config.OptWithBayesOddsDetails(true))
363369
}
364370

371+
if cfgCli.WithFindByAnnotation {
372+
opts = append(opts, config.OptWithFindByAnnotation(true))
373+
}
374+
375+
if cfgCli.WithOddsAdjustment {
376+
opts = append(opts, config.OptWithOddsAdjustment(true))
377+
}
378+
365379
if cfgCli.WithPlainInput {
366380
opts = append(opts, config.OptWithPlainInput(true))
367381
}
@@ -370,10 +384,6 @@ func getOpts() {
370384
opts = append(opts, config.OptWithPositonInBytes(true))
371385
}
372386

373-
if cfgCli.WithOddsAdjustment {
374-
opts = append(opts, config.OptWithOddsAdjustment(true))
375-
}
376-
377387
if cfgCli.WithUniqueNames {
378388
opts = append(opts, config.OptWithUniqueNames(true))
379389
}

pkg/config/config.go

+13
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ type Config struct {
7575
// WithBayesOddsDetails show in detail how odds are calculated.
7676
WithBayesOddsDetails bool
7777

78+
// WithFindByAnnotation allows to detect names by existence of a
79+
// nomenclatural annotation. If it is true, dictionaries do not prevent
80+
// detection of a name.
81+
WithFindByAnnotation bool
82+
7883
// WithOddsAdjustment can be set to true to adjust calculated odds using the
7984
// ratio of scientific names found in text to the number of capitalized
8085
// words.
@@ -208,6 +213,14 @@ func OptWithBayesOddsDetails(b bool) Option {
208213
}
209214
}
210215

216+
// OptWithFindByAnnotation option to allow detect names solely by their
217+
// nomenclatural annotation.
218+
func OptWithFindByAnnotation(b bool) Option {
219+
return func(cfg *Config) {
220+
cfg.WithFindByAnnotation = b
221+
}
222+
}
223+
211224
// OptWithOddsAdjustment is an option that triggers recalculation of prior odds
212225
// using number of found names divided by number of all name candidates.
213226
func OptWithOddsAdjustment(b bool) Option {

pkg/config/config_test.go

+17-12
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,17 @@ func TestConfig(t *testing.T) {
1616

1717
t.Run("returns new Config object", func(t *testing.T) {
1818
cfg := config.New()
19-
assert.Equal(t, cfg.Language, lang.English)
20-
assert.Equal(t, cfg.LanguageDetected, "")
21-
assert.Equal(t, cfg.TokensAround, 0)
19+
assert.Equal(t, lang.English, cfg.Language)
20+
assert.Equal(t, "", cfg.LanguageDetected)
21+
assert.Equal(t, 0, cfg.TokensAround)
2222
assert.True(t, cfg.WithBayes)
2323
assert.False(t, cfg.WithPositionInBytes)
2424
})
2525

2626
t.Run("takes language", func(t *testing.T) {
2727
cfg := config.New(config.OptLanguage(lang.English))
28-
assert.Equal(t, cfg.Language, lang.English)
29-
assert.Equal(t, cfg.LanguageDetected, "")
28+
assert.Equal(t, lang.English, cfg.Language)
29+
assert.Equal(t, "", cfg.LanguageDetected)
3030
})
3131

3232
t.Run("sets bayes", func(t *testing.T) {
@@ -41,19 +41,24 @@ func TestConfig(t *testing.T) {
4141

4242
t.Run("sets tokens number", func(t *testing.T) {
4343
cfg := config.New(config.OptTokensAround(4))
44-
assert.Equal(t, cfg.TokensAround, 4)
44+
assert.Equal(t, 4, cfg.TokensAround)
45+
})
46+
47+
t.Run("sets find by annotation", func(t *testing.T) {
48+
cfg := config.New(config.OptWithFindByAnnotation(true))
49+
assert.Equal(t, true, cfg.WithFindByAnnotation)
4550
})
4651

4752
t.Run("does not set 'bad' tokens number", func(t *testing.T) {
4853
cfg := config.New(config.OptTokensAround(-1))
49-
assert.Equal(t, cfg.TokensAround, 0)
54+
assert.Equal(t, 0, cfg.TokensAround)
5055
cfg = config.New(config.OptTokensAround(10))
51-
assert.Equal(t, cfg.TokensAround, 5)
56+
assert.Equal(t, 5, cfg.TokensAround)
5257
})
5358

5459
t.Run("sets bayes' threshold", func(t *testing.T) {
5560
cfg := config.New(config.OptBayesOddsThreshold(200))
56-
assert.Equal(t, cfg.BayesOddsThreshold, 200.0)
61+
assert.Equal(t, 200.0, cfg.BayesOddsThreshold)
5762
})
5863

5964
t.Run("sets several options", func(t *testing.T) {
@@ -62,7 +67,7 @@ func TestConfig(t *testing.T) {
6267
config.OptLanguage(lang.German),
6368
}
6469
cfg := config.New(opts...)
65-
assert.Equal(t, cfg.Language, lang.German)
70+
assert.Equal(t, lang.German, cfg.Language)
6671
assert.True(t, cfg.WithBayes)
6772
})
6873

@@ -81,11 +86,11 @@ func TestConfig(t *testing.T) {
8186

8287
for _, v := range tests {
8388
l, err := lang.New(v.lang)
84-
assert.Equal(t, err != nil, v.hasErr, v.msg)
89+
assert.Equal(t, v.hasErr, err != nil, v.msg)
8590
langOpt := config.OptLanguage(l)
8691
opts := []config.Option{langOpt}
8792
cfg := config.New(opts...)
88-
assert.Equal(t, cfg.Language, v.langCfg, v.msg)
93+
assert.Equal(t, v.langCfg, cfg.Language, v.msg)
8994
}
9095
})
9196
}

pkg/ent/heuristic/heuristic.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
// tokens and sets up token's indices. Indices determine if a token is a
1212
// potential unimonial, binomial or trinomial. Then if fills out signfificant
1313
// number of features pertained to the token.
14-
func TagTokens(ts []token.TokenSN, d *dict.Dictionary) {
14+
func TagTokens(ts []token.TokenSN, d *dict.Dictionary, withAnnot bool) {
1515
l := len(ts)
1616

1717
for i := range ts {

pkg/ent/heuristic/heuristic_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ func TestHeuristic(t *testing.T) {
1717
randomly... Pardosa is a very nice when it is not sad. Drosophila
1818
(Sophophora) melanogaster disagrees!`)
1919
ts := token.Tokenize(txt)
20-
heuristic.TagTokens(ts, dictionary)
20+
heuristic.TagTokens(ts, dictionary, false)
2121
tests := map[int]struct {
2222
name string
2323
decision token.Decision

pkg/ent/nlp/bayes_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ cheilum, 1 5s. per doz.
2626
Conostylis americana, 2i. 6d.
2727
`)
2828
tokens := token.Tokenize(txt)
29-
heuristic.TagTokens(tokens, dictionary)
29+
heuristic.TagTokens(tokens, dictionary, false)
3030
nb := weights[lang.English]
3131

3232
tkn := tokens[10]

pkg/gnfinder.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func (gnf gnfinder) Find(file, txt string) output.Output {
6464
gnf.Language, gnf.LanguageDetected = lang.DetectLanguage(text)
6565
}
6666

67-
heuristic.TagTokens(tokens, gnf.Dictionary)
67+
heuristic.TagTokens(tokens, gnf.Dictionary, gnf.WithFindByAnnotation)
6868
if gnf.WithBayes {
6969
nb := gnf.bayesWeights[gnf.Language]
7070
nlp.TagTokens(tokens, gnf.Dictionary, nb, gnf.BayesOddsThreshold)

tools/training/trainer.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ func processText(t *TextData, d *dict.Dictionary) []feature.ClassFeatures {
113113
var lfs, lfsText []feature.ClassFeatures
114114
var nd NameData
115115
ts := token.Tokenize(t.Text)
116-
heuristic.TagTokens(ts, d)
116+
heuristic.TagTokens(ts, d, false)
117117
l := len(t.NamesPositions)
118118
var nameIdx, i int
119119
for {

0 commit comments

Comments
 (0)