Skip to content

Commit 27035b0

Browse files
committed
show ambiguous genera if makes sense (close #113, close #114)
If ambiguous genera is found, unhide them if there are species of that genera in the text.
1 parent e8cd7a0 commit 27035b0

17 files changed

+235
-59
lines changed

CHANGELOG.md

+26-23
Original file line numberDiff line numberDiff line change
@@ -2,48 +2,51 @@
22

33
## Unreleased
44

5+
## [v0.18.1] - 2022-03-01 Tue
6+
7+
- Add [#114]: add an option to show ambiguous uninomials.
8+
- Add [#113]: show ambiguous genera, if there are species names with them.
9+
510
## [v0.18.0] - 2022-02-28 Mon
611

7-
Add [#117]: bring verificaton in sync with gnames v0.8.0
8-
Add [#116]: add --all-matches flag to show all verification results.
9-
Add: update input and output objects and REST API
12+
- Add [#117]: bring verificaton in sync with gnames v0.8.0
13+
- Add [#116]: add --all-matches flag to show all verification results.
14+
- Add: update input and output objects and REST API
1015
Introducing some backward incompatibility.
1116
See `https://apidoc.globalnames.org/gnfinder-beta`
1217

1318
## [v0.17.0] - 2022-01-06
1419

15-
Add [#111]: update bayes calculations.
16-
17-
Add [#110]: update verification process using most recent code.
20+
- Add [#111]: update bayes calculations.
21+
- Add [#110]: update verification process using most recent code.
1822
Stats for kingdoms distribution and the main clade that
1923
contains most of the names in the text.
2024
Verification JSON is not fully backward compatible.
21-
22-
Add [#109]: add classification path to CSV and TSV outputs.
25+
- Add [#109]: add classification path to CSV and TSV outputs.
2326

2427
## [v0.16.3] - 2021-10-31
2528

26-
Add: update dictionaries with Algaebase and fixes
29+
- Add: update dictionaries with Algaebase and fixes
2730

2831
## [v0.16.2] - 2021-10-28
2932

30-
Fix [#108]: remove confuxing red 'x' from web-UI results.
33+
- Fix [#108]: remove confuxing red 'x' from web-UI results.
3134

3235
## [v0.16.1] - 2021-10-17
3336

34-
Add [#106]: Add API documentation.
37+
- Add [#106]: Add API documentation.
3538

3639
## [v0.16.0] - 2021-06-23
3740

38-
Add [#94]: Add web-based user interface.
41+
- Add [#94]: Add web-based user interface.
3942

4043
## [v0.15.5]
4144

42-
Add [#105]: Support for URL name-finding in REST API.
45+
- Add [#105]: Support for URL name-finding in REST API.
4346

4447
## [v0.15.4]
4548

46-
Add [#104]: merge petectLanguage to language. It allows to simplify logic for
49+
- Add [#104]: merge petectLanguage to language. It allows to simplify logic for
4750
language settings. It also changes API signature for parameters.
4851
Now parameter "language" recognizes
4952

@@ -56,26 +59,26 @@ Add [#104]: merge petectLanguage to language. It allows to simplify logic for
5659

5760
## [v0.15.3]
5861

59-
Fix [#103]: remove conflict between language and detectlanguage parameters.
62+
- Fix [#103]: remove conflict between language and detectlanguage parameters.
6063

6164
## [v0.15.2]
6265

63-
Add: update Echo web framework to v4.5.0
64-
Fix [#102]: 'language' parameter for REST API.
66+
- Add: update Echo web framework to v4.5.0
67+
- Fix [#102]: 'language' parameter for REST API.
6568

6669
## [v0.15.1]
6770

68-
Fix [#101]: BOM interferes with offsets when -U flag is used.
71+
- Fix [#101]: BOM interferes with offsets when -U flag is used.
6972

7073
## [v0.15.0]
7174

72-
Add [#99]: add TSV format and make ouput format an option for REST API.
73-
Add: update modules
74-
Add: update Go to 1.17
75-
Add [#98]: an option to return names positions in bytes from the text start
75+
- Add [#99]: add TSV format and make ouput format an option for REST API.
76+
- Add: update modules
77+
- Add: update Go to 1.17
78+
- Add [#98]: an option to return names positions in bytes from the text start
7679
instead of UTF-8 characters.
7780

78-
Fix [#100]: fix csv/tsv fields number for verification
81+
- Fix [#100]: fix csv/tsv fields number for verification
7982

8083
## [v0.14.2]
8184

README.md

+8
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ settings in both the configuration file and from the flags.
164164
| TokensAround | GNF_TOKENS_AROUND |
165165
| VerifierURL | GNF_VERIFIER_URL |
166166
| WithAllMatches | GNF_WITH_ALL_MATCHES |
167+
| WithAmbiguousNames | GNF_WITH_AMBIGUOUS_NAMES |
167168
| WithBayesOddsDetails | GNF_WITH_BAYES_ODDS_DETAILS |
168169
| WithOddsAdjustment | GNF_WITH_ODDS_ADJUSTMENT |
169170
| WithPlainInput | GNF_WITH_PLAIN_INPUT |
@@ -253,6 +254,13 @@ echo "Pomatomus saltator and Parus major" | gnfinder -v -l eng -s "4,12"
253254
echo "Pomatomus saltator and Parus major" | gnfinder --verify --lang eng --sources "4,12"
254255
```
255256
257+
Preserve uninomial names that are also common words.
258+
259+
```bash
260+
echo "Cancer is a genus" | gnfinder -A
261+
echo "America is also a genus" | gnfinder --ambiguous-uninomials
262+
```
263+
256264
Show all matches, not only the best result.
257265
258266
```bash

config/config.go

+11
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ type Config struct {
6363
// WithAllMatches sets verification to return all found matches.
6464
WithAllMatches bool
6565

66+
// WithAmbiguousNames shows ambigous uninomial names when true.
67+
WithAmbiguousNames bool
68+
6669
// WithBayes determines if both heuristic and Naive Bayes algorithms run
6770
// during the name-finnding.
6871
// false - only heuristic algorithms run
@@ -179,6 +182,14 @@ func OptWithAllMatches(b bool) Option {
179182
}
180183
}
181184

185+
// OptWithAmbiguousNames sets WithAmbiguousNames option to show ambiguous
186+
// uninomials and genera.
187+
func OptWithAmbiguousNames(b bool) Option {
188+
return func(cfg *Config) {
189+
cfg.WithAmbiguousNames = b
190+
}
191+
}
192+
182193
// OptWithBayes is an option that forces running bayes name-finding even when
183194
// the language is not supported by training sets.
184195
func OptWithBayes(b bool) Option {

ent/api/params.go

+4
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,8 @@ type FinderParams struct {
5656
// WithAllMatches indicates that Verification results will return all
5757
// found results, not only the BestResult.
5858
WithAllMatches bool `json:"withAllMatches" form:"allMatches"`
59+
60+
// WithAmbiguousNames preserves detected ambigous uninomials like `America`
61+
// or `Cancer`.
62+
WithAmbiguousNames bool `json:"withAmbiguousNames" form:"ambiguousNames"`
5963
}

ent/heuristic/heuristic.go

+25-9
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,28 @@ func exploreNameCandidate(ts []token.TokenSN, d *dict.Dictionary) bool {
2929

3030
u := ts[0]
3131

32-
if u.Features().UninomialDict == dict.WhiteUninomial ||
33-
(u.Indices().Species == 0 &&
34-
u.Features().UninomialDict == dict.WhiteGenus) {
32+
if u.Features().UninomialDict == dict.WhiteUninomial {
3533
u.SetDecision(token.Uninomial)
3634
return true
3735
}
3836

39-
if u.Indices().Species == 0 ||
40-
u.Features().UninomialDict == dict.BlackUninomial {
37+
if u.Features().UninomialDict == dict.GreyUninomial {
38+
u.SetDecision(token.PossibleUninomial)
39+
return true
40+
}
41+
42+
if u.Indices().Species == 0 {
43+
if u.Features().UninomialDict == dict.WhiteGenus {
44+
u.SetDecision(token.Uninomial)
45+
return true
46+
}
47+
if u.Features().UninomialDict == dict.GreyGenus {
48+
u.SetDecision(token.PossibleUninomial)
49+
return true
50+
}
51+
}
52+
53+
if u.Features().UninomialDict == dict.BlackUninomial {
4154
return false
4255
}
4356

@@ -53,7 +66,7 @@ func exploreNameCandidate(ts []token.TokenSN, d *dict.Dictionary) bool {
5366
return true
5467
}
5568

56-
func checkAsSpecies(t token.TokenSN, d *dict.Dictionary) bool {
69+
func checkAsSpecies(t token.TokenSN) bool {
5770
if !t.Features().IsCapitalized &&
5871
(t.Features().SpeciesDict == dict.WhiteSpecies ||
5972
t.Features().SpeciesDict == dict.GreySpecies) {
@@ -65,12 +78,15 @@ func checkAsSpecies(t token.TokenSN, d *dict.Dictionary) bool {
6578
func checkAsGenusSpecies(ts []token.TokenSN, d *dict.Dictionary) bool {
6679
g := ts[0]
6780
s := ts[g.Indices().Species]
68-
69-
if !checkAsSpecies(s, d) {
81+
if !checkAsSpecies(s) {
7082
if g.Features().UninomialDict == dict.WhiteGenus {
7183
g.SetDecision(token.Uninomial)
7284
return true
7385
}
86+
if g.Features().UninomialDict == dict.GreyGenus {
87+
g.SetDecision(token.PossibleUninomial)
88+
return true
89+
}
7490
return false
7591
}
7692

@@ -126,7 +142,7 @@ func checkInfraspecies(ts []token.TokenSN, d *dict.Dictionary) {
126142
s := ts[ts[0].Indices().Species]
127143
isp := ts[i]
128144

129-
if checkGreyGeneraIsp(g, s, isp, d) || checkAsSpecies(ts[i], d) {
145+
if checkGreyGeneraIsp(g, s, isp, d) || checkAsSpecies(ts[i]) {
130146
ts[0].SetDecision(token.Trinomial)
131147
}
132148
}

ent/output/output.go

+50-16
Original file line numberDiff line numberDiff line change
@@ -40,20 +40,6 @@ type Meta struct {
4040
// TotalSec is time spent for the whole process
4141
TotalSec float32 `json:"totalSec"`
4242

43-
// WithBayes use of bayes during name-finding
44-
WithBayes bool `json:"withBayes"`
45-
46-
// WithOddsAdjustment to adjust prior odds according to the dencity of
47-
// scientific names in the text.
48-
WithOddsAdjustment bool `json:"withOddsAdjustment"`
49-
50-
// WithPositionInBytes names get start/enc positionx in bytes
51-
// instead of UTF-8 chars.
52-
WithPositionInBytes bool `json:"withPositionInBytes"`
53-
54-
// WithVerification is true if results are checked by verification service.
55-
WithVerification bool `json:"withVerification"`
56-
5743
// WordsAround shows the number of tokens preserved before and after
5844
// a name-string candidate.
5945
WordsAround int `json:"wordsAround"`
@@ -64,8 +50,33 @@ type Meta struct {
6450
// LanguageDetected automatically for the text.
6551
LanguageDetected string `json:"languageDetected,omitempty"`
6652

53+
// WithAllMatches is true if all verifcation results are shown.
54+
WithAllMatches bool `json:"withAllMatches,omitempty"`
55+
56+
// WithAmbiguousNames is true if ambiguous uninomials are preserved.
57+
// Examples of ambiguous uninomial names are `Cancer`, `America`.
58+
WithAmbiguousNames bool `json:"withAmbiguousNames,omitempty"`
59+
60+
// WithUniqueNames is true when unique names are returned instead
61+
// of every occurance of a name.
62+
WithUniqueNames bool `json:"withUniqueNames,omitempty"`
63+
64+
// WithBayes use of bayes during name-finding
65+
WithBayes bool `json:"withBayes,omitempty"`
66+
67+
// WithOddsAdjustment to adjust prior odds according to the dencity of
68+
// scientific names in the text.
69+
WithOddsAdjustment bool `json:"withOddsAdjustment,omitempty"`
70+
71+
// WithPositionInBytes names get start/enc positionx in bytes
72+
// instead of UTF-8 chars.
73+
WithPositionInBytes bool `json:"withPositionInBytes,omitempty"`
74+
75+
// WithVerification is true if results are checked by verification service.
76+
WithVerification bool `json:"withVerification,omitempty"`
77+
6778
// WithLanguageDetection sets automatic language determination.
68-
WithLanguageDetection bool `json:"withLanguageDetection"`
79+
WithLanguageDetection bool `json:"withLanguageDetection,omitempty"`
6980

7081
// TotalWords is a number of 'normalized' words in the text
7182
TotalWords int `json:"totalWords"`
@@ -115,11 +126,14 @@ type Name struct {
115126
Cardinality int `json:"cardinality"`
116127

117128
// Verbatim shows name the way it was in the text.
118-
Verbatim string `json:"verbatim"`
129+
Verbatim string `json:"verbatim,omitempty"`
119130

120131
// Name is a normalized version of a name.
121132
Name string `json:"name"`
122133

134+
// Decision about the quality of name detection.
135+
Decision token.Decision `json:"-"`
136+
123137
// Odds show a probability that name detection was correct.
124138
Odds float64 `json:"-"`
125139

@@ -183,6 +197,7 @@ func postprocessNames(
183197
// newOutput is a constructor for Output type.
184198
func newOutput(
185199
names []Name,
200+
genera map[string]struct{},
186201
ts []token.TokenSN,
187202
version string,
188203
cfg config.Config,
@@ -197,6 +212,9 @@ func newOutput(
197212
meta := Meta{
198213
Date: time.Now(),
199214
FinderVersion: version,
215+
WithAllMatches: cfg.WithAllMatches,
216+
WithAmbiguousNames: cfg.WithAmbiguousNames,
217+
WithUniqueNames: cfg.WithUniqueNames,
200218
WithBayes: cfg.WithBayes,
201219
WithOddsAdjustment: cfg.WithOddsAdjustment,
202220
WithVerification: cfg.WithVerification,
@@ -207,6 +225,9 @@ func newOutput(
207225
TotalNameCandidates: candidatesNum(ts),
208226
TotalNames: len(names),
209227
}
228+
if !cfg.WithAmbiguousNames {
229+
names = FilterNames(names, genera)
230+
}
210231

211232
if !cfg.WithBayesOddsDetails || cfg.WithOddsAdjustment {
212233
postprocessNames(names, meta.TotalNameCandidates, cfg)
@@ -216,3 +237,16 @@ func newOutput(
216237

217238
return o
218239
}
240+
241+
func FilterNames(names []Name, genera map[string]struct{}) []Name {
242+
res := make([]Name, 0, len(names))
243+
for i := range names {
244+
if names[i].Decision == token.PossibleUninomial {
245+
if _, ok := genera[names[i].Name]; !ok {
246+
continue
247+
}
248+
}
249+
res = append(res, names[i])
250+
}
251+
return res
252+
}

0 commit comments

Comments
 (0)