Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(tokenizers): discard citation from nominative reporter on overlap #237

Merged
merged 1 commit into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ Changes:
- None

Fixes:
- None
- Prefer the other full citation on overlap with nominative reporter
citations #237


## Current
Expand Down
56 changes: 54 additions & 2 deletions eyecite/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,46 @@
EXTRACTORS = []
EDITIONS_LOOKUP = defaultdict(list)

NOMINATIVE_REPORTER_NAMES = {
"Thompson",
"Cooke",
"Holmes",
"Olcott",
"Chase",
"Gilmer",
"Bee",
"Deady",
"Taney",
}


def token_is_from_nominative_reporter(token: Token) -> bool:
"""Returns true if the token is a citation from a nominative reporter

Cleaner way to do this would be via an attribute or named group from
reporters-db. However; this tagging is currently not complete, so we can
use a list of the most problematic names `NOMINATIVE_REPORTER_NAMES`

```
volume_nominative = token.groups.get("volume_nominative", False)
reporter_nominative = token.groups.get("reporter_nominative", False)
token.exact_editions[0].reporter.name
return volume_nominative is None or volume_nominative
or reporter_nominative is None or reporter_nominative
```

:param token: the token
:return: True if the token has a `volume_nominative` group, even if there
was no match; False if it didn't have the group
"""
if not isinstance(token, CitationToken):
return False
if token.exact_editions:
name = token.exact_editions[0].reporter.short_name
else:
name = token.variation_editions[0].reporter.short_name
return name in NOMINATIVE_REPORTER_NAMES


def _populate_reporter_extractors():
"""Populate EXTRACTORS and EDITIONS_LOOKUP."""
Expand Down Expand Up @@ -313,8 +353,19 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]:
if merged:
continue
if offset > token.start:
# skip overlaps
continue
if (
last_token
and isinstance(token, CitationToken)
and token_is_from_nominative_reporter(last_token)
):
# if a token has overlapping matches between a nominative
# reporter and another type of case citation, prefer the
# other case citation. See #221 and #174
citation_tokens.pop(-1)
all_tokens.pop(-1)
else:
# skip overlaps
continue
if offset < token.start:
# capture plain text before each match
self.append_text(all_tokens, text[offset : token.start])
Expand All @@ -326,6 +377,7 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]:
# capture plain text after final match
if offset < len(text):
self.append_text(all_tokens, text[offset:])

return all_tokens, citation_tokens

def get_extractors(self, text: str):
Expand Down
49 changes: 49 additions & 0 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,55 @@ def test_disambiguate_citations(self):
]
self.run_test_pairs(test_pairs, "Disambiguation")

def test_nominative_reporter_overlaps(self):
"""Can we parse a full citation where a name looks like a nominative
reporter?"""
pairs = [
(
"In re Cooke, 93 Wn. App. 526, 529",
case_citation(volume="93", reporter="Wn. App.", page="526"),
),
(
"Shapiro v. Thompson, 394 U. S. 618",
case_citation(volume="394", reporter="U. S.", page="618"),
),
(
"MacArdell v. Olcott, 82 N.E. 161",
case_citation(volume="82", reporter="N.E.", page="161"),
),
(
"Connecticut v. Holmes, 221 A.3d 407",
case_citation(volume="221", reporter="A.3d", page="407"),
),
(
"Kern v Taney, 11 Pa. D. & C.5th 558 [2010])",
case_citation(
volume="11", reporter="Pa. D. & C.5th", page="558"
),
),
(
"Ellenburg v. Chase, 2004 MT 66",
case_citation(volume="2004", reporter="MT", page="66"),
),
(
"Gilmer, 500 U.S. at 25;",
case_citation(
volume="500", reporter="U. S.", page="25", short=True
),
),
(
"Bison Bee, 778 F. 13 App’x at 73.",
case_citation(volume="778", reporter="F.", page="13"),
),
]
for cite_string, cite_object in pairs:
parsed_cite = get_citations(cite_string)[0]
self.assertEqual(
parsed_cite,
cite_object,
f"Nominative reporters getting in the way of parsing: {parsed_cite}",
)

def test_custom_tokenizer(self):
extractors = []
for e in EXTRACTORS:
Expand Down
Loading