Skip to content

Commit 5c39f08

Browse files
author
Ben King
committed
Ignore rarely-occurring quotation marks during quote convention detection
1 parent 852ea41 commit 5c39f08

File tree

1 file changed

+32
-0
lines changed

1 file changed

+32
-0
lines changed

machine/punctuation_analysis/preliminary_quotation_mark_analyzer.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,31 @@
1111
from .verse import Verse
1212

1313

14+
class QuotationMarkCounter:
15+
_NEGLIGIBLE_PROPORTION_THRESHOLD = 0.01
16+
17+
def __init__(self):
18+
self.reset()
19+
20+
def reset(self) -> None:
21+
self._quotation_mark_counts: Dict[str, int] = defaultdict(int)
22+
self._total_quotation_mark_count: int = 0
23+
24+
def count_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None:
25+
for quotation_mark_match in quotation_marks:
26+
mark: str = quotation_mark_match.quotation_mark
27+
self._quotation_mark_counts[mark] += 1
28+
self._total_quotation_mark_count += 1
29+
30+
def is_quotation_mark_proportion_negligible(self, quotation_mark: str) -> bool:
31+
if self._total_quotation_mark_count == 0:
32+
return True
33+
return (
34+
self._quotation_mark_counts[quotation_mark] / self._total_quotation_mark_count
35+
< self._NEGLIGIBLE_PROPORTION_THRESHOLD
36+
)
37+
38+
1439
class ApostropheProportionStatistics:
1540
def __init__(self):
1641
self.reset()
@@ -260,11 +285,13 @@ def __init__(self, quote_conventions: QuoteConventionSet):
260285
self._quote_conventions = quote_conventions
261286
self._apostrophe_analyzer = PreliminaryApostropheAnalyzer()
262287
self._quotation_mark_sequences = QuotationMarkSequences()
288+
self._quotation_mark_counts = QuotationMarkCounter()
263289
self.reset()
264290

265291
def reset(self) -> None:
266292
self._apostrophe_analyzer.reset()
267293
self._quotation_mark_sequences.reset()
294+
self._quotation_mark_counts.reset()
268295

269296
def narrow_down_possible_quote_conventions(self, chapters: List[Chapter]) -> QuoteConventionSet:
270297
for chapter in chapters:
@@ -281,6 +308,7 @@ def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None:
281308
).find_all_potential_quotation_marks_in_verse(verse)
282309
self._analyze_quotation_mark_sequence(quotation_marks)
283310
self._apostrophe_analyzer.process_quotation_marks(verse.text_segments, quotation_marks)
311+
self._quotation_mark_counts.count_quotation_marks(quotation_marks)
284312

285313
def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None:
286314
quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self._quote_conventions)
@@ -304,6 +332,8 @@ def _find_opening_quotation_marks(self) -> List[str]:
304332
]
305333

306334
def _is_opening_quotation_mark(self, quotation_mark: str) -> bool:
335+
if self._quotation_mark_counts.is_quotation_mark_proportion_negligible(quotation_mark):
336+
return False
307337
if self._apostrophe_analyzer.is_apostrophe_only(quotation_mark):
308338
return False
309339

@@ -323,6 +353,8 @@ def _find_closing_quotation_marks(self) -> List[str]:
323353
]
324354

325355
def _is_closing_quotation_mark(self, quotation_mark: str) -> bool:
356+
if self._quotation_mark_counts.is_quotation_mark_proportion_negligible(quotation_mark):
357+
return False
326358
if self._apostrophe_analyzer.is_apostrophe_only(quotation_mark):
327359
return False
328360

0 commit comments

Comments
 (0)