Skip to content

Commit

Permalink
✨ Add english addresses tagging
Browse files Browse the repository at this point in the history
  • Loading branch information
HugoPerrier committed Jan 15, 2024
1 parent e540b0f commit bc40331
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 3 deletions.
15 changes: 12 additions & 3 deletions melusine/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1478,9 +1478,10 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern:
r"Chief",
r"VP",
r"C.O",
r"(Sales)? Representative",
]
job_regex = r"\b(" + r"|".join(jobs) + r")\b"
line_with_known_job = rf"(?:^ *.{{,5}}{job_regex}( +{self.word_block(6)})?(?:\n+|$))"
line_with_known_job = rf"(?:^ *.{{,5}}{self.word_block(1)}{job_regex}( +{self.word_block(6)})?(?:\n+|$))"

# Street address regex
street_word_list = [
Expand All @@ -1500,13 +1501,21 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern:
r"r[ée]sidence",
r"rue",
r"sentier",
# English
r"st\.?",
r"street",
r"ln\.?",
r"lane",
r"rd\.?",
r"road",
r"hill",
]
street_word_pattern = "(" + "|".join(street_word_list) + ")"
street_word_pattern = r"\b(" + "|".join(street_word_list) + r")\b"

# A number (house number) or range, free words (up to 2), an equivalent of street (rue, allée, etc)
# and more free words (up to 5), free chars at the end (up to 2)
street_address_regex = (
r"^ *\d+(?:-\d+)?(?:bis|ter)?,? +(\w+\b *){,2}\b" + street_word_pattern + r"\b *(\w+\b[ -]*){,5}.{,2}$"
r"^ *\d+(?:-\d+)?(?:bis|ter|b)?,? +(\w+\b *){,2}\b" + street_word_pattern + r"\b *(\w+\b[ -]*){,5}.{,2}$"
)

# Email address
Expand Down
23 changes: 23 additions & 0 deletions tests/processors/test_content_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,29 @@ def test_tag_text_french(text, expected_tags):
],
id="english job signature patterns",
),
pytest.param(
(
"9 downing street\n"
"4-6 Beverly Hill\n"
"4 Abbey road W24RA\n"
"3 Ocean Rd.\n"
"5th avenue\n"
"221b Baker St.\n"
"6bis River ln.\n"
"7 Winter lane\n"
),
[
("SIGNATURE", "9 downing street"),
("SIGNATURE", "4-6 Beverly Hill"),
("SIGNATURE", "4 Abbey road W24RA"),
("SIGNATURE", "3 Ocean Rd."),
("SIGNATURE", "5th avenue"),
("SIGNATURE", "221b Baker St."),
("SIGNATURE", "6bis River ln."),
("SIGNATURE", "7 Winter lane"),
],
id="english adsress signature patterns",
),
],
)
def test_tag_text_english(text, expected_tags):
Expand Down

0 comments on commit bc40331

Please sign in to comment.