Skip to content

Commit bd2ae47

Browse files
committed
func_new as alternative, some changes there
1 parent 97f181e commit bd2ae47

File tree

2 files changed

+340
-0
lines changed

2 files changed

+340
-0
lines changed

funcs_new/clean.py

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import re
2+
3+
from . import re_patterns as repat
4+
5+
6+
class CleanText:
7+
# Tables
8+
TBL_UMLAUTE: dict = {'ä': 'ae', 'Ä': 'Ae', 'ö': 'oe', 'Ö': 'Oe', 'ü': 'ue', 'Ü': 'Ue', 'ß': 'ss'}
9+
TBL_ACCENTS: dict = {'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'å': 'a', 'À': 'A', 'Á': 'A', 'Â': 'A',
10+
'ç': 'c',
11+
'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ȩ': 'e', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E',
12+
'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'Î': 'I', 'Ï': 'I',
13+
'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'Ô': 'O', 'Ò': 'O', 'Ó': 'O',
14+
'ù': 'u', 'ú': 'u', 'û': 'u', 'Ù': 'U', 'Û': 'U'}
15+
TBL_CURRENCY: dict = {'€': 'EUR', '$': 'USD', '£': 'GBP', '¥': 'JPY'}
16+
TBL_FANCY_QUOTATION_MARKS: dict[int, int] = {
17+
ord(x): ord(y) for x, y in
18+
[("ʼ", "'"), ("‘", "'"), ("’", "'"), ("´", "'"), ("`", "'"), ("“", '"'), ("”", '"'), ("„", "'")]}
19+
20+
# Substitutions: Replace special character with value in respective TBL
21+
def sub_umlaute(self, text: str) -> str:
22+
table = str.maketrans(self.TBL_UMLAUTE)
23+
return text.translate(table)
24+
25+
def sub_accent_chars(self, text: str) -> str:
26+
table = str.maketrans(self.TBL_ACCENTS)
27+
return text.translate(table)
28+
29+
def sub_currency(self, text: str) -> str:
30+
""" Function makes space between curr, amount and unit (i.e. EUR200M -> EUR 200 M) to improve tokenization """
31+
table = str.maketrans(self.TBL_CURRENCY)
32+
text = text.translate(table)
33+
return repat.RE_CURR_AMOUNT.sub(
34+
repl=lambda m: m.group("curr") + ' ' + m.group("amount") + ' ' + m.group("unit"),
35+
string=text)
36+
37+
def sub_measurements(self, text: str) -> str:
38+
""" Function makes space between amount and unit (i.e. 200km -> 200 km) to improve tokenization """
39+
return repat.RE_MEASUREMENTS.sub(
40+
repl=lambda m: m.group("amount") + ' ' + m.group("unit"), string=text)
41+
42+
def sub_fancy_quot_marks(self, text: str) -> str:
43+
table = str.maketrans(self.TBL_FANCY_QUOTATION_MARKS)
44+
return text.translate(table)
45+
46+
def rem_quot_marks(self, text: str) -> str:
47+
return repat.RE_QUOTATION_MARKS.sub(repl="", string=text)
48+
49+
def rem_brackets(self, text: str, curly: bool = True, round: bool = False, square: bool = True) -> str:
50+
text = repat.RE_BRACKETS_CURLY_AND_CONTENT.sub(repl="", string=text) if curly else text
51+
text = repat.RE_BRACKETS_ROUND_AND_CONTENT.sub(repl="", string=text) if round else text
52+
text = repat.RE_BRACKETS_SQUARE_AND_CONTENT.sub(repl="", string=text) if square else text
53+
return text
54+
55+
def rem_strange_chars(self, text: str) -> str:
56+
text = repat.RE_SUPER_SUB_SCRIPTED_NUMBERS.sub(repl=" ", string=text)
57+
text = repat.RE_SUSPICIOUS_CHARS.sub(repl=" ", string=text)
58+
text = repat.RE_BULLET_POINTS.sub(repl="*", string=text)
59+
text = repat.RE_UNICODE_SYMBOLS.sub(repl="", string=text)
60+
text = repat.RE_STRANGE_DASHES.sub(repl="-", string=text)
61+
return text
62+
63+
def rem_datetime_only_lines(self, text: str):
64+
text = repat.RE_LOCATION_DATE_TIMEOPTIONAL.sub(repl="", string=text)
65+
text = repat.RE_DATE_TIME_ONLY_LINES.sub(repl="", string=text)
66+
return text
67+
68+
def rem_address_info(self, text: str):
69+
text = repat.RE_HTTP_LINKS.sub(repl="", string=text)
70+
text = repat.RE_EMAIL.sub(repl="", string=text)
71+
return text
72+
73+
def mark_end_of_sentence(self, text: str, period: str = ". ") -> str:
74+
""" An end of sentence pattern can be manifold. Multiple patterns are thus (probably) required. """
75+
text = repat.RE_HEADLINE.sub(repl=lambda m: m.group("headline") + period, string=text)
76+
text = repat.RE_SECTION_HEADER.sub(repl=lambda m: m.group("sectionheader") + period, string=text)
77+
return text
78+
79+
def split_listed_sentences(self, text: str, end_of_sent_char: str = ". ") -> str:
80+
possible_chars_to_mark_end_of_sents: list = [".", ":"]
81+
try:
82+
listing_objs: list[re.Match] = list(repat.RE_LISTING_SENTS.finditer(string=text))
83+
end_pos: int = 0
84+
for listing_obj in listing_objs:
85+
if listing_obj.start() == end_pos or listing_obj.group(
86+
'beforechar') in possible_chars_to_mark_end_of_sents:
87+
hyphen_replacement: str = ""
88+
else:
89+
hyphen_replacement: str = end_of_sent_char
90+
if listing_obj.group('afterchar') in possible_chars_to_mark_end_of_sents:
91+
char_added_at_end: str = ""
92+
else:
93+
char_added_at_end: str = end_of_sent_char
94+
end_pos = listing_obj.end()
95+
cleaned_listing = listing_obj.group('listing').replace(listing_obj.group('hyphen'),
96+
hyphen_replacement) + char_added_at_end
97+
text = text.replace(listing_obj.group('listing'), cleaned_listing)
98+
99+
except:
100+
# ToDo: Logger here
101+
pass
102+
return text
103+
104+
def rem_repeating_chars(self, text: str):
105+
text = repat.RE_REPEATING_CHARS.sub(
106+
repl=lambda m: m.group(0)[0] + " ", string=text)
107+
return text
108+
109+
def rem_whitespace(self, text: str) -> str:
110+
"""
111+
Replace all contiguous zero-width and line-breaking spaces and spaces before a sentence end with an empty
112+
string, non-line-breaking spaces with a single space and then strip any leading/trailing whitespace.
113+
"""
114+
text = repat.RE_ZERO_WIIDTH_SPACE.sub(repl="", string=text)
115+
text = repat.RE_LINEBREAK.sub(repl=r" ", string=text)
116+
text = repat.RE_NONBREAKING_SPACE.sub(repl=" ", string=text)
117+
text = repat.RE_SPACE_BEFORE_SENT_END.sub(repl="", string=text)
118+
return text.strip()
119+
120+
def clean(self, text: str,
121+
sub_umlaute: bool = True, sub_accent_chars: bool = True, sub_curr: bool = True, sub_measure: bool = True,
122+
sub_fancy_quot_marks: bool = True, rem_brackets: bool = True, rem_quot_marks: bool = True,
123+
rem_strange_chars: bool = True, mark_end_of_sent: bool = True, split_listed_sents: bool = True,
124+
rem_dt_only_lines: bool = True,
125+
rem_address_info: bool = True, rem_repeat_chars: bool = True, rem_whitespace: bool = True) -> str:
126+
127+
text = self.sub_umlaute(text=text) if sub_umlaute else text
128+
text = self.sub_accent_chars(text=text) if sub_accent_chars else text
129+
text = self.sub_currency(text=text) if sub_curr else text
130+
text = self.sub_measurements(text=text) if sub_measure else text
131+
text = self.sub_fancy_quot_marks(text=text) if sub_fancy_quot_marks else text
132+
text = self.rem_quot_marks(text=text) if rem_quot_marks else text # Should be after sub_fancy_quot_marks !!!
133+
text = self.rem_strange_chars(text=text) if rem_strange_chars else text
134+
text = self.rem_datetime_only_lines(text=text) if rem_dt_only_lines else text
135+
text = self.rem_brackets(text=text) if rem_brackets else text
136+
text = self.rem_address_info(text=text) if rem_address_info else text
137+
text = self.mark_end_of_sentence(text=text) if mark_end_of_sent else text
138+
text = self.split_listed_sentences(text=text) if split_listed_sents else text
139+
text = self.rem_repeating_chars(text=text) if rem_repeat_chars else text # rem_repeating_chars: do second last
140+
text = self.rem_whitespace(text=text) if rem_whitespace else text # rem_whitespace: do last !!!
141+
return text

funcs_new/re_patterns.py

+199
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import re
2+
from typing import Pattern
3+
4+
""" BRACKETS """
5+
RE_BRACKETS_CURLY_AND_CONTENT: Pattern = re.compile(r"\{[^{}]*?\}")
6+
RE_BRACKETS_ROUND_AND_CONTENT: Pattern = re.compile(r"\([^()]*?\)")
7+
RE_BRACKETS_SQUARE_AND_CONTENT: Pattern = re.compile(r"\[[^\[\]]*?\]")
8+
RE_BRACKETS_CURLY: Pattern = re.compile(r"[{}]")
9+
RE_BRACKETS_ROUND: Pattern = re.compile(r"[()]")
10+
RE_BRACKETS_SQUARE: Pattern = re.compile(r"[\[\]]")
11+
12+
""" LINEBREAKS, SPACES """
13+
RE_LINEBREAK: Pattern = re.compile(r"(\r\n|[\n\v])+")
14+
# RE_NO_PERIOD_NOR_HYPHEN_THEN_TWO_EMPTY_LINES: Pattern = re.compile(r"(?<![.-])(?:\r\n|[\n\v]){2,}")
15+
RE_NONBREAKING_SPACE: Pattern = re.compile(r"[^\S\n\v]+")
16+
RE_ZERO_WIIDTH_SPACE: Pattern = re.compile(r"[\u200B\u2060\uFEFF]+")
17+
RE_SPACE_BEFORE_SENT_END: Pattern = re.compile(r"\s+(?=\.)")
18+
19+
RE_HEADLINE: Pattern = re.compile(r"(?P<headline>\A\b(?:.|\n){,200}?$)(?=\n{2,})",
20+
flags=re.MULTILINE)
21+
RE_SECTION_HEADER: Pattern = re.compile(r"(?<!\A)(?<=\n{2})(?P<sectionheader>^\b[^.!?]*?$)(?=\n{2,})",
22+
flags=re.MULTILINE)
23+
24+
""" LISTINGS """
25+
num_of_required_hyphens = 2
26+
hyphen_chars = '-*o' # The chars therein must be present in function: RE_BULLET_POINTS.sub(repl="*", string=text)
27+
RE_LISTING_SENTS: Pattern = re.compile(
28+
fr"(?P<listing>(?:(?P<beforechar>\W)\W*)(?P<hyphen>(?<=\n)[^\S\r\n]*[{hyphen_chars}]\s+)[^.!?]+?(?P<afterchar>[.!?]?)(?=(?<=\n)[^\S\r\n][{hyphen_chars}]\s+|\s*\n+^\W|(?<=\.)))"
29+
, flags=re.MULTILINE)
30+
31+
32+
RE_LISTING_SENTS: Pattern = re.compile(
33+
fr"(?P<listing>(?:(?P<beforechar>\W)\W*)(?P<hyphen>(?<=\n)[^\S\r\n]*[{hyphen_chars}]\s+)[^.!?]+?(?P<afterchar>[.!?]?)(?=(?<=\n)[^\S\r\n][{hyphen_chars}]\s+|\s*\n+^\W|(?<=\.)))"
34+
, flags=re.MULTILINE)
35+
36+
37+
38+
""" DATES AND TIMES """
39+
RE_DATE_1_DAY_MONTHNAME_YEAR: Pattern = re.compile(
40+
r"\b(?P<day1>[0-3]?[0-9])[.\s/-]*(?P<monthname1>(?:jan|feb|m[aä]r|apr|ma[iy][^\w]|jun|jul|aug|sep|o[ck]t|nov|de[cz])\w*)(?:[,.\s/-]*(?P<year1>[1|2]?[0|9]?[0-9]{2})?)",
41+
flags=re.IGNORECASE)
42+
RE_DATE_2_MONTHNAME_DAY_YEAR: Pattern = re.compile(
43+
r"\b(?P<monthname2>(?:jan|feb|m[aä]r|apr|ma[iy][^\w]|jun|jul|aug|sep|o[ck]t|nov|de[cz])\w*)\s*(?P<day2>[0-3]?[0-9]),?\s+(?P<year2>[1|2]?[0|9]?[0-9]{2})?",
44+
flags=re.IGNORECASE)
45+
RE_DATE_3_DAY_MONTHNUMBER_YEAR: Pattern = re.compile(
46+
r"\b(?P<day3>[0-3]?[0-9])[\s/.-]+(?P<monthnum3>[0-1][0-9])[\s/.-]+(?P<year3>(?:[1|2][0|9])?[0-9]{2})",
47+
flags=re.IGNORECASE)
48+
RE_DATE_4_YEAR_MONTHNUMBER_DAY: Pattern = re.compile(
49+
r"\b(?P<year4>(?:[1|2][0|9])?[0-9]{2})[-/.\s]+(?P<monthnum4>[0-1]?[0-9])[-/.\s]+(?P<day4>[0-3]?[0-9])",
50+
flags=re.IGNORECASE)
51+
RE_DATE_EXACT: Pattern = re.compile('(?P<date>' + '|'.join(
52+
[RE_DATE_1_DAY_MONTHNAME_YEAR.pattern, RE_DATE_2_MONTHNAME_DAY_YEAR.pattern, RE_DATE_3_DAY_MONTHNUMBER_YEAR.pattern,
53+
RE_DATE_4_YEAR_MONTHNUMBER_DAY.pattern]) + ')',
54+
flags=re.IGNORECASE)
55+
RE_TIME: Pattern = re.compile(
56+
r"(?P<time>(?P<hour>[0-2]?[0-9]):(?P<min>[0-5][0-9]):?(?P<sec>[0-5][0-9])?\s*(?P<pm>p\.?m\.?)?\s*(?P<am>a\.?m\.?)?\s*(?:(?P<gmtoffset>(?:GMT|UTC)(?P<offset>[+-]\d\d?))|(?P<tzname>(?:CET)|(?:EET)|(?:GMT[^\w+-])|(?:UTC[^\w+-])|(?:UCT[^\w+-])|(?:EST)|(?:WET)|(?:MET)|(?:HST)|(?:MST)))?(?P<timesuffix>(?:/CEST))?)",
57+
flags=re.IGNORECASE)
58+
59+
RE_DATE_AND_TIME = re.compile(RE_DATE_EXACT.pattern + r"[\s/-]*" + RE_TIME.pattern + r"\b", flags=re.IGNORECASE)
60+
RE_YEAR_PERIOD = re.compile(r"([1|2][9|0]\d{2}\.)")
61+
RE_MONTH_YEAR: Pattern = re.compile(
62+
r"\b(?:(?P<day1>[0-3]?[0-9])(?:[\.\s\/\-,]*))?(?P<month>(?:jan|feb|m[aä]r|apr|ma[iy]|jun|jul|aug|sep|o[ck]t|nov|de[cz])\w*)\s*(?:(?P<day2>[0-3]?[0-9])(?:[\.\s\/\-,]*))?(?P<year>(?:[1|2][0|9])[0-9]{2})\b",
63+
flags=re.IGNORECASE)
64+
65+
start_and_end_chars: str = "-+#"
66+
RE_LOCATION_DATE_TIMEOPTIONAL: Pattern = re.compile(
67+
r"^"
68+
+ r"(?P<locationbeforedate>(?:[A-Z])[^\n\d]{1,50}(?P<dateprefix>[,-]|den|der|am)\s*)" + r"?" # Optional. Include location, then time-related sent starts
69+
+ RE_DATE_EXACT.pattern
70+
+ r"[\s/-]*?"
71+
+ RE_TIME.pattern + r"?" # Optional
72+
+ fr"(?P<endchars>(?:([{start_and_end_chars}][^\S\n\r]*)+(?=.*))|$)"
73+
, flags=re.MULTILINE | re.IGNORECASE | re.VERBOSE
74+
)
75+
76+
RE_DATE_TIME_ONLY_LINES: Pattern = re.compile(
77+
r"^"
78+
+ fr"(?P<startchars>(?:[^\S\n\r\w]|[{start_and_end_chars}])*)" + r"?" #Optional
79+
+ RE_DATE_EXACT.pattern
80+
+ r"[\s/-]*?"
81+
+ RE_TIME.pattern + r"?" # Optional
82+
+ fr"(?P<endcharsnowords>(?:[^\S\n\r\w]|[{start_and_end_chars}])*(?!\w))" + r"?"
83+
+ r"$"
84+
, flags=re.MULTILINE | re.IGNORECASE | re.VERBOSE
85+
)
86+
87+
88+
""" CHARS """
89+
RE_SUSPICIOUS_CHARS: Pattern = re.compile(r"[\\#<>+|~^°=]+")
90+
RE_UNICODE_SYMBOLS: Pattern = re.compile(r"([\u00ab\u00BB\u00AE])")
91+
RE_STRANGE_DASHES: Pattern = re.compile(r"([\u2010-\u2015\uFF0D\uFE63\u2043\u1680\u002D\u2043\u1806])")
92+
RE_SUPER_SUB_SCRIPTED_NUMBERS: Pattern = re.compile(r"([\u2070\u00B9\u00B2\u00B3\u2074\u2075\u2076\u2077\u2078\u2079\u2080\u2081\u2082\u2083\u2084\u2085\u2086\u2087\u2088\u2089])")
93+
RE_BULLET_POINTS: Pattern = re.compile(
94+
# require bullet points as first non-whitespace char on a new line, like a list to catch it later in listing
95+
r"((^|\n)\s*?)"
96+
r"([\u2022\u2023\u2043\u204C\u204D\u2219\u25aa\u25CF\u25E6\u29BE\u29BF\u30fb])",
97+
)
98+
99+
repeat_chars: str = "-.?"
100+
RE_REPEATING_CHARS: Pattern = re.compile(
101+
fr"(?:(?P<spacebefore>\s*)*(?P<repeatchar>[{repeat_chars}])(?P<spaceafter>\s*)){{2,}}")
102+
103+
RE_CURR_SYMBOL: Pattern = re.compile(r"[$¢£¤¥ƒ֏؋৲৳૱௹฿៛ℳ元円圆圓﷼\u20A0-\u20C0]")
104+
RE_CURR_AMOUNT: Pattern = re.compile(r"(?P<curr>EUR|USD|GBP|JPY)(?:\s{0,})(?P<amount>[\,\.\d]+)(?P<unit>\s*\w{1,}\.?)",
105+
flags=re.IGNORECASE)
106+
107+
RE_MEASUREMENTS: Pattern = re.compile(r"(?P<amount>\d+)\s*(?P<unit>(?:mm|cm|m|km|km2)(?:2|3)?)",
108+
flags=re.IGNORECASE)
109+
110+
RE_MEASUREMENTS_PERIOD: Pattern = re.compile(r"(?P<unit>(?:mm|cm|m|km)(?:2|3)?)\.",
111+
flags=re.IGNORECASE)
112+
# Quotation marks
113+
RE_QUOTATION_MARKS: Pattern = re.compile(r"\'(?!s)|\"")
114+
115+
""" EMOJIS """
116+
RE_EMOJI: Pattern = re.compile(
117+
r"[\u2600-\u26FF\u2700-\u27BF\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F900-\U0001F9FF\U0001FA70-\U0001FAFF]",
118+
flags=re.IGNORECASE,
119+
)
120+
121+
""" INTERNET, EMAIL, PHONE """
122+
RE_URL: Pattern = re.compile(
123+
r"(?:^|(?<![\w/.]))"
124+
# protocol identifier
125+
# r"(?:(?:https?|ftp)://)" <-- alt?
126+
r"(?:(?:https?://|ftp://|www\d{0,3}\.))"
127+
# user:pass authentication
128+
r"(?:\S+(?::\S*)?@)?"
129+
r"(?:"
130+
# IP address exclusion
131+
# private & local networks
132+
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
133+
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
134+
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
135+
# IP address dotted notation octets
136+
# excludes loopback network 0.0.0.0
137+
# excludes reserved space >= 224.0.0.0
138+
# excludes network & broadcast addresses
139+
# (first & last IP address of each class)
140+
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
141+
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
142+
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
143+
r"|"
144+
# host name
145+
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9])"
146+
# domain name
147+
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9])*"
148+
# TLD identifier
149+
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
150+
r")"
151+
# port number
152+
r"(?::\d{2,5})?"
153+
# resource path
154+
r"(?:/\S*)?"
155+
r"(?:$|(?![\w?!+&/]))",
156+
flags=re.IGNORECASE,
157+
)
158+
159+
RE_SHORT_URL: Pattern = re.compile(
160+
r"(?:^|(?<![\w/.]))"
161+
# optional scheme
162+
r"(?:(?:https?://)?)"
163+
# domain
164+
r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}"
165+
r"/"
166+
# hash
167+
r"[^\s.,?!'\"|+]{2,12}"
168+
r"(?:$|(?![\w?!+&/]))",
169+
flags=re.IGNORECASE,
170+
)
171+
172+
RE_URL_DOMAIN: Pattern = re.compile(
173+
r"(?P<https>\bhttps?://)?(?P<www>(?:www|\w+)\.)?(?P<domain>(?:\w|-)+)(?P<ending>\.(?:com|de|net|org|io|co|us|uk|au|edu|int|gov|ai|biz)/?)",
174+
flags=re.IGNORECASE)
175+
176+
RE_HTTP_LINKS: Pattern = re.compile(
177+
r"(?P<beforelinks>(?:Link|Bildlink|Quellen)[^:]*:\s*)" + r"?" # Optional
178+
+ RE_URL.pattern
179+
, flags=re.IGNORECASE | re.MULTILINE | re.VERBOSE)
180+
181+
RE_EMAIL: Pattern = re.compile(
182+
r"(?:mailto:)?"
183+
r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}"
184+
r"(?:$|(?=\b))",
185+
flags=re.IGNORECASE,
186+
)
187+
188+
RE_PHONE_NUMBER: Pattern = re.compile("(%s|%s)" % (
189+
# Intl.
190+
r"(?:\+(?:1|44|90)[\-\s]+[0-9]{3}([\-\s0-9]{4,11}(?:$|\s)))|(?:\+[0-9]{2,3}[\-\s]+\(?[0-9]{2,5}\)?[\/\-\s0-9]{4,11}(?:$|\s))",
191+
# National
192+
r"(?P<nat1>0|\+)?(?(nat1)[0-9]{2,5}|\(0?[0-9]{2,5}\))[\s\-\/]([\-\s\d]{3,11})($|\s+)"),
193+
flags=re.IGNORECASE)
194+
195+
""" Abbreviations: """
196+
RE_ABBREVIATION: Pattern = re.compile(r"(?<!-)\b[a-zA-Z]{1,7}\.\-{0,1}(?:[a-zA-Z]{1,4}\.){0,2}", flags=re.IGNORECASE)
197+
198+
if __name__ == '__main__':
199+
print(RE_LISTING_SENTS.pattern)

0 commit comments

Comments
 (0)