|
| 1 | +import re |
| 2 | + |
| 3 | +from . import re_patterns as repat |
| 4 | + |
| 5 | + |
| 6 | +class CleanText: |
| 7 | + # Tables |
| 8 | + TBL_UMLAUTE: dict = {'ä': 'ae', 'Ä': 'Ae', 'ö': 'oe', 'Ö': 'Oe', 'ü': 'ue', 'Ü': 'Ue', 'ß': 'ss'} |
| 9 | + TBL_ACCENTS: dict = {'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'å': 'a', 'À': 'A', 'Á': 'A', 'Â': 'A', |
| 10 | + 'ç': 'c', |
| 11 | + 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ȩ': 'e', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', |
| 12 | + 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'Î': 'I', 'Ï': 'I', |
| 13 | + 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'Ô': 'O', 'Ò': 'O', 'Ó': 'O', |
| 14 | + 'ù': 'u', 'ú': 'u', 'û': 'u', 'Ù': 'U', 'Û': 'U'} |
| 15 | + TBL_CURRENCY: dict = {'€': 'EUR', '$': 'USD', '£': 'GBP', '¥': 'JPY'} |
| 16 | + TBL_FANCY_QUOTATION_MARKS: dict[int, int] = { |
| 17 | + ord(x): ord(y) for x, y in |
| 18 | + [("ʼ", "'"), ("‘", "'"), ("’", "'"), ("´", "'"), ("`", "'"), ("“", '"'), ("”", '"'), ("„", "'")]} |
| 19 | + |
| 20 | + # Substitutions: Replace special character with value in respective TBL |
| 21 | + def sub_umlaute(self, text: str) -> str: |
| 22 | + table = str.maketrans(self.TBL_UMLAUTE) |
| 23 | + return text.translate(table) |
| 24 | + |
| 25 | + def sub_accent_chars(self, text: str) -> str: |
| 26 | + table = str.maketrans(self.TBL_ACCENTS) |
| 27 | + return text.translate(table) |
| 28 | + |
| 29 | + def sub_currency(self, text: str) -> str: |
| 30 | + """ Function makes space between curr, amount and unit (i.e. EUR200M -> EUR 200 M) to improve tokenization """ |
| 31 | + table = str.maketrans(self.TBL_CURRENCY) |
| 32 | + text = text.translate(table) |
| 33 | + return repat.RE_CURR_AMOUNT.sub( |
| 34 | + repl=lambda m: m.group("curr") + ' ' + m.group("amount") + ' ' + m.group("unit"), |
| 35 | + string=text) |
| 36 | + |
| 37 | + def sub_measurements(self, text: str) -> str: |
| 38 | + """ Function makes space between amount and unit (i.e. 200km -> 200 km) to improve tokenization """ |
| 39 | + return repat.RE_MEASUREMENTS.sub( |
| 40 | + repl=lambda m: m.group("amount") + ' ' + m.group("unit"), string=text) |
| 41 | + |
| 42 | + def sub_fancy_quot_marks(self, text: str) -> str: |
| 43 | + table = str.maketrans(self.TBL_FANCY_QUOTATION_MARKS) |
| 44 | + return text.translate(table) |
| 45 | + |
| 46 | + def rem_quot_marks(self, text: str) -> str: |
| 47 | + return repat.RE_QUOTATION_MARKS.sub(repl="", string=text) |
| 48 | + |
| 49 | + def rem_brackets(self, text: str, curly: bool = True, round: bool = False, square: bool = True) -> str: |
| 50 | + text = repat.RE_BRACKETS_CURLY_AND_CONTENT.sub(repl="", string=text) if curly else text |
| 51 | + text = repat.RE_BRACKETS_ROUND_AND_CONTENT.sub(repl="", string=text) if round else text |
| 52 | + text = repat.RE_BRACKETS_SQUARE_AND_CONTENT.sub(repl="", string=text) if square else text |
| 53 | + return text |
| 54 | + |
| 55 | + def rem_strange_chars(self, text: str) -> str: |
| 56 | + text = repat.RE_SUPER_SUB_SCRIPTED_NUMBERS.sub(repl=" ", string=text) |
| 57 | + text = repat.RE_SUSPICIOUS_CHARS.sub(repl=" ", string=text) |
| 58 | + text = repat.RE_BULLET_POINTS.sub(repl="*", string=text) |
| 59 | + text = repat.RE_UNICODE_SYMBOLS.sub(repl="", string=text) |
| 60 | + text = repat.RE_STRANGE_DASHES.sub(repl="-", string=text) |
| 61 | + return text |
| 62 | + |
| 63 | + def rem_datetime_only_lines(self, text: str): |
| 64 | + text = repat.RE_LOCATION_DATE_TIMEOPTIONAL.sub(repl="", string=text) |
| 65 | + text = repat.RE_DATE_TIME_ONLY_LINES.sub(repl="", string=text) |
| 66 | + return text |
| 67 | + |
| 68 | + def rem_address_info(self, text: str): |
| 69 | + text = repat.RE_HTTP_LINKS.sub(repl="", string=text) |
| 70 | + text = repat.RE_EMAIL.sub(repl="", string=text) |
| 71 | + return text |
| 72 | + |
| 73 | + def mark_end_of_sentence(self, text: str, period: str = ". ") -> str: |
| 74 | + """ An end of sentence pattern can be manifold. Multiple patterns are thus (probably) required. """ |
| 75 | + text = repat.RE_HEADLINE.sub(repl=lambda m: m.group("headline") + period, string=text) |
| 76 | + text = repat.RE_SECTION_HEADER.sub(repl=lambda m: m.group("sectionheader") + period, string=text) |
| 77 | + return text |
| 78 | + |
| 79 | + def split_listed_sentences(self, text: str, end_of_sent_char: str = ". ") -> str: |
| 80 | + possible_chars_to_mark_end_of_sents: list = [".", ":"] |
| 81 | + try: |
| 82 | + listing_objs: list[re.Match] = list(repat.RE_LISTING_SENTS.finditer(string=text)) |
| 83 | + end_pos: int = 0 |
| 84 | + for listing_obj in listing_objs: |
| 85 | + if listing_obj.start() == end_pos or listing_obj.group( |
| 86 | + 'beforechar') in possible_chars_to_mark_end_of_sents: |
| 87 | + hyphen_replacement: str = "" |
| 88 | + else: |
| 89 | + hyphen_replacement: str = end_of_sent_char |
| 90 | + if listing_obj.group('afterchar') in possible_chars_to_mark_end_of_sents: |
| 91 | + char_added_at_end: str = "" |
| 92 | + else: |
| 93 | + char_added_at_end: str = end_of_sent_char |
| 94 | + end_pos = listing_obj.end() |
| 95 | + cleaned_listing = listing_obj.group('listing').replace(listing_obj.group('hyphen'), |
| 96 | + hyphen_replacement) + char_added_at_end |
| 97 | + text = text.replace(listing_obj.group('listing'), cleaned_listing) |
| 98 | + |
| 99 | + except: |
| 100 | + # ToDo: Logger here |
| 101 | + pass |
| 102 | + return text |
| 103 | + |
| 104 | + def rem_repeating_chars(self, text: str): |
| 105 | + text = repat.RE_REPEATING_CHARS.sub( |
| 106 | + repl=lambda m: m.group(0)[0] + " ", string=text) |
| 107 | + return text |
| 108 | + |
| 109 | + def rem_whitespace(self, text: str) -> str: |
| 110 | + """ |
| 111 | + Replace all contiguous zero-width and line-breaking spaces and spaces before a sentence end with an empty |
| 112 | + string, non-line-breaking spaces with a single space and then strip any leading/trailing whitespace. |
| 113 | + """ |
| 114 | + text = repat.RE_ZERO_WIIDTH_SPACE.sub(repl="", string=text) |
| 115 | + text = repat.RE_LINEBREAK.sub(repl=r" ", string=text) |
| 116 | + text = repat.RE_NONBREAKING_SPACE.sub(repl=" ", string=text) |
| 117 | + text = repat.RE_SPACE_BEFORE_SENT_END.sub(repl="", string=text) |
| 118 | + return text.strip() |
| 119 | + |
| 120 | + def clean(self, text: str, |
| 121 | + sub_umlaute: bool = True, sub_accent_chars: bool = True, sub_curr: bool = True, sub_measure: bool = True, |
| 122 | + sub_fancy_quot_marks: bool = True, rem_brackets: bool = True, rem_quot_marks: bool = True, |
| 123 | + rem_strange_chars: bool = True, mark_end_of_sent: bool = True, split_listed_sents: bool = True, |
| 124 | + rem_dt_only_lines: bool = True, |
| 125 | + rem_address_info: bool = True, rem_repeat_chars: bool = True, rem_whitespace: bool = True) -> str: |
| 126 | + |
| 127 | + text = self.sub_umlaute(text=text) if sub_umlaute else text |
| 128 | + text = self.sub_accent_chars(text=text) if sub_accent_chars else text |
| 129 | + text = self.sub_currency(text=text) if sub_curr else text |
| 130 | + text = self.sub_measurements(text=text) if sub_measure else text |
| 131 | + text = self.sub_fancy_quot_marks(text=text) if sub_fancy_quot_marks else text |
| 132 | + text = self.rem_quot_marks(text=text) if rem_quot_marks else text # Should be after sub_fancy_quot_marks !!! |
| 133 | + text = self.rem_strange_chars(text=text) if rem_strange_chars else text |
| 134 | + text = self.rem_datetime_only_lines(text=text) if rem_dt_only_lines else text |
| 135 | + text = self.rem_brackets(text=text) if rem_brackets else text |
| 136 | + text = self.rem_address_info(text=text) if rem_address_info else text |
| 137 | + text = self.mark_end_of_sentence(text=text) if mark_end_of_sent else text |
| 138 | + text = self.split_listed_sentences(text=text) if split_listed_sents else text |
| 139 | + text = self.rem_repeating_chars(text=text) if rem_repeat_chars else text # rem_repeating_chars: do second last |
| 140 | + text = self.rem_whitespace(text=text) if rem_whitespace else text # rem_whitespace: do last !!! |
| 141 | + return text |
0 commit comments