-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfull_cleanup.py
153 lines (133 loc) · 7.01 KB
/
full_cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import re
from bs4 import BeautifulSoup
def clean_work_notes(html_text: str) -> str:
"""
Cleans the given HTML text by:
1. Parsing out all text using BeautifulSoup
2. Removing special characters, timestamps, code tags, subject lines, email addresses, etc.
3. Capturing names in LastName, FirstName patterns (also storing reversed FirstName LastName)
4. Removing any occurrence of those names in the final text
5. Returning the fully cleaned text
"""
soup = BeautifulSoup(html_text, "html.parser")
cleaned_text = soup.get_text(separator=" ")
# ----------------------------------------------------------------
# Remove special characters and extra whitespace
# (You had these steps in your code)
# ----------------------------------------------------------------
# Replace multiple spaces with a single space
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
# Remove `\[cC]ode\{...\}`
cleaned_text = re.sub(r"\[cC]ode\{.*?\}", "", cleaned_text)
# Remove newlines with a space
cleaned_text = re.sub(r"[\r\n]+", " ", cleaned_text)
# Replace HTML non-breaking spaces
cleaned_text = re.sub(r" ", " ", cleaned_text)
# Remove square brackets
cleaned_text = re.sub(r"\[.*?\]", "", cleaned_text)
# Remove "Subject:"
cleaned_text = re.sub(r"Subject:", "", cleaned_text)
# Remove "Re:"
cleaned_text = re.sub(r"Re:", "", cleaned_text)
# ----------------------------------------------------------------
# Remove timestamps or date/time patterns you had (example regexes)
# ----------------------------------------------------------------
timestamp_patterns = [
# ex: "18-02-2024 14:39:21 -"
r"\d{1,2}-\d{1,2}-\d{4}\s+\d{1,2}:\d{2}:\d{2}-?\s*",
# ex: "Sent: Thu, Feb 1, 2024 at 1:24PM"
r"(Sent:\s*[A-Za-z]{3},?\s[A-Za-z]{3}\s\d{1,2},?\s\d{4}\s+at\s+\d{1,2}:\d{2}(AM|PM)?)",
# ex: "On Fri, Jan 5, 2024 at 12:14PM"
r"(On\s[A-Za-z]{3},?\s[A-Za-z]{3}\s\d{1,2},?\s\d{4}\s+at\s+\d{1,2}:\d{2}(AM|PM)?)",
]
for pattern in timestamp_patterns:
cleaned_text = re.sub(pattern, "", cleaned_text)
# Remove standalone times like "12:34 PM" or "1:13 AM"
cleaned_text = re.sub(r"\b\d{1,2}:\d{2}\s?(AM|PM)\b", "", cleaned_text)
# Remove leftover "AM" or "PM" if they appear alone
cleaned_text = re.sub(r"\b(AM|PM)\b", "", cleaned_text)
# Remove "(Work notes (Rich Text))"
cleaned_text = re.sub(r"\(Work notes \(Rich Text\)\)", "", cleaned_text)
# ----------------------------------------------------------------
# Remove email addresses
# ----------------------------------------------------------------
cleaned_text = re.sub(r"<?\b[\w\.-]+@[\w\.-]+\b>?", "", cleaned_text)
# ----------------------------------------------------------------
# Divisions and name patterns from your snippet
# (We’ll keep them here for reference)
# ----------------------------------------------------------------
divisions = [
'AAM', 'Engineering', 'EO', 'GBM', 'Public Risk', 'PS Private',
'Treasury', 'Legal', 'GBM Private', 'Controllers', 'Rothesay',
'AAM Private', 'CHS', 'GRI', 'Internal Audit', 'CRG', 'Health Exch',
'GSB2 Compl', 'CPM', 'AAM Shared', 'PMM', 'PS Public'
]
division_pattern = '|'.join(divisions)
# Example name patterns (these remove lines like "Cc: Last, First Division ;", etc.)
name_patterns = [
rf"(Cc:\s+[A-Z][a-z]+,\s+[A-Z][a-z]+\s*\(?({division_pattern})?\)?)",
rf"(To:\s+[A-Z][a-z]+,\s+[A-Z][a-z]+\s*\(?({division_pattern})?\)?)",
rf"(From:\s+[A-Z][a-z]+,\s+[A-Z][a-z]+\s*\(?({division_pattern})?\)?)",
# You could add more patterns if needed
]
# ----------------------------------------------------------------
# 1) CAPTURE NAMES from lines like "To: LastName, FirstName"
# so we can remove them ANYWHERE in the text (including "FirstName LastName")
# ----------------------------------------------------------------
found_names = set()
# This pattern specifically captures "Label: LastName, FirstName"
# with optional division in parentheses
# E.g. "To: Smith, John P. (Engineering)"
# We’ll store:
# - LastName
# - FirstName
# - "FirstName LastName"
# - possibly with or without middle initials
capturing_pattern = re.compile(
rf"(?:To|From|Cc)\s*:\s*"
rf"(?P<last>[A-Z][a-z]+)" # e.g. "Smith"
rf"\s*,\s*"
rf"(?P<first>[A-Z][a-z]+)" # e.g. "John"
rf"(?:\s+[A-Z]\.)?" # optional middle initial
rf"(?:\s*\(\s*(?:{division_pattern})\s*\))?", # optional (Division)
)
# Let’s extract all matches:
for match in capturing_pattern.finditer(cleaned_text):
last = match.group("last").strip()
first = match.group("first").strip()
# Store both "Last", "First", and "First Last"
found_names.add(last) # "Smith"
found_names.add(first) # "John"
found_names.add(f"{first} {last}") # "John Smith"
# ----------------------------------------------------------------
# 2) REMOVE NAME PATTERNS from the text (the big blocks like "To: Smith, John...")
# So these lines get cleaned out. (This was in your original code.)
# ----------------------------------------------------------------
for pattern in name_patterns:
cleaned_text = re.sub(pattern, '', cleaned_text)
# Also remove entire lines that match the capturing_pattern
cleaned_text = re.sub(capturing_pattern, '', cleaned_text)
# ----------------------------------------------------------------
# Remove leftover "From:", "To:", "Cc:" if desired
# ----------------------------------------------------------------
cleaned_text = re.sub(r"\b(From|To|Cc)\b\s*:?","", cleaned_text)
# ----------------------------------------------------------------
# Remove patterns for "from:", "to:", and "cc:" with or without division
# (If that was in your original code, keep it as well)
# ----------------------------------------------------------------
cleaned_text = re.sub(r"\b(?:[Ff]rom|[Tt]o|[Cc]c)\b:\s*[A-Za-z0-9,()\.\s]+", "", cleaned_text)
# ----------------------------------------------------------------
# 3) CLEAN UP REDUNDANT SPACES
# ----------------------------------------------------------------
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
# ----------------------------------------------------------------
# 4) FINAL PASS: Remove ANY occurrence of the captured names
# (including “FirstName”, “LastName”, or “FirstName LastName”)
# ----------------------------------------------------------------
for name in found_names:
# Match as a separate word
pattern = rf"\b{re.escape(name)}\b"
cleaned_text = re.sub(pattern, "", cleaned_text)
# After removing names, clean up extra spaces again
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
return cleaned_text