-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_missions.py
More file actions
93 lines (78 loc) · 2.93 KB
/
preprocess_missions.py
File metadata and controls
93 lines (78 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import nltk
import spacy
from sentence_transformers import SentenceTransformer
def preprocess_missions(df, column_name="MISSIONS"):
"""
Preprocess the MISSIONS column of a DataFrame in multiple steps:
1. Normalize text (lowercase, remove extra spaces).
2. Segment text into sentences.
3. Lemmatize sentences.
4. Convert lemmatized sentences into embeddings.
Args:
df (pd.DataFrame): The input DataFrame containing the MISSIONS column.
column_name (str): The name of the column to preprocess.
Returns:
pd.DataFrame: The updated DataFrame with additional columns for each preprocessing step.
"""
# Step 1: Normalize text
def preprocess_step1(text):
"""
Normalize text by converting to lowercase, removing extra spaces, and preserving punctuation.
"""
if not isinstance(text, str):
text = str(text)
text = text.lower().strip()
text = re.sub(r"\s+", " ", text)
return text
# Apply Step 1
df[column_name] = df[column_name].apply(preprocess_step1)
# Step 2: Segment text into sentences
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
def preprocess_step2(text):
"""
Segment text into sentences using NLTK's sentence tokenizer.
"""
if not isinstance(text, str):
text = str(text)
sentences = nltk.tokenize.sent_tokenize(text, language='french')
return sentences
# Apply Step 2
df[f"{column_name}_sentences"] = df[column_name].apply(preprocess_step2)
# Step 3: Lemmatize sentences
try:
nlp = spacy.load("fr_core_news_sm")
except Exception:
import spacy.cli
spacy.cli.download("fr_core_news_sm")
nlp = spacy.load("fr_core_news_sm")
def lemmatize_sentence(sentence):
"""
Lemmatize a single sentence using spaCy.
"""
doc = nlp(sentence)
lemmatized_tokens = [token.lemma_ for token in doc]
return " ".join(lemmatized_tokens)
def preprocess_step3(sentences_list):
"""
Lemmatize a list of sentences.
"""
return [lemmatize_sentence(sentence) for sentence in sentences_list]
# Apply Step 3
df[f"{column_name}_lemmatized"] = df[f"{column_name}_sentences"].apply(preprocess_step3)
# Step 4: Convert lemmatized sentences into embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
def compute_embeddings(sentences_list):
"""
Compute embeddings for a list of lemmatized sentences using SentenceTransformer.
"""
if not sentences_list or not isinstance(sentences_list, list):
return []
embeddings = model.encode(sentences_list, convert_to_tensor=False)
return embeddings
# Apply Step 4
df[f"{column_name}_embeddings"] = df[f"{column_name}_lemmatized"].apply(compute_embeddings)
return df