StaffBot/preprocess_missions.py at main · CrSamson/StaffBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import nltk
import spacy
from sentence_transformers import SentenceTransformer

def preprocess_missions(df, column_name="MISSIONS"):
    """
    Preprocess the MISSIONS column of a DataFrame in multiple steps:
    1. Normalize text (lowercase, remove extra spaces).
    2. Segment text into sentences.
    3. Lemmatize sentences.
    4. Convert lemmatized sentences into embeddings.

    Args:
        df (pd.DataFrame): The input DataFrame containing the MISSIONS column.
        column_name (str): The name of the column to preprocess.

    Returns:
        pd.DataFrame: The updated DataFrame with additional columns for each preprocessing step.
    """
    # Step 1: Normalize text
    def preprocess_step1(text):
        """
        Normalize text by converting to lowercase, removing extra spaces, and preserving punctuation.
        """
        if not isinstance(text, str):
            text = str(text)
        text = text.lower().strip()
        text = re.sub(r"\s+", " ", text)
        return text

    # Apply Step 1
    df[column_name] = df[column_name].apply(preprocess_step1)

    # Step 2: Segment text into sentences
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    def preprocess_step2(text):
        """
        Segment text into sentences using NLTK's sentence tokenizer.
        """
        if not isinstance(text, str):
            text = str(text)
        sentences = nltk.tokenize.sent_tokenize(text, language='french')
        return sentences

    # Apply Step 2
    df[f"{column_name}_sentences"] = df[column_name].apply(preprocess_step2)

    # Step 3: Lemmatize sentences
    try:
        nlp = spacy.load("fr_core_news_sm")
    except Exception:
        import spacy.cli
        spacy.cli.download("fr_core_news_sm")
        nlp = spacy.load("fr_core_news_sm")

    def lemmatize_sentence(sentence):
        """
        Lemmatize a single sentence using spaCy.
        """
        doc = nlp(sentence)
        lemmatized_tokens = [token.lemma_ for token in doc]
        return " ".join(lemmatized_tokens)

    def preprocess_step3(sentences_list):
        """
        Lemmatize a list of sentences.
        """
        return [lemmatize_sentence(sentence) for sentence in sentences_list]

    # Apply Step 3
    df[f"{column_name}_lemmatized"] = df[f"{column_name}_sentences"].apply(preprocess_step3)

    # Step 4: Convert lemmatized sentences into embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    def compute_embeddings(sentences_list):
        """
        Compute embeddings for a list of lemmatized sentences using SentenceTransformer.
        """
        if not sentences_list or not isinstance(sentences_list, list):
            return []
        embeddings = model.encode(sentences_list, convert_to_tensor=False)
        return embeddings

    # Apply Step 4
    df[f"{column_name}_embeddings"] = df[f"{column_name}_lemmatized"].apply(compute_embeddings)

    return df