Merge pull request #186 from MAIF/update_content_tagger

✨ Update Melusine Content Tagger
MAIF · Dec 16, 2024 · c3a6cc2 · c3a6cc2
2 parents 18ab6e6 + 8f3d702
commit c3a6cc2
Show file tree

Hide file tree

Showing 26 changed files with 1,533 additions and 1,102 deletions.
diff --git a/docs/tutorials/08_MelusineRegex.md b/docs/tutorials/08_MelusineRegex.md
@@ -17,7 +17,6 @@ from melusine.base import MelusineRegex
 
 
 class AnnoyingEmailsRegex(MelusineRegex):
-
     @property
     def positive(self) -> Union[str, Dict[str, str]]:
         return dict(
@@ -65,7 +64,6 @@ from melusine.base import MelusineRegex
 
 
 class AnnoyingEmailsRegex(MelusineRegex):
-
     @property
     def positive(self) -> Union[str, Dict[str, str]]:
         return dict(
@@ -192,7 +190,6 @@ from melusine.base import MelusineRegex
 
 
 class AnnoyingEmailsRegex(MelusineRegex):
-
     @property
     def positive(self) -> Union[str, Dict[str, str]]:
         return dict(
@@ -237,7 +234,6 @@ That is were neutral regex can be of use. Whenever a neutral regex is matched, i
 
 ```python
 class IfritAlertRegex(MelusineRegex):
-
     @property
     def positive(self) -> Union[str, Dict[str, str]]:
         return dict(

diff --git a/melusine/base.py b/melusine/base.py
@@ -25,7 +25,7 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 
 from melusine.backend import backend
-from melusine.io import IoMixin
+from melusine.io_mixin import IoMixin
 
 logger = logging.getLogger(__name__)
 
@@ -99,6 +99,23 @@ def parse_column_list(columns: str | Iterable[str]) -> list[str]:
             columns = [columns]
         return list(columns)
 
+    def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer:
+        """A reference implementation of a fitting function.
+
+        Parameters
+        ----------
+        X : The training input samples.
+
+        y : The target values (class labels in classification, real numbers in
+            regression).
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        return self
+
     def transform(self, data: MelusineDataset) -> MelusineDataset:
         """
         Transform input data.
@@ -196,6 +213,23 @@ def transform_methods(self) -> list[Callable]:
             List of  methods to be called by the transform method.
         """
 
+    def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer:
+        """A reference implementation of a fitting function.
+
+        Parameters
+        ----------
+        X : The training input samples.
+
+        y : The target values (class labels in classification, real numbers in
+            regression).
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        return self
+
     def transform(self, df: MelusineDataset) -> MelusineDataset:
         """
         Re-definition of super().transform() => specific detector's implementation

diff --git a/melusine/conf/pipelines/demo_pipeline.yaml b/melusine/conf/pipelines/demo_pipeline.yaml
@@ -12,6 +12,9 @@ demo_pipeline:
   - class_name: ContentTagger
     config_key: content_tagger
     module: melusine.processors
+  - class_name: RefinedTagger
+    config_key: refined_tagger
+    module: melusine.processors
   - class_name: TextExtractor
     config_key: text_extractor
     module: melusine.processors

diff --git a/melusine/conf/pipelines/preprocessing_pipeline.yaml b/melusine/conf/pipelines/preprocessing_pipeline.yaml
@@ -12,6 +12,9 @@ preprocessing_pipeline:
   - class_name: ContentTagger
     config_key: content_tagger
     module: melusine.processors
+  - class_name: RefinedTagger
+    config_key: refined_tagger
+    module: melusine.processors
   - class_name: TransferredEmailProcessor
     config_key: transferred_email_processor
     module: melusine.processors

diff --git a/melusine/conf/processors/refined_tagger.yaml b/melusine/conf/processors/refined_tagger.yaml
@@ -0,0 +1,2 @@
+refined_tagger:
+  default_tag: BODY
diff --git a/melusine/detectors.py b/melusine/detectors.py
@@ -6,7 +6,7 @@
 
 """
 
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 from melusine.base import MelusineDetector, MelusineItem, MelusineRegex
 from melusine.message import Message
@@ -95,19 +95,12 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte
             target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART}
         )
 
-        # Extract the THANKS part in the last message
-        thanks_parts: List[Tuple[str, str]] = row[self.messages_column][0].extract_parts(target_tags={self.THANKS_PART})
-
-        # Compute THANKS text
-        if not thanks_parts:
-            thanks_text: str = ""
-        else:
-            thanks_text = "\n".join(x[1] for x in thanks_parts)
+        # Extract the THANKS text in the last message
+        thanks_text = row[self.messages_column][0].extract_text(target_tags={self.THANKS_PART})
 
         # Save debug data
         if debug_mode:
             debug_dict = {
-                self.THANKS_PARTS_COL: thanks_parts,
                 self.THANKS_TEXT_COL: thanks_text,
                 self.HAS_BODY: has_body,
             }
@@ -236,20 +229,13 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte
         """
         # Last message body
         last_message: Message = row[self.messages_column][0]
-        body_parts = last_message.extract_last_body()
-
-        if body_parts:
-            row[self.CONST_TEXT_COL_NAME] = "\n".join(text for tag, text in body_parts)
-        else:
-            row[self.CONST_TEXT_COL_NAME] = ""
+        row[self.CONST_TEXT_COL_NAME] = last_message.extract_text(target_tags=("BODY",), stop_at=("GREETINGS",))
 
         # Prepare and save debug data
         if debug_mode:
             debug_dict: Dict[str, Any] = {
                 self.CONST_DEBUG_TEXT_KEY: row[self.CONST_TEXT_COL_NAME],
             }
-            if self.messages_column:
-                debug_dict[self.CONST_DEBUG_PARTS_KEY] = body_parts
             row[self.debug_dict_col].update(debug_dict)
 
         return row

diff --git a/melusine/io/__init__.py → melusine/io_mixin/__init__.py b/melusine/io/__init__.py → melusine/io_mixin/__init__.py
@@ -2,6 +2,6 @@
 The melusine.io module includes classes for input/output data.
 """
 
-from melusine.io._classes import IoMixin
+from melusine.io_mixin._classes import IoMixin
 
 __all__ = ["IoMixin"]
diff --git a/melusine/io/_classes.py → melusine/io_mixin/_classes.py b/melusine/io/_classes.py → melusine/io_mixin/_classes.py
@@ -27,10 +27,6 @@ class IoMixin:
     Defines generic load methods.
     """
 
-    def __init__(self, **kwargs: Any):
-        """Initialize attribute."""
-        self.json_exclude_list: list[str] = ["_func", "json_exclude_list"]
-
     @classmethod
     def from_config(
         cls: type[T],

diff --git a/melusine/message.py b/melusine/message.py
@@ -8,7 +8,7 @@
 
 import re
 from datetime import datetime
-from typing import Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional
 
 from melusine import config
 
@@ -29,7 +29,7 @@ def __init__(
         date: Optional[datetime] = None,
         text_from: str = "",
         text_to: Optional[str] = None,
-        tags: Optional[List[Tuple[str, str]]] = None,
+        tags: Optional[List[Dict[str, Any]]] = None,
     ):
         """
         Attributes initialization.
@@ -63,6 +63,9 @@ def __init__(
         self.clean_header: str = ""
         self.clean_text: str = ""
 
+        self.effective_tag_key = "base_tag"
+        self.effective_text_key = "base_text"
+
     @property
     def str_tag_name_length(self) -> int:
         """
@@ -84,8 +87,11 @@ def str_line_length(self) -> int:
             return config["message"].get("str_line_length", self.DEFAULT_STR_LINE_LENGTH)
 
     def extract_parts(
-        self, target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None
-    ) -> List[Tuple[str, str]]:
+        self,
+        target_tags: Optional[Iterable[str]] = None,
+        stop_at: Optional[Iterable[str]] = None,
+        tag_type: Optional[str] = None,
+    ) -> List[Dict[str, Any]]:
         """
         Function to extract target tags from the message.
 
@@ -95,17 +101,21 @@ def extract_parts(
             Tags to be extracted.
         stop_at:
             Tags for which extraction should stop.
+        tag_type:
+            Type of tags to consider.
 
         Returns
         -------
-        _: List[Tuple[str, str]]
-            List of extracted tags.
+        _: List of extracted tags.
         """
         if not self.tags:
             return []
 
+        if tag_type is None:
+            tag_type = self.effective_tag_key
+
         # List of tags in the message
-        tag_name_list: List[str] = [x[0] for x in self.tags]
+        tag_name_list: List[str] = [x[tag_type] for x in self.tags]
 
         if target_tags is None:
             target_tags = tag_name_list
@@ -122,29 +132,67 @@ def extract_parts(
         else:
             effective_tags = self.tags
 
-        return [x for x in effective_tags if x[0] in target_tags]
+        return [x for x in effective_tags if x[tag_type] in target_tags]
+
+    def extract_text(
+        self,
+        target_tags: Optional[Iterable[str]] = None,
+        stop_at: Optional[Iterable[str]] = None,
+        tag_type: Optional[str] = None,
+        text_type: Optional[str] = None,
+        separator: str = "\n",
+    ) -> str:
+        """
+        Function to extract target tags from the message.
+
+        Parameters
+        ----------
+        target_tags:
+            Tags to be extracted.
+        stop_at:
+            Tags for which extraction should stop.
+        tag_type:
+            Type of tags to consider.
+        text_type:
+            Type of text to consider
+        separator:
+            Separator to join the extracted texts.
+
+        Returns
+        -------
+        _: List of extracted tags.
+        """
+        if text_type is None:
+            text_type = self.effective_text_key
+        parts = self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type)
+        return separator.join([x[text_type] for x in parts])
 
     def extract_last_body(
-        self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",)
-    ) -> List[Tuple[str, str]]:
+        self,
+        target_tags: Iterable[str] = ("BODY",),
+        stop_at: Iterable[str] = ("GREETINGS",),
+        tag_type: Optional[str] = None,
+    ) -> List[Dict[str, Any]]:
         """
         Extract the BODY parts of the last message in the email.
 
         Parameters
         ----------
         target_tags: Iterable[str]
         stop_at: Iterable[str]
+        tag_type: Type of tags to consider.
 
         Returns
         -------
         _: List[Tuple[str, str]]
         """
-        return self.extract_parts(target_tags=target_tags, stop_at=stop_at)
+        return self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type)
 
     def has_tags(
         self,
         target_tags: Iterable[str] = ("BODY",),
         stop_at: Optional[Iterable[str]] = None,
+        tag_type: Optional[str] = None,
     ) -> bool:
         """
         Function to check if input tags are present in the message.
@@ -155,6 +203,8 @@ def has_tags(
             Tags of interest.
         stop_at:
             Tags for which extraction should stop.
+        tag_type:
+            Type of tags to consider.
 
         Returns
         -------
@@ -164,11 +214,16 @@ def has_tags(
         if self.tags is None:
             return False
 
+        if tag_type is None:
+            tag_type = self.effective_tag_key
+
         if not stop_at:
             stop_at = set()
 
         found: bool = False
-        for tag, _ in self.tags:
+        for tag_data in self.tags:
+            tag = tag_data[tag_type]
+
             # Check if tag in tags of interest
             if tag in target_tags:
                 found = True
@@ -180,19 +235,27 @@ def has_tags(
 
         return found
 
-    def format_tags(self) -> str:
+    def format_tags(self, tag_type: Optional[str] = None, text_type: Optional[str] = None) -> str:
         """
         Create a pretty formatted representation of text and their associated tags.
 
         Returns:
             _: Pretty formatted representation of the tags and texts.
         """
+        if tag_type is None:
+            tag_type = self.effective_tag_key
+
+        if text_type is None:
+            text_type = self.effective_text_key
+
         if self.tags is None:
             return self.text
         else:
             tag_text_length = self.str_line_length - self.str_tag_name_length
             text = ""
-            for tag_name, tag_text in self.tags:
+            for tag_data in self.tags:
+                tag_name = tag_data[tag_type]
+                tag_text = tag_data[text_type]
                 text += tag_text.ljust(tag_text_length, ".") + tag_name.rjust(self.str_tag_name_length, ".") + "\n"
 
         return text.strip()

diff --git a/melusine/pipeline.py b/melusine/pipeline.py
@@ -16,7 +16,7 @@
 from melusine.backend import backend
 from melusine.backend.base_backend import Any
 from melusine.base import MelusineTransformer
-from melusine.io import IoMixin
+from melusine.io_mixin import IoMixin
 
 T = TypeVar("T")