Skip to content

Commit

Permalink
Merge pull request #186 from MAIF/update_content_tagger
Browse files Browse the repository at this point in the history
✨ Update Melusine Content Tagger
  • Loading branch information
HugoPerrier authored Dec 16, 2024
2 parents 18ab6e6 + 8f3d702 commit c3a6cc2
Show file tree
Hide file tree
Showing 26 changed files with 1,533 additions and 1,102 deletions.
4 changes: 0 additions & 4 deletions docs/tutorials/08_MelusineRegex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ from melusine.base import MelusineRegex


class AnnoyingEmailsRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down Expand Up @@ -65,7 +64,6 @@ from melusine.base import MelusineRegex


class AnnoyingEmailsRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down Expand Up @@ -192,7 +190,6 @@ from melusine.base import MelusineRegex


class AnnoyingEmailsRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down Expand Up @@ -237,7 +234,6 @@ That is were neutral regex can be of use. Whenever a neutral regex is matched, i

```python
class IfritAlertRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down
36 changes: 35 additions & 1 deletion melusine/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from sklearn.base import BaseEstimator, TransformerMixin

from melusine.backend import backend
from melusine.io import IoMixin
from melusine.io_mixin import IoMixin

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -99,6 +99,23 @@ def parse_column_list(columns: str | Iterable[str]) -> list[str]:
columns = [columns]
return list(columns)

def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer:
"""A reference implementation of a fitting function.
Parameters
----------
X : The training input samples.
y : The target values (class labels in classification, real numbers in
regression).
Returns
-------
self : object
Returns self.
"""
return self

def transform(self, data: MelusineDataset) -> MelusineDataset:
"""
Transform input data.
Expand Down Expand Up @@ -196,6 +213,23 @@ def transform_methods(self) -> list[Callable]:
List of methods to be called by the transform method.
"""

def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer:
"""A reference implementation of a fitting function.
Parameters
----------
X : The training input samples.
y : The target values (class labels in classification, real numbers in
regression).
Returns
-------
self : object
Returns self.
"""
return self

def transform(self, df: MelusineDataset) -> MelusineDataset:
"""
Re-definition of super().transform() => specific detector's implementation
Expand Down
3 changes: 3 additions & 0 deletions melusine/conf/pipelines/demo_pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ demo_pipeline:
- class_name: ContentTagger
config_key: content_tagger
module: melusine.processors
- class_name: RefinedTagger
config_key: refined_tagger
module: melusine.processors
- class_name: TextExtractor
config_key: text_extractor
module: melusine.processors
Expand Down
3 changes: 3 additions & 0 deletions melusine/conf/pipelines/preprocessing_pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ preprocessing_pipeline:
- class_name: ContentTagger
config_key: content_tagger
module: melusine.processors
- class_name: RefinedTagger
config_key: refined_tagger
module: melusine.processors
- class_name: TransferredEmailProcessor
config_key: transferred_email_processor
module: melusine.processors
Expand Down
2 changes: 2 additions & 0 deletions melusine/conf/processors/refined_tagger.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
refined_tagger:
default_tag: BODY
22 changes: 4 additions & 18 deletions melusine/detectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List

from melusine.base import MelusineDetector, MelusineItem, MelusineRegex
from melusine.message import Message
Expand Down Expand Up @@ -95,19 +95,12 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte
target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART}
)

# Extract the THANKS part in the last message
thanks_parts: List[Tuple[str, str]] = row[self.messages_column][0].extract_parts(target_tags={self.THANKS_PART})

# Compute THANKS text
if not thanks_parts:
thanks_text: str = ""
else:
thanks_text = "\n".join(x[1] for x in thanks_parts)
# Extract the THANKS text in the last message
thanks_text = row[self.messages_column][0].extract_text(target_tags={self.THANKS_PART})

# Save debug data
if debug_mode:
debug_dict = {
self.THANKS_PARTS_COL: thanks_parts,
self.THANKS_TEXT_COL: thanks_text,
self.HAS_BODY: has_body,
}
Expand Down Expand Up @@ -236,20 +229,13 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte
"""
# Last message body
last_message: Message = row[self.messages_column][0]
body_parts = last_message.extract_last_body()

if body_parts:
row[self.CONST_TEXT_COL_NAME] = "\n".join(text for tag, text in body_parts)
else:
row[self.CONST_TEXT_COL_NAME] = ""
row[self.CONST_TEXT_COL_NAME] = last_message.extract_text(target_tags=("BODY",), stop_at=("GREETINGS",))

# Prepare and save debug data
if debug_mode:
debug_dict: Dict[str, Any] = {
self.CONST_DEBUG_TEXT_KEY: row[self.CONST_TEXT_COL_NAME],
}
if self.messages_column:
debug_dict[self.CONST_DEBUG_PARTS_KEY] = body_parts
row[self.debug_dict_col].update(debug_dict)

return row
Expand Down
2 changes: 1 addition & 1 deletion melusine/io/__init__.py → melusine/io_mixin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
The melusine.io module includes classes for input/output data.
"""

from melusine.io._classes import IoMixin
from melusine.io_mixin._classes import IoMixin

__all__ = ["IoMixin"]
4 changes: 0 additions & 4 deletions melusine/io/_classes.py → melusine/io_mixin/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ class IoMixin:
Defines generic load methods.
"""

def __init__(self, **kwargs: Any):
"""Initialize attribute."""
self.json_exclude_list: list[str] = ["_func", "json_exclude_list"]

@classmethod
def from_config(
cls: type[T],
Expand Down
91 changes: 77 additions & 14 deletions melusine/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import re
from datetime import datetime
from typing import Iterable, List, Optional, Tuple
from typing import Any, Dict, Iterable, List, Optional

from melusine import config

Expand All @@ -29,7 +29,7 @@ def __init__(
date: Optional[datetime] = None,
text_from: str = "",
text_to: Optional[str] = None,
tags: Optional[List[Tuple[str, str]]] = None,
tags: Optional[List[Dict[str, Any]]] = None,
):
"""
Attributes initialization.
Expand Down Expand Up @@ -63,6 +63,9 @@ def __init__(
self.clean_header: str = ""
self.clean_text: str = ""

self.effective_tag_key = "base_tag"
self.effective_text_key = "base_text"

@property
def str_tag_name_length(self) -> int:
"""
Expand All @@ -84,8 +87,11 @@ def str_line_length(self) -> int:
return config["message"].get("str_line_length", self.DEFAULT_STR_LINE_LENGTH)

def extract_parts(
self, target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None
) -> List[Tuple[str, str]]:
self,
target_tags: Optional[Iterable[str]] = None,
stop_at: Optional[Iterable[str]] = None,
tag_type: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Function to extract target tags from the message.
Expand All @@ -95,17 +101,21 @@ def extract_parts(
Tags to be extracted.
stop_at:
Tags for which extraction should stop.
tag_type:
Type of tags to consider.
Returns
-------
_: List[Tuple[str, str]]
List of extracted tags.
_: List of extracted tags.
"""
if not self.tags:
return []

if tag_type is None:
tag_type = self.effective_tag_key

# List of tags in the message
tag_name_list: List[str] = [x[0] for x in self.tags]
tag_name_list: List[str] = [x[tag_type] for x in self.tags]

if target_tags is None:
target_tags = tag_name_list
Expand All @@ -122,29 +132,67 @@ def extract_parts(
else:
effective_tags = self.tags

return [x for x in effective_tags if x[0] in target_tags]
return [x for x in effective_tags if x[tag_type] in target_tags]

def extract_text(
self,
target_tags: Optional[Iterable[str]] = None,
stop_at: Optional[Iterable[str]] = None,
tag_type: Optional[str] = None,
text_type: Optional[str] = None,
separator: str = "\n",
) -> str:
"""
Function to extract target tags from the message.
Parameters
----------
target_tags:
Tags to be extracted.
stop_at:
Tags for which extraction should stop.
tag_type:
Type of tags to consider.
text_type:
Type of text to consider
separator:
Separator to join the extracted texts.
Returns
-------
_: List of extracted tags.
"""
if text_type is None:
text_type = self.effective_text_key
parts = self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type)
return separator.join([x[text_type] for x in parts])

def extract_last_body(
self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",)
) -> List[Tuple[str, str]]:
self,
target_tags: Iterable[str] = ("BODY",),
stop_at: Iterable[str] = ("GREETINGS",),
tag_type: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Extract the BODY parts of the last message in the email.
Parameters
----------
target_tags: Iterable[str]
stop_at: Iterable[str]
tag_type: Type of tags to consider.
Returns
-------
_: List[Tuple[str, str]]
"""
return self.extract_parts(target_tags=target_tags, stop_at=stop_at)
return self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type)

def has_tags(
self,
target_tags: Iterable[str] = ("BODY",),
stop_at: Optional[Iterable[str]] = None,
tag_type: Optional[str] = None,
) -> bool:
"""
Function to check if input tags are present in the message.
Expand All @@ -155,6 +203,8 @@ def has_tags(
Tags of interest.
stop_at:
Tags for which extraction should stop.
tag_type:
Type of tags to consider.
Returns
-------
Expand All @@ -164,11 +214,16 @@ def has_tags(
if self.tags is None:
return False

if tag_type is None:
tag_type = self.effective_tag_key

if not stop_at:
stop_at = set()

found: bool = False
for tag, _ in self.tags:
for tag_data in self.tags:
tag = tag_data[tag_type]

# Check if tag in tags of interest
if tag in target_tags:
found = True
Expand All @@ -180,19 +235,27 @@ def has_tags(

return found

def format_tags(self) -> str:
def format_tags(self, tag_type: Optional[str] = None, text_type: Optional[str] = None) -> str:
"""
Create a pretty formatted representation of text and their associated tags.
Returns:
_: Pretty formatted representation of the tags and texts.
"""
if tag_type is None:
tag_type = self.effective_tag_key

if text_type is None:
text_type = self.effective_text_key

if self.tags is None:
return self.text
else:
tag_text_length = self.str_line_length - self.str_tag_name_length
text = ""
for tag_name, tag_text in self.tags:
for tag_data in self.tags:
tag_name = tag_data[tag_type]
tag_text = tag_data[text_type]
text += tag_text.ljust(tag_text_length, ".") + tag_name.rjust(self.str_tag_name_length, ".") + "\n"

return text.strip()
Expand Down
2 changes: 1 addition & 1 deletion melusine/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from melusine.backend import backend
from melusine.backend.base_backend import Any
from melusine.base import MelusineTransformer
from melusine.io import IoMixin
from melusine.io_mixin import IoMixin

T = TypeVar("T")

Expand Down
Loading

0 comments on commit c3a6cc2

Please sign in to comment.