From 61fffa91ccd06c4d63f553146d05db35a1997efc Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Thu, 2 May 2024 18:23:29 +0200 Subject: [PATCH 1/9] :sparkles: introduce GmailConnector to interact with mailbox (WIP) Signed-off-by: MathisNcl --- AUTHORS.rst | 1 + melusine/connectors/gmail.py | 323 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 325 insertions(+) create mode 100644 melusine/connectors/gmail.py diff --git a/AUTHORS.rst b/AUTHORS.rst index 1563dc3..a6672e8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -20,5 +20,6 @@ Contributors * Sacha Samama * Antoine Simoulin * Tom Stringer +* Mathis Nicoli To be continued ... diff --git a/melusine/connectors/gmail.py b/melusine/connectors/gmail.py new file mode 100644 index 0000000..f7e8bcd --- /dev/null +++ b/melusine/connectors/gmail.py @@ -0,0 +1,323 @@ +import base64 +import logging +import mimetypes +import os +from email import message, policy +from email.parser import BytesParser +from typing import Any, Dict, List, Optional, Union + +import pandas as pd +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +SCOPES: List[str] = ["https://www.googleapis.com/auth/gmail.readonly", "https://www.googleapis.com/auth/gmail.modify"] + + +class GmailConnector: + """ + Connector to Gmail Mailboxs. + This class contains methods suited for automated emails routing. + """ + + def __init__( + self, + token_json_path: Optional[str] = None, + routing_label: Optional[str] = None, + correction_label: Optional[str] = None, + done_label: Optional[str] = None, + target_column: str = "target", + ): + self.target_column = target_column + + # Connect to mailbox + self.credentials: Credentials = self.get_credentials(token_json_path=token_json_path) + self.service = build("gmail", "v1", credentials=self.credentials) + self.labels: List[Dict[str, str]] = self._get_labels() + + # Setup correction folder and done folder + self.routing_label: Optional[str] = self._check_or_create_label(routing_label) + self.correction_label: Optional[str] = self._check_or_create_label(correction_label) + self.done_label: Optional[str] = self._check_or_create_label(done_label) + + self.mailbox_address = self.service.users().getProfile(userId="me").execute()["emailAddress"] + logger.info(f"Connected to mailbox: {self.mailbox_address}.") + + def __repr__(self) -> str: + return ( + f"GmailConnector(routing_label={self.routing_label}, correction_label={self.correction_label}," + + f" done_label={self.done_label}), connected to {self.mailbox_address}" + ) + + @staticmethod + def get_credentials(token_json_path: Optional[str] = None): + """TODO + + Args: + token_json_path (Optional[str], optional): _description_. Defaults to None. + + Returns: + _type_: _description_ + """ + if token_json_path is not None and os.path.exists(token_json_path): + creds: Credentials = Credentials.from_authorized_user_file("token.json", SCOPES) + if creds.valid is False: + creds.refresh(Request()) + return creds + + flow = InstalledAppFlow.from_client_secrets_file( + # your creds file here. Please create json file as here https://cloud.google.com/docs/authentication/getting-started + "credentials.json", + SCOPES, + ) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open("token.json", "w") as token: + token.write(creds.to_json()) + logger.info(f"gmail token.json saved at: {os.getcwd()}") + + def _get_labels(self) -> List[Dict]: + """Retrieve all current labels in mailbox + + Returns: + List[Dict]: List of labels dict + """ + labels = self.service.users().labels().list(userId="me").execute()["labels"] + return labels + + def _check_or_create_label(self, label_name: Optional[str] = None) -> Optional[str]: + """Check label existance, if not, ask to create it. + + Args: + label_name (Optional[str], optional): The label name to check. Defaults to None. + + Returns: + Optional[str]: The label name + """ + if label_name is None: + return None + + all_labels_upper: List[str] = [label["name"].upper() for label in self.labels] + if label_name.upper() in all_labels_upper: + return label_name + + logger.warning( + f"Label {label_name} does not exist in current labels list: {all_labels_upper}.\n" + + "Would you like to create it? (Y/N)" + ) + choice: str = input() + if "Y" in choice.upper(): + result: Dict[str, str] = self.create_label(label_name) + self.labels.append(result) + return label_name + raise ValueError(f"Label {label_name} does not exist.") + + def create_label(self, label_name: str) -> Dict[str, str]: + """Create a label into connected mailbox + + Args: + label_name (str): name of the new label + + Returns: + Dict[str, str]: return from the api with label and its informations + """ + label = self.service.users().labels().create(userId="me", body=dict(name=label_name)).execute() + logger.info(f"Label {label_name} has been created.") + return label + + def extract_from_parsed_mail(self, parsed_email: message.Message) -> Dict[str, Any]: + """Extract body and attachments from the parsed email + + Args: + parsed_email (message.Message): Message object containg all the email and its informations + + Returns: + Dict[str, Any]: `body` key and `attachments_list` key with value inside the parsed email + """ + body: str = "" + if parsed_email.is_multipart(): + for part in parsed_email.walk(): + content_type = part.get_content_type() + if "text/plain" in content_type: + bytes_string = part.get_payload(decode=True) + charset = part.get_content_charset("iso-8859-1") + body += bytes_string.decode(charset, "replace") + else: + bytes_string = parsed_email.get_payload(decode=True) + charset = parsed_email.get_content_charset("iso-8859-1") + body += bytes_string.decode(charset, "replace") + + attachments_list: List[Dict] = [] + for part in parsed_email.iter_attachments(): # type: ignore + attachments_list.append( + { + "filename": part.get_filename(), + "type": part.get_content_type(), + "data": part.get_payload(decode=True), + } + ) + return {"body": body, "attachments_list": attachments_list} + + def _extract_email_attributes(self, message_id: str) -> Dict: + """Return formatted attributes for the considered email + + Args: + message_id (str): id of the mail to consider + + Returns: + Dict: formatted output of the email + """ + msg_raw: Dict[str, Any] = ( + self.service.users().messages().get(id=message_id, userId="me", format="raw").execute() + ) + parsed_email: message.Message = BytesParser(policy=policy.default).parsebytes( + base64.urlsafe_b64decode(msg_raw["raw"]) + ) + + infos: Dict[str, Any] = self.extract_from_parsed_mail(parsed_email) + email_dict = { + "message_id": message_id, + "body": infos["body"], + "header": parsed_email["Subject"], + "date": parsed_email["Date"], + "from": parsed_email["From"], + "to": parsed_email["To"], + "attachment": infos["attachments_list"], + } + return email_dict + + def get_emails( + self, + max_emails: int = 100, + target_labels: Optional[List[str]] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + ) -> pd.DataFrame: + """TODO + + Args: + max_emails (int, optional): _description_. Defaults to 100. + target_labels (List[str], optional): _description_. Defaults to None. + start_date (Optional[str], optional): _description_. Defaults to None. + end_date (Optional[str], optional): _description_. Defaults to None. + + Returns: + pd.DataFrame: _description_ + """ + logger.info("Reading new emails for mailbox") + if target_labels is None: + target_labels = ["INBOX"] + target_label_id: List[str] = [item["id"] for item in self.labels if item["name"] in target_labels] + q = "" + if start_date is not None: + q += f"after:{start_date} " + + if end_date is not None: + q += f"before:{end_date}" + + all_new_data = ( + self.service.users() + .messages() + .list(userId="me", maxResults=max_emails, labelIds=target_label_id, q=q) + .execute() + ) + + logger.info("Please wait while loading messages") + new_emails: List[Dict] = [self._extract_email_attributes(x["id"]) for x in tqdm(all_new_data["messages"])] + df_new_emails = pd.DataFrame(new_emails) + + logger.info(f"Read '{len(new_emails)}' new emails") + return df_new_emails + + def _move_to( + self, + emails_id: List[str], + label_to_move_on: Optional[str], + attribute_class_to_set_error: str, + func_name_error: str, + ): + """TODO + + Args: + emails_id (List[str]): _description_ + label_to_move_on (Optional[str]): _description_ + attribute_class_to_set_error (str): _description_ + func_name_error (str): _description_ + + Raises: + AttributeError: _description_ + """ + if label_to_move_on is None: + raise AttributeError( + f"You need to set the class attribute `{attribute_class_to_set_error}` to use `{func_name_error}`." + ) + label_id = next((label["id"] for label in self.labels if label["name"] == label_to_move_on), None) + body = {"addLabelIds": [label_id]} + for email_id in emails_id: + self.service.users().messages().modify(id=email_id, userId="me", body=body).execute() + logger.info(f"Moved {len(emails_id)} emails to {label_to_move_on} label.") + + def move_to_done(self, emails_id: List[str]) -> None: + """Move emails to done label + + Args: + emails_id (List[str]): List of emails id to move to done label + """ + self._move_to(emails_id, self.done_label, "done_label", "move_to_done") + + def route_emails( + self, + classified_emails: pd.DataFrame, + id_column: str = "message_id", + ) -> None: + """Function to route emails to mailbox folders. + + Args: + classified_emails (pd.DataFrame): DataFrame containing emails message_id and target folder + id_column (str, optional): Name of the DataFrame column containing message ids. Defaults to "message_id". + """ + target_column = self.target_column + target_labels = classified_emails[target_column].unique().tolist() + + for label in target_labels: + mask = classified_emails[target_column] == label + mids_to_move = classified_emails[mask][id_column] + self._move_to(mids_to_move, label, label, "route_emails") + logger.info(f"Moving {mids_to_move.size} emails to folder '{label}'") + + def send_email(self, to: Union[str, List[str]], header: str, body: str, attachments: dict) -> None: + """This method sends an email from the login address (attribute login_address). + + Args: + to (Union[str, List[str]]): Address or list of addresses of email recipients + header (str): Email header + body (str): Email body + attachments (dict): Dict containing attachment names as key and attachment file contents as values. + Currently, the code is tested for DataFrame attachments only + """ + if isinstance(to, str): + to = [to] + + m = message.EmailMessage() + m.set_content(body) + + m["To"] = to + m["Subject"] = header + + if attachments: + for filename, value in attachments.items(): + type_subtype, _ = mimetypes.guess_type(filename) + if type_subtype is not None: + maintype, subtype = type_subtype.split("/") + m.add_attachment(value, filename=filename, maintype=maintype, subtype=subtype) + + # encoded message + encoded_message = base64.urlsafe_b64encode(m.as_bytes()).decode() + create_message = {"raw": encoded_message} + send_message = self.service.users().messages().send(userId="me", body=create_message).execute() + logger.info(f"Email sent to {to}, Message Id: {send_message['id']}") diff --git a/pyproject.toml b/pyproject.toml index a581dc6..3120735 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ dynamic = ["version"] dev = ["tox", "pre-commit", "black", "flake8", "isort", "mypy", "pytest", "coverage", "build", "ruff"] test = ["pytest", "coverage"] transformers = ["transformers>4"] +connectors = ["exchangelib", "google-auth-oauthlib", "google-api-python-client"] docs = ["mkdocs", "markdown", "mkdocs-material", "mdx-include"] [tool.setuptools.packages.find] From 2ca9b6553748f0e4686fae9335580a8ee03ce1bf Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Fri, 3 May 2024 09:59:27 +0200 Subject: [PATCH 2/9] :art: enhance GmailConnector and add docstring Signed-off-by: MathisNcl --- melusine/connectors/gmail.py | 140 +++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 55 deletions(-) diff --git a/melusine/connectors/gmail.py b/melusine/connectors/gmail.py index f7e8bcd..afc8c4d 100644 --- a/melusine/connectors/gmail.py +++ b/melusine/connectors/gmail.py @@ -16,70 +16,94 @@ logger = logging.getLogger(__name__) -SCOPES: List[str] = ["https://www.googleapis.com/auth/gmail.readonly", "https://www.googleapis.com/auth/gmail.modify"] - - class GmailConnector: """ Connector to Gmail Mailboxs. This class contains methods suited for automated emails routing. + A credentials.json file is needed to sign in to google. To do so, follow these steps : + https://medium.com/@preetipriyanka24/how-to-read-emails-from-gmail-using-gmail-api-in-python-20f7d9d09ae9 + Please do not forget to add the mail address to the list of allowed tester if the credentials stays in testing + + First use can only be: + ```python + # Assuming credentials.json is at root level + gc = GmailConnector() + # A pop up window will ask you to connect then the next connection will be: + gc = GmailConnector("token.json") + ``` """ + SCOPES: List[str] = [ + "https://www.googleapis.com/auth/gmail.readonly", + "https://www.googleapis.com/auth/gmail.modify", + ] + def __init__( self, token_json_path: Optional[str] = None, - routing_label: Optional[str] = None, - correction_label: Optional[str] = None, done_label: Optional[str] = None, target_column: str = "target", ): - self.target_column = target_column + """ + Args: + token_json_path (Optional[str], optional): `token.json` file path created after the first connection using + `credentials.json`. If None, looking for `credentials.json` at root and sign in. Defaults to None. + done_label (Optional[str], optional): Label name for the done situation. Defaults to None. + target_column (str, optional): Name of the DataFrame column containing target label. Defaults to "target". + """ + self.target_column: str = target_column # Connect to mailbox self.credentials: Credentials = self.get_credentials(token_json_path=token_json_path) - self.service = build("gmail", "v1", credentials=self.credentials) + self.service: Any = build("gmail", "v1", credentials=self.credentials) self.labels: List[Dict[str, str]] = self._get_labels() - # Setup correction folder and done folder - self.routing_label: Optional[str] = self._check_or_create_label(routing_label) - self.correction_label: Optional[str] = self._check_or_create_label(correction_label) + # Setup done label self.done_label: Optional[str] = self._check_or_create_label(done_label) self.mailbox_address = self.service.users().getProfile(userId="me").execute()["emailAddress"] logger.info(f"Connected to mailbox: {self.mailbox_address}.") def __repr__(self) -> str: + """ + Returns: + str: Reprensentation of the object + """ return ( - f"GmailConnector(routing_label={self.routing_label}, correction_label={self.correction_label}," - + f" done_label={self.done_label}), connected to {self.mailbox_address}" + f"GmailConnector(done_label={self.done_label}, target_column={self.target_column}), " + + f"connected to {self.mailbox_address}" ) - @staticmethod - def get_credentials(token_json_path: Optional[str] = None): - """TODO + def get_credentials(self, token_json_path: Optional[str] = None) -> Credentials: + """Retrieve credentials object to connect to Gmail using the `credentials.json` and generating the `token.json` + if needed at root path. + Please create json file as here https://cloud.google.com/docs/authentication/getting-started Args: - token_json_path (Optional[str], optional): _description_. Defaults to None. + token_json_path (Optional[str], optional): `token.json` file path created after the first connection using + `credentials.json`. Defaults to None. Returns: - _type_: _description_ + Credentials: Credentials to connect to Gmail """ if token_json_path is not None and os.path.exists(token_json_path): - creds: Credentials = Credentials.from_authorized_user_file("token.json", SCOPES) + creds: Credentials = Credentials.from_authorized_user_file("token.json", self.SCOPES) if creds.valid is False: creds.refresh(Request()) return creds - flow = InstalledAppFlow.from_client_secrets_file( - # your creds file here. Please create json file as here https://cloud.google.com/docs/authentication/getting-started + flow: InstalledAppFlow = InstalledAppFlow.from_client_secrets_file( "credentials.json", - SCOPES, + self.SCOPES, ) creds = flow.run_local_server(port=0) + # Save the credentials for the next run with open("token.json", "w") as token: token.write(creds.to_json()) + logger.info(f"gmail token.json saved at: {os.getcwd()}") + return creds def _get_labels(self) -> List[Dict]: """Retrieve all current labels in mailbox @@ -164,7 +188,14 @@ def extract_from_parsed_mail(self, parsed_email: message.Message) -> Dict[str, A return {"body": body, "attachments_list": attachments_list} def _extract_email_attributes(self, message_id: str) -> Dict: - """Return formatted attributes for the considered email + """Return formatted attributes for the considered email such as: + - `message_id` field + - `body` field + - `header` field + - `date` field + - `from` field + - `to` field + - `attachment` field Args: message_id (str): id of the mail to consider @@ -198,16 +229,25 @@ def get_emails( start_date: Optional[str] = None, end_date: Optional[str] = None, ) -> pd.DataFrame: - """TODO + """Loads emails in descending date order based on target_labels. To see all available labels, use `self.labels`. + If two label names are defined, retrieve all emails with both labels, e.g. ["TRASH", "INBOX"] will retrieve none + These labels cannot be present simultaneously. + + For example, to get first 10 mails received in inbox and unreaded: + ```python + gc = GmailConnector("token.json", done_label="test") + df = gc.get_emails(10, target_labels=["INBOX", "UNREAD"]) + df + ``` Args: - max_emails (int, optional): _description_. Defaults to 100. - target_labels (List[str], optional): _description_. Defaults to None. - start_date (Optional[str], optional): _description_. Defaults to None. - end_date (Optional[str], optional): _description_. Defaults to None. + max_emails (int, optional): Maximum number of emails to load. Defaults to 100. + target_labels (List[str], optional): Label names list to fetch. Defaults to None. If None, fetch INBOX. + start_date (Optional[str], optional): Filter date start, format YYYY/MM/DD. Defaults to None. + end_date (Optional[str], optional): Filter date end, format YYYY/MM/DD. Defaults to None. Returns: - pd.DataFrame: _description_ + pd.DataFrame: DataFrame containing emails """ logger.info("Reading new emails for mailbox") if target_labels is None: @@ -234,32 +274,24 @@ def get_emails( logger.info(f"Read '{len(new_emails)}' new emails") return df_new_emails - def _move_to( - self, - emails_id: List[str], - label_to_move_on: Optional[str], - attribute_class_to_set_error: str, - func_name_error: str, - ): - """TODO + def move_to(self, emails_id: List[str], label_to_move_on: str) -> None: + """Generic method to move emails to a specified label Args: - emails_id (List[str]): _description_ - label_to_move_on (Optional[str]): _description_ - attribute_class_to_set_error (str): _description_ - func_name_error (str): _description_ + emails_id (List[str]): List of emails id to set the label + label_to_move_on (str): Label name to set - Raises: - AttributeError: _description_ """ - if label_to_move_on is None: - raise AttributeError( - f"You need to set the class attribute `{attribute_class_to_set_error}` to use `{func_name_error}`." + label_id: Optional[str] = next( + (label["id"] for label in self.labels if label["name"] == label_to_move_on), None + ) + if label_id is None: + raise ValueError( + f"Label '{label_to_move_on}' does not exist in self.labels. Make sure to specified a right label name." ) - label_id = next((label["id"] for label in self.labels if label["name"] == label_to_move_on), None) - body = {"addLabelIds": [label_id]} for email_id in emails_id: - self.service.users().messages().modify(id=email_id, userId="me", body=body).execute() + self.service.users().messages().modify(id=email_id, userId="me", body={"addLabelIds": [label_id]}).execute() + logger.info(f"Moved {len(emails_id)} emails to {label_to_move_on} label.") def move_to_done(self, emails_id: List[str]) -> None: @@ -268,13 +300,11 @@ def move_to_done(self, emails_id: List[str]) -> None: Args: emails_id (List[str]): List of emails id to move to done label """ - self._move_to(emails_id, self.done_label, "done_label", "move_to_done") + if self.done_label is None: + raise AttributeError("You need to set the class attribute `done_label` to use `move_to_done`.") + self.move_to(emails_id, self.done_label) - def route_emails( - self, - classified_emails: pd.DataFrame, - id_column: str = "message_id", - ) -> None: + def route_emails(self, classified_emails: pd.DataFrame, id_column: str = "message_id") -> None: """Function to route emails to mailbox folders. Args: @@ -287,7 +317,7 @@ def route_emails( for label in target_labels: mask = classified_emails[target_column] == label mids_to_move = classified_emails[mask][id_column] - self._move_to(mids_to_move, label, label, "route_emails") + self.move_to(mids_to_move, label) logger.info(f"Moving {mids_to_move.size} emails to folder '{label}'") def send_email(self, to: Union[str, List[str]], header: str, body: str, attachments: dict) -> None: From ab8246e6d8cf39a135b254fe1fded89318e291ec Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Fri, 3 May 2024 16:08:32 +0200 Subject: [PATCH 3/9] :white_check_mark: add GmailConnector unit tests Signed-off-by: MathisNcl --- docs/advanced/GmailConnector.md | 1 + melusine/connectors/gmail.py | 14 +- pyproject.toml | 2 +- tests/conftest.py | 2 + tests/connectors/__init__.py | 0 tests/connectors/test_gmail.py | 256 ++++++++++++++++++++++++++++++++ tests/fixtures/basic_emails.py | 42 ++++++ tests/fixtures/connectors.py | 55 +++++++ 8 files changed, 365 insertions(+), 7 deletions(-) create mode 100644 docs/advanced/GmailConnector.md create mode 100644 tests/connectors/__init__.py create mode 100644 tests/connectors/test_gmail.py create mode 100644 tests/fixtures/connectors.py diff --git a/docs/advanced/GmailConnector.md b/docs/advanced/GmailConnector.md new file mode 100644 index 0000000..e996e79 --- /dev/null +++ b/docs/advanced/GmailConnector.md @@ -0,0 +1 @@ +# Connect melusine to a Gmail Mailbox diff --git a/melusine/connectors/gmail.py b/melusine/connectors/gmail.py index afc8c4d..a386a33 100644 --- a/melusine/connectors/gmail.py +++ b/melusine/connectors/gmail.py @@ -87,7 +87,7 @@ def get_credentials(self, token_json_path: Optional[str] = None) -> Credentials: Credentials: Credentials to connect to Gmail """ if token_json_path is not None and os.path.exists(token_json_path): - creds: Credentials = Credentials.from_authorized_user_file("token.json", self.SCOPES) + creds: Credentials = Credentials.from_authorized_user_file(token_json_path, self.SCOPES) if creds.valid is False: creds.refresh(Request()) return creds @@ -154,7 +154,8 @@ def create_label(self, label_name: str) -> Dict[str, str]: logger.info(f"Label {label_name} has been created.") return label - def extract_from_parsed_mail(self, parsed_email: message.Message) -> Dict[str, Any]: + @staticmethod + def extract_from_parsed_mail(parsed_email: message.Message) -> Dict[str, Any]: """Extract body and attachments from the parsed email Args: @@ -318,18 +319,19 @@ def route_emails(self, classified_emails: pd.DataFrame, id_column: str = "messag mask = classified_emails[target_column] == label mids_to_move = classified_emails[mask][id_column] self.move_to(mids_to_move, label) - logger.info(f"Moving {mids_to_move.size} emails to folder '{label}'") + logger.info(f"Moving {mids_to_move.size} emails to label '{label}'") - def send_email(self, to: Union[str, List[str]], header: str, body: str, attachments: dict) -> None: + def send_email(self, to: Union[str, List[str]], header: str, body: str, attachments: Optional[Dict] = None) -> None: """This method sends an email from the login address (attribute login_address). Args: to (Union[str, List[str]]): Address or list of addresses of email recipients header (str): Email header body (str): Email body - attachments (dict): Dict containing attachment names as key and attachment file contents as values. - Currently, the code is tested for DataFrame attachments only + attachments (Optional[Dict], optional): Dict containing attachment names as key and attachment + file contents as values. Defaults to None. """ + if isinstance(to, str): to = [to] diff --git a/pyproject.toml b/pyproject.toml index 3120735..443f81c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dynamic = ["version"] [project.optional-dependencies] # Optional dev = ["tox", "pre-commit", "black", "flake8", "isort", "mypy", "pytest", "coverage", "build", "ruff"] -test = ["pytest", "coverage"] +test = ["pytest", "coverage", "pytest-cov"] transformers = ["transformers>4"] connectors = ["exchangelib", "google-auth-oauthlib", "google-api-python-client"] docs = ["mkdocs", "markdown", "mkdocs-material", "mdx-include"] diff --git a/tests/conftest.py b/tests/conftest.py index 5dd461e..678606f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ """ Setup tests and import fixtures """ + import numpy as np import pytest @@ -10,6 +11,7 @@ pytest_plugins = [ "tests.fixtures.backend", "tests.fixtures.basic_emails", + "tests.fixtures.connectors", "tests.fixtures.docs", "tests.fixtures.pipelines", "tests.fixtures.processors", diff --git a/tests/connectors/__init__.py b/tests/connectors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/connectors/test_gmail.py b/tests/connectors/test_gmail.py new file mode 100644 index 0000000..d7de370 --- /dev/null +++ b/tests/connectors/test_gmail.py @@ -0,0 +1,256 @@ +import base64 +import logging +import os +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest +from google.oauth2.credentials import Credentials +from googleapiclient.http import HttpRequestMock + +from melusine.connectors.gmail import GmailConnector + + +def return_value(resp, content): + return content + + +@patch("melusine.connectors.gmail.build") +@patch("melusine.connectors.gmail.Credentials.from_authorized_user_file") +@patch("melusine.connectors.gmail.os.path.exists") +def test_init(mock_exists, mock_creds_from_file, mock_build, caplog): + # Mocking necessary objects and methods + mock_exists.return_value = True + mock_service = MagicMock() + mock_service.users().getProfile.return_value = HttpRequestMock( + None, {"emailAddress": "test@example.com"}, return_value + ) + mock_service.users().labels().list.return_value = HttpRequestMock( + None, + { + "labels": [ + {"id": "INBOX", "name": "INBOX", "type": "system"}, + { + "id": "TRASH", + "name": "TRASH", + "messageListVisibility": "hide", + "labelListVisibility": "labelHide", + "type": "system", + }, + {"id": "UNREAD", "name": "UNREAD", "type": "system"}, + ] + }, + return_value, + ) + mock_build.return_value = mock_service + mock_creds_from_file.return_value = Credentials("dummy") + + # Creating an instance of GmailConnector + with caplog.at_level(logging.DEBUG): + gc = GmailConnector(token_json_path="token.json", done_label="TRASH", target_column="target") + + # Assertions + assert len(gc.labels) == 3 + assert gc.done_label == "TRASH" + assert gc.mailbox_address == "test@example.com" + assert gc.target_column == "target" + + assert "Connected to mailbox:" in caplog.text + + assert str(gc) == "GmailConnector(done_label=TRASH, target_column=target), connected to test@example.com" + + +@patch("melusine.connectors.gmail.build") +@patch("melusine.connectors.gmail.InstalledAppFlow.from_client_secrets_file") +def test_init_without_creds(mock_flow, mock_build, caplog): + # Mocking necessary objects and methods + mock_service = MagicMock() + mock_service.users().getProfile.return_value = HttpRequestMock( + None, {"emailAddress": "test@example.com"}, return_value + ) + mock_service.users().labels().list.return_value = HttpRequestMock( + None, + { + "labels": [ + {"id": "INBOX", "name": "INBOX", "type": "system"}, + { + "id": "TRASH", + "name": "TRASH", + "messageListVisibility": "hide", + "labelListVisibility": "labelHide", + "type": "system", + }, + {"id": "UNREAD", "name": "UNREAD", "type": "system"}, + ] + }, + return_value, + ) + mock_build.return_value = mock_service + mock_flow.return_value.run_local_server.return_value = Credentials("dummy") + + # Creating an instance of GmailConnector + with caplog.at_level(logging.DEBUG): + gc = GmailConnector() + + # Assertions + assert len(gc.labels) == 3 + assert gc.done_label is None + assert gc.mailbox_address == "test@example.com" + assert gc.target_column == "target" + assert os.path.exists("token.json") + os.remove("token.json") + assert "gmail token.json saved at:" in caplog.text + assert "Connected to mailbox:" in caplog.text + + assert str(gc) == "GmailConnector(done_label=None, target_column=target), connected to test@example.com" + + +def test_gc_get_emails(mocked_gc, simple_email_raw, caplog): + mocked_gc.service.users().messages().list.return_value = HttpRequestMock( + None, {"messages": [{"id": "123"}]}, return_value + ) + mocked_gc.service.users().messages().get.return_value = HttpRequestMock( + None, + { + "id": "123", + "labelIds": ["INBOX"], + "snippet": "Did it worked?", + "sizeEstimate": 45200, + "raw": simple_email_raw, + }, + return_value, + ) + with caplog.at_level(logging.DEBUG): + df = mocked_gc.get_emails(1, None, "2024/01/01", "2024/05/03") + + assert "Please wait while loading messages" in caplog.text + assert "Read '1' new emails" in caplog.text + + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + assert df.iloc[0].to_dict() == { + "message_id": "123", + "body": "This body should appeared!\n", + "header": "Did it worked?", + "date": "Thu, 02 May 2024 08:53:35 -0700", + "from": "sender@example.com", + "to": "recipient@example.com", + "attachment": [], + } + + +def test_gc_get_emails_complex_mail(mocked_gc, complex_email_raw, caplog): + mocked_gc.service.users().messages().list.return_value = HttpRequestMock( + None, {"messages": [{"id": "123"}]}, return_value + ) + mocked_gc.service.users().messages().get.return_value = HttpRequestMock( + None, + { + "id": "123", + "labelIds": ["INBOX"], + "snippet": "Did it worked?", + "sizeEstimate": 45200, + "raw": complex_email_raw, + }, + return_value, + ) + with caplog.at_level(logging.DEBUG): + df = mocked_gc.get_emails(1, None, "2024/01/01", "2024/05/03") + + assert "Please wait while loading messages" in caplog.text + assert "Read '1' new emails" in caplog.text + + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + assert df.iloc[0].to_dict() == { + "message_id": "123", + "body": "This is the body of the email.", + "header": "Fake multipart email", + "date": "Thu, 02 May 2024 08:53:35 -0700", + "from": "sender@example.com", + "to": "recipient@example.com", + "attachment": [{"filename": "attachment.txt", "type": "application/octet-stream", "data": b"dummy text"}], + } + + +@patch("builtins.input", side_effect=["y", "n"]) +def test_gc_check_or_create_label(mock_input, mocked_gc, caplog): + mocked_gc.service.users().labels().create.return_value = HttpRequestMock( + None, + { + "id": "Label_3", + "name": "test", + "messageListVisibility": "show", + "labelListVisibility": "labelShow", + "type": "user", + }, + return_value, + ) + assert len([label for label in mocked_gc.labels if label["name"] == "test"]) == 0 + with caplog.at_level(logging.DEBUG): + label = mocked_gc._check_or_create_label("test") + assert "does not exist in current labels list" in caplog.text + + assert label == "test" + assert len([label for label in mocked_gc.labels if label["name"] == "test"]) == 1 + assert "Label test has been created" in caplog.text + + # With input to "n" + with pytest.raises(ValueError, match="Label test2 does not exist."): + mocked_gc._check_or_create_label("test2") + + +def test_gc_move_to_done(mocked_gc, caplog): + mocked_gc.service.users().messages().modify.return_value = HttpRequestMock(None, {}, return_value) + with caplog.at_level(logging.DEBUG): + mocked_gc.move_to_done(["dummy_id"]) + + assert "Moved 1 emails to TRASH label." in caplog.text + + mocked_gc.done_label = None + with pytest.raises(AttributeError, match="You need to set the class attribute `done_label` to use `move_to_done`."): + mocked_gc.move_to_done(["dummy_id"]) + + +def test_gc_move_to_error(mocked_gc, caplog): + with pytest.raises( + ValueError, + match="Label 'not_existing_label' does not exist in self.labels. Make sure to specified a right label name.", + ): + mocked_gc.move_to(["dummy_id"], "not_existing_label") + + +def test_gc_route_emails(mocked_gc, caplog): + mocked_gc.service.users().messages().modify.return_value = HttpRequestMock(None, {}, return_value) + + df = pd.DataFrame( + { + "message_id": ["123", "456"], + "body": ["Body1", "Body2"], + "header": ["Header1", "Header2"], + "date": ["Thu, 02 May 2024 08:53:35 -0700", "Thu, 02 May 2024 10:00:00 -0700"], + "from": ["sender2@example.com", "sender2@example.com"], + "to": ["recipient@example.com", "recipient@example.com"], + "attachment": [[], []], + "target": ["TRASH", "UNREAD"], + } + ) + with caplog.at_level(logging.DEBUG): + mocked_gc.route_emails(df) + + assert "Moving 1 emails to label 'TRASH'" in caplog.text + assert "Moving 1 emails to label 'UNREAD'" in caplog.text + + +def test_gc_send_email(mocked_gc, fake_image, caplog): + mocked_gc.service.users().messages().send.return_value = HttpRequestMock(None, {"id": "12456"}, return_value) + + with caplog.at_level(logging.DEBUG): + mocked_gc.send_email( + "melusine_testing.yopmail.com", + "Testing Header", + "Testing Body", + {"attachment.jpg": fake_image}, + ) + + assert "Email sent to melusine_testing@yopmail.com, Message Id: 12456" diff --git a/tests/fixtures/basic_emails.py b/tests/fixtures/basic_emails.py index 0ad361c..953128a 100644 --- a/tests/fixtures/basic_emails.py +++ b/tests/fixtures/basic_emails.py @@ -1,3 +1,10 @@ +import base64 +from email.message import EmailMessage +from email.mime.application import MIMEApplication +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.policy import default + import pandas as pd import pytest @@ -73,3 +80,38 @@ def email_dataframe_basic(email_basic_hello_world, email_basic_with_accent): email_basic_with_accent, ] ) + + +@pytest.fixture +def simple_email_raw(): + email_message = EmailMessage() + email_message["From"] = "sender@example.com" + email_message["To"] = "recipient@example.com" + email_message["Subject"] = "Did it worked?" + email_message["Date"] = "Thu, 02 May 2024 08:53:35 -0700" + email_message.set_content("This body should appeared!") + + email_bytes = email_message.as_bytes(policy=default) + + return base64.urlsafe_b64encode(email_bytes) + + +@pytest.fixture +def complex_email_raw(): + msg = MIMEMultipart() + msg["From"] = "sender@example.com" + msg["To"] = "recipient@example.com" + msg["Subject"] = "Fake multipart email" + msg["Date"] = "Thu, 02 May 2024 08:53:35 -0700" + + text_part = MIMEText("This is the body of the email.") + msg.attach(text_part) + + # Add attachment + attachment_part = MIMEApplication("dummy text", Name="attachment.txt") + attachment_part["Content-Disposition"] = 'attachment; filename="attachment.txt"' + msg.attach(attachment_part) + + email_bytes = msg.as_bytes(policy=default) + + return base64.urlsafe_b64encode(email_bytes).decode("utf-8") diff --git a/tests/fixtures/connectors.py b/tests/fixtures/connectors.py new file mode 100644 index 0000000..c556d91 --- /dev/null +++ b/tests/fixtures/connectors.py @@ -0,0 +1,55 @@ +from unittest.mock import MagicMock, patch + +import pytest +from google.oauth2.credentials import Credentials +from googleapiclient.http import HttpRequestMock + +from melusine.connectors.gmail import GmailConnector + + +def return_value(resp, content): + return content + + +@pytest.fixture +def mocked_gc(): + with patch("melusine.connectors.gmail.build") as mock_build: + with patch("melusine.connectors.gmail.Credentials.from_authorized_user_file") as mock_creds_from_file: + with patch("melusine.connectors.gmail.os.path.exists") as mock_exists: + mock_exists.return_value = True + mock_service = MagicMock() + mock_service.users().getProfile.return_value = HttpRequestMock( + None, {"emailAddress": "test@example.com"}, return_value + ) + mock_service.users().labels().list.return_value = HttpRequestMock( + None, + { + "labels": [ + {"id": "INBOX", "name": "INBOX", "type": "system"}, + { + "id": "TRASH", + "name": "TRASH", + "messageListVisibility": "hide", + "labelListVisibility": "labelHide", + "type": "system", + }, + {"id": "UNREAD", "name": "UNREAD", "type": "system"}, + ] + }, + return_value, + ) + mock_build.return_value = mock_service + mock_creds_from_file.return_value = Credentials("dummy") + + return GmailConnector(token_json_path="token.json", done_label="TRASH", target_column="target") + + +@pytest.fixture +def fake_image(): + image_data = b"" + width = height = 100 + for _ in range(height): + row_data = b"\xff" * (width * 3) + image_data += row_data + + return image_data From caf8d7729af9301c0551dd94b62b92f8cdf3499c Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Fri, 3 May 2024 16:11:50 +0200 Subject: [PATCH 4/9] :arrow_up: bump melusine version Signed-off-by: MathisNcl --- melusine/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/melusine/__init__.py b/melusine/__init__.py index 5c4fd1b..81eed12 100644 --- a/melusine/__init__.py +++ b/melusine/__init__.py @@ -1,6 +1,7 @@ """ Top-level package. """ + import logging from ctypes import CDLL, cdll from typing import Any, Optional @@ -11,7 +12,7 @@ __all__ = ["config"] -VERSION = (3, 0, 0) +VERSION = (3, 1, 0) __version__ = ".".join(map(str, VERSION)) # ------------------------------- # From a3421d16a23ae05bc232370c5669a866f2032e34 Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Fri, 3 May 2024 16:16:13 +0200 Subject: [PATCH 5/9] :wrench: update makefile Signed-off-by: MathisNcl --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index ae41e7a..fc49a8a 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,9 @@ test: ## run tests quickly with the default Python test-all: ## run tests on every Python version with tox tox +pytest-coverage: # run pytest coverage with term-missing + pytest --cov=melusine --cov-report term-missing + coverage: ## check code coverage quickly with the default Python coverage run --source melusine -m pytest coverage report -m From 6608a34c488651c6a882f3d49d9c5a8c0e0a4a9b Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Fri, 3 May 2024 16:24:50 +0200 Subject: [PATCH 6/9] :pushpin: fix deps for testing purpose Signed-off-by: MathisNcl --- pyproject.toml | 2 +- tox.ini | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 443f81c..085cb93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dynamic = ["version"] [project.optional-dependencies] # Optional dev = ["tox", "pre-commit", "black", "flake8", "isort", "mypy", "pytest", "coverage", "build", "ruff"] -test = ["pytest", "coverage", "pytest-cov"] +test = ["pytest", "coverage", "pytest-cov", "google-auth-oauthlib", "google-api-python-client"] transformers = ["transformers>4"] connectors = ["exchangelib", "google-auth-oauthlib", "google-api-python-client"] docs = ["mkdocs", "markdown", "mkdocs-material", "mdx-include"] diff --git a/tox.ini b/tox.ini index 0051cb7..a94c341 100644 --- a/tox.ini +++ b/tox.ini @@ -13,6 +13,8 @@ commands = pytest --cov --cov-append --cov-report xml deps = pytest pytest-cov + google-auth-oauthlib + google-api-python-client depends = {core38,transformers}: clean report: core38,transformers From 0a62e0b710aae583ac44ff8d4e459554219d3902 Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Mon, 6 May 2024 10:38:27 +0200 Subject: [PATCH 7/9] :memo: add some comments Signed-off-by: MathisNcl --- melusine/connectors/gmail.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/melusine/connectors/gmail.py b/melusine/connectors/gmail.py index a386a33..0c3ae0c 100644 --- a/melusine/connectors/gmail.py +++ b/melusine/connectors/gmail.py @@ -56,12 +56,12 @@ def __init__( # Connect to mailbox self.credentials: Credentials = self.get_credentials(token_json_path=token_json_path) self.service: Any = build("gmail", "v1", credentials=self.credentials) - self.labels: List[Dict[str, str]] = self._get_labels() - # Setup done label + # Get and setup labels + self.labels: List[Dict[str, str]] = self._get_labels() self.done_label: Optional[str] = self._check_or_create_label(done_label) - self.mailbox_address = self.service.users().getProfile(userId="me").execute()["emailAddress"] + self.mailbox_address: str = self.service.users().getProfile(userId="me").execute()["emailAddress"] logger.info(f"Connected to mailbox: {self.mailbox_address}.") def __repr__(self) -> str: @@ -86,19 +86,21 @@ def get_credentials(self, token_json_path: Optional[str] = None) -> Credentials: Returns: Credentials: Credentials to connect to Gmail """ + # Get the token from the path if token_json_path is not None and os.path.exists(token_json_path): creds: Credentials = Credentials.from_authorized_user_file(token_json_path, self.SCOPES) if creds.valid is False: creds.refresh(Request()) return creds + # Ask for token to Google flow: InstalledAppFlow = InstalledAppFlow.from_client_secrets_file( "credentials.json", self.SCOPES, ) creds = flow.run_local_server(port=0) - # Save the credentials for the next run + # Save the token for the next run with open("token.json", "w") as token: token.write(creds.to_json()) @@ -204,6 +206,8 @@ def _extract_email_attributes(self, message_id: str) -> Dict: Returns: Dict: formatted output of the email """ + + # Get the raw message and create a Message object msg_raw: Dict[str, Any] = ( self.service.users().messages().get(id=message_id, userId="me", format="raw").execute() ) @@ -261,7 +265,7 @@ def get_emails( if end_date is not None: q += f"before:{end_date}" - all_new_data = ( + all_new_data: Dict[str, Any] = ( self.service.users() .messages() .list(userId="me", maxResults=max_emails, labelIds=target_label_id, q=q) From 9f91bd29d681e6ca2bd1235b28d725597aaf6546 Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Mon, 13 May 2024 11:06:23 +0200 Subject: [PATCH 8/9] :art: add credentials json path when init GmailConnector and update tests Signed-off-by: MathisNcl --- melusine/connectors/gmail.py | 22 ++++++++++++++++------ tests/connectors/test_gmail.py | 19 ++++++++++++++++--- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/melusine/connectors/gmail.py b/melusine/connectors/gmail.py index 0c3ae0c..8b31c92 100644 --- a/melusine/connectors/gmail.py +++ b/melusine/connectors/gmail.py @@ -41,20 +41,25 @@ class GmailConnector: def __init__( self, token_json_path: Optional[str] = None, + credentials_json_path: str = "credentials.json", done_label: Optional[str] = None, target_column: str = "target", ): """ Args: token_json_path (Optional[str], optional): `token.json` file path created after the first connection using - `credentials.json`. If None, looking for `credentials.json` at root and sign in. Defaults to None. + `credentials.json`. If None, looking for credentials_json_path and sign in. Defaults to None. + credentials_json_path (str, optional): file path for credentials.json delivered by google. + Defaults to credentials.json at root. done_label (Optional[str], optional): Label name for the done situation. Defaults to None. target_column (str, optional): Name of the DataFrame column containing target label. Defaults to "target". """ self.target_column: str = target_column # Connect to mailbox - self.credentials: Credentials = self.get_credentials(token_json_path=token_json_path) + self.credentials: Credentials = self.get_credentials( + token_json_path=token_json_path, credentials_json_path=credentials_json_path + ) self.service: Any = build("gmail", "v1", credentials=self.credentials) # Get and setup labels @@ -74,12 +79,13 @@ def __repr__(self) -> str: + f"connected to {self.mailbox_address}" ) - def get_credentials(self, token_json_path: Optional[str] = None) -> Credentials: + def get_credentials(self, credentials_json_path: str, token_json_path: Optional[str] = None) -> Credentials: """Retrieve credentials object to connect to Gmail using the `credentials.json` and generating the `token.json` if needed at root path. Please create json file as here https://cloud.google.com/docs/authentication/getting-started Args: + credentials_json_path (str): Credentials file path delivered by Google to authenticate. token_json_path (Optional[str], optional): `token.json` file path created after the first connection using `credentials.json`. Defaults to None. @@ -95,7 +101,7 @@ def get_credentials(self, token_json_path: Optional[str] = None) -> Credentials: # Ask for token to Google flow: InstalledAppFlow = InstalledAppFlow.from_client_secrets_file( - "credentials.json", + credentials_json_path, self.SCOPES, ) creds = flow.run_local_server(port=0) @@ -272,6 +278,11 @@ def get_emails( .execute() ) + if "messages" not in all_new_data: + logger.info( + f"No emails with filters: target_labels={target_labels}, start_date={start_date}, end_date={end_date}" + ) + return pd.DataFrame(columns=["message_id", "body", "header", "date", "from", "to", "attachment"]) logger.info("Please wait while loading messages") new_emails: List[Dict] = [self._extract_email_attributes(x["id"]) for x in tqdm(all_new_data["messages"])] df_new_emails = pd.DataFrame(new_emails) @@ -297,7 +308,7 @@ def move_to(self, emails_id: List[str], label_to_move_on: str) -> None: for email_id in emails_id: self.service.users().messages().modify(id=email_id, userId="me", body={"addLabelIds": [label_id]}).execute() - logger.info(f"Moved {len(emails_id)} emails to {label_to_move_on} label.") + logger.info(f"Moved {len(emails_id)} emails to '{label_to_move_on}' label.") def move_to_done(self, emails_id: List[str]) -> None: """Move emails to done label @@ -323,7 +334,6 @@ def route_emails(self, classified_emails: pd.DataFrame, id_column: str = "messag mask = classified_emails[target_column] == label mids_to_move = classified_emails[mask][id_column] self.move_to(mids_to_move, label) - logger.info(f"Moving {mids_to_move.size} emails to label '{label}'") def send_email(self, to: Union[str, List[str]], header: str, body: str, attachments: Optional[Dict] = None) -> None: """This method sends an email from the login address (attribute login_address). diff --git a/tests/connectors/test_gmail.py b/tests/connectors/test_gmail.py index d7de370..390bfa2 100644 --- a/tests/connectors/test_gmail.py +++ b/tests/connectors/test_gmail.py @@ -173,6 +173,19 @@ def test_gc_get_emails_complex_mail(mocked_gc, complex_email_raw, caplog): } +def test_gc_get_emails_none(mocked_gc, simple_email_raw, caplog): + mocked_gc.service.users().messages().list.return_value = HttpRequestMock(None, {}, return_value) + with caplog.at_level(logging.DEBUG): + df = mocked_gc.get_emails(1, None, "2024/01/01", "2024/05/03") + + assert "Please wait while loading messages" not in caplog.text + assert "No emails with filters: target_labels=" in caplog.text + + assert isinstance(df, pd.DataFrame) + assert len(df) == 0 + assert all([col in df.columns for col in ["message_id", "body", "header", "date", "from", "to", "attachment"]]) + + @patch("builtins.input", side_effect=["y", "n"]) def test_gc_check_or_create_label(mock_input, mocked_gc, caplog): mocked_gc.service.users().labels().create.return_value = HttpRequestMock( @@ -205,7 +218,7 @@ def test_gc_move_to_done(mocked_gc, caplog): with caplog.at_level(logging.DEBUG): mocked_gc.move_to_done(["dummy_id"]) - assert "Moved 1 emails to TRASH label." in caplog.text + assert "Moved 1 emails to 'TRASH' label." in caplog.text mocked_gc.done_label = None with pytest.raises(AttributeError, match="You need to set the class attribute `done_label` to use `move_to_done`."): @@ -238,8 +251,8 @@ def test_gc_route_emails(mocked_gc, caplog): with caplog.at_level(logging.DEBUG): mocked_gc.route_emails(df) - assert "Moving 1 emails to label 'TRASH'" in caplog.text - assert "Moving 1 emails to label 'UNREAD'" in caplog.text + assert "Moved 1 emails to 'TRASH' label" in caplog.text + assert "Moved 1 emails to 'UNREAD' label" in caplog.text def test_gc_send_email(mocked_gc, fake_image, caplog): From d6853842bea5dfccb2670975c6125f6d8ce0a608 Mon Sep 17 00:00:00 2001 From: MathisNcl Date: Mon, 13 May 2024 11:13:49 +0200 Subject: [PATCH 9/9] :memo: add documentation about GmailConnector Signed-off-by: MathisNcl --- docs/advanced/GmailConnector.md | 155 +++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 1 deletion(-) diff --git a/docs/advanced/GmailConnector.md b/docs/advanced/GmailConnector.md index e996e79..b987c1c 100644 --- a/docs/advanced/GmailConnector.md +++ b/docs/advanced/GmailConnector.md @@ -1 +1,154 @@ -# Connect melusine to a Gmail Mailbox +# Connect Melusine to a Gmail Mailbox + +The Gmail connector has been developed to make it easier to get started with Melusine, by connecting directly to your mailbox. +The `GmailConnector` class allows: + +- connect to your Gmail inbox +- retrieve the latest messages, filtering by date and labels +- extract relevant information from an e-mail, including attachments +- move messages individually into labels +- route an entire dataframe of e-mails to new labels, according to your detected use cases +- send emails + +## Installation + +First make sure to have a Gmail account and follow this [medium tutorial](https://medium.com/@preetipriyanka24/how-to-read-emails-from-gmail-using-gmail-api-in-python-20f7d9d09ae9) to get your `credentials.json` file or follow as below. + +???note "Steps to create your credentials file from the medium post" + + - Sign in to [Google Cloud console](https://console.cloud.google.com/) and create a New Project or continue with an existing project. + - Go to **APIs and Services**. + - Go to **Enable APIs and Services**, enable Gmail API for the selected project. + - Clicking on **OAuth Consent Screen** to configure the content screen. + - Enter the Application name and save it. + - Now go to **Credentials**. + - Click on Create credentials, and go to **OAuth Client ID**. + - Choose application type as Desktop Application. + - Enter the Application name, and click on the Create button. + - The Client ID will be created. Download it to your computer and save it as credentials.json + - If the App is still in Testing mode, go to **OAuth Consent Screen** and add your gmail address to **Test users**. + +Once your `credentials.json` created, save it to root for the first use. + +## Usage + +### First use + +For the first use, a `token.json` will be created, save it. You will reuse it to sign in. + +```Python +from melusine.connectors.gmail import GmailConnector +import logging + +# Set up logging +ch = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +ch.setFormatter(formatter) +connector_logger = logging.getLogger("melusine.connectors.gmail") +connector_logger.addHandler(ch) +connector_logger.setLevel(logging.INFO) + +connector = GmailConnector(credentials_json_path="/Users/xxxxx/melusine/credentials.json") +# >>> 2024-05-06 11:18:58,636 - melusine.connectors.gmail - INFO - gmail token.json saved at: /Users/xxxxx/melusine +# >>> 2024-05-06 11:18:58,920 - melusine.connectors.gmail - INFO - Connected to mailbox: xxxxxxxxxx@gmail.com. + +# Next usage will then be: +connector = GmailConnector(token_json_path="/Users/xxxxx/melusine/token.json") +``` + +!!! info + A pop up window from google will ask you to choose your gmail account to sign in. If the app is still in testing mode, click on **Continue**. + Then select all boxes to allow the credentials for read and modify rights, continue and close the window. + +### Get emails + +We have emails in the box to consider. These mails should either be put in **Melusine** label because they ask for something or in the **TRASH** label. Let's get the five last emails. + +```Python +from melusine.connectors.gmail import GmailConnector +connector = GmailConnector(token_json_path="/Users/xxxxx/Desktop/melusine/token.json", done_label="TRASH") + + +df = connector.get_emails(max_emails=5) +# equivalent to: +# df = connector.get_emails(max_emails=5, target_labels= ["INBOX"]) +print(df) +``` + +| | message_id | body | header | date | from | to | attachment | +|---|------------|--------------------------------------------|-----------------------|---------------------------------|-----------------------|--------------------|------------| +| 1 | 12456789 | This is a an example | Demo1 | Mon, 13 May 2024 07:31:09 +0000 | | | [] | +| 2 | 987654321 | I am very happy of this Melusine connector | Awesome connector!! | Mon, 06 May 2024 10:55:22 +0000 | | | [] | +| 3 | 147258369 | Does Melusine is free ? | Impossible to believe | Thu, 02 May 2024 12:40:28 +0000 | | | [] | +| 4 | 741852963 | Hello World! | print | Mon, 29 Apr 2024 16:27:55 +0000 | | | [] | +| 5 | 951753467 | Python is lovely | PEP | Thu, 25 Apr 2024 15:28:07 +0000 | | | [] | + +And that's it, you have your last 5 mails from the INBOX! + +!!! info "Filters" + + Filters can be used to get emails from specific labels. For example how about retrieveing emails unreaded in inbox from the last week? + ```python + df = connector.get_emails( + max_emails=5, + target_labels=["INBOX", "UNREAD"], + start_date="2024/05/06", + end_date="2024/05/12", + ) + ``` + +!!! warning "Date format" + When using `start_date` and `end_date` from `get_emails`, the format of the dates needs to be `YYYY/MM/DD` eg **2024/03/31**. + +### Create label + +To route emails to labels, they must exist. Using the example above, let's create the **Melusine** label. + +```Python +connector.create_label("Melusine") +# >>> 2024-05-13 10:41:56,406 - melusine.connectors.gmail - INFO - Label Melusine has been created. +# >>> {'id': 'Label_4', 'name': 'Melusine', 'messageListVisibility': 'show', 'labelListVisibility': 'labelShow'} +``` + +The label has been created and it's id is `Label_4`. + +### Route emails + +There are two ways of routing emails, either individually using `move_to` or via the `route_emails` method, which takes as input a pandas data frame resulting from the application of the Melusine framework. + +#### Route using Melusine output + +Let's consider the same data frame, but with Melusine feedback to route each email via two detectors: **TRASH** and **Melusine**. +The column of interest is `target.` and `classified_df` is: + +| | message_id | body | ... | attachment | target | +|---|------------|--------------------------------------------|-----|------------|----------| +| 1 | 12456789 | This is a an example | ... | [] | TRASH | +| 2 | 987654321 | I am very happy of this Melusine connector | ... | [] | Melusine | +| 3 | 147258369 | Does Melusine is free ? | ... | [] | Melusine | +| 4 | 741852963 | Hello World! | ... | [] | TRASH | +| 5 | 951753467 | Python is lovely | ... | [] | TRASH | + +```Python +connector.route_emails(classified_emails=classified_df) +# >>> 2024-05-13 10:48:43,752 - melusine.connectors.gmail - INFO - Moved 3 emails to 'TRASH' label +# >>> 2024-05-13 10:48:44,110 - melusine.connectors.gmail - INFO - Moved 2 emails to 'Melusine' label +``` + +#### Route one by one (not recommanded) + +Considering the above data frame, the first emails is not related to Melusine so let's move it to trash. Conversely, the second evokes Melusine and should have the **Melusine** label. + +```Python +# First email +# as done_label is "TRASH" +connector.move_to_done(emails_id=[df.iloc[0].message_id]) +# >>> 2024-05-13 10:48:58,870 - melusine.connectors.gmail - INFO - Moved 1 emails to 'TRASH' label + +# Second email +connector.move_to(emails_id=[df.iloc[1].message_id], label_to_move_on="MELUSINE") +# >>> 2024-05-13 10:48:59,110 - melusine.connectors.gmail - INFO - Moved 1 emails to 'Melusine' label +``` + +!!! info + You can route multiple emails since `emails_id` from `move_to` and `move_to_done` is a list.