Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ pytest.local.ini
/evals
/tests/testdata/Episode_53_Answer_results.json
/tests/testdata/Episode_53_Search_results.json
/tests/testdata/email-mbox/*

# Email demo
/tools/gmail/client_secret.json
/tools/gmail/token.json
*_dump/

# Monty Python demo
/examples/testdata/MP
/examples/testdata/MP
30 changes: 29 additions & 1 deletion docs/demos.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,39 @@ We used the Gmail API to download 550 messages from Guido's Gmail
Given a folder with `*.eml` files in MIME format, we ran our email
ingestion tool, `tools/ingest_email.py`. You run it as follows:
```sh
python tools/ingest_email.py -d gmail.db email-folder/
python tools/ingest_email.py -d gmail.db --eml email-folder/
```
You can also pass individual `.eml` files instead of a directory.
Use `-v` for verbose output.

#### Filtering by date

Use `--after` and `--before` to restrict ingestion to a date range
(dates are `YYYY-MM-DD`, interpreted as local midnight):

```sh
# Ingest only January 2024 emails
python tools/ingest_email.py -d gmail.db --eml email-folder/ \
--after 2024-01-01 --before 2024-02-01
```

#### Pagination with --offset and --limit

Like SQL `OFFSET` / `LIMIT`, these flags let you paginate through
the set of qualifying (not-yet-ingested, date-filtered) emails:

```sh
# Ingest only the first 20 qualifying emails
python tools/ingest_email.py -d gmail.db --eml email-folder/ --limit 20

# Skip the first 100, then ingest the next 50
python tools/ingest_email.py -d gmail.db --eml email-folder/ \
--offset 100 --limit 50
```

All four flags can be combined. The filter pipeline is:
already-ingested → date range → offset → limit.

The process took over an hour for 500 messages. Moreover, it complained
about nearly 10% of the messages due to timeouts or just overly large
files. When an error occurs, the tool recovers and continues with the
Expand Down
32 changes: 26 additions & 6 deletions src/typeagent/emails/email_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ def decode_encoded_words(value: str) -> str:
return str(make_header(decode_header(value)))


def _header_to_str(value: object) -> str | None:
"""Coerce an email header value to str or None.

msg.get() can return an email.header.Header object instead of a plain str
when the header contains RFC 2047 encoded words. Pydantic expects str, so
we normalise here.
"""
if value is None:
return None
return str(value)


def import_emails_from_dir(
dir_path: str, max_chunk_length: int = 4096
) -> Iterable[EmailMessage]:
Expand Down Expand Up @@ -64,14 +76,16 @@ def import_forwarded_email_string(
# Imports an email.message.Message object and returns an EmailMessage object
# If the message is a reply, returns only the latest response.
def import_email_message(msg: Message, max_chunk_length: int) -> EmailMessage:
# Extract metadata from
# Extract metadata from headers.
# msg.get() can return a Header object instead of str for encoded headers,
# so coerce all values to str.
email_meta = EmailMessageMeta(
sender=msg.get("From", ""),
sender=_header_to_str(msg.get("From", "")) or "",
recipients=_import_address_headers(msg.get_all("To", [])),
cc=_import_address_headers(msg.get_all("Cc", [])),
bcc=_import_address_headers(msg.get_all("Bcc", [])),
subject=msg.get("Subject"), # TODO: Remove newlines
id=msg.get("Message-ID", None),
subject=_header_to_str(msg.get("Subject")),
id=_header_to_str(msg.get("Message-ID")),
)
timestamp: str | None = None
timestamp_date = msg.get("Date", None)
Expand Down Expand Up @@ -175,7 +189,13 @@ def _decode_email_payload(part: Message) -> str:
return payload
return ""
if isinstance(payload, bytes):
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
charset = part.get_content_charset() or "latin-1"
try:
return payload.decode(charset, errors="replace")
except LookupError:
# Unknown encoding (e.g. iso-8859-8-i); fall back to latin-1
# which accepts all 256 byte values without loss.
return payload.decode("latin-1")
if isinstance(payload, str):
return payload
return ""
Expand All @@ -187,7 +207,7 @@ def _import_address_headers(headers: list[str]) -> list[str]:
unique_addresses: set[str] = set()
for header in headers:
if header:
addresses = _remove_empty_strings(header.split(","))
addresses = _remove_empty_strings(str(header).split(","))
for address in addresses:
unique_addresses.add(address)

Expand Down
235 changes: 235 additions & 0 deletions tests/test_mbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests for email filtering logic and email parsing edge cases."""

from datetime import datetime, timezone

# Import the filtering helpers from the tool module.
import importlib
import importlib.util
from pathlib import Path
import sys

from typeagent.emails.email_import import import_email_string

_tools_dir = str(Path(__file__).resolve().parent.parent / "tools")
_spec = importlib.util.spec_from_file_location(
"ingest_email", Path(_tools_dir) / "ingest_email.py"
)
assert _spec and _spec.loader
_ingest_mod = importlib.util.module_from_spec(_spec)
sys.modules["ingest_email"] = _ingest_mod
_spec.loader.exec_module(_ingest_mod)

from ingest_email import ( # type: ignore[import-untyped]
_email_matches_date_filter,
)

# ===========================================================================
# Tests for _email_matches_date_filter
# ===========================================================================


class TestEmailMatchesDateFilter:
"""Tests for the _email_matches_date_filter helper in ingest_email.py."""

def _utc(self, year: int, month: int, day: int) -> datetime:
return datetime(year, month, day, tzinfo=timezone.utc)

def test_no_filters(self) -> None:
"""All emails pass when no filters are set."""
assert _email_matches_date_filter("2024-01-15T10:00:00+00:00", None, None)

def test_none_timestamp_always_passes(self) -> None:
"""Emails without a timestamp are always included."""
assert _email_matches_date_filter(
None, self._utc(2024, 1, 1), self._utc(2024, 12, 31)
)

def test_invalid_timestamp_always_passes(self) -> None:
"""Emails with unparseable timestamps are always included."""
assert _email_matches_date_filter(
"not-a-date", self._utc(2024, 1, 1), self._utc(2024, 12, 31)
)

def test_after_filter_includes(self) -> None:
"""Email on or after the 'after' date passes."""
after = self._utc(2024, 1, 15)
assert _email_matches_date_filter("2024-01-15T00:00:00+00:00", after, None)
assert _email_matches_date_filter("2024-01-16T00:00:00+00:00", after, None)

def test_after_filter_excludes(self) -> None:
"""Email before the 'after' date is excluded."""
after = self._utc(2024, 1, 15)
assert not _email_matches_date_filter("2024-01-14T23:59:59+00:00", after, None)

def test_before_filter_includes(self) -> None:
"""Email before the 'before' date passes."""
before = self._utc(2024, 2, 1)
assert _email_matches_date_filter("2024-01-31T23:59:59+00:00", None, before)

def test_before_filter_excludes(self) -> None:
"""Email on or after the 'before' date is excluded (exclusive upper bound)."""
before = self._utc(2024, 2, 1)
assert not _email_matches_date_filter("2024-02-01T00:00:00+00:00", None, before)

def test_date_range(self) -> None:
"""Email within [after, before) passes; outside fails."""
after = self._utc(2024, 1, 1)
before = self._utc(2024, 2, 1)
# Inside
assert _email_matches_date_filter("2024-01-15T12:00:00+00:00", after, before)
# Before range
assert not _email_matches_date_filter(
"2023-12-31T23:59:59+00:00", after, before
)
# At upper bound (exclusive)
assert not _email_matches_date_filter(
"2024-02-01T00:00:00+00:00", after, before
)

def test_naive_timestamp_treated_as_local(self) -> None:
"""Offset-naive timestamps should be treated as local time."""
from datetime import datetime as dt

# Build the filter boundary in local time so the test is TZ-independent
local_tz = dt.now().astimezone().tzinfo
after = datetime(2024, 1, 15, tzinfo=local_tz)
assert _email_matches_date_filter("2024-01-15T00:00:00", after, None)
assert not _email_matches_date_filter("2024-01-14T23:59:59", after, None)

def test_different_timezone(self) -> None:
"""Timestamps with non-UTC offsets are compared correctly."""
# 2024-01-15T00:00:00+05:00 is 2024-01-14T19:00:00 UTC
after = self._utc(2024, 1, 15)
assert not _email_matches_date_filter("2024-01-15T00:00:00+05:00", after, None)
# 2024-01-15T10:00:00+05:00 is 2024-01-15T05:00:00 UTC
assert _email_matches_date_filter("2024-01-15T10:00:00+05:00", after, None)


# ===========================================================================
# Tests for email encoding edge cases
# ===========================================================================


_EMAIL_WITH_ENCODED_HEADER = """\
From: =?utf-8?b?SsO8cmdlbg==?= <juergen@example.com>\r
To: recipient@example.com\r
Subject: =?utf-8?q?M=C3=BCnchen_weather?=\r
Date: Mon, 01 Jan 2024 10:00:00 +0000\r
Message-ID: <encoded@example.com>\r
\r
Hello from Munich!\r
"""


class TestEncodingEdgeCases:
def test_encoded_header_sender(self) -> None:
"""RFC 2047 encoded sender should be decoded to a string, not raise."""
email = import_email_string(_EMAIL_WITH_ENCODED_HEADER)
assert isinstance(email.metadata.sender, str)

def test_encoded_header_subject(self) -> None:
"""RFC 2047 encoded subject should be decoded to a string."""
email = import_email_string(_EMAIL_WITH_ENCODED_HEADER)
assert isinstance(email.metadata.subject, str)


_EMAIL_WITH_UNKNOWN_CHARSET = """\
From: test@example.com\r
To: recipient@example.com\r
Subject: Unknown charset test\r
Date: Mon, 01 Jan 2024 10:00:00 +0000\r
Message-ID: <charset@example.com>\r
MIME-Version: 1.0\r
Content-Type: text/plain; charset="iso-8859-8-i"\r
Content-Transfer-Encoding: base64\r
\r
SGVsbG8gV29ybGQ=\r
"""


class TestUnknownCharset:
def test_unknown_charset_does_not_crash(self) -> None:
"""An email with an unknown charset should be decoded without raising."""
email = import_email_string(_EMAIL_WITH_UNKNOWN_CHARSET)
body = " ".join(email.text_chunks)
assert "Hello World" in body or len(body) > 0


# ===========================================================================
# Tests for mbox with missing / malformed date
# ===========================================================================

_EMAIL_NO_DATE = """\
From: test@example.com\r
To: recipient@example.com\r
Subject: No date header\r
Message-ID: <nodate@example.com>\r
\r
This email has no Date header.\r
"""


class TestMissingDate:
def test_email_without_date_has_none_timestamp(self) -> None:
email = import_email_string(_EMAIL_NO_DATE)
assert email.timestamp is None

def test_email_without_date_passes_date_filter(self) -> None:
"""Emails without timestamps should always pass the date filter."""
assert _email_matches_date_filter(
None, datetime(2024, 1, 1, tzinfo=timezone.utc), None
)


# ===========================================================================
# Tests for import_email_string (also exercised by mbox, but directly tested)
# ===========================================================================

_SIMPLE_EMAIL = """\
From: alice@example.com\r
To: bob@example.com\r
Subject: Test\r
Date: Mon, 01 Jan 2024 10:00:00 +0000\r
Message-ID: <simple@example.com>\r
\r
Hello Bob!\r
"""

_MULTIPART_EMAIL = """\
From: alice@example.com\r
To: bob@example.com\r
Subject: Multipart\r
Date: Mon, 01 Jan 2024 10:00:00 +0000\r
MIME-Version: 1.0\r
Content-Type: multipart/alternative; boundary="boundary"\r
\r
--boundary\r
Content-Type: text/plain\r
\r
Plain text body\r
--boundary\r
Content-Type: text/html\r
\r
<p>HTML body</p>\r
--boundary--\r
"""


class TestImportEmailString:
def test_simple_email(self) -> None:
email = import_email_string(_SIMPLE_EMAIL)
assert "alice@example.com" in email.metadata.sender
assert email.metadata.subject is not None
assert "Test" in email.metadata.subject
assert email.metadata.id == "<simple@example.com>"
assert email.timestamp is not None
assert len(email.text_chunks) > 0

def test_multipart_email(self) -> None:
email = import_email_string(_MULTIPART_EMAIL)
# Should extract the plain text part
body = " ".join(email.text_chunks)
assert "Plain text body" in body
Loading