microsoft · gvanrossum · Feb 21, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -33,10 +33,13 @@ pytest.local.ini
 /evals
 /tests/testdata/Episode_53_Answer_results.json
 /tests/testdata/Episode_53_Search_results.json
+# E-Mail test data
+!/tests/testdata/email-testdata/.gitkeep
+/tests/testdata/email-testdata/*
 
 # Email demo
-/tools/gmail/client_secret.json
-/tools/gmail/token.json
+/tools/mail/client_secret.json
+/tools/mail/token.json
 *_dump/
 
 # Monty Python demo

diff --git a/docs/demos.md b/docs/demos.md
@@ -18,16 +18,19 @@ We have a driver program in the repo to ingest WebVTT files into a
 SQLite database.
 
 This is `tools/ingest_vtt.py`. You run it as follows:
+
 ```sh
 python tools/ingest_vtt.py FILE1.vtt ... FILEN.vtt -d mp.db
 ```
 
 The process took maybe 15 minutes for 11 sketches.
 
 The sketches can now be queried by using another tool:
+
 ```sh
 python tools/query.py -d mp.db
 ```
+
 (You just type questions and it prints answers.)
 
 ## How we did the Gmail demo
@@ -41,12 +44,40 @@ We used the Gmail API to download 550 messages from Guido's Gmail
 
 Given a folder with `*.eml` files in MIME format, we ran our email
 ingestion tool, `tools/ingest_email.py`. You run it as follows:
+
 ```sh
 python tools/ingest_email.py -d gmail.db email-folder/
 ```
+
 You can also pass individual `.eml` files instead of a directory.
 Use `-v` for verbose output.
 
+### Filtering by date
+
+Use `--start-date` and `--stop-date` to restrict ingestion to a date range with [start, stop):
+
+```sh
+# Ingest only January 2024 emails
+python tools/ingest_email.py -d gmail.db email-folder/ \
+    --start-date 2024-01-01 --stop-date 2024-02-01
+```
+
+### Pagination with --offset and --limit
+
+These flags slice the input file list before any other filtering:
+
+```sh
+# Ingest only the first 20 files
+python tools/ingest_email.py -d gmail.db email-folder/ --limit 20
+
+# Skip the first 100 files, then process the next 50
+python tools/ingest_email.py -d gmail.db email-folder/ \
+    --offset 100 --limit 50
+```
+
+All four flags can be combined. The filter pipeline is:
+offset/limit → already-ingested → date range.
+
 The process took over an hour for 500 messages. Moreover, it complained
 about nearly 10% of the messages due to timeouts or just overly large
 files. When an error occurs, the tool recovers and continues with the
@@ -55,13 +86,14 @@ subsequent runs.
 
 We can then query the `gmail.db` database using the same `query.py`
 tool that we used for the Monty Python demo:
+
 ```sh
 python tools/query.py -d gmail.db
 ```
 
 ### How to use the Gmail API to download messages
 
-In the `tools/gmail/` folder you'll find a tool named `gmail_dump.py` which
+In the `tools/mail/` folder you'll find a tool named `gmail_dump.py` which
 will download any number of messages (default 50) using the Gmail API.
 In order to use the Gmail API, however, you have to create a (free)
 Google Cloud app and configure it appropriately.
@@ -104,6 +136,7 @@ and saved to two files by calling the `.ingest()` method on the
 returned `src/typeagent/podcasts/podcast/Podcast` object.
 
 Here's a brief sample session:
+
 ```sh
 $ python tools/query.py
 1.318s -- Using Azure OpenAI

diff --git a/docs/gmail.md b/docs/gmail.md
@@ -1,6 +1,6 @@
 # Extracting GMail Messages
 
-There's a helper script in the repo under `tools/gmail/`.
+There's a helper script in the repo under `tools/mail/`.
 It requires setting up and creating a Google API project.
 Until we have time to write this up, your best bet is to
 ask your favorite search engine or LLM-based chat bot for help.

diff --git a/src/typeagent/emails/email_import.py b/src/typeagent/emails/email_import.py
@@ -1,13 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
+from datetime import datetime
 from email import message_from_string
-from email.header import decode_header, make_header
+from email.header import decode_header, Header, make_header
 from email.message import Message
 from email.utils import parsedate_to_datetime
 from pathlib import Path
 import re
-from typing import Iterable
+from typing import Iterable, overload
 
 from .email_message import EmailMessage, EmailMessageMeta
 
@@ -20,6 +21,27 @@ def decode_encoded_words(value: str) -> str:
     return str(make_header(decode_header(value)))
 
 
+# Coerce an email header value to str or None.
+#  msg.get() can return an email.header.Header object instead of a plain str when the header contains RFC 2047 encoded words.
+#  Pydantic expects str, so we normalise here.
+
+
+@overload
+def _header_to_str(value: str | Header | None, default: str) -> str: ...
+
+
+@overload
+def _header_to_str(value: str | Header | None) -> str | None: ...
+
+
+def _header_to_str(
+    value: str | Header | None, default: str | None = None
+) -> str | None:
+    if value is None:
+        return default
+    return str(value)
+
+
 def import_emails_from_dir(
     dir_path: str, max_chunk_length: int = 4096
 ) -> Iterable[EmailMessage]:
@@ -64,14 +86,16 @@ def import_forwarded_email_string(
 # Imports an email.message.Message object and returns an EmailMessage object
 # If the message is a reply, returns only the latest response.
 def import_email_message(msg: Message, max_chunk_length: int) -> EmailMessage:
-    # Extract metadata from
+    # Extract metadata from headers.
+    # msg.get() can return a Header object instead of str for encoded headers,
+    # so coerce all values to str.
     email_meta = EmailMessageMeta(
-        sender=msg.get("From", ""),
+        sender=_header_to_str(msg.get("From"), ""),
         recipients=_import_address_headers(msg.get_all("To", [])),
         cc=_import_address_headers(msg.get_all("Cc", [])),
         bcc=_import_address_headers(msg.get_all("Bcc", [])),
-        subject=msg.get("Subject"),  # TODO: Remove newlines
-        id=msg.get("Message-ID", None),
+        subject=_header_to_str(msg.get("Subject")),
+        id=_header_to_str(msg.get("Message-ID")),
     )
     timestamp: str | None = None
     timestamp_date = msg.get("Date", None)
@@ -175,7 +199,13 @@ def _decode_email_payload(part: Message) -> str:
             return payload
         return ""
     if isinstance(payload, bytes):
-        return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
+        charset = part.get_content_charset() or "latin-1"
+        try:
+            return payload.decode(charset, errors="replace")
+        except LookupError:
+            # Unknown encoding (e.g. iso-8859-8-i); fall back to latin-1
+            # which accepts all 256 byte values without loss.
+            return payload.decode("latin-1")
     if isinstance(payload, str):
         return payload
     return ""
@@ -187,7 +217,7 @@ def _import_address_headers(headers: list[str]) -> list[str]:
     unique_addresses: set[str] = set()
     for header in headers:
         if header:
-            addresses = _remove_empty_strings(header.split(","))
+            addresses = _remove_empty_strings(str(header).split(","))
             for address in addresses:
                 unique_addresses.add(address)
 
@@ -238,3 +268,29 @@ def _merge_chunks(
 
     if (len(cur_chunk)) > 0:
         yield cur_chunk
+
+
+def email_matches_date_filter(
+    timestamp: str | None,
+    start_date: datetime | None,
+    stop_date: datetime | None,
+) -> bool:
+    """Check whether an email's ISO timestamp passes the date filters.
+
+    The range is half-open: [start_date, stop_date).
+    Emails without a parseable timestamp are always included.
+    """
+    if timestamp is None:
+        return True
+    try:
+        email_dt = datetime.fromisoformat(timestamp)
+    except ValueError:
+        return True
+    # Treat offset-naive timestamps as local time for comparison
+    if email_dt.tzinfo is None:
+        email_dt = email_dt.astimezone()
+    if start_date and email_dt < start_date:
+        return False
+    if stop_date and email_dt >= stop_date:
+        return False
+    return True