From 95addfc9fdcd5f12202c9418642972433f78870d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 11 Dec 2024 10:38:12 +0100
Subject: [PATCH] Add fix_text callback for custom fix-up rules

---
 README.md              | 12 ++++++++++++
 spacy_layout/layout.py |  9 +++++++--
 tests/test_general.py  |  9 +++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 287e0a1..55941c9 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,17 @@ def display_table(df: pd.DataFrame) -> str:
 layout = spaCyLayout(nlp, display_table=display_table)
 ```
 
+### Adding custom fix-up rules
+
+As with any machine learning process, the model may make mistakes during the extraction phase. To add custom fix-up rules, you can provide a `fix_text` function that's called on all text-based layout spans right after extraction and before the text is tokenized.
+
+```python
+def fix_text(text: str) -> str:
+    return text.replace("ª", "ã")
+
+layout = spaCyLayout(nlp, fix_text=fix_text)
+```
+
 ## 🎛️ API
 
 ### Data and extension attributes
@@ -166,6 +177,7 @@ layout = spaCyLayout(nlp)
 | `separator` | `str` | Token used to separate sections in the created `Doc` object. The separator won't be part of the layout span. If `None`, no separator will be added. Defaults to `"\n\n"`. |
 | `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"doc_markdown"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. |
 | `headings` | `list[str]` | Labels of headings to consider for `Span._.heading` detection. Defaults to `["section_header", "page_header", "title"]`. |
+| `fix_text` | `Callable[[str], str]` | Function applying fix-up rules to the text, e.g. to correct common mistakes made by the model. Called on every text-based layout span after extraction. |
 | `display_table` | `Callable[[pandas.DataFrame], str] \| str` | Function to generate the text-based representation of the table in the `Doc.text` or placeholder text. Defaults to `"TABLE"`. |
 | `docling_options` | `dict[InputFormat, FormatOption]` | [Format options](https://ds4sd.github.io/docling/usage/#advanced-options) passed to Docling's `DocumentConverter`. |
 | **RETURNS** | `spaCyLayout` | The initialized object. |
diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py
index 9ff9535..4577180 100644
--- a/spacy_layout/layout.py
+++ b/spacy_layout/layout.py
@@ -38,6 +38,7 @@ def __init__(
             DocItemLabel.PAGE_HEADER,
             DocItemLabel.TITLE,
         ],
+        fix_text: Callable[[str], str] | None = None,
         display_table: Callable[["DataFrame"], str] | str = TABLE_PLACEHOLDER,
         docling_options: dict["InputFormat", "FormatOption"] | None = None,
     ) -> None:
@@ -56,6 +57,7 @@ def __init__(
         )
         self.headings = headings
         self.display_table = display_table
+        self.fix_text = fix_text
         self.converter = DocumentConverter(format_options=docling_options)
         # Set spaCy extension attributes for custom data
         Doc.set_extension(self.attrs.doc_layout, default=None, force=True)
@@ -99,9 +101,12 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
         for node, _ in result.document.iterate_items():
             if node.self_ref in text_items:
                 item = text_items[node.self_ref]
-                if item.text == "":
+                text = item.text
+                if text == "":
                     continue
-                inputs.append((item.text, item))
+                if self.fix_text:
+                    text = self.fix_text(text)
+                inputs.append((text, item))
             elif node.self_ref in table_items:
                 item = table_items[node.self_ref]
                 if isinstance(self.display_table, str):
diff --git a/tests/test_general.py b/tests/test_general.py
index 0601a55..9813c8b 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -57,6 +57,15 @@ def test_simple_pipe(nlp):
         assert len(doc.spans[layout.attrs.span_group]) == 4
 
 
+def test_fix_text(nlp):
+    def fix_text(text):
+        return text.replace("Lorem", "LOREM")
+
+    layout = spaCyLayout(nlp, fix_text=fix_text)
+    doc = layout(PDF_SIMPLE)
+    assert doc.text.startswith("LOREM ipsum dolor sit amet")
+
+
 def test_table(nlp):
     layout = spaCyLayout(nlp)
     doc = layout(PDF_TABLE)