From 95addfc9fdcd5f12202c9418642972433f78870d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 11 Dec 2024 10:38:12 +0100 Subject: [PATCH] Add fix_text callback for custom fix-up rules --- README.md | 12 ++++++++++++ spacy_layout/layout.py | 9 +++++++-- tests/test_general.py | 9 +++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 287e0a1..55941c9 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,17 @@ def display_table(df: pd.DataFrame) -> str: layout = spaCyLayout(nlp, display_table=display_table) ``` +### Adding custom fix-up rules + +As with any machine learning process, the model may make mistakes during the extraction phase. To add custom fix-up rules, you can provide a `fix_text` function that's called on all text-based layout spans right after extraction and before the text is tokenized. + +```python +def fix_text(text: str) -> str: + return text.replace("ª", "ã") + +layout = spaCyLayout(nlp, fix_text=fix_text) +``` + ## 🎛️ API ### Data and extension attributes @@ -166,6 +177,7 @@ layout = spaCyLayout(nlp) | `separator` | `str` | Token used to separate sections in the created `Doc` object. The separator won't be part of the layout span. If `None`, no separator will be added. Defaults to `"\n\n"`. | | `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"doc_markdown"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. | | `headings` | `list[str]` | Labels of headings to consider for `Span._.heading` detection. Defaults to `["section_header", "page_header", "title"]`. | +| `fix_text` | `Callable[[str], str]` | Function applying fix-up rules to the text, e.g. to correct common mistakes made by the model. Called on every text-based layout span after extraction. | | `display_table` | `Callable[[pandas.DataFrame], str] \| str` | Function to generate the text-based representation of the table in the `Doc.text` or placeholder text. Defaults to `"TABLE"`. | | `docling_options` | `dict[InputFormat, FormatOption]` | [Format options](https://ds4sd.github.io/docling/usage/#advanced-options) passed to Docling's `DocumentConverter`. | | **RETURNS** | `spaCyLayout` | The initialized object. | diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 9ff9535..4577180 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -38,6 +38,7 @@ def __init__( DocItemLabel.PAGE_HEADER, DocItemLabel.TITLE, ], + fix_text: Callable[[str], str] | None = None, display_table: Callable[["DataFrame"], str] | str = TABLE_PLACEHOLDER, docling_options: dict["InputFormat", "FormatOption"] | None = None, ) -> None: @@ -56,6 +57,7 @@ def __init__( ) self.headings = headings self.display_table = display_table + self.fix_text = fix_text self.converter = DocumentConverter(format_options=docling_options) # Set spaCy extension attributes for custom data Doc.set_extension(self.attrs.doc_layout, default=None, force=True) @@ -99,9 +101,12 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc: for node, _ in result.document.iterate_items(): if node.self_ref in text_items: item = text_items[node.self_ref] - if item.text == "": + text = item.text + if text == "": continue - inputs.append((item.text, item)) + if self.fix_text: + text = self.fix_text(text) + inputs.append((text, item)) elif node.self_ref in table_items: item = table_items[node.self_ref] if isinstance(self.display_table, str): diff --git a/tests/test_general.py b/tests/test_general.py index 0601a55..9813c8b 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -57,6 +57,15 @@ def test_simple_pipe(nlp): assert len(doc.spans[layout.attrs.span_group]) == 4 +def test_fix_text(nlp): + def fix_text(text): + return text.replace("Lorem", "LOREM") + + layout = spaCyLayout(nlp, fix_text=fix_text) + doc = layout(PDF_SIMPLE) + assert doc.text.startswith("LOREM ipsum dolor sit amet") + + def test_table(nlp): layout = spaCyLayout(nlp) doc = layout(PDF_TABLE)