add example to for inferring header levels to docstring

OGuggenbuehl · OGuggenbuehl · commit 460afc577dbc · 2025-09-25T10:35:41.000+02:00
diff --git a/haystack/components/preprocessors/markdown_header_splitter.py b/haystack/components/preprocessors/markdown_header_splitter.py
@@ -38,34 +38,20 @@ def __init__(
         Initialize the MarkdownHeaderSplitter.
 
         :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
-            Useful for documents where all headers use the same level (e.g., all "#") or have inconsistent levels.
+            Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling).
             For example, a document like:
-                "# Introduction
-                 Some text
-                 # Methods
-                 Some text
-                 # Results"
+                "## Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
             Would be normalized to:
-                "# Introduction
-                 Some text
-                 # Methods
-                 Some text
-                 # Results"
-            But a document like:
-                "# Introduction
-                 Some text
-                 # Methods
-                 # Experiment 1
-                 # Experiment 2
-                 Some text"
-            Would be normalized to:
-                "# Introduction
-                 Some text
-                 # Methods
-                 ## Experiment 1
-                 ## Experiment 2
-                 Some text"
-            This maintains proper hierarchical structure. Defaults to False.
+                "# Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
+            This attempts to maintain proper hierarchical structure. Defaults to False.
         :param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f").
         :param secondary_split: Optional secondary split condition after header splitting.
             Options are "none", "word", "passage", "period", "line". Defaults to "none".
@@ -376,35 +362,21 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
 
         :param documents: List of documents to split
         :param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
-            Useful for documents where all headers use the same level (e.g., all "#") or have inconsistent levels.
+            Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling).
             For example, a document like:
-                "# Introduction
-                 Some text
-                 # Methods
-                 Some text
-                 # Results"
-            Would be normalized to:
-                "# Introduction
-                 Some text
-                 # Methods
-                 Some text
-                 # Results"
-            But a document like:
-                "# Introduction
-                 Some text
-                 # Methods
-                 # Experiment 1
-                 # Experiment 2
-                 Some text"
+                "## Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
             Would be normalized to:
-                "# Introduction
-                 Some text
-                 # Methods
-                 ## Experiment 1
-                 ## Experiment 2
-                 Some text"
-            This maintains proper hierarchical structure. Defaults to False.
-            If None, uses the value from initialization.
+                "# Title
+                 ## Introduction
+                 Introductory text
+                 ## Methods
+                 Method details"
+            This attempts to maintain proper hierarchical structure. Defaults to False.
+            If None, uses the instance's initialized infer_header_levels setting.
 
         :returns: A dictionary with the following key:
             - `documents`: List of documents with the split texts. Each document includes: