@@ -38,34 +38,20 @@ def __init__(
38
38
Initialize the MarkdownHeaderSplitter.
39
39
40
40
:param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
41
- Useful for documents where all headers use the same level (e.g., all "#") or have inconsistent levels .
41
+ Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling) .
42
42
For example, a document like:
43
- "# Introduction
44
- Some text
45
- # Methods
46
- Some text
47
- # Results "
43
+ "## Title
44
+ ## Introduction
45
+ Introductory text
46
+ ## Methods
47
+ Method details "
48
48
Would be normalized to:
49
- "# Introduction
50
- Some text
51
- # Methods
52
- Some text
53
- # Results"
54
- But a document like:
55
- "# Introduction
56
- Some text
57
- # Methods
58
- # Experiment 1
59
- # Experiment 2
60
- Some text"
61
- Would be normalized to:
62
- "# Introduction
63
- Some text
64
- # Methods
65
- ## Experiment 1
66
- ## Experiment 2
67
- Some text"
68
- This maintains proper hierarchical structure. Defaults to False.
49
+ "# Title
50
+ ## Introduction
51
+ Introductory text
52
+ ## Methods
53
+ Method details"
54
+ This attempts to maintain proper hierarchical structure. Defaults to False.
69
55
:param page_break_character: Character used to identify page breaks. Defaults to form feed ("\f ").
70
56
:param secondary_split: Optional secondary split condition after header splitting.
71
57
Options are "none", "word", "passage", "period", "line". Defaults to "none".
@@ -376,35 +362,21 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
376
362
377
363
:param documents: List of documents to split
378
364
:param infer_header_levels: If True, attempts to infer and rewrite header levels based on content structure.
379
- Useful for documents where all headers use the same level (e.g., all "#") or have inconsistent levels .
365
+ Useful for documents where all headers use the same level (e.g., all "##", as with PDFs parsed by Docling) .
380
366
For example, a document like:
381
- "# Introduction
382
- Some text
383
- # Methods
384
- Some text
385
- # Results"
386
- Would be normalized to:
387
- "# Introduction
388
- Some text
389
- # Methods
390
- Some text
391
- # Results"
392
- But a document like:
393
- "# Introduction
394
- Some text
395
- # Methods
396
- # Experiment 1
397
- # Experiment 2
398
- Some text"
367
+ "## Title
368
+ ## Introduction
369
+ Introductory text
370
+ ## Methods
371
+ Method details"
399
372
Would be normalized to:
400
- "# Introduction
401
- Some text
402
- # Methods
403
- ## Experiment 1
404
- ## Experiment 2
405
- Some text"
406
- This maintains proper hierarchical structure. Defaults to False.
407
- If None, uses the value from initialization.
373
+ "# Title
374
+ ## Introduction
375
+ Introductory text
376
+ ## Methods
377
+ Method details"
378
+ This attempts to maintain proper hierarchical structure. Defaults to False.
379
+ If None, uses the instance's initialized infer_header_levels setting.
408
380
409
381
:returns: A dictionary with the following key:
410
382
- `documents`: List of documents with the split texts. Each document includes:
0 commit comments