Skip to content

Commit 56dd0a0

Browse files
committed
improve empty document handling
1 parent 460afc5 commit 56dd0a0

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

haystack/components/preprocessors/markdown_header_splitter.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
split_length: int = 200,
3434
split_overlap: int = 0,
3535
split_threshold: int = 0,
36+
skip_empty_documents: bool = True,
3637
):
3738
"""
3839
Initialize the MarkdownHeaderSplitter.
@@ -59,13 +60,16 @@ def __init__(
5960
:param split_overlap: The number of overlapping units for each split when using secondary splitting.
6061
Defaults to 0.
6162
:param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
63+
:param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
64+
Defaults to True.
6265
"""
6366
self.infer_header_levels = infer_header_levels
6467
self.page_break_character = page_break_character
6568
self.secondary_split = secondary_split
6669
self.split_length = split_length
6770
self.split_overlap = split_overlap
6871
self.split_threshold = split_threshold
72+
self.skip_empty_documents = skip_empty_documents
6973
self._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers
7074

7175
# initialize secondary_splitter only if needed
@@ -393,9 +397,16 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
393397

394398
processed_documents = []
395399
for doc in documents:
396-
# skip empty documents
400+
# handle empty documents
397401
if not doc.content or not doc.content.strip():
398-
continue
402+
if self.skip_empty_documents:
403+
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
404+
continue
405+
else:
406+
# keep empty documents
407+
processed_documents.append(doc)
408+
continue
409+
399410
if infer_header_levels:
400411
content = self._infer_header_levels(doc.content, doc_id=doc.id)
401412
processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))

0 commit comments

Comments
 (0)