Skip to content

Commit d7d4f18

Browse files
committed
improve empty document handling
add more logging for empty documents
1 parent 460afc5 commit d7d4f18

File tree

1 file changed

+24
-2
lines changed

1 file changed

+24
-2
lines changed

haystack/components/preprocessors/markdown_header_splitter.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
split_length: int = 200,
3434
split_overlap: int = 0,
3535
split_threshold: int = 0,
36+
skip_empty_documents: bool = True,
3637
):
3738
"""
3839
Initialize the MarkdownHeaderSplitter.
@@ -59,13 +60,16 @@ def __init__(
5960
:param split_overlap: The number of overlapping units for each split when using secondary splitting.
6061
Defaults to 0.
6162
:param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
63+
:param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
64+
Defaults to True.
6265
"""
6366
self.infer_header_levels = infer_header_levels
6467
self.page_break_character = page_break_character
6568
self.secondary_split = secondary_split
6669
self.split_length = split_length
6770
self.split_overlap = split_overlap
6871
self.split_threshold = split_threshold
72+
self.skip_empty_documents = skip_empty_documents
6973
self._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers
7074

7175
# initialize secondary_splitter only if needed
@@ -386,16 +390,34 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
386390
"""
387391
# validate input documents
388392
for doc in documents:
393+
if doc.content is None:
394+
raise ValueError(
395+
(
396+
"MarkdownHeaderSplitter only works with text documents but content for document ID"
397+
f" {doc.id} is None."
398+
)
399+
)
389400
if not isinstance(doc.content, str):
390401
raise ValueError("MarkdownHeaderSplitter only works with text documents (str content).")
391402

392403
infer_header_levels = infer_header_levels if infer_header_levels is not None else self.infer_header_levels
393404

394405
processed_documents = []
395406
for doc in documents:
396-
# skip empty documents
407+
# handle empty documents
397408
if not doc.content or not doc.content.strip():
398-
continue
409+
if self.skip_empty_documents:
410+
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
411+
continue
412+
else:
413+
# keep empty documents
414+
processed_documents.append(doc)
415+
logger.warning(
416+
"Document ID {doc_id} has an empty content. Keeping this document as per configuration.",
417+
doc_id=doc.id,
418+
)
419+
continue
420+
399421
if infer_header_levels:
400422
content = self._infer_header_levels(doc.content, doc_id=doc.id)
401423
processed_documents.append(Document(content=content, meta=doc.meta, id=doc.id))

0 commit comments

Comments
 (0)