@@ -33,6 +33,7 @@ def __init__(
33
33
split_length : int = 200 ,
34
34
split_overlap : int = 0 ,
35
35
split_threshold : int = 0 ,
36
+ skip_empty_documents : bool = True ,
36
37
):
37
38
"""
38
39
Initialize the MarkdownHeaderSplitter.
@@ -59,13 +60,16 @@ def __init__(
59
60
:param split_overlap: The number of overlapping units for each split when using secondary splitting.
60
61
Defaults to 0.
61
62
:param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
63
+ :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
64
+ Defaults to True.
62
65
"""
63
66
self .infer_header_levels = infer_header_levels
64
67
self .page_break_character = page_break_character
65
68
self .secondary_split = secondary_split
66
69
self .split_length = split_length
67
70
self .split_overlap = split_overlap
68
71
self .split_threshold = split_threshold
72
+ self .skip_empty_documents = skip_empty_documents
69
73
self ._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers
70
74
71
75
# initialize secondary_splitter only if needed
@@ -386,16 +390,34 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
386
390
"""
387
391
# validate input documents
388
392
for doc in documents :
393
+ if doc .content is None :
394
+ raise ValueError (
395
+ (
396
+ "MarkdownHeaderSplitter only works with text documents but content for document ID"
397
+ f" { doc .id } is None."
398
+ )
399
+ )
389
400
if not isinstance (doc .content , str ):
390
401
raise ValueError ("MarkdownHeaderSplitter only works with text documents (str content)." )
391
402
392
403
infer_header_levels = infer_header_levels if infer_header_levels is not None else self .infer_header_levels
393
404
394
405
processed_documents = []
395
406
for doc in documents :
396
- # skip empty documents
407
+ # handle empty documents
397
408
if not doc .content or not doc .content .strip ():
398
- continue
409
+ if self .skip_empty_documents :
410
+ logger .warning ("Document ID {doc_id} has an empty content. Skipping this document." , doc_id = doc .id )
411
+ continue
412
+ else :
413
+ # keep empty documents
414
+ processed_documents .append (doc )
415
+ logger .warning (
416
+ "Document ID {doc_id} has an empty content. Keeping this document as per configuration." ,
417
+ doc_id = doc .id ,
418
+ )
419
+ continue
420
+
399
421
if infer_header_levels :
400
422
content = self ._infer_header_levels (doc .content , doc_id = doc .id )
401
423
processed_documents .append (Document (content = content , meta = doc .meta , id = doc .id ))
0 commit comments