@@ -33,6 +33,7 @@ def __init__(
33
33
split_length : int = 200 ,
34
34
split_overlap : int = 0 ,
35
35
split_threshold : int = 0 ,
36
+ skip_empty_documents : bool = True ,
36
37
):
37
38
"""
38
39
Initialize the MarkdownHeaderSplitter.
@@ -59,13 +60,16 @@ def __init__(
59
60
:param split_overlap: The number of overlapping units for each split when using secondary splitting.
60
61
Defaults to 0.
61
62
:param split_threshold: The minimum number of units per split when using secondary splitting. Defaults to 0.
63
+ :param skip_empty_documents: If True, skip documents with empty content. If False, process empty documents.
64
+ Defaults to True.
62
65
"""
63
66
self .infer_header_levels = infer_header_levels
64
67
self .page_break_character = page_break_character
65
68
self .secondary_split = secondary_split
66
69
self .split_length = split_length
67
70
self .split_overlap = split_overlap
68
71
self .split_threshold = split_threshold
72
+ self .skip_empty_documents = skip_empty_documents
69
73
self ._header_pattern = r"(?m)^(#{1,6}) (.+)$" # ATX-style .md-headers
70
74
71
75
# initialize secondary_splitter only if needed
@@ -393,9 +397,16 @@ def run(self, documents: list[Document], infer_header_levels: Optional[bool] = N
393
397
394
398
processed_documents = []
395
399
for doc in documents :
396
- # skip empty documents
400
+ # handle empty documents
397
401
if not doc .content or not doc .content .strip ():
398
- continue
402
+ if self .skip_empty_documents :
403
+ logger .warning ("Document ID {doc_id} has an empty content. Skipping this document." , doc_id = doc .id )
404
+ continue
405
+ else :
406
+ # keep empty documents
407
+ processed_documents .append (doc )
408
+ continue
409
+
399
410
if infer_header_levels :
400
411
content = self ._infer_header_levels (doc .content , doc_id = doc .id )
401
412
processed_documents .append (Document (content = content , meta = doc .meta , id = doc .id ))
0 commit comments