huggingface · sumukshashidhar · Aug 25, 2025
diff --git a/docs/multilingual_chunking.md b/docs/multilingual_chunking.md
@@ -0,0 +1,125 @@
+# Multilingual Text Chunking Support
+
+## Overview
+
+YouRBench now supports customizable sentence-based text chunking with configurable delimiters, enabling proper text processing for multiple languages including Chinese, Japanese, and other languages that use different punctuation marks.
+
+## Features
+
+- **Flexible Chunking Modes**: Choose between token-based or sentence-based chunking
+- **Customizable Delimiters**: Configure sentence delimiters for any language
+- **Overlap Support**: Control overlap between chunks for better context preservation
+- **Minimum Length Control**: Ensure chunks meet minimum length requirements
+
+## Configuration
+
+### Basic Configuration
+
+In your YAML configuration file, set the chunking mode and delimiters:
+
+```yaml
+pipeline_config:
+  chunking:
+    run: true
+    chunking_mode: "sentence"  # or "token" for token-based chunking
+    sentence_delimiters: "[.!?]"  # English delimiters (default)
+```
+
+### Chinese Language Configuration
+
+For Chinese text, use Chinese punctuation marks:
+
+```yaml
+pipeline_config:
+  chunking:
+    chunking_mode: "sentence"
+    sentence_delimiters: "[\u3002\uff01\uff1f]"  # 。！？
+    max_sentences_per_chunk: 10
+    sentence_overlap: 2
+```
+
+### Mixed Language Support
+
+For documents containing both English and Chinese:
+
+```yaml
+pipeline_config:
+  chunking:
+    chunking_mode: "sentence"
+    sentence_delimiters: "[.!?\u3002\uff01\uff1f]"  # English and Chinese
+```
+
+## Delimiter Reference
+
+### Common Language Delimiters
+
+| Language | Delimiters | Unicode | Pattern |
+|----------|------------|---------|---------|
+| English | . ! ? | - | `[.!?]` |
+| Chinese | 。！？ | \u3002 \uff01 \uff1f | `[\u3002\uff01\uff1f]` |
+| Japanese | 。！？ | \u3002 \uff01 \uff1f | `[\u3002\uff01\uff1f]` |
+| Arabic | . ! ؟ | - \u061f | `[.!\u061f]` |
+| Spanish | . ! ? ¡ ¿ | - \u00a1 \u00bf | `[.!?\u00a1\u00bf]` |
+
+### Configuration Parameters
+
+| Parameter | Description | Default | Range |
+|-----------|-------------|---------|-------|
+| `chunking_mode` | Choose between "token" or "sentence" | "token" | - |
+| `max_sentences_per_chunk` | Maximum sentences per chunk | 10 | 1-100 |
+| `sentence_overlap` | Number of overlapping sentences | 2 | 0+ |
+| `sentence_delimiters` | Regex pattern for delimiters | `[.!?]` | Any regex |
+| `min_chunk_length` | Minimum characters per chunk | 100 | 10+ |
+
+## Examples
+
+### Example 1: Processing Chinese Documentation
+
+```yaml
+dataset_config:
+  dataset_name: "chinese-docs"
+
+pipeline_config:
+  chunking:
+    run: true
+    chunking_mode: "sentence"
+    sentence_delimiters: "[\u3002\uff01\uff1f]"
+    max_sentences_per_chunk: 8
+    sentence_overlap: 1
+    min_chunk_length: 50
+```
+
+### Example 2: Processing Mixed Content
+
+```yaml
+pipeline_config:
+  chunking:
+    run: true
+    chunking_mode: "sentence"
+    # Supports English, Chinese, Japanese punctuation
+    sentence_delimiters: "[.!?\u3002\uff01\uff1f]"
+    max_sentences_per_chunk: 12
+    sentence_overlap: 2
+```
+
+## Backward Compatibility
+
+The default configuration remains token-based chunking for backward compatibility. Existing configurations will continue to work without modification.
+
+## Performance Considerations
+
+- **Sentence-based chunking** is generally faster for processing but may create chunks of varying token sizes
+- **Token-based chunking** provides precise token control but requires tokenization overhead
+- Choose based on your specific requirements for chunk size consistency vs processing speed
+
+## Troubleshooting
+
+### Issue: Sentences not splitting correctly
+- Verify your delimiter pattern matches the punctuation in your text
+- Check for Unicode encoding issues
+- Test your pattern with a small sample first
+
+### Issue: Chunks too short/long
+- Adjust `max_sentences_per_chunk` parameter
+- Consider using `min_chunk_length` to merge short chunks
+- For token control, switch to token-based chunking mode
diff --git a/examples/chinese_chunking_config.yaml b/examples/chinese_chunking_config.yaml
@@ -0,0 +1,41 @@
+# Example configuration for Chinese language text chunking
+# This demonstrates how to use customizable sentence delimiters for multilingual support
+
+dataset_config:
+  dataset_name: "my-chinese-dataset"
+  output_dir: "./output"
+  save_intermediate_data: true
+
+pipeline_config:
+  chunking:
+    run: true
+
+    # Use sentence-based chunking instead of token-based
+    chunking_mode: "sentence"  # Options: "token" or "sentence"
+
+    # Sentence-based chunking configuration
+    max_sentences_per_chunk: 10
+    sentence_overlap: 2
+
+    # Chinese sentence delimiters
+    # \u3002 = Chinese full stop (。)
+    # \uff01 = Chinese exclamation mark (！)
+    # \uff1f = Chinese question mark (？)
+    sentence_delimiters: "[\u3002\uff01\uff1f]"
+
+    # For mixed Chinese-English text, you can use:
+    # sentence_delimiters: "[.!?\u3002\uff01\uff1f]"
+
+    min_chunk_length: 100
+
+    # Multi-hop configuration
+    h_min: 2
+    h_max: 5
+    num_multihops_factor: 1
+
+    # Token-based settings (ignored when chunking_mode is "sentence")
+    l_max_tokens: 8192
+    token_overlap: 512
+    encoding_name: "cl100k_base"
+
+# Other pipeline stages...
diff --git a/tests/unit/test_sentence_chunking.py b/tests/unit/test_sentence_chunking.py
@@ -0,0 +1,102 @@
+"""Test sentence-based chunking with customizable delimiters."""
+
+import pytest
+from yourbench.utils.chunking_utils import split_into_sentences, split_into_sentence_chunks
+
+
+class TestSentenceSplitting:
+    """Test sentence splitting with various delimiters."""
+
+    def test_english_sentence_splitting(self):
+        """Test splitting English text with default delimiters."""
+        text = "This is a sentence. This is another! Is this a question?"
+        sentences = split_into_sentences(text)
+        assert len(sentences) == 3
+        assert sentences[0] == "This is a sentence."
+        assert sentences[1] == "This is another!"
+        assert sentences[2] == "Is this a question?"
+
+    def test_chinese_sentence_splitting(self):
+        """Test splitting Chinese text with Chinese delimiters."""
+        text = "这是第一句话。这是第二句话！这是问句吗？"
+        sentences = split_into_sentences(text, delimiters=r"[。！？]")
+        assert len(sentences) == 3
+        assert sentences[0] == "这是第一句话。"
+        assert sentences[1] == "这是第二句话！"
+        assert sentences[2] == "这是问句吗？"
+
+    def test_mixed_language_splitting(self):
+        """Test splitting mixed Chinese-English text."""
+        text = "Hello world. 你好世界。How are you? 你好吗？"
+        sentences = split_into_sentences(text, delimiters=r"[.?。？]")
+        assert len(sentences) == 4
+        assert "Hello world." in sentences[0]
+        assert "你好世界。" in sentences[1]
+
+    def test_empty_text(self):
+        """Test handling of empty text."""
+        assert split_into_sentences("") == []
+        assert split_into_sentences("   ") == []
+
+    def test_text_without_delimiters(self):
+        """Test text without sentence delimiters."""
+        text = "This text has no sentence delimiters"
+        sentences = split_into_sentences(text)
+        assert len(sentences) == 1
+        assert sentences[0] == text
+
+
+class TestSentenceChunking:
+    """Test sentence-based chunking functionality."""
+
+    def test_basic_chunking(self):
+        """Test basic sentence chunking."""
+        text = "Sentence one. Sentence two. Sentence three. Sentence four. Sentence five."
+        chunks = split_into_sentence_chunks(text, max_sentences=2, overlap_sentences=0, min_chunk_length=10)
+        assert len(chunks) == 3
+        assert "Sentence one. Sentence two." in chunks[0]
+        assert "Sentence three. Sentence four." in chunks[1]
+        assert "Sentence five." in chunks[2]
+
+    def test_chunking_with_overlap(self):
+        """Test sentence chunking with overlap."""
+        text = "S1. S2. S3. S4. S5."
+        chunks = split_into_sentence_chunks(text, max_sentences=3, overlap_sentences=1, min_chunk_length=5)
+        assert len(chunks) == 2
+        # First chunk: S1, S2, S3
+        # Second chunk: S3, S4, S5 (overlaps with S3)
+        assert "S3." in chunks[0] and "S3." in chunks[1]
+
+    def test_chinese_chunking(self):
+        """Test chunking Chinese text."""
+        text = "第一句。第二句。第三句。第四句。第五句。"
+        chunks = split_into_sentence_chunks(
+            text, 
+            max_sentences=2, 
+            overlap_sentences=0,
+            delimiters=r"[。]",
+            min_chunk_length=1
+        )
+        assert len(chunks) == 3
+        assert "第一句。" in chunks[0]
+        assert "第三句。" in chunks[1]
+        assert "第五句。" in chunks[2]
+
+    def test_min_chunk_length(self):
+        """Test minimum chunk length enforcement."""
+        text = "Short. Also short. This is a much longer sentence that exceeds minimum."
+        chunks = split_into_sentence_chunks(
+            text,
+            max_sentences=1,
+            overlap_sentences=0,
+            min_chunk_length=50
+        )
+        # Short chunks should be merged
+        assert len(chunks) <= 2
+
+    def test_single_sentence(self):
+        """Test chunking with a single sentence."""
+        text = "This is just one sentence."
+        chunks = split_into_sentence_chunks(text, max_sentences=5)
+        assert len(chunks) == 1
+        assert chunks[0] == text
diff --git a/yourbench/pipeline/chunking.py b/yourbench/pipeline/chunking.py
@@ -7,7 +7,7 @@
 from loguru import logger
 from tqdm.auto import tqdm
 
-from yourbench.utils.chunking_utils import split_into_token_chunks
+from yourbench.utils.chunking_utils import split_into_token_chunks, split_into_sentence_chunks
 from yourbench.utils.dataset_engine import custom_load_dataset, custom_save_dataset
 from yourbench.utils.configuration_engine import YourbenchConfig
 
@@ -19,12 +19,22 @@ def _get_rng(seed: str) -> np.random.Generator:
     return np.random.default_rng(seed_int)
 
 
-def _chunk_text(text: str, doc_id: str, max_tokens: int) -> list[dict]:
-    """Split text into token-based chunks."""
+def _chunk_text(text: str, doc_id: str, cfg) -> list[dict]:
+    """Split text into chunks based on configuration."""
     if not text.strip():
         return []
 
-    chunks = split_into_token_chunks(text, max_tokens, overlap=0)
+    if cfg.chunking_mode == "sentence":
+        chunks = split_into_sentence_chunks(
+            text,
+            max_sentences=cfg.max_sentences_per_chunk,
+            overlap_sentences=cfg.sentence_overlap,
+            delimiters=cfg.sentence_delimiters,
+            min_chunk_length=cfg.min_chunk_length,
+        )
+    else:
+        chunks = split_into_token_chunks(text, cfg.l_max_tokens, overlap=0)
+
     return [{"chunk_id": f"{doc_id}_{i}", "chunk_text": chunk} for i, chunk in enumerate(chunks)]
 
 
@@ -80,7 +90,7 @@ def _process_document(row: dict, cfg) -> tuple[list[dict], list[dict]]:
     doc_id = row.get("document_id", f"doc_{hash(doc_text) % 10000}")
 
     # Create single-hop chunks
-    chunks = _chunk_text(doc_text, doc_id, cfg.l_max_tokens)
+    chunks = _chunk_text(doc_text, doc_id, cfg)
     if not chunks:
         return [], []