diff --git a/docs/multilingual_chunking.md b/docs/multilingual_chunking.md new file mode 100644 index 0000000..acfb1d9 --- /dev/null +++ b/docs/multilingual_chunking.md @@ -0,0 +1,125 @@ +# Multilingual Text Chunking Support + +## Overview + +YouRBench now supports customizable sentence-based text chunking with configurable delimiters, enabling proper text processing for multiple languages including Chinese, Japanese, and other languages that use different punctuation marks. + +## Features + +- **Flexible Chunking Modes**: Choose between token-based or sentence-based chunking +- **Customizable Delimiters**: Configure sentence delimiters for any language +- **Overlap Support**: Control overlap between chunks for better context preservation +- **Minimum Length Control**: Ensure chunks meet minimum length requirements + +## Configuration + +### Basic Configuration + +In your YAML configuration file, set the chunking mode and delimiters: + +```yaml +pipeline_config: + chunking: + run: true + chunking_mode: "sentence" # or "token" for token-based chunking + sentence_delimiters: "[.!?]" # English delimiters (default) +``` + +### Chinese Language Configuration + +For Chinese text, use Chinese punctuation marks: + +```yaml +pipeline_config: + chunking: + chunking_mode: "sentence" + sentence_delimiters: "[\u3002\uff01\uff1f]" # 。!? + max_sentences_per_chunk: 10 + sentence_overlap: 2 +``` + +### Mixed Language Support + +For documents containing both English and Chinese: + +```yaml +pipeline_config: + chunking: + chunking_mode: "sentence" + sentence_delimiters: "[.!?\u3002\uff01\uff1f]" # English and Chinese +``` + +## Delimiter Reference + +### Common Language Delimiters + +| Language | Delimiters | Unicode | Pattern | +|----------|------------|---------|---------| +| English | . ! ? | - | `[.!?]` | +| Chinese | 。!? | \u3002 \uff01 \uff1f | `[\u3002\uff01\uff1f]` | +| Japanese | 。!? | \u3002 \uff01 \uff1f | `[\u3002\uff01\uff1f]` | +| Arabic | . ! ؟ | - \u061f | `[.!\u061f]` | +| Spanish | . ! ? ¡ ¿ | - \u00a1 \u00bf | `[.!?\u00a1\u00bf]` | + +### Configuration Parameters + +| Parameter | Description | Default | Range | +|-----------|-------------|---------|-------| +| `chunking_mode` | Choose between "token" or "sentence" | "token" | - | +| `max_sentences_per_chunk` | Maximum sentences per chunk | 10 | 1-100 | +| `sentence_overlap` | Number of overlapping sentences | 2 | 0+ | +| `sentence_delimiters` | Regex pattern for delimiters | `[.!?]` | Any regex | +| `min_chunk_length` | Minimum characters per chunk | 100 | 10+ | + +## Examples + +### Example 1: Processing Chinese Documentation + +```yaml +dataset_config: + dataset_name: "chinese-docs" + +pipeline_config: + chunking: + run: true + chunking_mode: "sentence" + sentence_delimiters: "[\u3002\uff01\uff1f]" + max_sentences_per_chunk: 8 + sentence_overlap: 1 + min_chunk_length: 50 +``` + +### Example 2: Processing Mixed Content + +```yaml +pipeline_config: + chunking: + run: true + chunking_mode: "sentence" + # Supports English, Chinese, Japanese punctuation + sentence_delimiters: "[.!?\u3002\uff01\uff1f]" + max_sentences_per_chunk: 12 + sentence_overlap: 2 +``` + +## Backward Compatibility + +The default configuration remains token-based chunking for backward compatibility. Existing configurations will continue to work without modification. + +## Performance Considerations + +- **Sentence-based chunking** is generally faster for processing but may create chunks of varying token sizes +- **Token-based chunking** provides precise token control but requires tokenization overhead +- Choose based on your specific requirements for chunk size consistency vs processing speed + +## Troubleshooting + +### Issue: Sentences not splitting correctly +- Verify your delimiter pattern matches the punctuation in your text +- Check for Unicode encoding issues +- Test your pattern with a small sample first + +### Issue: Chunks too short/long +- Adjust `max_sentences_per_chunk` parameter +- Consider using `min_chunk_length` to merge short chunks +- For token control, switch to token-based chunking mode \ No newline at end of file diff --git a/examples/chinese_chunking_config.yaml b/examples/chinese_chunking_config.yaml new file mode 100644 index 0000000..d6586d3 --- /dev/null +++ b/examples/chinese_chunking_config.yaml @@ -0,0 +1,41 @@ +# Example configuration for Chinese language text chunking +# This demonstrates how to use customizable sentence delimiters for multilingual support + +dataset_config: + dataset_name: "my-chinese-dataset" + output_dir: "./output" + save_intermediate_data: true + +pipeline_config: + chunking: + run: true + + # Use sentence-based chunking instead of token-based + chunking_mode: "sentence" # Options: "token" or "sentence" + + # Sentence-based chunking configuration + max_sentences_per_chunk: 10 + sentence_overlap: 2 + + # Chinese sentence delimiters + # \u3002 = Chinese full stop (。) + # \uff01 = Chinese exclamation mark (!) + # \uff1f = Chinese question mark (?) + sentence_delimiters: "[\u3002\uff01\uff1f]" + + # For mixed Chinese-English text, you can use: + # sentence_delimiters: "[.!?\u3002\uff01\uff1f]" + + min_chunk_length: 100 + + # Multi-hop configuration + h_min: 2 + h_max: 5 + num_multihops_factor: 1 + + # Token-based settings (ignored when chunking_mode is "sentence") + l_max_tokens: 8192 + token_overlap: 512 + encoding_name: "cl100k_base" + +# Other pipeline stages... \ No newline at end of file diff --git a/tests/unit/test_sentence_chunking.py b/tests/unit/test_sentence_chunking.py new file mode 100644 index 0000000..9dc0dd3 --- /dev/null +++ b/tests/unit/test_sentence_chunking.py @@ -0,0 +1,102 @@ +"""Test sentence-based chunking with customizable delimiters.""" + +import pytest +from yourbench.utils.chunking_utils import split_into_sentences, split_into_sentence_chunks + + +class TestSentenceSplitting: + """Test sentence splitting with various delimiters.""" + + def test_english_sentence_splitting(self): + """Test splitting English text with default delimiters.""" + text = "This is a sentence. This is another! Is this a question?" + sentences = split_into_sentences(text) + assert len(sentences) == 3 + assert sentences[0] == "This is a sentence." + assert sentences[1] == "This is another!" + assert sentences[2] == "Is this a question?" + + def test_chinese_sentence_splitting(self): + """Test splitting Chinese text with Chinese delimiters.""" + text = "这是第一句话。这是第二句话!这是问句吗?" + sentences = split_into_sentences(text, delimiters=r"[。!?]") + assert len(sentences) == 3 + assert sentences[0] == "这是第一句话。" + assert sentences[1] == "这是第二句话!" + assert sentences[2] == "这是问句吗?" + + def test_mixed_language_splitting(self): + """Test splitting mixed Chinese-English text.""" + text = "Hello world. 你好世界。How are you? 你好吗?" + sentences = split_into_sentences(text, delimiters=r"[.?。?]") + assert len(sentences) == 4 + assert "Hello world." in sentences[0] + assert "你好世界。" in sentences[1] + + def test_empty_text(self): + """Test handling of empty text.""" + assert split_into_sentences("") == [] + assert split_into_sentences(" ") == [] + + def test_text_without_delimiters(self): + """Test text without sentence delimiters.""" + text = "This text has no sentence delimiters" + sentences = split_into_sentences(text) + assert len(sentences) == 1 + assert sentences[0] == text + + +class TestSentenceChunking: + """Test sentence-based chunking functionality.""" + + def test_basic_chunking(self): + """Test basic sentence chunking.""" + text = "Sentence one. Sentence two. Sentence three. Sentence four. Sentence five." + chunks = split_into_sentence_chunks(text, max_sentences=2, overlap_sentences=0, min_chunk_length=10) + assert len(chunks) == 3 + assert "Sentence one. Sentence two." in chunks[0] + assert "Sentence three. Sentence four." in chunks[1] + assert "Sentence five." in chunks[2] + + def test_chunking_with_overlap(self): + """Test sentence chunking with overlap.""" + text = "S1. S2. S3. S4. S5." + chunks = split_into_sentence_chunks(text, max_sentences=3, overlap_sentences=1, min_chunk_length=5) + assert len(chunks) == 2 + # First chunk: S1, S2, S3 + # Second chunk: S3, S4, S5 (overlaps with S3) + assert "S3." in chunks[0] and "S3." in chunks[1] + + def test_chinese_chunking(self): + """Test chunking Chinese text.""" + text = "第一句。第二句。第三句。第四句。第五句。" + chunks = split_into_sentence_chunks( + text, + max_sentences=2, + overlap_sentences=0, + delimiters=r"[。]", + min_chunk_length=1 + ) + assert len(chunks) == 3 + assert "第一句。" in chunks[0] + assert "第三句。" in chunks[1] + assert "第五句。" in chunks[2] + + def test_min_chunk_length(self): + """Test minimum chunk length enforcement.""" + text = "Short. Also short. This is a much longer sentence that exceeds minimum." + chunks = split_into_sentence_chunks( + text, + max_sentences=1, + overlap_sentences=0, + min_chunk_length=50 + ) + # Short chunks should be merged + assert len(chunks) <= 2 + + def test_single_sentence(self): + """Test chunking with a single sentence.""" + text = "This is just one sentence." + chunks = split_into_sentence_chunks(text, max_sentences=5) + assert len(chunks) == 1 + assert chunks[0] == text \ No newline at end of file diff --git a/yourbench/pipeline/chunking.py b/yourbench/pipeline/chunking.py index 52941e8..8b84a3a 100644 --- a/yourbench/pipeline/chunking.py +++ b/yourbench/pipeline/chunking.py @@ -7,7 +7,7 @@ from loguru import logger from tqdm.auto import tqdm -from yourbench.utils.chunking_utils import split_into_token_chunks +from yourbench.utils.chunking_utils import split_into_token_chunks, split_into_sentence_chunks from yourbench.utils.dataset_engine import custom_load_dataset, custom_save_dataset from yourbench.utils.configuration_engine import YourbenchConfig @@ -19,12 +19,22 @@ def _get_rng(seed: str) -> np.random.Generator: return np.random.default_rng(seed_int) -def _chunk_text(text: str, doc_id: str, max_tokens: int) -> list[dict]: - """Split text into token-based chunks.""" +def _chunk_text(text: str, doc_id: str, cfg) -> list[dict]: + """Split text into chunks based on configuration.""" if not text.strip(): return [] - chunks = split_into_token_chunks(text, max_tokens, overlap=0) + if cfg.chunking_mode == "sentence": + chunks = split_into_sentence_chunks( + text, + max_sentences=cfg.max_sentences_per_chunk, + overlap_sentences=cfg.sentence_overlap, + delimiters=cfg.sentence_delimiters, + min_chunk_length=cfg.min_chunk_length, + ) + else: + chunks = split_into_token_chunks(text, cfg.l_max_tokens, overlap=0) + return [{"chunk_id": f"{doc_id}_{i}", "chunk_text": chunk} for i, chunk in enumerate(chunks)] @@ -80,7 +90,7 @@ def _process_document(row: dict, cfg) -> tuple[list[dict], list[dict]]: doc_id = row.get("document_id", f"doc_{hash(doc_text) % 10000}") # Create single-hop chunks - chunks = _chunk_text(doc_text, doc_id, cfg.l_max_tokens) + chunks = _chunk_text(doc_text, doc_id, cfg) if not chunks: return [], [] diff --git a/yourbench/utils/chunking_utils.py b/yourbench/utils/chunking_utils.py index 4095343..59f8709 100644 --- a/yourbench/utils/chunking_utils.py +++ b/yourbench/utils/chunking_utils.py @@ -1,3 +1,4 @@ +import re import random from typing import Any, Callable, Optional from dataclasses import dataclass @@ -104,3 +105,86 @@ def sample_multihop_groups( k = min(int(value), total) return safe_sample(mh_chunks, k) return mh_chunks + + +def split_into_sentences(text: str, delimiters: str = r"[.!?]") -> list[str]: + """ + Split text into sentences using customizable delimiters. + + Args: + text (str): The input text to split + delimiters (str): Regex pattern for sentence delimiters (default: "[.!?]") + For Chinese: "[。!?]" + For mixed: "[.!?。!?]" + + Returns: + list[str]: List of sentences + """ + if not text or not text.strip(): + return [] + + # Normalize text - replace newlines with spaces + normalized_text = text.replace("\n", " ").strip() + + # Split using capturing parentheses to retain delimiters + pattern = f"({delimiters})" + segments = re.split(pattern, normalized_text) + + sentences = [] + for i in range(0, len(segments), 2): + if i + 1 < len(segments): + # Combine text and delimiter + sentence = (segments[i] + segments[i + 1]).strip() + else: + # Last segment without delimiter + sentence = segments[i].strip() + + if sentence: + sentences.append(sentence) + + return sentences + + +def split_into_sentence_chunks( + text: str, + max_sentences: int = 10, + overlap_sentences: int = 2, + delimiters: str = r"[.!?]", + min_chunk_length: int = 100, +) -> list[str]: + """ + Split text into chunks based on sentences with customizable delimiters. + + Args: + text (str): The input text + max_sentences (int): Maximum sentences per chunk + overlap_sentences (int): Number of overlapping sentences between chunks + delimiters (str): Regex pattern for sentence delimiters + min_chunk_length (int): Minimum character length for a chunk + + Returns: + list[str]: List of text chunks + """ + sentences = split_into_sentences(text, delimiters) + + if not sentences: + return [] + + if len(sentences) <= max_sentences: + return [" ".join(sentences)] + + chunks = [] + stride = max(1, max_sentences - overlap_sentences) + + for i in range(0, len(sentences), stride): + chunk_sentences = sentences[i : i + max_sentences] + chunk_text = " ".join(chunk_sentences) + + # Only add chunks that meet minimum length requirement + if len(chunk_text) >= min_chunk_length: + chunks.append(chunk_text) + elif chunks: + # Merge short chunk with previous one + chunks[-1] = chunks[-1] + " " + chunk_text + + return chunks diff --git a/yourbench/utils/configuration_engine.py b/yourbench/utils/configuration_engine.py index b32cc76..bc4e7c6 100644 --- a/yourbench/utils/configuration_engine.py +++ b/yourbench/utils/configuration_engine.py @@ -351,9 +351,22 @@ class ChunkingConfig(BaseModel): model_config = ConfigDict(validate_assignment=True) run: bool = False + + # Chunking mode configuration + chunking_mode: str = Field(default="token", pattern="^(token|sentence)$") + + # Token-based chunking settings l_max_tokens: int = Field(default=8192, ge=256, le=50000) token_overlap: int = Field(default=512, ge=0) encoding_name: str = "cl100k_base" + + # Sentence-based chunking settings + max_sentences_per_chunk: int = Field(default=10, ge=1, le=100) + sentence_overlap: int = Field(default=2, ge=0) + sentence_delimiters: str = Field(default=r"[.!?]") + min_chunk_length: int = Field(default=100, ge=10) + + # Multi-hop configuration h_min: int = Field(default=2, ge=1) h_max: int = Field(default=5, ge=1) num_multihops_factor: int = Field(default=1, ge=1)