Merge pull request #179 from chonkie-ai/development

[Fix] Use default model in `AutoEmbeddings` if `Error: model not found` + bad `__repr__` for `SemanticSentence`
chonkie-ai · Feb 17, 2025 · f4e460d · f4e460d
2 parents 932f484 + 4c064da
commit f4e460d
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 8 deletions.
diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py
@@ -78,6 +78,14 @@ def __init__(
         if return_type not in ["chunks", "texts"]:
             raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
 
+        # Add chunk_overlap deprecation warning
+        if chunk_overlap > 0:
+            warnings.warn(
+                "chunk_overlap is getting deprecated in v0.6.0. " +
+                "🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
+                DeprecationWarning,
+            )
+        # Assign the values if they make sense
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
         self.min_sentences_per_chunk = min_sentences_per_chunk

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
@@ -1,6 +1,7 @@
 """Token-based chunking."""
 
 from typing import Any, Generator, List, Literal, Union
+import warnings
 
 from tqdm import trange
 
@@ -24,7 +25,7 @@ def __init__(
         self,
         tokenizer: Union[str, Any] = "gpt2",
         chunk_size: int = 512,
-        chunk_overlap: Union[int, float] = 128,
+        chunk_overlap: Union[int, float] = 0,
         return_type: Literal["chunks", "texts"] = "chunks",
     ) -> None:
         """Initialize the TokenChunker with configuration parameters.
@@ -44,11 +45,18 @@ def __init__(
             raise ValueError("chunk_size must be positive")
         if isinstance(chunk_overlap, int) and chunk_overlap >= chunk_size:
             raise ValueError("chunk_overlap must be less than chunk_size")
-        if isinstance(chunk_overlap, float) and chunk_overlap >= 1:
-            raise ValueError("chunk_overlap must be less than 1")
         if return_type not in ["chunks", "texts"]:
             raise ValueError("return_type must be either 'chunks' or 'texts'")
 
+        # Add chunk_overlap deprecation warning
+        if chunk_overlap > 0:
+            warnings.warn(
+                "chunk_overlap is getting deprecated in v0.6.0. " +
+                "🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
+                DeprecationWarning,
+            )
+
+        # Assign the values if they make sense
         self.return_type = return_type
         self.chunk_size = chunk_size
         self.chunk_overlap = (

diff --git a/src/chonkie/chunker/word.py b/src/chonkie/chunker/word.py
@@ -1,6 +1,7 @@
 """Word-based chunker."""
 
 import re
+import warnings
 from typing import Any, Callable, List, Literal, Tuple, Union
 
 from chonkie.types import Chunk
@@ -26,7 +27,7 @@ def __init__(
         self,
         tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
         chunk_size: int = 512,
-        chunk_overlap: int = 128,
+        chunk_overlap: int = 0,
         return_type: Literal["chunks", "texts"] = "chunks",
     ):
         """Initialize the WordChunker with configuration parameters.
@@ -50,6 +51,15 @@ def __init__(
         if return_type not in ["chunks", "texts"]:
             raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
 
+        # Add chunk_overlap deprecation warning
+        if chunk_overlap > 0:
+            warnings.warn(
+                "chunk_overlap is getting deprecated in v0.6.0. " +
+                "🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
+                DeprecationWarning,
+            )
+
+        # Assign the values if they make sense
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
         self.return_type = return_type

diff --git a/src/chonkie/embeddings/auto.py b/src/chonkie/embeddings/auto.py
@@ -1,5 +1,6 @@
 """AutoEmbeddings is a factory class for automatically loading embeddings."""
 
+import warnings
 from typing import Any, Union
 
 from .base import BaseEmbeddings
@@ -71,10 +72,12 @@ def get_embeddings(
                     try:
                         return embeddings_cls(model, **kwargs)
                     except Exception as e:
-                        raise ValueError(
-                            f"Failed to load {embeddings_cls.__name__}: {e}"
+                        warnings.warn(
+                            f"Failed to load {embeddings_cls.__name__}: {e}. Falling back to default {embeddings_cls} model."
                         )
-            except Exception:
+                        return embeddings_cls(**kwargs)
+            except Exception as error:
+                warnings.warn(f"Failed to load embeddings via registry: {error}. Falling back to SentenceTransformerEmbeddings.")
                 # Fall back to SentenceTransformerEmbeddings if no matching implementation is found
                 from .sentence_transformer import SentenceTransformerEmbeddings
 

diff --git a/src/chonkie/types.py b/src/chonkie/types.py
@@ -274,7 +274,7 @@ def __repr__(self) -> str:
         return (
             f"SemanticSentence(text={self.text}, start_index={self.start_index}, "
             f"end_index={self.end_index}, token_count={self.token_count}, "
-            f"sentences={self.sentences})"
+            f"embedding={self.embedding})"
         )