Skip to content

Commit

Permalink
Merge pull request #179 from chonkie-ai/development
Browse files Browse the repository at this point in the history
[Fix] Use default model in `AutoEmbeddings` if `Error: model not found` + bad `__repr__` for `SemanticSentence`
  • Loading branch information
bhavnicksm authored Feb 17, 2025
2 parents 932f484 + 4c064da commit f4e460d
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 8 deletions.
8 changes: 8 additions & 0 deletions src/chonkie/chunker/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ def __init__(
if return_type not in ["chunks", "texts"]:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

# Add chunk_overlap deprecation warning
if chunk_overlap > 0:
warnings.warn(
"chunk_overlap is getting deprecated in v0.6.0. " +
"🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
DeprecationWarning,
)
# Assign the values if they make sense
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_sentences_per_chunk = min_sentences_per_chunk
Expand Down
14 changes: 11 additions & 3 deletions src/chonkie/chunker/token.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Token-based chunking."""

from typing import Any, Generator, List, Literal, Union
import warnings

from tqdm import trange

Expand All @@ -24,7 +25,7 @@ def __init__(
self,
tokenizer: Union[str, Any] = "gpt2",
chunk_size: int = 512,
chunk_overlap: Union[int, float] = 128,
chunk_overlap: Union[int, float] = 0,
return_type: Literal["chunks", "texts"] = "chunks",
) -> None:
"""Initialize the TokenChunker with configuration parameters.
Expand All @@ -44,11 +45,18 @@ def __init__(
raise ValueError("chunk_size must be positive")
if isinstance(chunk_overlap, int) and chunk_overlap >= chunk_size:
raise ValueError("chunk_overlap must be less than chunk_size")
if isinstance(chunk_overlap, float) and chunk_overlap >= 1:
raise ValueError("chunk_overlap must be less than 1")
if return_type not in ["chunks", "texts"]:
raise ValueError("return_type must be either 'chunks' or 'texts'")

# Add chunk_overlap deprecation warning
if chunk_overlap > 0:
warnings.warn(
"chunk_overlap is getting deprecated in v0.6.0. " +
"🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
DeprecationWarning,
)

# Assign the values if they make sense
self.return_type = return_type
self.chunk_size = chunk_size
self.chunk_overlap = (
Expand Down
12 changes: 11 additions & 1 deletion src/chonkie/chunker/word.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Word-based chunker."""

import re
import warnings
from typing import Any, Callable, List, Literal, Tuple, Union

from chonkie.types import Chunk
Expand All @@ -26,7 +27,7 @@ def __init__(
self,
tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
chunk_size: int = 512,
chunk_overlap: int = 128,
chunk_overlap: int = 0,
return_type: Literal["chunks", "texts"] = "chunks",
):
"""Initialize the WordChunker with configuration parameters.
Expand All @@ -50,6 +51,15 @@ def __init__(
if return_type not in ["chunks", "texts"]:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

# Add chunk_overlap deprecation warning
if chunk_overlap > 0:
warnings.warn(
"chunk_overlap is getting deprecated in v0.6.0. " +
"🦛 Chonkie advises you to use OverlapRefinery instead which is more flexible and powerful!",
DeprecationWarning,
)

# Assign the values if they make sense
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.return_type = return_type
Expand Down
9 changes: 6 additions & 3 deletions src/chonkie/embeddings/auto.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""AutoEmbeddings is a factory class for automatically loading embeddings."""

import warnings
from typing import Any, Union

from .base import BaseEmbeddings
Expand Down Expand Up @@ -71,10 +72,12 @@ def get_embeddings(
try:
return embeddings_cls(model, **kwargs)
except Exception as e:
raise ValueError(
f"Failed to load {embeddings_cls.__name__}: {e}"
warnings.warn(
f"Failed to load {embeddings_cls.__name__}: {e}. Falling back to default {embeddings_cls} model."
)
except Exception:
return embeddings_cls(**kwargs)
except Exception as error:
warnings.warn(f"Failed to load embeddings via registry: {error}. Falling back to SentenceTransformerEmbeddings.")
# Fall back to SentenceTransformerEmbeddings if no matching implementation is found
from .sentence_transformer import SentenceTransformerEmbeddings

Expand Down
2 changes: 1 addition & 1 deletion src/chonkie/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def __repr__(self) -> str:
return (
f"SemanticSentence(text={self.text}, start_index={self.start_index}, "
f"end_index={self.end_index}, token_count={self.token_count}, "
f"sentences={self.sentences})"
f"embedding={self.embedding})"
)


Expand Down

0 comments on commit f4e460d

Please sign in to comment.