-
-
Notifications
You must be signed in to change notification settings - Fork 28
feature:completed bpe and base #64
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
2f71909
82b0209
71ea890
3ff1c96
9d743a2
dcdb8c4
1d049ff
088ec84
49c7ba6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -333,4 +333,5 @@ __pycache__/ | |
| *.pyd | ||
| *.bz2 | ||
|
|
||
| *.venv | ||
| *.venv | ||
| venv/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,13 +19,30 @@ def __init__(self, vocab_size: int, min_frequency: int): | |
|
|
||
| @abstractmethod | ||
| def train(self, text_file: Path, save_path: Path): | ||
| """Train tokenizer and save model.""" | ||
| """Train tokenizer on a text corpus and save artifacts to save_path.""" | ||
| pass | ||
|
|
||
| @abstractmethod | ||
| def encode(self, text: str) -> list[int]: | ||
| """Encode text into a list of integer token ids.""" | ||
| pass | ||
|
|
||
| @abstractmethod | ||
| def decode(self, ids: list[int]) -> str: | ||
| """Decode a list of integer token ids back into text.""" | ||
| pass | ||
|
|
||
| @abstractmethod | ||
| def load(self, tokenizer_dir: Path): | ||
| """Load a previously trained tokenizer from disk.""" | ||
|
Comment on lines
25
to
+37
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "BaseTokenizer abstract method declarations:"
rg -nP '^\s*@abstractmethod\s*$|^\s*def (encode|decode|load)\(' openverifiablellm/tokenizer/base.py -C1
echo
echo "SentencePieceTokenizer implementations for the same methods:"
rg -nP '^\s*def (encode|decode|load)\(' openverifiablellm/tokenizer/sentencepiece_tokenizer.py -C1 || true
echo
echo "Factory path that instantiates SentencePieceTokenizer:"
rg -nP 'SentencePieceTokenizer|tokenizer_type == "sentencepiece"' openverifiablellm/tokenizer/factory.py -C2Repository: AOSSIE-Org/OpenVerifiableLLM Length of output: 1296 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== SentencePieceTokenizer class definition and all methods ==="
cat -n openverifiablellm/tokenizer/sentencepiece_tokenizer.py
echo
echo "=== Checking parent class(es) ==="
rg -P 'class SentencePieceTokenizer' openverifiablellm/tokenizer/sentencepiece_tokenizer.pyRepository: AOSSIE-Org/OpenVerifiableLLM Length of output: 1084
Implement 🤖 Prompt for AI Agents |
||
| pass | ||
|
|
||
| @abstractmethod | ||
| def get_vocab_path(self, tokenizer_dir: Path) -> Path: | ||
| """Return path to the vocabulary file.""" | ||
| pass | ||
|
|
||
| @abstractmethod | ||
| def get_merges_path(self, tokenizer_dir: Path): | ||
| def get_merges_path(self, tokenizer_dir: Path) -> Path | None: | ||
| """Return path to the merges file, or None if not applicable.""" | ||
| pass | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,43 @@ | |
|
|
||
|
|
||
| class BPETokenizer(BaseTokenizer): | ||
| """ | ||
| Byte-level BPE tokenizer implementation. | ||
|
|
||
| Wraps HuggingFace's ByteLevelBPETokenizer and implements | ||
| the full BaseTokenizer interface including train, encode, | ||
| decode, and load. | ||
| """ | ||
|
|
||
| def __init__(self, vocab_size: int, min_frequency: int): | ||
| super().__init__(vocab_size, min_frequency) | ||
| self._tokenizer = None | ||
|
|
||
| # ------------------------------------------------------------------ | ||
| # Training | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| def train(self, text_file: Path, save_path: Path): | ||
| """ | ||
| Train BPE tokenizer on text corpus and save artifacts. | ||
|
|
||
| Args: | ||
| text_file: Path to training text corpus. | ||
| save_path: Directory to save vocab.json and merges.txt. | ||
|
|
||
| Raises: | ||
| FileNotFoundError: If text_file does not exist or is not a file. | ||
| """ | ||
|
|
||
| text_file = Path(text_file) | ||
| save_path = Path(save_path) | ||
|
|
||
| if not text_file.is_file(): | ||
| raise FileNotFoundError( | ||
| f"Training file not found at {text_file}. Please provide a valid text corpus file." | ||
| ) | ||
|
|
||
| save_path.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| tokenizer = ByteLevelBPETokenizer() | ||
|
|
||
|
|
@@ -21,8 +57,106 @@ def train(self, text_file: Path, save_path: Path): | |
|
|
||
| tokenizer.save_model(str(save_path)) | ||
|
|
||
| self._tokenizer = tokenizer | ||
|
|
||
| # ------------------------------------------------------------------ | ||
| # Encode / Decode | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| def encode(self, text: str) -> list[int]: | ||
| """ | ||
| Encode text into a list of token ids. | ||
|
|
||
| Args: | ||
| text: Input string to tokenize. | ||
|
|
||
| Returns: | ||
| List of integer token ids. | ||
|
|
||
| Raises: | ||
| RuntimeError: If tokenizer has not been trained or loaded. | ||
| """ | ||
|
|
||
| self._check_loaded() | ||
| return self._tokenizer.encode(text).ids | ||
|
|
||
| def decode(self, ids: list[int]) -> str: | ||
| """ | ||
| Decode a list of token ids back into text. | ||
|
|
||
| Args: | ||
| ids: List of integer token ids. | ||
|
|
||
| Returns: | ||
| Decoded string. | ||
|
|
||
| Raises: | ||
| RuntimeError: If tokenizer has not been trained or loaded. | ||
| """ | ||
|
|
||
| self._check_loaded() | ||
| return self._tokenizer.decode(ids, skip_special_tokens=False) | ||
|
|
||
| # ------------------------------------------------------------------ | ||
| # Load | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| def load(self, tokenizer_dir: Path): | ||
| """ | ||
| Load a previously trained BPE tokenizer from disk. | ||
|
|
||
| Args: | ||
| tokenizer_dir: Directory containing vocab.json and merges.txt. | ||
|
|
||
| Raises: | ||
| FileNotFoundError: If vocab.json or merges.txt are not found. | ||
| """ | ||
|
|
||
| tokenizer_dir = Path(tokenizer_dir) | ||
|
|
||
| vocab_path = tokenizer_dir / "vocab.json" | ||
| merges_path = tokenizer_dir / "merges.txt" | ||
|
Comment on lines
+117
to
+118
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Use path helper methods inside Line 117–118 redefines artifact paths that are already centralized in ♻️ Proposed refactor- vocab_path = tokenizer_dir / "vocab.json"
- merges_path = tokenizer_dir / "merges.txt"
+ vocab_path = self.get_vocab_path(tokenizer_dir)
+ merges_path = self.get_merges_path(tokenizer_dir)🤖 Prompt for AI Agents |
||
|
|
||
| if not vocab_path.is_file(): | ||
| raise FileNotFoundError( | ||
| f"vocab.json not found at {vocab_path}. Please train the tokenizer first." | ||
| ) | ||
|
|
||
| if not merges_path.is_file(): | ||
| raise FileNotFoundError( | ||
| f"merges.txt not found at {merges_path}. Please train the tokenizer first." | ||
| ) | ||
|
|
||
| self._tokenizer = ByteLevelBPETokenizer( | ||
| vocab=str(vocab_path), | ||
| merges=str(merges_path), | ||
| ) | ||
|
|
||
| # ------------------------------------------------------------------ | ||
| # Artifact paths | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| def get_vocab_path(self, tokenizer_dir: Path) -> Path: | ||
| return tokenizer_dir / "vocab.json" | ||
| """Return path to vocab.json file.""" | ||
| return Path(tokenizer_dir) / "vocab.json" | ||
|
|
||
| def get_merges_path(self, tokenizer_dir: Path) -> Path: | ||
| return tokenizer_dir / "merges.txt" | ||
| """Return path to merges.txt file.""" | ||
| return Path(tokenizer_dir) / "merges.txt" | ||
|
Comment on lines
146
to
+148
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return type mismatch with base class. The base class ♻️ Proposed fix- def get_merges_path(self, tokenizer_dir: Path) -> Path:
+ def get_merges_path(self, tokenizer_dir: Path) -> Path | None:
"""Return path to merges.txt file."""
return Path(tokenizer_dir) / "merges.txt"🤖 Prompt for AI Agents |
||
|
|
||
| # ------------------------------------------------------------------ | ||
| # Internal helpers | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| def _check_loaded(self): | ||
| """ | ||
| Check that tokenizer is loaded before encode/decode. | ||
|
|
||
| Raises: | ||
| RuntimeError: If tokenizer has not been trained or loaded. | ||
| """ | ||
|
|
||
| if self._tokenizer is None: | ||
| raise RuntimeError( | ||
| "BPE tokenizer is not loaded. Call train() or load() before encode/decode." | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.