-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
414 lines (341 loc) · 15 KB
/
tokenizer.py
File metadata and controls
414 lines (341 loc) · 15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
"""
Token Translator Core Module
Pure conversion functions without file I/O or command line interface
Easy to embed in other Python projects
"""
import requests
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
from enum import Enum
class DatabasePriority(Enum):
"""Priority of token databases"""
SLOVA = 0
RUSSIAN = 1
NAMES = 2
@dataclass
class TokenInfo:
"""Information about a single token"""
id: str
source: Optional[DatabasePriority]
is_full: bool
@dataclass
class TranslationResult:
"""Result of translation with statistics"""
content: str
total_tokens: int = 0
slova_tokens: int = 0
russian_tokens: int = 0
names_tokens: int = 0
partial_tokens: int = 0
class TokenConverter:
"""
Core token conversion class
Can be easily embedded in other projects
"""
# Database URLs - can be customized
DATABASE_URLS = {
"slova.json": "https://raw.githubusercontent.com/scream-dev/Scream-Dev.ru/refs/heads/main/cdn/slova.json",
"russian.json": "https://dl.dropboxusercontent.com/scl/fi/l7u0na2cp99btx3etxskx/russian.json?rlkey=wgi95f6vq32cpdt48nmzajxu0&st=28buk26k&dl=0",
"russian-names.json": "https://dl.dropboxusercontent.com/scl/fi/cfogog0iaoacjw77qxr3p/russian_surnames.json?rlkey=splkwdgmrncenwekwocs1b2jw&st=sfwcna67&dl=0"
}
def __init__(self,
token_map_slova: Optional[Dict[str, str]] = None,
token_map_russian: Optional[Dict[str, str]] = None,
token_map_names: Optional[Dict[str, str]] = None,
auto_load: bool = True):
"""
Initialize token converter
Args:
token_map_slova: Pre-loaded slova tokens (optional)
token_map_russian: Pre-loaded russian tokens (optional)
token_map_names: Pre-loaded names tokens (optional)
auto_load: Automatically load tokens from URLs if not provided
"""
self.token_map_slova = token_map_slova or {}
self.token_map_russian = token_map_russian or {}
self.token_map_names = token_map_names or {}
if auto_load and not any([token_map_slova, token_map_russian, token_map_names]):
self.load_all_tokens()
@staticmethod
def download_json(url: str) -> Optional[Dict]:
"""Download JSON from URL"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.json()
except Exception:
return None
def load_all_tokens(self) -> bool:
"""Load all token databases from URLs"""
# Load slova.json
slova_data = self.download_json(self.DATABASE_URLS["slova.json"])
if slova_data and "token_to_id" in slova_data:
self.token_map_slova = {k: str(v) for k, v in slova_data["token_to_id"].items()}
# Load russian.json
russian_data = self.download_json(self.DATABASE_URLS["russian.json"])
if russian_data and "token_to_id" in russian_data:
self.token_map_russian = {k: str(v) for k, v in russian_data["token_to_id"].items()}
# Load russian-names.json
names_data = self.download_json(self.DATABASE_URLS["russian-names.json"])
if names_data and "token_to_id" in names_data:
self.token_map_names = {k: str(v) for k, v in names_data["token_to_id"].items()}
return any([self.token_map_slova, self.token_map_russian, self.token_map_names])
def set_token_maps(self,
slova: Dict[str, str] = None,
russian: Dict[str, str] = None,
names: Dict[str, str] = None):
"""Manually set token maps"""
if slova:
self.token_map_slova = slova
if russian:
self.token_map_russian = russian
if names:
self.token_map_names = names
def get_token_id(self, word: str) -> Optional[TokenInfo]:
"""Find full token match for a word"""
lower_word = word.lower().strip()
if lower_word in self.token_map_slova:
return TokenInfo(self.token_map_slova[lower_word], DatabasePriority.SLOVA, True)
if lower_word in self.token_map_russian:
return TokenInfo(self.token_map_russian[lower_word], DatabasePriority.RUSSIAN, True)
if lower_word in self.token_map_names:
return TokenInfo(self.token_map_names[lower_word], DatabasePriority.NAMES, True)
return None
def find_longest_token(self, word: str, start_index: int = 0) -> Optional[Tuple[str, str, DatabasePriority, int, int]]:
"""Find the longest token starting from given position"""
search_word = word.lower()
best_match = None
best_length = 0
# Search in slova.json
for token, token_id in self.token_map_slova.items():
if search_word.startswith(token, start_index) and len(token) > best_length:
best_match = (token, token_id, DatabasePriority.SLOVA, start_index, start_index + len(token))
best_length = len(token)
# Search in russian.json
for token, token_id in self.token_map_russian.items():
if search_word.startswith(token, start_index) and len(token) > best_length:
best_match = (token, token_id, DatabasePriority.RUSSIAN, start_index, start_index + len(token))
best_length = len(token)
# Search in names.json
for token, token_id in self.token_map_names.items():
if search_word.startswith(token, start_index) and len(token) > best_length:
best_match = (token, token_id, DatabasePriority.NAMES, start_index, start_index + len(token))
best_length = len(token)
return best_match
def tokenize_word(self, word: str) -> List[TokenInfo]:
"""Tokenize a single word into tokens"""
result = []
if not word:
return result
lower_word = word.lower().strip()
full_match = self.get_token_id(lower_word)
if full_match:
return [full_match]
current_index = 0
while current_index < len(lower_word):
match = self.find_longest_token(lower_word, current_index)
if match:
token, token_id, source, start, end = match
result.append(TokenInfo(token_id, source, True))
current_index = end
# Add separator "-" if there is continuation
if current_index < len(lower_word) and result:
result.append(TokenInfo("-", None, False))
else:
# Partial token for single character
partial_char = lower_word[current_index]
result.append(TokenInfo(partial_char, None, False))
current_index += 1
if current_index < len(lower_word):
result.append(TokenInfo("-", None, False))
return result
def text_to_tokens(self, text: str) -> TranslationResult:
"""
Convert text to tokens
Args:
text: Input text string
Returns:
TranslationResult with tokens and statistics
"""
if not text:
return TranslationResult("")
words = text.split()
result_parts = []
result = TranslationResult("")
for word_idx, word in enumerate(words):
tokenized = self.tokenize_word(word)
if word_idx > 0 and tokenized:
result_parts.append("+")
for token_idx, token in enumerate(tokenized):
if token.source is None and token.id == "-":
result_parts.append("-")
elif token.source is None and token.id != "-":
result_parts.append(token.id)
result.total_tokens += 1
result.partial_tokens += 1
else:
if token_idx > 0 and result_parts[-1] != "-":
if result_parts and result_parts[-1] not in ["+", "-"]:
result_parts.append("-")
result_parts.append(token.id)
result.total_tokens += 1
if token.source == DatabasePriority.SLOVA:
result.slova_tokens += 1
elif token.source == DatabasePriority.RUSSIAN:
result.russian_tokens += 1
elif token.source == DatabasePriority.NAMES:
result.names_tokens += 1
result.content = "".join(result_parts)
return result
def tokens_to_text(self, tokens: str) -> TranslationResult:
"""
Convert tokens back to text
Args:
tokens: Token string with + and - separators
Returns:
TranslationResult with text and statistics
"""
if not tokens:
return TranslationResult("")
# Create reverse maps
reverse_slova = {v: k for k, v in self.token_map_slova.items()}
reverse_russian = {v: k for k, v in self.token_map_russian.items()}
reverse_names = {v: k for k, v in self.token_map_names.items()}
result_parts = []
current_word = []
result = TranslationResult("")
current_token = ""
for char in tokens:
if char in ['+', '-']:
if current_token:
result.total_tokens += 1
# Check if it's a number (token ID) or character
if current_token.isdigit():
# Search in reverse maps
if current_token in reverse_slova:
current_word.append(reverse_slova[current_token])
result.slova_tokens += 1
elif current_token in reverse_russian:
current_word.append(reverse_russian[current_token])
result.russian_tokens += 1
elif current_token in reverse_names:
current_word.append(reverse_names[current_token])
result.names_tokens += 1
else:
# Partial token (single character)
current_word.append(current_token)
result.partial_tokens += 1
current_token = ""
if char == '+':
if current_word:
result_parts.append("".join(current_word))
current_word = []
if result_parts and result_parts[-1] != " ":
result_parts.append(" ")
# '-' means continue word, do nothing
else:
current_token += char
# Process last token
if current_token:
result.total_tokens += 1
if current_token.isdigit():
if current_token in reverse_slova:
current_word.append(reverse_slova[current_token])
result.slova_tokens += 1
elif current_token in reverse_russian:
current_word.append(reverse_russian[current_token])
result.russian_tokens += 1
elif current_token in reverse_names:
current_word.append(reverse_names[current_token])
result.names_tokens += 1
else:
current_word.append(current_token)
result.partial_tokens += 1
if current_word:
result_parts.append("".join(current_word))
result.content = "".join(result_parts).strip()
return result
def get_stats_summary(self, result: TranslationResult) -> Dict[str, Any]:
"""Get statistics summary as dictionary"""
return {
'total_tokens': result.total_tokens,
'slova_tokens': result.slova_tokens,
'russian_tokens': result.russian_tokens,
'names_tokens': result.names_tokens,
'partial_tokens': result.partial_tokens,
'content_length': len(result.content)
}
def calculate_compression(self, original_text: str, tokenized_text: str) -> float:
"""
Calculate compression percentage
Returns:
Positive percentage if compressed (smaller),
Negative if expanded (larger)
"""
if not original_text:
return 0.0
return ((len(original_text) - len(tokenized_text)) / len(original_text)) * 100.0
# Convenience functions for easy embedding
def create_converter(preloaded_tokens: Dict[str, Dict[str, str]] = None) -> TokenConverter:
"""
Create a TokenConverter instance with optional pre-loaded tokens
Args:
preloaded_tokens: Dictionary with 'slova', 'russian', 'names' keys
Returns:
TokenConverter instance
"""
if preloaded_tokens:
return TokenConverter(
token_map_slova=preloaded_tokens.get('slova'),
token_map_russian=preloaded_tokens.get('russian'),
token_map_names=preloaded_tokens.get('names'),
auto_load=False
)
return TokenConverter()
def text_to_tokens_simple(text: str, converter: TokenConverter = None) -> str:
"""
Simple text to tokens conversion
Args:
text: Input text
converter: Optional TokenConverter instance
Returns:
Token string
"""
if converter is None:
converter = TokenConverter()
return converter.text_to_tokens(text).content
def tokens_to_text_simple(tokens: str, converter: TokenConverter = None) -> str:
"""
Simple tokens to text conversion
Args:
tokens: Token string
converter: Optional TokenConverter instance
Returns:
Text string
"""
if converter is None:
converter = TokenConverter()
return converter.tokens_to_text(tokens).content
# Example of embedding in another project
"""
# Example usage in another Python file:
from token_converter_core import TokenConverter, text_to_tokens_simple
# Option 1: Simple usage
tokens = text_to_tokens_simple("Привет мир")
print(tokens)
# Option 2: Full control
converter = TokenConverter()
result = converter.text_to_tokens("Hello world")
print(f"Tokens: {result.content}")
print(f"Stats: {converter.get_stats_summary(result)}")
# Option 3: With pre-loaded tokens
my_tokens = {
'slova': {'привет': '100', 'мир': '101'},
'russian': {'hello': '200', 'world': '201'}
}
converter = TokenConverter(
token_map_slova=my_tokens['slova'],
token_map_russian=my_tokens['russian'],
auto_load=False
)
"""