-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.js
37 lines (31 loc) · 1.01 KB
/
tokenizer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import { AutoTokenizer } from "@xenova/transformers";
const tokenizerCache = new Map();
export async function tokenize(text, model) {
try {
if (!tokenizerCache.has(model)) {
console.log(`Loading tokenizer for model: ${model}`);
const tokenizer = await AutoTokenizer.from_pretrained(model);
tokenizerCache.set(model, tokenizer);
}
const tokenizer = tokenizerCache.get(model);
// console.log("Encoding text...");
const encodedOutput = tokenizer.encode(text);
// console.log("Encoded output:", JSON.stringify(encodedOutput, null, 2));
if (!Array.isArray(encodedOutput)) {
throw new Error(
`Unexpected encoded output type: ${typeof encodedOutput}`,
);
}
return {
tokenIds: encodedOutput,
tokens: encodedOutput.map((id) => tokenizer.decode([id])),
tokenCount: encodedOutput.length,
};
} catch (error) {
console.error(
`Error loading or using tokenizer for model ${model}:`,
error.message,
);
throw error;
}
}