Skip to content

Commit

Permalink
Update TikToken for perfect token computation on 'o' models.
Browse files Browse the repository at this point in the history
  • Loading branch information
enricoros committed May 16, 2024
1 parent a9c1c34 commit 21d045b
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 19 deletions.
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"sharp": "^0.33.3",
"superjson": "^2.2.1",
"tesseract.js": "^5.1.0",
"tiktoken": "^1.0.14",
"tiktoken": "^1.0.15",
"turndown": "^7.1.3",
"uuid": "^9.0.1",
"zod": "^3.23.8",
Expand Down
38 changes: 24 additions & 14 deletions src/common/util/token-counter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ import { DLLMId, findLLMOrThrow } from '~/modules/llms/store-llms';
// Do not set this to true in production, it's very verbose
const DEBUG_TOKEN_COUNT = false;


// global symbols to dynamically load the Tiktoken library
// Global symbols to dynamically load the Tiktoken library
let get_encoding: ((encoding: TiktokenEncoding) => Tiktoken) | null = null;
let encoding_for_model: ((model: TiktokenModel) => Tiktoken) | null = null;
let preloadPromise: Promise<void> | null = null;
let informTheUser = false;

export function preloadTiktokenLibrary() {
/**
* Preloads the Tiktoken library if not already loaded.
* @returns {Promise<void>} A promise that resolves when the library is loaded.
*/
export function preloadTiktokenLibrary(): Promise<void> {
if (!preloadPromise) {
preloadPromise = import('tiktoken')
.then(tiktoken => {
Expand All @@ -33,16 +36,21 @@ export function preloadTiktokenLibrary() {


/**
* Wrapper around the Tiktoken library, to keep tokenizers for all models in a cache
*
* We also preload the tokenizer for the default model, so that the first time a user types
* a message, it doesn't stall loading the tokenizer.
* Wrapper around the Tiktoken library to keep tokenizers for all models in a cache.
* Also, preloads the tokenizer for the default model to avoid initial stall.
*/
export const countModelTokens: (text: string, llmId: DLLMId, debugFrom: string) => number | null = (() => {
// return () => 0;
const tokenEncoders: { [modelId: string]: Tiktoken } = {};
let encodingCL100K: Tiktoken | null = null;
let encodingDefault: Tiktoken | null = null;

/**
* Counts the tokens in the given text for the specified model.
* @param {string} text - The text to tokenize.
* @param {DLLMId} llmId - The ID of the LLM.
* @param {string} debugFrom - Debug information.
* @returns {number | null} The token count or null if not ready.
*/
function _tokenCount(text: string, llmId: DLLMId, debugFrom: string): number | null {

// The library shall have been preloaded - if not, attempt to start its loading and return null to indicate we're not ready to count
Expand All @@ -55,21 +63,23 @@ export const countModelTokens: (text: string, llmId: DLLMId, debugFrom: string)
return null;
}

const { options: { llmRef: openaiModel } } = findLLMOrThrow(llmId);
const openaiModel = findLLMOrThrow(llmId)?.options?.llmRef;
if (!openaiModel) throw new Error(`LLM ${llmId} has no LLM reference id`);

if (!(openaiModel in tokenEncoders)) {
try {
tokenEncoders[openaiModel] = encoding_for_model(openaiModel as TiktokenModel);
} catch (e) {
// make sure we recycle the default encoding across all models
if (!encodingCL100K)
encodingCL100K = get_encoding('cl100k_base');
tokenEncoders[openaiModel] = encodingCL100K;
// fallback to the default encoding across all models (not just OpenAI - this will be used everywhere..)
if (!encodingDefault)
encodingDefault = get_encoding('cl100k_base');
tokenEncoders[openaiModel] = encodingDefault;
}
}
let count: number = 0;

// Note: the try/catch shouldn't be necessary, but there could be corner cases where the tiktoken library throws
// https://github.com/enricoros/big-agi/issues/182
let count = 0;
try {
count = tokenEncoders[openaiModel]?.encode(text, 'all', [])?.length || 0;
} catch (e) {
Expand Down

0 comments on commit 21d045b

Please sign in to comment.