Update TikToken for perfect token computation on 'o' models.

enricoros · May 16, 2024 · 21d045b · 21d045b
1 parent a9c1c34
commit 21d045b
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 19 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -62,7 +62,7 @@
     "sharp": "^0.33.3",
     "superjson": "^2.2.1",
     "tesseract.js": "^5.1.0",
-    "tiktoken": "^1.0.14",
+    "tiktoken": "^1.0.15",
     "turndown": "^7.1.3",
     "uuid": "^9.0.1",
     "zod": "^3.23.8",

diff --git a/src/common/util/token-counter.ts b/src/common/util/token-counter.ts
@@ -6,14 +6,17 @@ import { DLLMId, findLLMOrThrow } from '~/modules/llms/store-llms';
 // Do not set this to true in production, it's very verbose
 const DEBUG_TOKEN_COUNT = false;
 
-
-// global symbols to dynamically load the Tiktoken library
+// Global symbols to dynamically load the Tiktoken library
 let get_encoding: ((encoding: TiktokenEncoding) => Tiktoken) | null = null;
 let encoding_for_model: ((model: TiktokenModel) => Tiktoken) | null = null;
 let preloadPromise: Promise<void> | null = null;
 let informTheUser = false;
 
-export function preloadTiktokenLibrary() {
+/**
+ * Preloads the Tiktoken library if not already loaded.
+ * @returns {Promise<void>} A promise that resolves when the library is loaded.
+ */
+export function preloadTiktokenLibrary(): Promise<void> {
   if (!preloadPromise) {
     preloadPromise = import('tiktoken')
       .then(tiktoken => {
@@ -33,16 +36,21 @@ export function preloadTiktokenLibrary() {
 
 
 /**
- * Wrapper around the Tiktoken library, to keep tokenizers for all models in a cache
- *
- * We also preload the tokenizer for the default model, so that the first time a user types
- * a message, it doesn't stall loading the tokenizer.
+ * Wrapper around the Tiktoken library to keep tokenizers for all models in a cache.
+ * Also, preloads the tokenizer for the default model to avoid initial stall.
  */
 export const countModelTokens: (text: string, llmId: DLLMId, debugFrom: string) => number | null = (() => {
   // return () => 0;
   const tokenEncoders: { [modelId: string]: Tiktoken } = {};
-  let encodingCL100K: Tiktoken | null = null;
+  let encodingDefault: Tiktoken | null = null;
 
+  /**
+   * Counts the tokens in the given text for the specified model.
+   * @param {string} text - The text to tokenize.
+   * @param {DLLMId} llmId - The ID of the LLM.
+   * @param {string} debugFrom - Debug information.
+   * @returns {number | null} The token count or null if not ready.
+   */
   function _tokenCount(text: string, llmId: DLLMId, debugFrom: string): number | null {
 
     // The library shall have been preloaded - if not, attempt to start its loading and return null to indicate we're not ready to count
@@ -55,21 +63,23 @@ export const countModelTokens: (text: string, llmId: DLLMId, debugFrom: string)
       return null;
     }
 
-    const { options: { llmRef: openaiModel } } = findLLMOrThrow(llmId);
+    const openaiModel = findLLMOrThrow(llmId)?.options?.llmRef;
     if (!openaiModel) throw new Error(`LLM ${llmId} has no LLM reference id`);
+
     if (!(openaiModel in tokenEncoders)) {
       try {
         tokenEncoders[openaiModel] = encoding_for_model(openaiModel as TiktokenModel);
       } catch (e) {
-        // make sure we recycle the default encoding across all models
-        if (!encodingCL100K)
-          encodingCL100K = get_encoding('cl100k_base');
-        tokenEncoders[openaiModel] = encodingCL100K;
+        // fallback to the default encoding across all models (not just OpenAI - this will be used everywhere..)
+        if (!encodingDefault)
+          encodingDefault = get_encoding('cl100k_base');
+        tokenEncoders[openaiModel] = encodingDefault;
       }
     }
-    let count: number = 0;
+
     // Note: the try/catch shouldn't be necessary, but there could be corner cases where the tiktoken library throws
     // https://github.com/enricoros/big-agi/issues/182
+    let count = 0;
     try {
       count = tokenEncoders[openaiModel]?.encode(text, 'all', [])?.length || 0;
     } catch (e) {