feat: implement 'countTokens' function

niieani · niieani · commit 2d4146a064d9 · 2024-11-27T17:38:15.000-08:00
fixes #67
diff --git a/README.md b/README.md
@@ -235,6 +235,19 @@ const tokenLimit = 10
 const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
 ```
 
+### `countTokens(text: string | Iterable<ChatMessage>): number`
+
+Counts the number of tokens in the input text or chat. Use this method when you need to determine the number of tokens without checking against a limit.
+
+Example:
+
+```typescript
+import { countTokens } from 'gpt-tokenizer'
+
+const text = 'Hello, world!'
+const tokenCount = countTokens(text)
+```
+
 ### `encodeChat(chat: ChatMessage[], model?: ModelName): number[]`
 
 Encodes the given chat into a sequence of tokens.
diff --git a/src/GptEncoding.ts b/src/GptEncoding.ts
@@ -128,6 +128,7 @@ export class GptEncoding {
     this.isWithinTokenLimit = this.isWithinTokenLimit.bind(this)
     this.encodeChat = this.encodeChat.bind(this)
     this.encodeChatGenerator = this.encodeChatGenerator.bind(this)
+    this.countTokens = this.countTokens.bind(this)
     this.modelName = modelName
   }
 
@@ -349,6 +350,22 @@ export class GptEncoding {
     return count
   }
 
+  /**
+   * Counts the number of tokens in the input.
+   * @returns {number} The number of tokens.
+   */
+  countTokens(input: string | Iterable<ChatMessage>): number {
+    const tokenGenerator =
+      typeof input === 'string'
+        ? this.encodeGenerator(input)
+        : this.encodeChatGenerator(input)
+    let count = 0
+    for (const tokens of tokenGenerator) {
+      count += tokens.length
+    }
+    return count
+  }
+
   decode(inputTokensToDecode: Iterable<number>): string {
     return this.bytePairEncodingCoreProcessor.decodeNative(inputTokensToDecode)
   }
diff --git a/src/encoding/cl100k_base.ts b/src/encoding/cl100k_base.ts
@@ -13,11 +13,13 @@ const {
   encode,
   encodeGenerator,
   isWithinTokenLimit,
+  countTokens,
   encodeChat,
   encodeChatGenerator,
   vocabularySize,
 } = api
 export {
+  countTokens,
   decode,
   decodeAsyncGenerator,
   decodeGenerator,
diff --git a/src/encoding/o200k_base.ts b/src/encoding/o200k_base.ts
@@ -13,11 +13,13 @@ const {
   encode,
   encodeGenerator,
   isWithinTokenLimit,
+  countTokens,
   encodeChat,
   encodeChatGenerator,
   vocabularySize,
 } = api
 export {
+  countTokens,
   decode,
   decodeAsyncGenerator,
   decodeGenerator,
diff --git a/src/encoding/p50k_base.ts b/src/encoding/p50k_base.ts
@@ -13,9 +13,11 @@ const {
   encode,
   encodeGenerator,
   isWithinTokenLimit,
+  countTokens,
   vocabularySize,
 } = api
 export {
+  countTokens,
   decode,
   decodeAsyncGenerator,
   decodeGenerator,
diff --git a/src/encoding/p50k_edit.ts b/src/encoding/p50k_edit.ts
@@ -13,9 +13,11 @@ const {
   encode,
   encodeGenerator,
   isWithinTokenLimit,
+  countTokens,
   vocabularySize,
 } = api
 export {
+  countTokens,
   decode,
   decodeAsyncGenerator,
   decodeGenerator,
diff --git a/src/encoding/r50k_base.ts b/src/encoding/r50k_base.ts
@@ -13,9 +13,11 @@ const {
   encode,
   encodeGenerator,
   isWithinTokenLimit,
+  countTokens,
   vocabularySize,
 } = api
 export {
+  countTokens,
   decode,
   decodeAsyncGenerator,
   decodeGenerator,