Skip to content

Commit 2d4146a

Browse files
committed
feat: implement 'countTokens' function
fixes #67
1 parent 3547826 commit 2d4146a

7 files changed

+40
-0
lines changed

README.md

+13
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,19 @@ const tokenLimit = 10
235235
const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
236236
```
237237

238+
### `countTokens(text: string | Iterable<ChatMessage>): number`
239+
240+
Counts the number of tokens in the input text or chat. Use this method when you need to determine the number of tokens without checking against a limit.
241+
242+
Example:
243+
244+
```typescript
245+
import { countTokens } from 'gpt-tokenizer'
246+
247+
const text = 'Hello, world!'
248+
const tokenCount = countTokens(text)
249+
```
250+
238251
### `encodeChat(chat: ChatMessage[], model?: ModelName): number[]`
239252

240253
Encodes the given chat into a sequence of tokens.

src/GptEncoding.ts

+17
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ export class GptEncoding {
128128
this.isWithinTokenLimit = this.isWithinTokenLimit.bind(this)
129129
this.encodeChat = this.encodeChat.bind(this)
130130
this.encodeChatGenerator = this.encodeChatGenerator.bind(this)
131+
this.countTokens = this.countTokens.bind(this)
131132
this.modelName = modelName
132133
}
133134

@@ -349,6 +350,22 @@ export class GptEncoding {
349350
return count
350351
}
351352

353+
/**
354+
* Counts the number of tokens in the input.
355+
* @returns {number} The number of tokens.
356+
*/
357+
countTokens(input: string | Iterable<ChatMessage>): number {
358+
const tokenGenerator =
359+
typeof input === 'string'
360+
? this.encodeGenerator(input)
361+
: this.encodeChatGenerator(input)
362+
let count = 0
363+
for (const tokens of tokenGenerator) {
364+
count += tokens.length
365+
}
366+
return count
367+
}
368+
352369
decode(inputTokensToDecode: Iterable<number>): string {
353370
return this.bytePairEncodingCoreProcessor.decodeNative(inputTokensToDecode)
354371
}

src/encoding/cl100k_base.ts

+2
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ const {
1313
encode,
1414
encodeGenerator,
1515
isWithinTokenLimit,
16+
countTokens,
1617
encodeChat,
1718
encodeChatGenerator,
1819
vocabularySize,
1920
} = api
2021
export {
22+
countTokens,
2123
decode,
2224
decodeAsyncGenerator,
2325
decodeGenerator,

src/encoding/o200k_base.ts

+2
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ const {
1313
encode,
1414
encodeGenerator,
1515
isWithinTokenLimit,
16+
countTokens,
1617
encodeChat,
1718
encodeChatGenerator,
1819
vocabularySize,
1920
} = api
2021
export {
22+
countTokens,
2123
decode,
2224
decodeAsyncGenerator,
2325
decodeGenerator,

src/encoding/p50k_base.ts

+2
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ const {
1313
encode,
1414
encodeGenerator,
1515
isWithinTokenLimit,
16+
countTokens,
1617
vocabularySize,
1718
} = api
1819
export {
20+
countTokens,
1921
decode,
2022
decodeAsyncGenerator,
2123
decodeGenerator,

src/encoding/p50k_edit.ts

+2
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ const {
1313
encode,
1414
encodeGenerator,
1515
isWithinTokenLimit,
16+
countTokens,
1617
vocabularySize,
1718
} = api
1819
export {
20+
countTokens,
1921
decode,
2022
decodeAsyncGenerator,
2123
decodeGenerator,

src/encoding/r50k_base.ts

+2
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ const {
1313
encode,
1414
encodeGenerator,
1515
isWithinTokenLimit,
16+
countTokens,
1617
vocabularySize,
1718
} = api
1819
export {
20+
countTokens,
1921
decode,
2022
decodeAsyncGenerator,
2123
decodeGenerator,

0 commit comments

Comments
 (0)