diff --git a/tokenizer_ts/package-lock.json b/tokenizer_ts/package-lock.json index cee2664..b5decfc 100644 --- a/tokenizer_ts/package-lock.json +++ b/tokenizer_ts/package-lock.json @@ -1,12 +1,12 @@ { "name": "@microsoft/tiktokenizer", - "version": "1.0.3", + "version": "1.0.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@microsoft/tiktokenizer", - "version": "1.0.3", + "version": "1.0.4", "license": "MIT", "dependencies": { "lru-cache": "^9.1.1" @@ -2061,9 +2061,9 @@ ] }, "node_modules/semver": { - "version": "7.5.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.0.tgz", - "integrity": "sha512-+XC0AD/R7Q2mPSRuy2Id0+CGTZ98+8f+KvwirxOKIEyid+XSx6HbC63p+O4IndTHuX5Z+JxQ0TghCkO5Cg/2HA==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "dev": true, "dependencies": { "lru-cache": "^6.0.0" diff --git a/tokenizer_ts/package.json b/tokenizer_ts/package.json index b7c3717..478dd01 100644 --- a/tokenizer_ts/package.json +++ b/tokenizer_ts/package.json @@ -2,7 +2,7 @@ "name": "@microsoft/tiktokenizer", "displayName": "tiktokenizer", "description": "Tokenizer for OpenAI large language models.", - "version": "1.0.3", + "version": "1.0.4", "author": { "name": "Microsoft Corporation" }, diff --git a/tokenizer_ts/src/tikTokenizer.ts b/tokenizer_ts/src/tikTokenizer.ts index 6fd2d8c..4c50bb6 100644 --- a/tokenizer_ts/src/tikTokenizer.ts +++ b/tokenizer_ts/src/tikTokenizer.ts @@ -241,11 +241,15 @@ export class TikTokenizer { const piece = match[0]; if (this.cache.has(piece)) { let tokens = this.cache.get(piece); - tokenCount += tokens!.length; - if (tokenCount <= maxTokenCount) { + if (tokenCount + tokens!.length <= maxTokenCount) { + tokenCount += tokens!.length; encodeLength += piece.length; tokenIds.push(...tokens!); } else { + let remainingTokens = maxTokenCount - tokenCount; + tokenCount += remainingTokens; + encodeLength += piece.length; + tokenIds.push(...tokens!.slice(0, remainingTokens)); break; } } else { @@ -254,8 +258,8 @@ export class TikTokenizer { const token = this.encoder!.get(uint8ArrayToString(bytes)); if (token !== undefined) { this.cache.set(piece, [token]); - tokenCount++; - if (tokenCount <= maxTokenCount) { + if (tokenCount + 1 <= maxTokenCount) { + tokenCount++; encodeLength += piece.length; tokenIds.push(token); } else { @@ -264,11 +268,15 @@ export class TikTokenizer { } else { const encodedTokens = bytePairEncode(bytes, this.encoder!); this.cache.set(piece, encodedTokens); - tokenCount += encodedTokens.length; - if (tokenCount <= maxTokenCount) { + if (tokenCount + encodedTokens.length <= maxTokenCount) { + tokenCount += encodedTokens.length; encodeLength += piece.length; tokenIds.push(...encodedTokens); } else { + let remainingTokens = maxTokenCount - tokenCount; + tokenCount += remainingTokens; + encodeLength += piece.length; + tokenIds.push(...encodedTokens.slice(0, remainingTokens)); break; } } @@ -443,6 +451,16 @@ export class TikTokenizer { } } + // Naive approach if chunks are incorrect + if (actualPrefixTokenCount > maxTokenCount) { + const encodedTokens = this.encode(text, allowedSpecial); + const slicedTokens = encodedTokens.slice(encodedTokens.length - maxTokenCount); + return { + tokenIds: slicedTokens, + text: this.decode(slicedTokens) + }; + } + return { tokenIds: tokenIds.slice(actualPrefixTokenCount), text: text.slice(actualPrefixStrLength) diff --git a/tokenizer_ts/test/tikTokenizer.test.ts b/tokenizer_ts/test/tikTokenizer.test.ts index 4e44d4d..10bce1b 100644 --- a/tokenizer_ts/test/tikTokenizer.test.ts +++ b/tokenizer_ts/test/tikTokenizer.test.ts @@ -91,7 +91,7 @@ suite("TikTokenizer Test Suite", function() { test("encode trim suffix - 2", () => { const str = "<|im_start|>Hello TempWorld<|im_end|>"; - const encodedStr = "<|im_start|>Hello"; + const encodedStr = "<|im_start|>Hello TempWorld"; let encoded = tokenizer.encodeTrimSuffix( str, 5, @@ -125,10 +125,18 @@ suite("TikTokenizer Test Suite", function() { 3, Array.from(specialTokens.keys()) ); - assert.deepStrictEqual(encoded.tokenIds, [100264, 9906]); + assert.deepStrictEqual(encoded.tokenIds, [100264, 9906, 20539]); assert.deepStrictEqual(encoded.text, encodedStr); }); + test("encode trim suffix - 3", () => { + const str = "t".repeat(4000); + const encodedStr = tokenizer.encode(str); + let encodedTrimSuffix = tokenizer.encodeTrimSuffix(str, 5, []); + assert.deepStrictEqual(encodedTrimSuffix.tokenIds.length, 5); + assert.deepStrictEqual(encodedTrimSuffix.tokenIds, encodedStr.slice(0, 5)); + }); + test("encode trim prefix", () => { const str = "<|im_start|>Hello World<|im_end|>"; const encodedStr = "Hello World<|im_end|>"; @@ -197,6 +205,14 @@ suite("TikTokenizer Test Suite", function() { assert.deepStrictEqual(encoded.text, encodedStr); }); + test("encode trim prefix - 3", () => { + const str = "t".repeat(4000); + const encodedStr = tokenizer.encode(str); + let encodedTrimSuffix = tokenizer.encodeTrimPrefix(str, 5, []); + assert.deepStrictEqual(encodedTrimSuffix.tokenIds.length, 5); + assert.deepStrictEqual(encodedTrimSuffix.tokenIds, encodedStr.slice(encodedStr.length - 5)); + }); + test("tokenize source code - gpt-3.5", done => { const source = fs.readFileSync("test/testdata/lib.rs.txt", "utf8"); const filePath = "test/testdata/tokens_gpt_3.5_turbo.json";