From 6030d91cbd8a08876212e9e43d4eb7387465e5ac Mon Sep 17 00:00:00 2001 From: Bazyli Brzoska Date: Tue, 12 Nov 2024 22:02:41 -0800 Subject: [PATCH] fix: unify property and variable names across the library --- src/BytePairEncodingCore.ts | 21 +++++++++++---------- src/GptEncoding.ts | 22 +++++++++++----------- src/encodingParams/Cl100KBase.ts | 6 +++--- src/encodingParams/O200KBase.ts | 6 +++--- src/encodingParams/P50KBase.ts | 6 +++--- src/encodingParams/P50KEdit.ts | 6 +++--- src/encodingParams/R50KBase.ts | 6 +++--- src/modelParams.ts | 2 +- 8 files changed, 38 insertions(+), 37 deletions(-) diff --git a/src/BytePairEncodingCore.ts b/src/BytePairEncodingCore.ts index e265a07..0d78c29 100644 --- a/src/BytePairEncodingCore.ts +++ b/src/BytePairEncodingCore.ts @@ -6,8 +6,8 @@ import { escapeRegExp } from './util.js' export type RawBytePairRanks = readonly (string | readonly number[])[] export interface BytePairEncodingConfig { - mergeableBytePairRanks: RawBytePairRanks - specialTokenMapping?: Map + bytePairRankDecoder: RawBytePairRanks + specialTokensEncoder?: Map tokenSplitRegex: RegExp } @@ -38,18 +38,18 @@ export class BytePairEncodingCore { private textEncoder = new TextEncoder() constructor({ - mergeableBytePairRanks: bytePairEncoder, - specialTokenMapping: specialTokenEncoder, + bytePairRankDecoder, + specialTokensEncoder, tokenSplitRegex, }: BytePairEncodingConfig) { - this.bytePairRankDecoder = bytePairEncoder + this.bytePairRankDecoder = bytePairRankDecoder this.bytePairStringRankEncoder = new Map() // size without array holes (which may be present in the encoder) - this.mergeableBytePairRankCount = Object.keys(bytePairEncoder).length + this.mergeableBytePairRankCount = Object.keys(bytePairRankDecoder).length const binaryLookup: [Uint8Array, number][] = [] // forEach skips array holes: - bytePairEncoder.forEach((value, rank) => { + bytePairRankDecoder.forEach((value, rank) => { if (typeof value === 'string') { this.bytePairStringRankEncoder.set(value, rank) return @@ -61,9 +61,10 @@ export class BytePairEncodingCore { this.bytePairNonUtfSortedEncoder = binaryLookup.sort((a, b) => compareUint8Arrays(a[0], b[0]), ) - this.specialTokensEncoder = specialTokenEncoder ?? new Map() - this.specialTokensDecoder = specialTokenEncoder - ? new Map([...specialTokenEncoder].map(([key, value]) => [value, key])) + this.specialTokensEncoder = + specialTokensEncoder ?? new Map() + this.specialTokensDecoder = specialTokensEncoder + ? new Map([...specialTokensEncoder].map(([key, value]) => [value, key])) : new Map() this.tokenSplitRegex = tokenSplitRegex diff --git a/src/GptEncoding.ts b/src/GptEncoding.ts index 05ac616..a33ce0c 100644 --- a/src/GptEncoding.ts +++ b/src/GptEncoding.ts @@ -60,7 +60,7 @@ export class GptEncoding { modelName?: ModelName private bytePairEncodingCoreProcessor: BytePairEncodingCore - private specialTokenMapping: Map + private specialTokensEncoder: Map private specialTokensSet: Set private allSpecialTokenRegex: RegExp private defaultSpecialTokenConfig: SpecialTokenConfig @@ -68,31 +68,31 @@ export class GptEncoding { readonly vocabularySize: number private constructor({ - mergeableBytePairRanks, - specialTokenMapping, + bytePairRankDecoder: mergeableBytePairRanks, + specialTokensEncoder, expectedVocabularySize, modelName, ...rest }: EncodingParams) { - this.specialTokenMapping = specialTokenMapping - this.specialTokensSet = new Set(this.specialTokenMapping.keys()) + this.specialTokensEncoder = specialTokensEncoder + this.specialTokensSet = new Set(this.specialTokensEncoder.keys()) this.allSpecialTokenRegex = getSpecialTokenRegex(this.specialTokensSet) this.bytePairEncodingCoreProcessor = new BytePairEncodingCore({ - mergeableBytePairRanks, - specialTokenMapping, + bytePairRankDecoder: mergeableBytePairRanks, + specialTokensEncoder, ...rest, }) this.defaultSpecialTokenConfig = this.processSpecialTokens() const maxTokenValue = Math.max( mergeableBytePairRanks.length - 1, - getMaxValueFromMap(specialTokenMapping), + getMaxValueFromMap(specialTokensEncoder), ) this.vocabularySize = this.bytePairEncodingCoreProcessor.mergeableBytePairRankCount + - specialTokenMapping.size + specialTokensEncoder.size if (expectedVocabularySize !== undefined) { if (this.vocabularySize !== expectedVocabularySize) { @@ -245,8 +245,8 @@ export class GptEncoding { const params: ChatParameters | undefined = chatModelParams[model as ChatModelName] - const chatStartToken = this.specialTokenMapping.get(ImStart) - const chatEndToken = this.specialTokenMapping.get(ImEnd) + const chatStartToken = this.specialTokensEncoder.get(ImStart) + const chatEndToken = this.specialTokensEncoder.get(ImEnd) if (!params || chatStartToken === undefined || chatEndToken === undefined) { throw new Error(`Model '${model}' does not support chat.`) diff --git a/src/encodingParams/Cl100KBase.ts b/src/encodingParams/Cl100KBase.ts index f968ba8..98d529c 100644 --- a/src/encodingParams/Cl100KBase.ts +++ b/src/encodingParams/Cl100KBase.ts @@ -13,7 +13,7 @@ import { } from '../specialTokens.js' export function Cl100KBase( - mergeableBytePairRanks: RawBytePairRanks, + bytePairRankDecoder: RawBytePairRanks, ): EncodingParams { const specialTokenMapping = new Map([ [EndOfText, 100_257], @@ -29,7 +29,7 @@ export function Cl100KBase( return { tokenSplitRegex: /(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, - mergeableBytePairRanks, - specialTokenMapping, + bytePairRankDecoder, + specialTokensEncoder: specialTokenMapping, } } diff --git a/src/encodingParams/O200KBase.ts b/src/encodingParams/O200KBase.ts index c0e279a..07025a8 100644 --- a/src/encodingParams/O200KBase.ts +++ b/src/encodingParams/O200KBase.ts @@ -13,7 +13,7 @@ import { } from '../specialTokens.js' export function O200KBase( - mergeableBytePairRanks: RawBytePairRanks, + bytePairRankDecoder: RawBytePairRanks, ): EncodingParams { const specialTokenMapping = new Map([ [EndOfText, 199_999], @@ -29,7 +29,7 @@ export function O200KBase( return { tokenSplitRegex: /(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, - mergeableBytePairRanks, - specialTokenMapping, + bytePairRankDecoder, + specialTokensEncoder: specialTokenMapping, } } diff --git a/src/encodingParams/P50KBase.ts b/src/encodingParams/P50KBase.ts index 76bc20d..52b8c1b 100644 --- a/src/encodingParams/P50KBase.ts +++ b/src/encodingParams/P50KBase.ts @@ -4,12 +4,12 @@ import { type EncodingParams, tokenSplitRegex } from '../modelParams.js' import { EndOfText } from '../specialTokens.js' export function P50KBase( - mergeableBytePairRanks: RawBytePairRanks, + bytePairRankDecoder: RawBytePairRanks, ): EncodingParams { return { expectedVocabularySize: 50_281, tokenSplitRegex, - mergeableBytePairRanks, - specialTokenMapping: new Map([[EndOfText, 50_256]]), + bytePairRankDecoder, + specialTokensEncoder: new Map([[EndOfText, 50_256]]), } } diff --git a/src/encodingParams/P50KEdit.ts b/src/encodingParams/P50KEdit.ts index 68598b1..3b636af 100644 --- a/src/encodingParams/P50KEdit.ts +++ b/src/encodingParams/P50KEdit.ts @@ -4,7 +4,7 @@ import { type EncodingParams, tokenSplitRegex } from '../modelParams.js' import { EndOfText, FimMiddle, FimPrefix, FimSuffix } from '../specialTokens.js' export function P50KEdit( - mergeableBytePairRanks: RawBytePairRanks, + bytePairRankDecoder: RawBytePairRanks, ): EncodingParams { const specialTokenMapping = new Map([ [EndOfText, 50_256], @@ -15,7 +15,7 @@ export function P50KEdit( return { tokenSplitRegex, - mergeableBytePairRanks, - specialTokenMapping, + bytePairRankDecoder, + specialTokensEncoder: specialTokenMapping, } } diff --git a/src/encodingParams/R50KBase.ts b/src/encodingParams/R50KBase.ts index 2361a5b..c6d7d82 100644 --- a/src/encodingParams/R50KBase.ts +++ b/src/encodingParams/R50KBase.ts @@ -4,12 +4,12 @@ import { type EncodingParams, tokenSplitRegex } from '../modelParams.js' import { EndOfText } from '../specialTokens.js' export function R50KBase( - mergeableBytePairRanks: RawBytePairRanks, + bytePairRankDecoder: RawBytePairRanks, ): EncodingParams { return { expectedVocabularySize: 50_257, tokenSplitRegex, - mergeableBytePairRanks, - specialTokenMapping: new Map([[EndOfText, 50_256]]), + bytePairRankDecoder, + specialTokensEncoder: new Map([[EndOfText, 50_256]]), } } diff --git a/src/modelParams.ts b/src/modelParams.ts index fc736a7..320084f 100644 --- a/src/modelParams.ts +++ b/src/modelParams.ts @@ -22,7 +22,7 @@ export interface EncodingParams extends BytePairEncodingConfig { * It's complex due to its need to deal with a wide variety of cases in text processing. */ tokenSplitRegex: RegExp - specialTokenMapping: Map + specialTokensEncoder: Map modelName?: ModelName }