diff --git a/global.json b/global.json index 9156e87..1d2a4ca 100644 --- a/global.json +++ b/global.json @@ -1,7 +1,7 @@ { "sdk": { - "version": "9.0.201", - "rollForward": "disable", + "version": "9.0.205", + "rollForward": "latestMinor", "allowPrerelease": true } } diff --git a/src/FastBertTokenizer.AotCompatibility.TestApp/packages.lock.json b/src/FastBertTokenizer.AotCompatibility.TestApp/packages.lock.json index c392a5f..5d9767b 100644 --- a/src/FastBertTokenizer.AotCompatibility.TestApp/packages.lock.json +++ b/src/FastBertTokenizer.AotCompatibility.TestApp/packages.lock.json @@ -4,15 +4,15 @@ "net8.0": { "Microsoft.DotNet.ILCompiler": { "type": "Direct", - "requested": "[8.0.4, )", - "resolved": "8.0.4", - "contentHash": "bVK6XD5E/BpkPmdrx99ZkloZZX3nOvTAcW+W6jW4cs8elWlLFAc9BXHk2esxSVHMiaCLiLdIJAyhUfhV50TW+Q==" + "requested": "[8.0.22, )", + "resolved": "8.0.22", + "contentHash": "4SDEQzYp3d9yVu0iQRFMV4HJCTjDvCt2AclSxMGXZSva5VrdoatwDg5nQVSD42ZuzBY5Y+BzgaexNbwb0/3gHw==" }, "Microsoft.NET.ILLink.Tasks": { "type": "Direct", - "requested": "[8.0.4, )", - "resolved": "8.0.4", - "contentHash": "PZb5nfQ+U19nhnmnR9T1jw+LTmozhuG2eeuzuW5A7DqxD/UXW2ucjmNJqnqOuh8rdPzM3MQXoF8AfFCedJdCUw==" + "requested": "[8.0.22, )", + "resolved": "8.0.22", + "contentHash": "MhcMithKEiyyNkD2ZfbDZPmcOdi0GheGfg8saEIIEfD/fol3iHmcV8TsZkD4ZYz5gdUuoX4YtlVySUU7Sxl9SQ==" }, "Microsoft.SourceLink.GitHub": { "type": "Direct", @@ -26,9 +26,9 @@ }, "Nerdbank.GitVersioning": { "type": "Direct", - "requested": "[3.6.133, )", - "resolved": "3.6.133", - "contentHash": "VZWMd5YAeDxpjWjAP/X6bAxnRMiEf6tES/ITN0X5CHJgkWLLeHGmEALivmTAfYM6P+P/3Szy6VCITUAkqjcHVw==" + "requested": "[3.7.115, )", + "resolved": "3.7.115", + "contentHash": "EpXamaAdRfG/BMxGgvZlTM0npRnkmXUjAj8OdNKd17t4oN+2nvjdv/KnFmzOOMDqvlwB49UCwtOHJrAQTfUBtQ==" }, "StyleCop.Analyzers.Unstable": { "type": "Direct", @@ -50,20 +50,20 @@ "type": "Project" } }, - "net8.0/win-x64": { + "net8.0/linux-x64": { "Microsoft.DotNet.ILCompiler": { "type": "Direct", - "requested": "[8.0.4, )", - "resolved": "8.0.4", - "contentHash": "bVK6XD5E/BpkPmdrx99ZkloZZX3nOvTAcW+W6jW4cs8elWlLFAc9BXHk2esxSVHMiaCLiLdIJAyhUfhV50TW+Q==", + "requested": "[8.0.22, )", + "resolved": "8.0.22", + "contentHash": "4SDEQzYp3d9yVu0iQRFMV4HJCTjDvCt2AclSxMGXZSva5VrdoatwDg5nQVSD42ZuzBY5Y+BzgaexNbwb0/3gHw==", "dependencies": { - "runtime.win-x64.Microsoft.DotNet.ILCompiler": "8.0.4" + "runtime.linux-x64.Microsoft.DotNet.ILCompiler": "8.0.22" } }, - "runtime.win-x64.Microsoft.DotNet.ILCompiler": { + "runtime.linux-x64.Microsoft.DotNet.ILCompiler": { "type": "Transitive", - "resolved": "8.0.4", - "contentHash": "jOBAHB9NiCdVUrNoJMkDjRPdNTLUxTVu87D5nXjoMPrgk12QDwVFGKCFCgQTxnfY12xSg6KlW4lMG0v+PfYGwQ==" + "resolved": "8.0.22", + "contentHash": "gjjhSmxUB4wbXSqtFoVmu0cbHSPN9djaQ9cpndZSXUnMGkdbsh6GmqE1YHZ23mLYSRzrGm+sbXNhlYuYzzFxAA==" } } } diff --git a/src/FastBertTokenizer.Tests/packages.lock.json b/src/FastBertTokenizer.Tests/packages.lock.json index 032beae..a62fdf1 100644 --- a/src/FastBertTokenizer.Tests/packages.lock.json +++ b/src/FastBertTokenizer.Tests/packages.lock.json @@ -26,6 +26,15 @@ "Microsoft.CodeCoverage": "17.13.0" } }, + "Microsoft.NETFramework.ReferenceAssemblies": { + "type": "Direct", + "requested": "[1.0.3, )", + "resolved": "1.0.3", + "contentHash": "vUc9Npcs14QsyOD01tnv/m8sQUnGTGOw1BCmKcv77LBJY7OxhJ+zJF7UD/sCL3lYNFuqmQEVlkfS4Quif6FyYg==", + "dependencies": { + "Microsoft.NETFramework.ReferenceAssemblies.net48": "1.0.3" + } + }, "Microsoft.SourceLink.GitHub": { "type": "Direct", "requested": "[8.0.0, )", @@ -173,6 +182,11 @@ "resolved": "4.7.0", "contentHash": "pTj+D3uJWyN3My70i2Hqo+OXixq3Os2D1nJ2x92FFo6sk8fYS1m1WLNTs0Dc1uPaViH0YvEEwvzddQ7y4rhXmA==" }, + "Microsoft.NETFramework.ReferenceAssemblies.net48": { + "type": "Transitive", + "resolved": "1.0.3", + "contentHash": "zMk4D+9zyiEWByyQ7oPImPN/Jhpj166Ky0Nlla4eXlNL8hI/BtSJsgR8Inldd4NNpIAH3oh8yym0W2DrhXdSLQ==" + }, "Microsoft.SourceLink.Common": { "type": "Transitive", "resolved": "8.0.0", diff --git a/src/FastBertTokenizer/AddedTokens.cs b/src/FastBertTokenizer/AddedTokens.cs index d65e1cf..e1d146c 100644 --- a/src/FastBertTokenizer/AddedTokens.cs +++ b/src/FastBertTokenizer/AddedTokens.cs @@ -13,15 +13,22 @@ public AddedTokens(IEnumerable<(string Content, bool Normalize)> addedTokens) // This logic might not be perfect. Are there chars that are equal to others in an invariant case insesitive comparison // but are neither the upper nor the lower variant of the original? - var firstLettersToSearch = addedTokens - .SelectMany(x => x.Normalize - ? (IEnumerable)[x.Content[0], char.ToLowerInvariant(x.Content[0]), char.ToUpperInvariant(x.Content[0])] - : [x.Content[0]]) - .Distinct(); + var firstLettersSet = new HashSet(); + foreach (var (content, normalize) in addedTokens) + { + var firstChar = content[0]; + firstLettersSet.Add(firstChar); + if (normalize) + { + firstLettersSet.Add(char.ToLowerInvariant(firstChar)); + firstLettersSet.Add(char.ToUpperInvariant(firstChar)); + } + } + #if NET8_0_OR_GREATER - FirstLetters = SearchValues.Create([.. firstLettersToSearch]); + FirstLetters = SearchValues.Create([.. firstLettersSet]); #else - FirstLetters = [.. firstLettersToSearch]; + FirstLetters = [.. firstLettersSet]; #endif } diff --git a/src/FastBertTokenizer/BertTokenizer.Decode.cs b/src/FastBertTokenizer/BertTokenizer.Decode.cs index 8908a2a..7832dab 100644 --- a/src/FastBertTokenizer/BertTokenizer.Decode.cs +++ b/src/FastBertTokenizer/BertTokenizer.Decode.cs @@ -15,8 +15,10 @@ public string Decode(ReadOnlySpan tokenIds) _ = _prefixes ?? throw new InvalidOperationException("Vocabulary not loaded."); _ = _suffixes ?? throw new InvalidOperationException("Vocabulary not loaded."); - _decodeSuffixes ??= _suffixes.ToDictionary(x => x.Value, x => x.Key.ToString()); - _decodePrefixes ??= _prefixes.ToDictionary(x => x.Value, x => x.Key.ToString()); + if (_decodeSuffixes is null || _decodePrefixes is null) + { + InitializeDecodeDictionaries(); + } if (tokenIds.Length == 0) { @@ -72,9 +74,24 @@ public string Decode(ReadOnlySpan tokenIds) // See https://github.com/huggingface/tokenizers/blob/daf361676bdfd14088f7e0bc087effc6a9cfdf3e/tokenizers/src/decoders/wordpiece.rs#L31 private bool EmitNoSpaceBefore(string prefix) { - return ".".Equals(prefix, StringComparison.Ordinal) - || "?".Equals(prefix, StringComparison.Ordinal) - || "!".Equals(prefix, StringComparison.Ordinal) - || ",".Equals(prefix, StringComparison.Ordinal); + return prefix.Length == 1 && (prefix[0] == '.' || prefix[0] == '?' || prefix[0] == '!' || prefix[0] == ','); + } + + private void InitializeDecodeDictionaries() + { + var decodeSuffixes = new Dictionary(_suffixes!.Count); + foreach (var kvp in _suffixes!) + { + decodeSuffixes[kvp.Value] = kvp.Key.ToString(); + } + + var decodePrefixes = new Dictionary(_prefixes!.Count); + foreach (var kvp in _prefixes!) + { + decodePrefixes[kvp.Value] = kvp.Key.ToString(); + } + + _decodeSuffixes = decodeSuffixes; + _decodePrefixes = decodePrefixes; } } diff --git a/src/FastBertTokenizer/packages.lock.json b/src/FastBertTokenizer/packages.lock.json index 5304785..7e92ef8 100644 --- a/src/FastBertTokenizer/packages.lock.json +++ b/src/FastBertTokenizer/packages.lock.json @@ -124,9 +124,9 @@ "net8.0": { "Microsoft.NET.ILLink.Tasks": { "type": "Direct", - "requested": "[8.0.14, )", - "resolved": "8.0.14", - "contentHash": "4U2fd7PexNKrK5ZqfqIcXZj9/lRRjFsLgA/pxuFQTuGQuLYP/+7yACz/j7EmWbEj/fspOf4mafi/vHIy/rKDzQ==" + "requested": "[8.0.22, )", + "resolved": "8.0.22", + "contentHash": "MhcMithKEiyyNkD2ZfbDZPmcOdi0GheGfg8saEIIEfD/fol3iHmcV8TsZkD4ZYz5gdUuoX4YtlVySUU7Sxl9SQ==" }, "Microsoft.SourceLink.GitHub": { "type": "Direct", diff --git a/src/HuggingfaceTokenizer/RustLibWrapper/packages.lock.json b/src/HuggingfaceTokenizer/RustLibWrapper/packages.lock.json index 4e09d27..404de55 100644 --- a/src/HuggingfaceTokenizer/RustLibWrapper/packages.lock.json +++ b/src/HuggingfaceTokenizer/RustLibWrapper/packages.lock.json @@ -2,6 +2,15 @@ "version": 2, "dependencies": { ".NETFramework,Version=v4.8": { + "Microsoft.NETFramework.ReferenceAssemblies": { + "type": "Direct", + "requested": "[1.0.3, )", + "resolved": "1.0.3", + "contentHash": "vUc9Npcs14QsyOD01tnv/m8sQUnGTGOw1BCmKcv77LBJY7OxhJ+zJF7UD/sCL3lYNFuqmQEVlkfS4Quif6FyYg==", + "dependencies": { + "Microsoft.NETFramework.ReferenceAssemblies.net48": "1.0.3" + } + }, "Microsoft.SourceLink.GitHub": { "type": "Direct", "requested": "[8.0.0, )", @@ -46,6 +55,11 @@ "resolved": "8.0.0", "contentHash": "bZKfSIKJRXLTuSzLudMFte/8CempWjVamNUR5eHJizsy+iuOuO/k2gnh7W0dHJmYY0tBf+gUErfluCv5mySAOQ==" }, + "Microsoft.NETFramework.ReferenceAssemblies.net48": { + "type": "Transitive", + "resolved": "1.0.3", + "contentHash": "zMk4D+9zyiEWByyQ7oPImPN/Jhpj166Ky0Nlla4eXlNL8hI/BtSJsgR8Inldd4NNpIAH3oh8yym0W2DrhXdSLQ==" + }, "Microsoft.SourceLink.Common": { "type": "Transitive", "resolved": "8.0.0",