Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions global.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"sdk": {
"version": "9.0.201",
"rollForward": "disable",
"version": "9.0.205",
"rollForward": "latestMinor",
"allowPrerelease": true
}
}
34 changes: 17 additions & 17 deletions src/FastBertTokenizer.AotCompatibility.TestApp/packages.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
"net8.0": {
"Microsoft.DotNet.ILCompiler": {
"type": "Direct",
"requested": "[8.0.4, )",
"resolved": "8.0.4",
"contentHash": "bVK6XD5E/BpkPmdrx99ZkloZZX3nOvTAcW+W6jW4cs8elWlLFAc9BXHk2esxSVHMiaCLiLdIJAyhUfhV50TW+Q=="
"requested": "[8.0.22, )",
"resolved": "8.0.22",
"contentHash": "4SDEQzYp3d9yVu0iQRFMV4HJCTjDvCt2AclSxMGXZSva5VrdoatwDg5nQVSD42ZuzBY5Y+BzgaexNbwb0/3gHw=="
},
"Microsoft.NET.ILLink.Tasks": {
"type": "Direct",
"requested": "[8.0.4, )",
"resolved": "8.0.4",
"contentHash": "PZb5nfQ+U19nhnmnR9T1jw+LTmozhuG2eeuzuW5A7DqxD/UXW2ucjmNJqnqOuh8rdPzM3MQXoF8AfFCedJdCUw=="
"requested": "[8.0.22, )",
"resolved": "8.0.22",
"contentHash": "MhcMithKEiyyNkD2ZfbDZPmcOdi0GheGfg8saEIIEfD/fol3iHmcV8TsZkD4ZYz5gdUuoX4YtlVySUU7Sxl9SQ=="
},
"Microsoft.SourceLink.GitHub": {
"type": "Direct",
Expand All @@ -26,9 +26,9 @@
},
"Nerdbank.GitVersioning": {
"type": "Direct",
"requested": "[3.6.133, )",
"resolved": "3.6.133",
"contentHash": "VZWMd5YAeDxpjWjAP/X6bAxnRMiEf6tES/ITN0X5CHJgkWLLeHGmEALivmTAfYM6P+P/3Szy6VCITUAkqjcHVw=="
"requested": "[3.7.115, )",
"resolved": "3.7.115",
"contentHash": "EpXamaAdRfG/BMxGgvZlTM0npRnkmXUjAj8OdNKd17t4oN+2nvjdv/KnFmzOOMDqvlwB49UCwtOHJrAQTfUBtQ=="
},
"StyleCop.Analyzers.Unstable": {
"type": "Direct",
Expand All @@ -50,20 +50,20 @@
"type": "Project"
}
},
"net8.0/win-x64": {
"net8.0/linux-x64": {
"Microsoft.DotNet.ILCompiler": {
"type": "Direct",
"requested": "[8.0.4, )",
"resolved": "8.0.4",
"contentHash": "bVK6XD5E/BpkPmdrx99ZkloZZX3nOvTAcW+W6jW4cs8elWlLFAc9BXHk2esxSVHMiaCLiLdIJAyhUfhV50TW+Q==",
"requested": "[8.0.22, )",
"resolved": "8.0.22",
"contentHash": "4SDEQzYp3d9yVu0iQRFMV4HJCTjDvCt2AclSxMGXZSva5VrdoatwDg5nQVSD42ZuzBY5Y+BzgaexNbwb0/3gHw==",
"dependencies": {
"runtime.win-x64.Microsoft.DotNet.ILCompiler": "8.0.4"
"runtime.linux-x64.Microsoft.DotNet.ILCompiler": "8.0.22"
}
},
"runtime.win-x64.Microsoft.DotNet.ILCompiler": {
"runtime.linux-x64.Microsoft.DotNet.ILCompiler": {
"type": "Transitive",
"resolved": "8.0.4",
"contentHash": "jOBAHB9NiCdVUrNoJMkDjRPdNTLUxTVu87D5nXjoMPrgk12QDwVFGKCFCgQTxnfY12xSg6KlW4lMG0v+PfYGwQ=="
"resolved": "8.0.22",
"contentHash": "gjjhSmxUB4wbXSqtFoVmu0cbHSPN9djaQ9cpndZSXUnMGkdbsh6GmqE1YHZ23mLYSRzrGm+sbXNhlYuYzzFxAA=="
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions src/FastBertTokenizer.Tests/packages.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@
"Microsoft.CodeCoverage": "17.13.0"
}
},
"Microsoft.NETFramework.ReferenceAssemblies": {
"type": "Direct",
"requested": "[1.0.3, )",
"resolved": "1.0.3",
"contentHash": "vUc9Npcs14QsyOD01tnv/m8sQUnGTGOw1BCmKcv77LBJY7OxhJ+zJF7UD/sCL3lYNFuqmQEVlkfS4Quif6FyYg==",
"dependencies": {
"Microsoft.NETFramework.ReferenceAssemblies.net48": "1.0.3"
}
},
"Microsoft.SourceLink.GitHub": {
"type": "Direct",
"requested": "[8.0.0, )",
Expand Down Expand Up @@ -173,6 +182,11 @@
"resolved": "4.7.0",
"contentHash": "pTj+D3uJWyN3My70i2Hqo+OXixq3Os2D1nJ2x92FFo6sk8fYS1m1WLNTs0Dc1uPaViH0YvEEwvzddQ7y4rhXmA=="
},
"Microsoft.NETFramework.ReferenceAssemblies.net48": {
"type": "Transitive",
"resolved": "1.0.3",
"contentHash": "zMk4D+9zyiEWByyQ7oPImPN/Jhpj166Ky0Nlla4eXlNL8hI/BtSJsgR8Inldd4NNpIAH3oh8yym0W2DrhXdSLQ=="
},
"Microsoft.SourceLink.Common": {
"type": "Transitive",
"resolved": "8.0.0",
Expand Down
21 changes: 14 additions & 7 deletions src/FastBertTokenizer/AddedTokens.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@ public AddedTokens(IEnumerable<(string Content, bool Normalize)> addedTokens)

// This logic might not be perfect. Are there chars that are equal to others in an invariant case insesitive comparison
// but are neither the upper nor the lower variant of the original?
var firstLettersToSearch = addedTokens
.SelectMany(x => x.Normalize
? (IEnumerable<char>)[x.Content[0], char.ToLowerInvariant(x.Content[0]), char.ToUpperInvariant(x.Content[0])]
: [x.Content[0]])
.Distinct();
var firstLettersSet = new HashSet<char>();
foreach (var (content, normalize) in addedTokens)
{
var firstChar = content[0];
firstLettersSet.Add(firstChar);
if (normalize)
{
firstLettersSet.Add(char.ToLowerInvariant(firstChar));
firstLettersSet.Add(char.ToUpperInvariant(firstChar));
}
}

#if NET8_0_OR_GREATER
FirstLetters = SearchValues.Create([.. firstLettersToSearch]);
FirstLetters = SearchValues.Create([.. firstLettersSet]);
#else
FirstLetters = [.. firstLettersToSearch];
FirstLetters = [.. firstLettersSet];
#endif
}

Expand Down
29 changes: 23 additions & 6 deletions src/FastBertTokenizer/BertTokenizer.Decode.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ public string Decode(ReadOnlySpan<long> tokenIds)
_ = _prefixes ?? throw new InvalidOperationException("Vocabulary not loaded.");
_ = _suffixes ?? throw new InvalidOperationException("Vocabulary not loaded.");

_decodeSuffixes ??= _suffixes.ToDictionary(x => x.Value, x => x.Key.ToString());
_decodePrefixes ??= _prefixes.ToDictionary(x => x.Value, x => x.Key.ToString());
if (_decodeSuffixes is null || _decodePrefixes is null)
{
InitializeDecodeDictionaries();
}

if (tokenIds.Length == 0)
{
Expand Down Expand Up @@ -72,9 +74,24 @@ public string Decode(ReadOnlySpan<long> tokenIds)
// See https://github.com/huggingface/tokenizers/blob/daf361676bdfd14088f7e0bc087effc6a9cfdf3e/tokenizers/src/decoders/wordpiece.rs#L31
private bool EmitNoSpaceBefore(string prefix)
{
return ".".Equals(prefix, StringComparison.Ordinal)
|| "?".Equals(prefix, StringComparison.Ordinal)
|| "!".Equals(prefix, StringComparison.Ordinal)
|| ",".Equals(prefix, StringComparison.Ordinal);
return prefix.Length == 1 && (prefix[0] == '.' || prefix[0] == '?' || prefix[0] == '!' || prefix[0] == ',');
}

private void InitializeDecodeDictionaries()
{
var decodeSuffixes = new Dictionary<long, string>(_suffixes!.Count);
foreach (var kvp in _suffixes!)
{
decodeSuffixes[kvp.Value] = kvp.Key.ToString();
}

var decodePrefixes = new Dictionary<long, string>(_prefixes!.Count);
foreach (var kvp in _prefixes!)
{
decodePrefixes[kvp.Value] = kvp.Key.ToString();
}

_decodeSuffixes = decodeSuffixes;
_decodePrefixes = decodePrefixes;
}
}
6 changes: 3 additions & 3 deletions src/FastBertTokenizer/packages.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@
"net8.0": {
"Microsoft.NET.ILLink.Tasks": {
"type": "Direct",
"requested": "[8.0.14, )",
"resolved": "8.0.14",
"contentHash": "4U2fd7PexNKrK5ZqfqIcXZj9/lRRjFsLgA/pxuFQTuGQuLYP/+7yACz/j7EmWbEj/fspOf4mafi/vHIy/rKDzQ=="
"requested": "[8.0.22, )",
"resolved": "8.0.22",
"contentHash": "MhcMithKEiyyNkD2ZfbDZPmcOdi0GheGfg8saEIIEfD/fol3iHmcV8TsZkD4ZYz5gdUuoX4YtlVySUU7Sxl9SQ=="
},
"Microsoft.SourceLink.GitHub": {
"type": "Direct",
Expand Down
14 changes: 14 additions & 0 deletions src/HuggingfaceTokenizer/RustLibWrapper/packages.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
"version": 2,
"dependencies": {
".NETFramework,Version=v4.8": {
"Microsoft.NETFramework.ReferenceAssemblies": {
"type": "Direct",
"requested": "[1.0.3, )",
"resolved": "1.0.3",
"contentHash": "vUc9Npcs14QsyOD01tnv/m8sQUnGTGOw1BCmKcv77LBJY7OxhJ+zJF7UD/sCL3lYNFuqmQEVlkfS4Quif6FyYg==",
"dependencies": {
"Microsoft.NETFramework.ReferenceAssemblies.net48": "1.0.3"
}
},
"Microsoft.SourceLink.GitHub": {
"type": "Direct",
"requested": "[8.0.0, )",
Expand Down Expand Up @@ -46,6 +55,11 @@
"resolved": "8.0.0",
"contentHash": "bZKfSIKJRXLTuSzLudMFte/8CempWjVamNUR5eHJizsy+iuOuO/k2gnh7W0dHJmYY0tBf+gUErfluCv5mySAOQ=="
},
"Microsoft.NETFramework.ReferenceAssemblies.net48": {
"type": "Transitive",
"resolved": "1.0.3",
"contentHash": "zMk4D+9zyiEWByyQ7oPImPN/Jhpj166Ky0Nlla4eXlNL8hI/BtSJsgR8Inldd4NNpIAH3oh8yym0W2DrhXdSLQ=="
},
"Microsoft.SourceLink.Common": {
"type": "Transitive",
"resolved": "8.0.0",
Expand Down