-
Notifications
You must be signed in to change notification settings - Fork 642
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize StemmerUtil for ReadOnlySpan<char>
/Span<char>
, #1140
#1144
base: master
Are you sure you want to change the base?
Conversation
Ran some benchmarks and this shows a notable improvement for StartsWith and EndsWith (net9.0, macOS, arm64). Delete/DeleteN didn't see a huge change because the (Edit: the small improvement in the Delete(N) Span versions might be due to not having to do the Debugging.AssertsEnabled check, which could not be done from my benchmark project as that's an internal type. The benchmark results are thus effectively identical for Delete(N). The CharArray versions were testing the beta 17 NuGet package.)
using System;
using BenchmarkDotNet.Attributes;
using Lucene.Net.Analysis.Util;
namespace LuceneNetBenchmarkPlayground;
[MemoryDiagnoser]
public class BenchmarkLucene
{
public static bool StartsWith(ReadOnlySpan<char> s, int len, ReadOnlySpan<char> prefix)
{
int prefixLen = prefix.Length;
if (prefixLen > len)
{
return false;
}
return s.StartsWith(prefix, StringComparison.Ordinal);
}
public static bool EndsWith(ReadOnlySpan<char> s, int len, ReadOnlySpan<char> suffix)
{
int suffixLen = suffix.Length;
if (suffixLen > len)
{
return false;
}
return s.Slice(0, len).EndsWith(suffix, StringComparison.Ordinal);
}
public static int Delete(Span<char> s, int pos, int len)
{
//if (Debugging.AssertsEnabled) Debugging.Assert(pos < len);
if (pos < len - 1) // don't arraycopy if asked to delete last character
{
// Arrays.Copy(s, pos + 1, s, pos, len - pos - 1);
s.Slice(pos + 1, len - pos - 1).CopyTo(s.Slice(pos, len - pos - 1));
}
return len - 1;
}
public static int DeleteN(Span<char> s, int pos, int len, int nChars)
{
//if (Debugging.AssertsEnabled) Debugging.Assert(pos + nChars <= len);
if (pos + nChars < len) // don't arraycopy if asked to delete the last characters
{
// Arrays.Copy(s, pos + nChars, s, pos, len - pos - nChars);
s.Slice(pos + nChars, len - pos - nChars).CopyTo(s.Slice(pos, len - pos - nChars));
}
return len - nChars;
}
[Benchmark]
public void StartsWithCharArray()
{
char[] s = new string('f', 1000).ToCharArray();
for (int i = 0; i < 1000; i++)
{
StemmerUtil.StartsWith(s, i, "fff");
}
}
[Benchmark]
public void StartsWithSpan()
{
ReadOnlySpan<char> s = new string('f', 1000);
for (int i = 0; i < 1000; i++)
{
StartsWith(s, i, "fff");
}
}
[Benchmark]
public void EndsWithCharArray()
{
char[] s = new string('f', 1000).ToCharArray();
for (int i = 0; i < 1000; i++)
{
StemmerUtil.EndsWith(s, i, "fff");
}
}
[Benchmark]
public void EndsWithSpan()
{
ReadOnlySpan<char> s = new string('f', 1000);
for (int i = 0; i < 1000; i++)
{
EndsWith(s, i, "fff");
}
}
[Benchmark]
public void DeleteCharArray()
{
char[] s = new string('f', 1000).ToCharArray();
for (int i = 0; i < 1000; i++)
{
StemmerUtil.Delete(s, i, 1000);
}
}
[Benchmark]
public void DeleteSpan()
{
char[] s = new string('f', 1000).ToCharArray();
for (int i = 0; i < 1000; i++)
{
Delete(s, i, 1000);
}
}
[Benchmark]
public void DeleteNCharArray()
{
char[] s = new string('f', 1000).ToCharArray();
for (int i = 0; i < 1000; i++)
{
StemmerUtil.DeleteN(s, i, 1000, 2);
}
}
[Benchmark]
public void DeleteNSpan()
{
char[] s = new string('f', 1000).ToCharArray();
for (int i = 0; i < 1000; i++)
{
DeleteN(s, i, 1000, 2);
}
}
} |
Optimize StemmerUtil for
ReadOnlySpan<char>
/Span<char>
Fixes #1140
Description
This PR optimizes StemmerUtil's EndsWith and StartsWith for
ReadOnlySpan<char>
, and Delete and DeleteN forSpan<char>
. Additionally, this type was missing unit tests (including in latest Lucene AFAICT), so this adds some lucenenet-specific unit tests for this type.Note that the
len
parameters are not quite what you might expect on a naïve reading of the XML doc comments. It is not always equal to the input buffer length (otherwise we could just uses.Length
and drop the parameter). Instead, it's Lucene's equivalent of what we would call a Slice in .NET. Any characters afterlen
chars in the input buffer are treated as if they aren't there. While we could update every callsite of these methods to pass in a slice of the input buffer (via.AsSpan(0, len)
) and drop the parameter, there are over 200 uses of these methods that would have to be updated (and kept in sync in future ports), and the additional overhead of creating an extra slice should be negligible as it's a stack-allocated ref struct. So I figured it would be best to keep the method signatures as close to the original as possible, without removing thelen
parameter.