diff --git a/.gitignore b/.gitignore index 3476138..25b76fc 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ example/testdata/ *.log *.json + +/.idea \ No newline at end of file diff --git a/go.mod b/go.mod index 772381f..fdc39a6 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,11 @@ module github.com/sugarme/tokenizer -go 1.23 +go 1.23.0 + +toolchain go1.24.1 require ( + github.com/dlclark/regexp2 v1.11.5 github.com/emirpasic/gods v1.18.1 github.com/rivo/uniseg v0.4.7 github.com/schollz/progressbar/v2 v2.15.0 diff --git a/go.sum b/go.sum index 5e020c3..501a749 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ= +github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= diff --git a/normalizer/pattern.go b/normalizer/pattern.go index 1a5ebe7..800fdce 100644 --- a/normalizer/pattern.go +++ b/normalizer/pattern.go @@ -1,9 +1,9 @@ package normalizer import ( + "github.com/dlclark/regexp2" "log" - // "reflect" - "regexp" + "unicode/utf8" "github.com/sugarme/tokenizer/util" ) @@ -107,16 +107,48 @@ func (s *StringPattern) FindMatches(inside string) []OffsetsMatch { } } - quoted := regexp.QuoteMeta(s.string) - - re := regexp.MustCompile(quoted) + escaped := regexp2.Escape(s.string) + re := regexp2.MustCompile(escaped, regexp2.RE2) return findMatches(re, inside) } -func findMatches(re *regexp.Regexp, inside string) []OffsetsMatch { +// convertRuneIndexToStringIndex The internals of regexp2 always operate on []rune +// so Index and Length data in a Match always reference a position in runes rather than bytes (even if the input was given as a string). +// This is a dramatic difference between regexp and regexp2. It's advisable to use the provided String() methods to avoid having to work with indices. +// Ref: https://github.com/dlclark/regexp2/issues/78#issuecomment-2131313788 +func convertRuneIndexToStringIndex(r []rune, runeIndex, runeLength int) (stringIndex, stringLength int) { + var curStrIdx, startIdx int + + // first get the start index + for i := 0; i < runeIndex; i++ { + curStrIdx += utf8.RuneLen(r[i]) + } + startIdx = curStrIdx + + // now get the length + for i := runeIndex; i < runeIndex+runeLength; i++ { + curStrIdx += utf8.RuneLen(r[i]) + } + return startIdx, curStrIdx - startIdx +} + +func regexp2FindAllStringIndex(re *regexp2.Regexp, s string) (matches [][]int) { + r := []rune(s) + // The only error that the *Match* methods should return is a Timeout if you set the re.MatchTimeout field. + // Any other error is a bug in the regexp2 package. + m, _ := re.FindRunesMatch(r) + for m != nil { + stringIndex, stringLength := convertRuneIndexToStringIndex(r, m.Index, m.Length) + matches = append(matches, []int{stringIndex, stringIndex + stringLength}) + m, _ = re.FindNextMatch(m) + } + return matches +} + +func findMatches(re *regexp2.Regexp, inside string) []OffsetsMatch { - matches := re.FindAllStringIndex(inside, -1) + matches := regexp2FindAllStringIndex(re, inside) // 0. If no matches, just return if len(matches) == 0 { @@ -185,11 +217,11 @@ func findMatches(re *regexp.Regexp, inside string) []OffsetsMatch { } type RegexpPattern struct { - re *regexp.Regexp + re *regexp2.Regexp } func NewRegexpPattern(s string) *RegexpPattern { - re := regexp.MustCompile(s) + re := regexp2.MustCompile(s, regexp2.RE2) return &RegexpPattern{ re: re, }