From cb457e52604d0f558947c9dadfe46b531a8ae548 Mon Sep 17 00:00:00 2001 From: nanmu42 Date: Sat, 12 Jul 2025 14:52:36 +0800 Subject: [PATCH 1/2] feat: support zero-width negative lookahead (?!regex) Resolves https://github.com/sugarme/tokenizer/issues/43 Ref: https://github.com/dlclark/regexp2/issues/78#issuecomment-2131313788 --- .gitignore | 2 ++ go.mod | 5 ++++- go.sum | 2 ++ normalizer/pattern.go | 49 +++++++++++++++++++++++++++++++++++-------- 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 3476138..25b76fc 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ example/testdata/ *.log *.json + +/.idea \ No newline at end of file diff --git a/go.mod b/go.mod index 772381f..fdc39a6 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,11 @@ module github.com/sugarme/tokenizer -go 1.23 +go 1.23.0 + +toolchain go1.24.1 require ( + github.com/dlclark/regexp2 v1.11.5 github.com/emirpasic/gods v1.18.1 github.com/rivo/uniseg v0.4.7 github.com/schollz/progressbar/v2 v2.15.0 diff --git a/go.sum b/go.sum index 5e020c3..501a749 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ= +github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= diff --git a/normalizer/pattern.go b/normalizer/pattern.go index 1a5ebe7..1d53a0a 100644 --- a/normalizer/pattern.go +++ b/normalizer/pattern.go @@ -1,9 +1,9 @@ package normalizer import ( + "github.com/dlclark/regexp2" "log" - // "reflect" - "regexp" + "unicode/utf8" "github.com/sugarme/tokenizer/util" ) @@ -107,16 +107,47 @@ func (s *StringPattern) FindMatches(inside string) []OffsetsMatch { } } - quoted := regexp.QuoteMeta(s.string) - - re := regexp.MustCompile(quoted) + re := regexp2.MustCompile(s.string, regexp2.RE2) return findMatches(re, inside) } -func findMatches(re *regexp.Regexp, inside string) []OffsetsMatch { +// convertRuneIndexToStringIndex The internals of regexp2 always operate on []rune +// so Index and Length data in a Match always reference a position in runes rather than bytes (even if the input was given as a string). +// This is a dramatic difference between regexp and regexp2. It's advisable to use the provided String() methods to avoid having to work with indices. +// Ref: https://github.com/dlclark/regexp2/issues/78#issuecomment-2131313788 +func convertRuneIndexToStringIndex(r []rune, runeIndex, runeLength int) (stringIndex, stringLength int) { + var curStrIdx, startIdx int + + // first get the start index + for i := 0; i < runeIndex; i++ { + curStrIdx += utf8.RuneLen(r[i]) + } + startIdx = curStrIdx + + // now get the length + for i := runeIndex; i < runeIndex+runeLength; i++ { + curStrIdx += utf8.RuneLen(r[i]) + } + return startIdx, curStrIdx - startIdx +} + +func regexp2FindAllStringIndex(re *regexp2.Regexp, s string) (matches [][]int) { + r := []rune(s) + // The only error that the *Match* methods should return is a Timeout if you set the re.MatchTimeout field. + // Any other error is a bug in the regexp2 package. + m, _ := re.FindRunesMatch(r) + for m != nil { + stringIndex, stringLength := convertRuneIndexToStringIndex(r, m.Index, m.Length) + matches = append(matches, []int{stringIndex, stringIndex + stringLength}) + m, _ = re.FindNextMatch(m) + } + return matches +} + +func findMatches(re *regexp2.Regexp, inside string) []OffsetsMatch { - matches := re.FindAllStringIndex(inside, -1) + matches := regexp2FindAllStringIndex(re, inside) // 0. If no matches, just return if len(matches) == 0 { @@ -185,11 +216,11 @@ func findMatches(re *regexp.Regexp, inside string) []OffsetsMatch { } type RegexpPattern struct { - re *regexp.Regexp + re *regexp2.Regexp } func NewRegexpPattern(s string) *RegexpPattern { - re := regexp.MustCompile(s) + re := regexp2.MustCompile(s, regexp2.RE2) return &RegexpPattern{ re: re, } From 91d4cdc8204a35e50a8eddb222ef293cd2bf6980 Mon Sep 17 00:00:00 2001 From: nanmu42 Date: Sat, 12 Jul 2025 17:38:53 +0800 Subject: [PATCH 2/2] fix: escape string for regex building --- normalizer/pattern.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/normalizer/pattern.go b/normalizer/pattern.go index 1d53a0a..800fdce 100644 --- a/normalizer/pattern.go +++ b/normalizer/pattern.go @@ -107,7 +107,8 @@ func (s *StringPattern) FindMatches(inside string) []OffsetsMatch { } } - re := regexp2.MustCompile(s.string, regexp2.RE2) + escaped := regexp2.Escape(s.string) + re := regexp2.MustCompile(escaped, regexp2.RE2) return findMatches(re, inside) }