From cb457e52604d0f558947c9dadfe46b531a8ae548 Mon Sep 17 00:00:00 2001
From: nanmu42 <i@nanmu.me>
Date: Sat, 12 Jul 2025 14:52:36 +0800
Subject: [PATCH 1/2] feat: support zero-width negative lookahead (?!regex)

Resolves https://github.com/sugarme/tokenizer/issues/43

Ref: https://github.com/dlclark/regexp2/issues/78#issuecomment-2131313788
---
 .gitignore            |  2 ++
 go.mod                |  5 ++++-
 go.sum                |  2 ++
 normalizer/pattern.go | 49 +++++++++++++++++++++++++++++++++++--------
 4 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3476138..25b76fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ example/testdata/
 *.log
 
 *.json
+
+/.idea
\ No newline at end of file
diff --git a/go.mod b/go.mod
index 772381f..fdc39a6 100644
--- a/go.mod
+++ b/go.mod
@@ -1,8 +1,11 @@
 module github.com/sugarme/tokenizer
 
-go 1.23
+go 1.23.0
+
+toolchain go1.24.1
 
 require (
+	github.com/dlclark/regexp2 v1.11.5
 	github.com/emirpasic/gods v1.18.1
 	github.com/rivo/uniseg v0.4.7
 	github.com/schollz/progressbar/v2 v2.15.0
diff --git a/go.sum b/go.sum
index 5e020c3..501a749 100644
--- a/go.sum
+++ b/go.sum
@@ -1,6 +1,8 @@
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ=
+github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
diff --git a/normalizer/pattern.go b/normalizer/pattern.go
index 1a5ebe7..1d53a0a 100644
--- a/normalizer/pattern.go
+++ b/normalizer/pattern.go
@@ -1,9 +1,9 @@
 package normalizer
 
 import (
+	"github.com/dlclark/regexp2"
 	"log"
-	// "reflect"
-	"regexp"
+	"unicode/utf8"
 
 	"github.com/sugarme/tokenizer/util"
 )
@@ -107,16 +107,47 @@ func (s *StringPattern) FindMatches(inside string) []OffsetsMatch {
 		}
 	}
 
-	quoted := regexp.QuoteMeta(s.string)
-
-	re := regexp.MustCompile(quoted)
+	re := regexp2.MustCompile(s.string, regexp2.RE2)
 
 	return findMatches(re, inside)
 }
 
-func findMatches(re *regexp.Regexp, inside string) []OffsetsMatch {
+// convertRuneIndexToStringIndex The internals of regexp2 always operate on []rune
+// so Index and Length data in a Match always reference a position in runes rather than bytes (even if the input was given as a string).
+// This is a dramatic difference between regexp and regexp2. It's advisable to use the provided String() methods to avoid having to work with indices.
+// Ref: https://github.com/dlclark/regexp2/issues/78#issuecomment-2131313788
+func convertRuneIndexToStringIndex(r []rune, runeIndex, runeLength int) (stringIndex, stringLength int) {
+	var curStrIdx, startIdx int
+
+	// first get the start index
+	for i := 0; i < runeIndex; i++ {
+		curStrIdx += utf8.RuneLen(r[i])
+	}
+	startIdx = curStrIdx
+
+	// now get the length
+	for i := runeIndex; i < runeIndex+runeLength; i++ {
+		curStrIdx += utf8.RuneLen(r[i])
+	}
+	return startIdx, curStrIdx - startIdx
+}
+
+func regexp2FindAllStringIndex(re *regexp2.Regexp, s string) (matches [][]int) {
+	r := []rune(s)
+	// The only error that the *Match* methods should return is a Timeout if you set the re.MatchTimeout field.
+	// Any other error is a bug in the regexp2 package.
+	m, _ := re.FindRunesMatch(r)
+	for m != nil {
+		stringIndex, stringLength := convertRuneIndexToStringIndex(r, m.Index, m.Length)
+		matches = append(matches, []int{stringIndex, stringIndex + stringLength})
+		m, _ = re.FindNextMatch(m)
+	}
+	return matches
+}
+
+func findMatches(re *regexp2.Regexp, inside string) []OffsetsMatch {
 
-	matches := re.FindAllStringIndex(inside, -1)
+	matches := regexp2FindAllStringIndex(re, inside)
 
 	// 0. If no matches, just return
 	if len(matches) == 0 {
@@ -185,11 +216,11 @@ func findMatches(re *regexp.Regexp, inside string) []OffsetsMatch {
 }
 
 type RegexpPattern struct {
-	re *regexp.Regexp
+	re *regexp2.Regexp
 }
 
 func NewRegexpPattern(s string) *RegexpPattern {
-	re := regexp.MustCompile(s)
+	re := regexp2.MustCompile(s, regexp2.RE2)
 	return &RegexpPattern{
 		re: re,
 	}

From 91d4cdc8204a35e50a8eddb222ef293cd2bf6980 Mon Sep 17 00:00:00 2001
From: nanmu42 <i@nanmu.me>
Date: Sat, 12 Jul 2025 17:38:53 +0800
Subject: [PATCH 2/2] fix: escape string for regex building

---
 normalizer/pattern.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/normalizer/pattern.go b/normalizer/pattern.go
index 1d53a0a..800fdce 100644
--- a/normalizer/pattern.go
+++ b/normalizer/pattern.go
@@ -107,7 +107,8 @@ func (s *StringPattern) FindMatches(inside string) []OffsetsMatch {
 		}
 	}
 
-	re := regexp2.MustCompile(s.string, regexp2.RE2)
+	escaped := regexp2.Escape(s.string)
+	re := regexp2.MustCompile(escaped, regexp2.RE2)
 
 	return findMatches(re, inside)
 }