golang · gburt · Dec 24, 2024 · Dec 30, 2024 · Jan 7, 2025 · Jan 21, 2025
diff --git a/collate/collate.go b/collate/collate.go
@@ -19,12 +19,14 @@ import (
 )
 
 // Collator provides functionality for comparing strings for a given
-// collation order.
+// collation order.  A given Collator is not safe to use concurrently.
 type Collator struct {
 	options
 
 	sorter sorter
 
+	buf Buffer
+
 	_iter [2]iter
 }
 
@@ -102,90 +104,26 @@ func (b *Buffer) Reset() {
 
 // Compare returns an integer comparing the two byte slices.
 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
+// Note that this is less performant than calling c.Sort() because
+// a new buffer will be allocated for each call.
 func (c *Collator) Compare(a, b []byte) int {
-	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
-	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
-	c.iter(0).SetInput(a)
-	c.iter(1).SetInput(b)
-	if res := c.compare(); res != 0 {
-		return res
-	}
-	if !c.ignore[colltab.Identity] {
-		return bytes.Compare(a, b)
-	}
-	return 0
+	kA := c.Key(&c.buf, a)
+	kB := c.Key(&c.buf, b)
+	ret := bytes.Compare(kA, kB)
+	c.buf.Reset()
+	return ret
 }
 
 // CompareString returns an integer comparing the two strings.
 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
+// Note that this is less performant than calling c.Sort() because
+// a new buffer will be allocated for each call.
 func (c *Collator) CompareString(a, b string) int {
-	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
-	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
-	c.iter(0).SetInputString(a)
-	c.iter(1).SetInputString(b)
-	if res := c.compare(); res != 0 {
-		return res
-	}
-	if !c.ignore[colltab.Identity] {
-		if a < b {
-			return -1
-		} else if a > b {
-			return 1
-		}
-	}
-	return 0
-}
-
-func compareLevel(f func(i *iter) int, a, b *iter) int {
-	a.pce = 0
-	b.pce = 0
-	for {
-		va := f(a)
-		vb := f(b)
-		if va != vb {
-			if va < vb {
-				return -1
-			}
-			return 1
-		} else if va == 0 {
-			break
-		}
-	}
-	return 0
-}
-
-func (c *Collator) compare() int {
-	ia, ib := c.iter(0), c.iter(1)
-	// Process primary level
-	if c.alternate != altShifted {
-		// TODO: implement script reordering
-		if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
-			return res
-		}
-	} else {
-		// TODO: handle shifted
-	}
-	if !c.ignore[colltab.Secondary] {
-		f := (*iter).nextSecondary
-		if c.backwards {
-			f = (*iter).prevSecondary
-		}
-		if res := compareLevel(f, ia, ib); res != 0 {
-			return res
-		}
-	}
-	// TODO: special case handling (Danish?)
-	if !c.ignore[colltab.Tertiary] || c.caseLevel {
-		if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
-			return res
-		}
-		if !c.ignore[colltab.Quaternary] {
-			if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
-				return res
-			}
-		}
-	}
-	return 0
+	kA := c.KeyFromString(&c.buf, a)
+	kB := c.KeyFromString(&c.buf, b)
+	ret := bytes.Compare(kA, kB)
+	c.buf.Reset()
+	return ret
 }
 
 // Key returns the collation key for str.

diff --git a/collate/collate_test.go b/collate/collate_test.go
@@ -6,6 +6,7 @@ package collate
 
 import (
 	"bytes"
+	"runtime"
 	"testing"
 
 	"golang.org/x/text/internal/colltab"
@@ -450,6 +451,50 @@ func TestCompare(t *testing.T) {
 			t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
 		}
 	}
+
+	c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
+	if c.CompareString("deluge", "de luge") != -1 {
+		t.Errorf("CompareString for 'deluge' vs 'de luge' in Shift-Trimmed mode should return -1 but returned %v", c.CompareString("deluge", "de luge"))
+	}
+}
+
+func TestKeyFromStringCompareForShiftTrimmed(t *testing.T) {
+	var (
+		c   = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
+		buf Buffer
+		kA  = c.KeyFromString(&buf, "deluge")
+		kB  = c.KeyFromString(&buf, "de luge")
+	)
+
+	if bytes.Compare(kA, kB) != -1 {
+		t.Errorf("The Keys for 'deluge' should sort before the key for 'de luge' in Shift-Trimmed mode, but it compares as %v", bytes.Compare(kA, kB))
+	}
+}
+
+func totalAllocs() uint64 {
+	var mem runtime.MemStats
+	runtime.ReadMemStats(&mem)
+	return mem.TotalAlloc
+}
+
+func TestNoAllocationsForCompares(t *testing.T) {
+	var a = []byte("foo")
+	var b = []byte("bar")
+	var c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
+	var before = totalAllocs()
+
+	for i := 0; i < 100; i++ {
+		c.CompareString("foo", "bar")
+		c.Compare(a, b)
+	}
+
+	var bytesAllocated = totalAllocs() - before
+
+	// We should allocate zero additional bytes b/c the Buffer has 4k of memory pre-allocated that
+	// we should re-use for each/every key we generate for the comparisons.
+	if bytesAllocated != 0 {
+		t.Errorf("Expected CompareStrings to not allocate memory but it allocated %v", bytesAllocated)
+	}
 }
 
 func TestNumeric(t *testing.T) {

diff --git a/collate/sort_test.go b/collate/sort_test.go
@@ -5,7 +5,9 @@
 package collate_test
 
 import (
+	"bytes"
 	"fmt"
+	"strings"
 	"testing"
 
 	"golang.org/x/text/collate"
@@ -53,3 +55,100 @@ func TestSort(t *testing.T) {
 		t.Errorf("found %s; want %s", res, want)
 	}
 }
+
+func TestSortStringsAndCompareString(t *testing.T) {
+	for _, tt := range []struct {
+		name string
+		c    *collate.Collator
+		want []string
+	}{
+		{
+			name: "English default options",
+			c:    collate.New(language.English),
+			want: []string{
+				"abc",
+				"bcd",
+				"ddd",
+			},
+		},
+		{
+			// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
+			name: "Blanked",
+			c:    collate.New(language.MustParse("en-us-u-ka-blanked")),
+			want: []string{
+				"death",
+				"de luge",
+				"de-luge",
+				"deluge",
+				"de-luge",
+				"de Luge",
+				"de-Luge",
+				"deLuge",
+				"de-Luge",
+				"demark",
+			},
+		},
+		{
+			// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
+			name: "Shifted",
+			c:    collate.New(language.MustParse("en-us-u-ka-shifted")),
+			want: []string{
+				"death",
+				"de luge",
+				"de-luge",
+				"de-luge",
+				"deluge",
+				"de Luge",
+				"de-Luge",
+				"de-Luge",
+				"deLuge",
+				"demark",
+			},
+		},
+		{
+			// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
+			name: "Shift-Trimmed",
+			c:    collate.New(language.MustParse("en-us-u-ka-posix-ks-level4")),
+			want: []string{
+				"death",
+				"deluge",
+				"de luge",
+				"de-luge",
+				"de-luge",
+				"deLuge",
+				"de Luge",
+				"de-Luge",
+				"de-Luge",
+				"demark",
+			},
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			actual := make([]string, len(tt.want))
+			copy(actual, tt.want)
+			tt.c.SortStrings(actual)
+
+			p := func(v []string) string { return strings.Join(v, ", ") }
+			if p(tt.want) != p(actual) {
+				t.Errorf("SortStrings want: '%v'\n Got: '%v'", p(tt.want), p(actual))
+			}
+
+			buf := collate.Buffer{}
+			for i := 0; i < len(tt.want)-1; i++ {
+				a, b := tt.want[i], tt.want[i+1]
+				kA, kB := tt.c.KeyFromString(&buf, a), tt.c.KeyFromString(&buf, b)
+				if bytes.Compare(kA, kB) > 0 {
+					t.Errorf("KeyFromString for %v is bigger than for %v", a, b)
+				}
+			}
+
+			for i := 0; i < len(tt.want)-1; i++ {
+				a, b := tt.want[i], tt.want[i+1]
+				cmp := tt.c.CompareString(a, b)
+				if cmp > 0 {
+					t.Errorf("CompareString for '%v' vs '%v' is 1 when should be -1 or 0", a, b)
+				}
+			}
+		})
+	}
+}