Skip to content

collate: fix Compare[String] funcs to use key comparisons #54

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 17 additions & 79 deletions collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ import (
)

// Collator provides functionality for comparing strings for a given
// collation order.
// collation order. A given Collator is not safe to use concurrently.
type Collator struct {
options

sorter sorter

buf Buffer

_iter [2]iter
}

Expand Down Expand Up @@ -102,90 +104,26 @@ func (b *Buffer) Reset() {

// Compare returns an integer comparing the two byte slices.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
// Note that this is less performant than calling c.Sort() because
// a new buffer will be allocated for each call.
func (c *Collator) Compare(a, b []byte) int {
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
c.iter(0).SetInput(a)
c.iter(1).SetInput(b)
if res := c.compare(); res != 0 {
return res
}
if !c.ignore[colltab.Identity] {
return bytes.Compare(a, b)
}
return 0
kA := c.Key(&c.buf, a)
kB := c.Key(&c.buf, b)
ret := bytes.Compare(kA, kB)
c.buf.Reset()
return ret
}

// CompareString returns an integer comparing the two strings.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
// Note that this is less performant than calling c.Sort() because
// a new buffer will be allocated for each call.
func (c *Collator) CompareString(a, b string) int {
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
c.iter(0).SetInputString(a)
c.iter(1).SetInputString(b)
if res := c.compare(); res != 0 {
return res
}
if !c.ignore[colltab.Identity] {
if a < b {
return -1
} else if a > b {
return 1
}
}
return 0
}

func compareLevel(f func(i *iter) int, a, b *iter) int {
a.pce = 0
b.pce = 0
for {
va := f(a)
vb := f(b)
if va != vb {
if va < vb {
return -1
}
return 1
} else if va == 0 {
break
}
}
return 0
}

func (c *Collator) compare() int {
ia, ib := c.iter(0), c.iter(1)
// Process primary level
if c.alternate != altShifted {
// TODO: implement script reordering
if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
return res
}
} else {
// TODO: handle shifted
}
if !c.ignore[colltab.Secondary] {
f := (*iter).nextSecondary
if c.backwards {
f = (*iter).prevSecondary
}
if res := compareLevel(f, ia, ib); res != 0 {
return res
}
}
// TODO: special case handling (Danish?)
if !c.ignore[colltab.Tertiary] || c.caseLevel {
if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
return res
}
if !c.ignore[colltab.Quaternary] {
if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
return res
}
}
}
return 0
kA := c.KeyFromString(&c.buf, a)
kB := c.KeyFromString(&c.buf, b)
ret := bytes.Compare(kA, kB)
c.buf.Reset()
return ret
}

// Key returns the collation key for str.
Expand Down
45 changes: 45 additions & 0 deletions collate/collate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package collate

import (
"bytes"
"runtime"
"testing"

"golang.org/x/text/internal/colltab"
Expand Down Expand Up @@ -450,6 +451,50 @@ func TestCompare(t *testing.T) {
t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
}

c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
if c.CompareString("deluge", "de luge") != -1 {
t.Errorf("CompareString for 'deluge' vs 'de luge' in Shift-Trimmed mode should return -1 but returned %v", c.CompareString("deluge", "de luge"))
}
}

func TestKeyFromStringCompareForShiftTrimmed(t *testing.T) {
var (
c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
buf Buffer
kA = c.KeyFromString(&buf, "deluge")
kB = c.KeyFromString(&buf, "de luge")
)

if bytes.Compare(kA, kB) != -1 {
t.Errorf("The Keys for 'deluge' should sort before the key for 'de luge' in Shift-Trimmed mode, but it compares as %v", bytes.Compare(kA, kB))
}
}

func totalAllocs() uint64 {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
return mem.TotalAlloc
}

func TestNoAllocationsForCompares(t *testing.T) {
var a = []byte("foo")
var b = []byte("bar")
var c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
var before = totalAllocs()

for i := 0; i < 100; i++ {
c.CompareString("foo", "bar")
c.Compare(a, b)
}

var bytesAllocated = totalAllocs() - before

// We should allocate zero additional bytes b/c the Buffer has 4k of memory pre-allocated that
// we should re-use for each/every key we generate for the comparisons.
if bytesAllocated != 0 {
t.Errorf("Expected CompareStrings to not allocate memory but it allocated %v", bytesAllocated)
}
}

func TestNumeric(t *testing.T) {
Expand Down
99 changes: 99 additions & 0 deletions collate/sort_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
package collate_test

import (
"bytes"
"fmt"
"strings"
"testing"

"golang.org/x/text/collate"
Expand Down Expand Up @@ -53,3 +55,100 @@ func TestSort(t *testing.T) {
t.Errorf("found %s; want %s", res, want)
}
}

func TestSortStringsAndCompareString(t *testing.T) {
for _, tt := range []struct {
name string
c *collate.Collator
want []string
}{
{
name: "English default options",
c: collate.New(language.English),
want: []string{
"abc",
"bcd",
"ddd",
},
},
{
// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
name: "Blanked",
c: collate.New(language.MustParse("en-us-u-ka-blanked")),
want: []string{
"death",
"de luge",
"de-luge",
"deluge",
"de-luge",
"de Luge",
"de-Luge",
"deLuge",
"de-Luge",
"demark",
},
},
{
// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
name: "Shifted",
c: collate.New(language.MustParse("en-us-u-ka-shifted")),
want: []string{
"death",
"de luge",
"de-luge",
"de-luge",
"deluge",
"de Luge",
"de-Luge",
"de-Luge",
"deLuge",
"demark",
},
},
{
// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
name: "Shift-Trimmed",
c: collate.New(language.MustParse("en-us-u-ka-posix-ks-level4")),
want: []string{
"death",
"deluge",
"de luge",
"de-luge",
"de-luge",
"deLuge",
"de Luge",
"de-Luge",
"de-Luge",
"demark",
},
},
} {
t.Run(tt.name, func(t *testing.T) {
actual := make([]string, len(tt.want))
copy(actual, tt.want)
tt.c.SortStrings(actual)

p := func(v []string) string { return strings.Join(v, ", ") }
if p(tt.want) != p(actual) {
t.Errorf("SortStrings want: '%v'\n Got: '%v'", p(tt.want), p(actual))
}

buf := collate.Buffer{}
for i := 0; i < len(tt.want)-1; i++ {
a, b := tt.want[i], tt.want[i+1]
kA, kB := tt.c.KeyFromString(&buf, a), tt.c.KeyFromString(&buf, b)
if bytes.Compare(kA, kB) > 0 {
t.Errorf("KeyFromString for %v is bigger than for %v", a, b)
}
}

for i := 0; i < len(tt.want)-1; i++ {
a, b := tt.want[i], tt.want[i+1]
cmp := tt.c.CompareString(a, b)
if cmp > 0 {
t.Errorf("CompareString for '%v' vs '%v' is 1 when should be -1 or 0", a, b)
}
}
})
}
}