diff --git a/bitmap_opt.go b/bitmap_opt.go new file mode 100644 index 0000000..b34a1e1 --- /dev/null +++ b/bitmap_opt.go @@ -0,0 +1,135 @@ +package sroar + +// AndToSuperset calculates intersection of current and incoming bitmap +// It reuses containers present in current bitmap +// and utilize container buffer provided. +// +// CAUTION: should be used only when current bitmap contained before +// all elements present in incoming bitmap +func (dst *Bitmap) AndToSuperset(src *Bitmap, containerBuf []uint16) { + if src == nil { + for ai, an := 0, dst.keys.numKeys(); ai < an; ai++ { + off := dst.keys.val(ai) + zeroOutContainer(dst.getContainer(off)) + } + return + } + + a, b := dst, src + ai, an := 0, a.keys.numKeys() + bi, bn := 0, b.keys.numKeys() + + for ai < an && bi < bn { + ak := a.keys.key(ai) + bk := b.keys.key(bi) + if ak == bk { + off := a.keys.val(ai) + ac := a.getContainer(off) + off = b.keys.val(bi) + bc := b.getContainer(off) + + if getCardinality(bc) == 0 { + zeroOutContainer(ac) + } else { + containerAndToSuperset(ac, bc, containerBuf) + } + ai++ + bi++ + } else if ak < bk { + off := a.keys.val(ai) + zeroOutContainer(a.getContainer(off)) + ai++ + } else { + bi++ + } + } + for ; ai < an; ai++ { + off := a.keys.val(ai) + zeroOutContainer(a.getContainer(off)) + } +} + +// OrToSuperset calculates union of current and incoming bitmap +// It reuses containers present in current bitmap +// and utilize container buffer provided. +// +// CAUTION: should be used only when current bitmap contained before +// all elements present in incoming bitmap +func (dst *Bitmap) OrToSuperset(src *Bitmap, containerBuf []uint16) { + if src == nil { + return + } + + srcIdx, numKeys := 0, src.keys.numKeys() + for ; srcIdx < numKeys; srcIdx++ { + srcCont := src.getContainer(src.keys.val(srcIdx)) + if getCardinality(srcCont) == 0 { + continue + } + + key := src.keys.key(srcIdx) + + dstIdx := dst.keys.search(key) + if dstIdx >= dst.keys.numKeys() || dst.keys.key(dstIdx) != key { + // Container does not exist in dst. + panic("Current bitmap should have all containers of incoming bitmap") + } else { + // Container exists in dst as well. Do an inline containerOr. + offset := dst.keys.val(dstIdx) + dstCont := dst.getContainer(offset) + containerOrToSuperset(dstCont, srcCont, containerBuf) + } + } +} + +// AndNotToSuperset calculates difference between current and incoming bitmap +// It reuses containers present in current bitmap +// and utilize container buffer provided. +// +// CAUTION: should be used only when current bitmap contained before +// all elements present in incoming bitmap +func (dst *Bitmap) AndNotToSuperset(src *Bitmap, containerBuf []uint16) { + if src == nil { + return + } + + a, b := dst, src + ai, an := 0, a.keys.numKeys() + bi, bn := 0, b.keys.numKeys() + + for ai < an && bi < bn { + ak := a.keys.key(ai) + bk := b.keys.key(bi) + if ak == bk { + off := a.keys.val(ai) + ac := a.getContainer(off) + off = b.keys.val(bi) + bc := b.getContainer(off) + + if getCardinality(bc) != 0 { + containerAndNotToSuperset(ac, bc, containerBuf) + } + ai++ + bi++ + } else if ak < bk { + ai++ + } else { + bi++ + } + } +} + +func (ra *Bitmap) ConvertToBitmapContainers() { + for ai, an := 0, ra.keys.numKeys(); ai < an; ai++ { + ak := ra.keys.key(ai) + off := ra.keys.val(ai) + ac := ra.getContainer(off) + + if ac[indexType] == typeArray { + c := array(ac).toBitmapContainer(nil) + offset := ra.newContainer(uint16(len(c))) + copy(ra.data[offset:], c) + ra.setKey(ak, offset) + } + } +} diff --git a/bitmap_opt_test.go b/bitmap_opt_test.go new file mode 100644 index 0000000..f14dd4b --- /dev/null +++ b/bitmap_opt_test.go @@ -0,0 +1,315 @@ +package sroar + +import ( + "fmt" + "math" + "math/rand" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMergeToSuperset(t *testing.T) { + containerThreshold := uint64(math.MaxUint16 + 1) + buf := make([]uint16, maxContainerSize) + + // containers of type array + bitmap + bitmap + superset := NewBitmap() + // containers of type array + array + bitmap + and := NewBitmap() + or := NewBitmap() + andNot := NewBitmap() + + t.Run("init bitmaps", func(t *testing.T) { + N1 := uint64(4000) // fits to array container + N2 := uint64(16000) // fits to bitmap container + + // containers of type array for all BMs + for i := uint64(0); i < N1; i++ { + val1 := i * 2 + + superset.Set(val1) + if i%3 != 0 { + and.Set(i) + } + if i < N1*3/4 { + or.Set(i) + } + if i%2 == 0 { + andNot.Set(i) + } + } + + // containers of type 2xbitmap for superset + // containers of type array+bitmap for subsets + for i := uint64(0); i < N2; i++ { + val2 := i*3 + containerThreshold + val3 := i*4 + 2*containerThreshold + + superset.Set(val2) + superset.Set(val3) + + if i%5 == 1 { + and.Set(val2) + } + if a := i % 11; a == 3 || a == 7 { + or.Set(val2) + } + if a := i % 23; a < 5 { + andNot.Set(val2) + } + + if a := i % 7; a > 3 { + and.Set(val3) + } + if a := i % 13; a < 10 { + or.Set(val3) + } + if a := i % 17; a > 2 && a < 15 { + andNot.Set(val3) + } + } + }) + + control := superset.Clone() + + t.Run("and", func(t *testing.T) { + control.And(and) + superset.AndToSuperset(and, buf) + + require.Equal(t, 11389, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + }) + + t.Run("or", func(t *testing.T) { + control.Or(or) + superset.OrToSuperset(or, buf) + + require.Equal(t, 22750, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + }) + + t.Run("and not", func(t *testing.T) { + control.AndNot(andNot) + superset.AndNotToSuperset(andNot, buf) + + require.Equal(t, 9911, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + }) + + t.Run("2nd or", func(t *testing.T) { + control.Or(or) + superset.OrToSuperset(or, buf) + + require.Equal(t, 20730, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + }) + + t.Run("2nd and", func(t *testing.T) { + control.And(and) + superset.AndToSuperset(and, buf) + + require.Equal(t, 10369, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + }) + + t.Run("2nd and not", func(t *testing.T) { + control.AndNot(andNot) + superset.AndNotToSuperset(andNot, buf) + + require.Equal(t, 5520, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + }) + + t.Run("merge into", func(t *testing.T) { + dst := NewBitmap() + for _, val1 := range []uint64{0123, 1234, 2345, 3456, 4567, 5678, 6789, 7890, 8901, 9012} { + val2 := val1 + containerThreshold + val3 := val1 + 2*containerThreshold + + superset.Set(val1) + superset.Set(val2) + superset.Set(val3) + control.Set(val1) + control.Set(val2) + control.Set(val3) + + dst.Set(val1) + dst.Set(val2) + dst.Set(val3) + } + controlDst := dst.Clone() + + require.Equal(t, 5548, superset.GetCardinality()) + require.ElementsMatch(t, control.ToArray(), superset.ToArray()) + + dst.And(superset) + controlDst.And(control) + + require.Equal(t, 30, dst.GetCardinality()) + require.ElementsMatch(t, controlDst.ToArray(), dst.ToArray()) + + dst.Or(superset) + controlDst.Or(control) + + require.Equal(t, 5548, dst.GetCardinality()) + require.ElementsMatch(t, controlDst.ToArray(), dst.ToArray()) + }) +} + +// go test -v -fuzz FuzzMergeToSuperset -fuzztime 600s -run ^$ github.com/weaviate/sroar +func FuzzMergeToSuperset(f *testing.F) { + type testCase struct { + name string + countElements int + countSubsets int + countMerges int + randSeed int64 + } + + testCases := []testCase{ + { + name: "few elements, few subsets", + countElements: 1_000, + countSubsets: 3, + countMerges: 15, + randSeed: 1724861525311406000, + }, + { + name: "few elements, many subsets", + countElements: 2_000, + countSubsets: 15, + countMerges: 14, + randSeed: 172486152531140600, + }, + { + name: "more elements, few subsets", + countElements: 5_000, + countSubsets: 4, + countMerges: 13, + randSeed: 17248615253114060, + }, + { + name: "more elements, many subsets", + countElements: 7_000, + countSubsets: 16, + countMerges: 12, + randSeed: 1724861525311406, + }, + { + name: "many elements, few subsets", + countElements: 19_000, + countSubsets: 5, + countMerges: 11, + randSeed: 172486152531140, + }, + { + name: "many elements, many subsets", + countElements: 25_000, + countSubsets: 18, + countMerges: 10, + randSeed: 17248615253114, + }, + } + + for _, tc := range testCases { + f.Add(tc.countElements, tc.countSubsets, tc.countMerges, tc.randSeed) + } + + f.Fuzz(runMergeToSuperSetTest) +} + +func TestMergeToSuperset_VerifyFuzzCallback(t *testing.T) { + runMergeToSuperSetTest(t, 23_456, 17, 9, 1724861525311) +} + +func runMergeToSuperSetTest(t *testing.T, + countElements int, countSubsets int, countMerges int, randSeed int64, +) { + if countElements < 100 || countElements > 50_000 { + return + } + if countSubsets < 1 || countSubsets > 25 { + return + } + if countMerges < 1 || countMerges > 50 { + return + } + + // max element is 3x bigger than capacity of single bm's container + maxX := (int(math.MaxUint16) + 1) * 3 + buffer := make([]uint16, maxContainerSize) + rnd := rand.New(rand.NewSource(randSeed)) + + superset := NewBitmap() + subsets := make([]*Bitmap, countSubsets) + var control *Bitmap + + t.Run("populate bitmaps", func(t *testing.T) { + for i := 0; i < countElements; i++ { + x := uint64(rnd.Intn(maxX)) + superset.Set(x) + } + + for i := range subsets { + subsets[i] = NewBitmap() + // each next subset bitmap contains fewer elements + // 1/2 of countElements, 1/3, 1/4, ... + for j, c := 0, countElements/(i+2); j < c; j++ { + x := uint64(rnd.Intn(maxX)) + subsets[i].Set(x) + // ensure superset contains element of subset + superset.Set(x) + } + } + + control = superset.Clone() + }) + + for i := 0; i < countMerges; i++ { + t.Run("merge bitmaps", func(t *testing.T) { + id := rnd.Intn(len(subsets)) + subset := subsets[id] + + switch mergeType := rnd.Intn(3); mergeType { + case 1: + t.Run(fmt.Sprintf("AND with %d", id), func(t *testing.T) { + superset.AndToSuperset(subset, buffer) + control.And(subset) + assertMatches(t, superset, control) + }) + case 2: + t.Run(fmt.Sprintf("AND NOT with %d", id), func(t *testing.T) { + superset.AndNotToSuperset(subset, buffer) + control.AndNot(subset) + assertMatches(t, superset, control) + }) + default: + t.Run(fmt.Sprintf("OR with %d", id), func(t *testing.T) { + superset.OrToSuperset(subset, buffer) + control.Or(subset) + assertMatches(t, superset, control) + }) + } + }) + } +} + +func assertMatches(t *testing.T, bm1, bm2 *Bitmap) { + require.Equal(t, bm1.GetCardinality(), bm2.GetCardinality()) + + // check elements match using iterator as + // require.ElementsMatch(t, bm1.ToArray(), bm2.ToArray()) + // causes fuzz test to fail frequently + cit := bm1.NewIterator() + sit := bm2.NewIterator() + for { + cx := cit.Next() + sx := sit.Next() + require.Equal(t, cx, sx) + + if cx == 0 || sx == 0 { + break + } + } +} diff --git a/container_opt.go b/container_opt.go new file mode 100644 index 0000000..095d619 --- /dev/null +++ b/container_opt.go @@ -0,0 +1,150 @@ +package sroar + +import "math/bits" + +func containerAndToSuperset(ac, bc, buf []uint16) []uint16 { + at := ac[indexType] + bt := bc[indexType] + + if at == typeArray && bt == typeArray { + left := array(ac) + right := array(bc) + return left.andArrayToSuperset(right, buf) + } + if at == typeBitmap && bt == typeArray { + left := bitmap(ac) + right := array(bc) + return left.andArrayToSuperset(right, buf) + } + if at == typeBitmap && bt == typeBitmap { + left := bitmap(ac) + right := bitmap(bc) + return left.andBitmapToSuperset(right) + } + panic("containerAndToSuperset: We should not reach here") +} + +func containerOrToSuperset(ac, bc, buf []uint16) []uint16 { + at := ac[indexType] + bt := bc[indexType] + + if at == typeArray && bt == typeArray { + left := array(ac) + right := array(bc) + return left.orArrayToSuperset(right, buf) + } + if at == typeBitmap && bt == typeArray { + left := bitmap(ac) + right := array(bc) + return left.orArray(right, buf, runInline) + } + if at == typeBitmap && bt == typeBitmap { + left := bitmap(ac) + right := bitmap(bc) + return left.orBitmapToSuperset(right) + } + panic("containerOrToSuperset: We should not reach here") +} + +func containerAndNotToSuperset(ac, bc, buf []uint16) []uint16 { + at := ac[indexType] + bt := bc[indexType] + + if at == typeArray && bt == typeArray { + left := array(ac) + right := array(bc) + return left.andNotArrayToSuperset(right, buf) + } + if at == typeBitmap && bt == typeArray { + left := bitmap(ac) + right := array(bc) + out := left.andNotArray(right) + return out + } + if at == typeBitmap && bt == typeBitmap { + left := bitmap(ac) + right := bitmap(bc) + return left.andNotBitmapToSuperset(right) + } + panic("containerAndNotToSuperset: We should not reach here") +} + +func (a array) andArrayToSuperset(other array, buf []uint16) []uint16 { + copy(buf, zeroContainer) + out := buf[:len(a)] + + num := intersection2by2(a.all(), other.all(), out[startIdx:]) + setCardinality(out, num) + copy(a[2:], out[2:]) + return a +} + +func (a array) orArrayToSuperset(other array, buf []uint16) []uint16 { + copy(buf, zeroContainer) + out := buf[:len(a)] + + num := union2by2(a.all(), other.all(), out[startIdx:]) + setCardinality(out, num) + copy(a[2:], out[2:]) + return a +} + +func (a array) andNotArrayToSuperset(other array, buf []uint16) []uint16 { + copy(buf, zeroContainer) + out := buf[:len(a)] + + andRes := array(a.andArray(other)).all() // TODO is andRes needed? + num := difference(a.all(), andRes, out[startIdx:]) + setCardinality(out, int(num)) + copy(a[2:], out[2:]) + return a +} + +func (b bitmap) andBitmapToSuperset(other bitmap) []uint16 { + b64 := uint16To64Slice(b[startIdx:]) + o64 := uint16To64Slice(other[startIdx:]) + + var num int + for i := range b64 { + b64[i] &= o64[i] + num += bits.OnesCount64(b64[i]) + } + setCardinality(b, num) + return b +} + +func (b bitmap) orBitmapToSuperset(other bitmap) []uint16 { + if num := getCardinality(b); num == maxCardinality { + // do nothing. bitmap is already full. + return b + } + + b64 := uint16To64Slice(b[startIdx:]) + o64 := uint16To64Slice(other[startIdx:]) + + var num int + for i := range b64 { + b64[i] |= o64[i] + num += bits.OnesCount64(b64[i]) + } + setCardinality(b, num) + return b +} + +func (b bitmap) andArrayToSuperset(other array, buf []uint16) []uint16 { + otherb := other.toBitmapContainer(buf) + return b.andBitmapToSuperset(otherb) +} + +func (b bitmap) andNotBitmapToSuperset(other bitmap) []uint16 { + b64 := uint16To64Slice(b[startIdx:]) + o64 := uint16To64Slice(other[startIdx:]) + + var num int + for i := range b64 { + b64[i] &^= o64[i] + num += bits.OnesCount64(b64[i]) + } + setCardinality(b, num) + return b +} diff --git a/utils.go b/utils.go index b44bb34..eaba7f8 100644 --- a/utils.go +++ b/utils.go @@ -74,7 +74,7 @@ func toByteSlice(b []uint16) []byte { // they are pointer-based (unsafe). The caller is responsible to // ensure that the input slice does not get garbage collected, deleted // or modified while you hold the returned slince. -//// +// // func toUint16Slice(b []byte) (result []uint16) { var u16s []uint16 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u16s)) @@ -84,7 +84,7 @@ func toUint16Slice(b []byte) (result []uint16) { return u16s } -// BytesToU32Slice converts the given byte slice to uint32 slice +// toUint64Slice converts the given byte slice to uint64 slice func toUint64Slice(b []uint16) []uint64 { var u64s []uint64 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u64s)) @@ -104,3 +104,23 @@ func Memclr(b []uint16) { p := unsafe.Pointer(&b[0]) memclrNoHeapPointers(p, uintptr(len(b))) } + +// uint16To64Slice converts the given uint16 slice to uint64 slice +func uint16To64Slice(u16s []uint16) (result []uint64) { + var u64s []uint64 + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u64s)) + hdr.Len = len(u16s) / 4 + hdr.Cap = hdr.Len + hdr.Data = uintptr(unsafe.Pointer(&u16s[0])) + return u64s +} + +// uint64To16Slice converts the given uint64 slice to uint16 slice +func uint64To16Slice(u64s []uint64) (result []uint16) { + var u16s []uint16 + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u16s)) + hdr.Len = len(u64s) * 4 + hdr.Cap = hdr.Len + hdr.Data = uintptr(unsafe.Pointer(&u64s[0])) + return u16s +}