Skip to content

Commit f7530de

Browse files
zeroshadepitrou
andauthored
GH-37712: [Go][Parquet] Fix ARM64 assembly for bitmap extract bits (#37785)
### Rationale for this change The linked issue discovered a problem with the ARM64 implementation of the extractBits functionality which was being hidden because until `go1.21`, it looks like `golang.org/x/sys/cpu` wasn't properly detecting the ASIMD bit flag on the ARM processors and so it was using the pure go implementation. So we fix the assembly and add a test which was able to reproduce the failure without the fix on an ARM64 machine. ### What changes are included in this PR? There was a problem with the assembly as it existed as the short circuit case where the selection bitmap is 0 wasn't actually setting the 0 value back onto the stack as a return. The assembly which initialized that register to 0 only occurred AFTER the `CBZ` instruction which would jump to the bottom and return. This means that we *actually* returned whatever happened to be on the stack at that location (i.e. garbage!). This only occurred if you were using the ARM64-specific assembly implementation, not the amd64 or pure go versions. We also include a test to ensure we get the correct values for a variety of bitmaps and selections, along with this ensuring there's enough data on the stack so that if this problem re-occured we would catch it in this test. ### Are these changes tested? Test is added. * Closes: #37712 Lead-authored-by: Matt Topol <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Matt Topol <[email protected]>
1 parent dd8734d commit f7530de

13 files changed

+7209
-3463
lines changed

go/parquet/internal/bmi/_lib/bitmap_bmi2.c

+6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
#include <arch.h>
1818
#include <stdint.h>
1919

20+
#if !defined(__ARM_NEON) && !defined(__ARM_NEON__)
21+
// don't compile this for ARM, the pure go lookup table version
22+
// is more performant anyways since ARM doesn't have a BMI2/pext_u64
23+
// instruction we can call directly.
2024
uint64_t FULL_NAME(extract_bits)(uint64_t bitmap, uint64_t select_bitmap) {
2125
#if defined(__BMI2__)
2226
return (uint64_t)(_pext_u64(bitmap, select_bitmap));
@@ -32,6 +36,8 @@ uint64_t FULL_NAME(extract_bits)(uint64_t bitmap, uint64_t select_bitmap) {
3236
#endif
3337
}
3438

39+
#endif
40+
3541
uint64_t FULL_NAME(levels_to_bitmap)(const int16_t* levels, const int num_levels, const int16_t rhs) {
3642
uint64_t mask = 0;
3743
for (int x = 0; x < num_levels; x++) {

go/parquet/internal/bmi/_lib/bitmap_neon.s

-32
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/parquet/internal/bmi/bitmap_neon_arm64.go

+1-9
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,13 @@
1414
// See the License for the specific language governing permissions and
1515
// limitations under the License.
1616

17+
//go:build !noasm
1718
// +build !noasm
1819

1920
package bmi
2021

2122
import "unsafe"
2223

23-
//go:noescape
24-
func _extract_bits_neon(bitmap, selectBitmap uint64) (res uint64)
25-
26-
// There is no single instruction similar with 'pext' on Arm64.
27-
// The equivalent of 'pext' was implemented with Arm64 Neon for Parallel Bits Extract
28-
func extractBitsNEON(bitmap, selectBitmap uint64) uint64 {
29-
return _extract_bits_neon(bitmap, selectBitmap)
30-
}
31-
3224
//go:noescape
3325
func _levels_to_bitmap_neon(levels unsafe.Pointer, numLevels int, rhs int16) (res uint64)
3426

go/parquet/internal/bmi/bitmap_neon_arm64.s

-30
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/parquet/internal/bmi/bmi_arm64.go

+28-21
Original file line numberDiff line numberDiff line change
@@ -14,44 +14,51 @@
1414
// See the License for the specific language governing permissions and
1515
// limitations under the License.
1616

17+
//go:build !noasm
1718
// +build !noasm
1819

1920
package bmi
2021

2122
import (
23+
"fmt"
2224
"os"
2325
"strings"
24-
)
25-
import (
26-
"golang.org/x/sys/cpu"
26+
27+
"github.com/klauspost/cpuid/v2"
2728
)
2829

2930
func init() {
30-
// Added ability to enable extension via environment:
31+
// Added ability to enable extension via environment:
3132
// ARM_ENABLE_EXT=NEON go test
3233
if ext, ok := os.LookupEnv("ARM_ENABLE_EXT"); ok {
33-
exts := strings.Split(ext, ",")
34-
35-
for _, x := range exts {
36-
switch x {
37-
case "NEON":
38-
cpu.ARM64.HasASIMD = true
39-
case "AES":
40-
cpu.ARM64.HasAES = true
41-
case "PMULL":
42-
cpu.ARM64.HasPMULL = true
43-
default:
44-
cpu.ARM64.HasASIMD = false
45-
cpu.ARM64.HasAES = false
46-
cpu.ARM64.HasPMULL = false
34+
if ext == "DISABLE" {
35+
cpuid.CPU.Disable(cpuid.ASIMD, cpuid.AESARM, cpuid.PMULL)
36+
} else {
37+
exts := strings.Split(ext, ",")
38+
39+
for _, x := range exts {
40+
switch x {
41+
case "NEON":
42+
cpuid.CPU.Enable(cpuid.ASIMD)
43+
case "AES":
44+
cpuid.CPU.Enable(cpuid.AESARM)
45+
case "PMULL":
46+
cpuid.CPU.Enable(cpuid.PMULL)
47+
default:
48+
fmt.Fprintln(os.Stderr, "unrecognized value for ARM_ENABLE_EXT:", x)
49+
}
4750
}
4851
}
4952
}
50-
if cpu.ARM64.HasASIMD {
51-
funclist.extractBits = extractBitsNEON
53+
54+
// after benchmarking, turns out the pure go lookup table version
55+
// is nearly twice as fast as the non-lookup table assembly
56+
// because arm doesn't have a PEXT instruction.
57+
funclist.extractBits = extractBitsGo
58+
59+
if cpuid.CPU.Has(cpuid.ASIMD) {
5260
funclist.gtbitmap = greaterThanBitmapNEON
5361
} else {
54-
funclist.extractBits = extractBitsGo
5562
funclist.gtbitmap = greaterThanBitmapGo
5663
}
5764
}

go/parquet/internal/bmi/bmi_test.go

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
17+
package bmi_test
18+
19+
import (
20+
"fmt"
21+
"testing"
22+
23+
"github.com/apache/arrow/go/v14/parquet/internal/bmi"
24+
"github.com/stretchr/testify/assert"
25+
)
26+
27+
// Testing the issue in GH-37712
28+
func TestBasicExtractBits(t *testing.T) {
29+
tests := []struct {
30+
bitmap, selection uint64
31+
expected uint64
32+
}{
33+
{0, 0, 0},
34+
{0xFF, 0, 0},
35+
{0xFF, ^uint64(0), 0xFF},
36+
{0xFF00FF, 0xAAAA, 0x000F},
37+
{0xFF0AFF, 0xAFAA, 0x00AF},
38+
{0xFFAAFF, 0xAFAA, 0x03AF},
39+
{0xFECBDA9876543210, 0xF00FF00FF00FF00F, 0xFBD87430},
40+
}
41+
42+
for _, tt := range tests {
43+
t.Run(fmt.Sprintf("%d-%d=>%d", tt.bitmap, tt.selection, tt.expected), func(t *testing.T) {
44+
assert.Equal(t, tt.expected, bmi.ExtractBits(tt.bitmap, tt.selection))
45+
})
46+
}
47+
}

go/parquet/internal/utils/Makefile

+12-6
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,14 @@ C_FLAGS_NEON=-O3 -fvectorize -mllvm -force-vector-width=16 -fno-asynchronous-unw
3232
GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
3333
ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')
3434

35-
.PHONEY: assembly
35+
.PHONY: assembly
3636

3737
INTEL_SOURCES := \
38-
bit_packing_avx2.s \
39-
unpack_bool_avx2.s unpack_bool_sse4.s
38+
bit_packing_avx2_amd64.s \
39+
unpack_bool_avx2_amd64.s unpack_bool_sse4_amd64.s
40+
41+
ARM_SOURCES := \
42+
bit_packing_neon_arm64.s unpack_bool_neon_arm64.s
4043

4144
#
4245
# ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support.
@@ -58,13 +61,16 @@ _lib/unpack_bool_sse4.s: _lib/unpack_bool.c
5861
_lib/unpack_bool_neon.s: _lib/unpack_bool.c
5962
$(CC) -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
6063

61-
bit_packing_avx2.s: _lib/bit_packing_avx2.s
64+
_lib/bit_packing_neon.s: _lib/bit_packing_neon.c
65+
$(CC) -S $(C_FLAGS_NEON) $^ -o $@
66+
67+
bit_packing_avx2_amd64.s: _lib/bit_packing_avx2.s
6268
$(C2GOASM) -a -f $^ $@
6369

64-
unpack_bool_avx2.s: _lib/unpack_bool_avx2.s
70+
unpack_bool_avx2_amd64.s: _lib/unpack_bool_avx2.s
6571
$(C2GOASM) -a -f $^ $@
6672

67-
unpack_bool_sse4.s: _lib/unpack_bool_sse4.s
73+
unpack_bool_sse4_amd64.s: _lib/unpack_bool_sse4.s
6874
$(C2GOASM) -a -f $^ $@
6975

7076
clean:

0 commit comments

Comments
 (0)