Skip to content

Commit

Permalink
apacheGH-37712: [Go][Parquet] Fix ARM64 assembly for bitmap extract b…
Browse files Browse the repository at this point in the history
…its (apache#37785)

### Rationale for this change
The linked issue discovered a problem with the ARM64 implementation of the extractBits functionality which was being hidden because until `go1.21`, it looks like `golang.org/x/sys/cpu` wasn't properly detecting the ASIMD bit flag on the ARM processors and so it was using the pure go implementation. So we fix the assembly and add a test which was able to reproduce the failure without the fix on an ARM64 machine. 

### What changes are included in this PR?
There was a problem with the assembly as it existed as the short circuit case where the selection bitmap is 0 wasn't actually setting the 0 value back onto the stack as a return. The assembly which initialized that register to 0 only occurred AFTER the `CBZ` instruction which would jump to the bottom and return. This means that we *actually* returned whatever happened to be on the stack at that location (i.e. garbage!). This only occurred if you were using the ARM64-specific assembly implementation, not the amd64 or pure go versions.

We also include a test to ensure we get the correct values for a variety of bitmaps and selections, along with this ensuring there's enough data on the stack so that if this problem re-occured we would catch it in this test.

### Are these changes tested?
Test is added.

* Closes: apache#37712

Lead-authored-by: Matt Topol <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
  • Loading branch information
2 people authored and loicalleyne committed Nov 13, 2023
1 parent 4503fb0 commit c227fd9
Show file tree
Hide file tree
Showing 13 changed files with 7,209 additions and 3,463 deletions.
6 changes: 6 additions & 0 deletions go/parquet/internal/bmi/_lib/bitmap_bmi2.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
#include <arch.h>
#include <stdint.h>

#if !defined(__ARM_NEON) && !defined(__ARM_NEON__)
// don't compile this for ARM, the pure go lookup table version
// is more performant anyways since ARM doesn't have a BMI2/pext_u64
// instruction we can call directly.
uint64_t FULL_NAME(extract_bits)(uint64_t bitmap, uint64_t select_bitmap) {
#if defined(__BMI2__)
return (uint64_t)(_pext_u64(bitmap, select_bitmap));
Expand All @@ -32,6 +36,8 @@ uint64_t FULL_NAME(extract_bits)(uint64_t bitmap, uint64_t select_bitmap) {
#endif
}

#endif

uint64_t FULL_NAME(levels_to_bitmap)(const int16_t* levels, const int num_levels, const int16_t rhs) {
uint64_t mask = 0;
for (int x = 0; x < num_levels; x++) {
Expand Down
32 changes: 0 additions & 32 deletions go/parquet/internal/bmi/_lib/bitmap_neon.s

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 1 addition & 9 deletions go/parquet/internal/bmi/bitmap_neon_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !noasm
// +build !noasm

package bmi

import "unsafe"

//go:noescape
func _extract_bits_neon(bitmap, selectBitmap uint64) (res uint64)

// There is no single instruction similar with 'pext' on Arm64.
// The equivalent of 'pext' was implemented with Arm64 Neon for Parallel Bits Extract
func extractBitsNEON(bitmap, selectBitmap uint64) uint64 {
return _extract_bits_neon(bitmap, selectBitmap)
}

//go:noescape
func _levels_to_bitmap_neon(levels unsafe.Pointer, numLevels int, rhs int16) (res uint64)

Expand Down
30 changes: 0 additions & 30 deletions go/parquet/internal/bmi/bitmap_neon_arm64.s

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 28 additions & 21 deletions go/parquet/internal/bmi/bmi_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,44 +14,51 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !noasm
// +build !noasm

package bmi

import (
"fmt"
"os"
"strings"
)
import (
"golang.org/x/sys/cpu"

"github.com/klauspost/cpuid/v2"
)

func init() {
// Added ability to enable extension via environment:
// Added ability to enable extension via environment:
// ARM_ENABLE_EXT=NEON go test
if ext, ok := os.LookupEnv("ARM_ENABLE_EXT"); ok {
exts := strings.Split(ext, ",")

for _, x := range exts {
switch x {
case "NEON":
cpu.ARM64.HasASIMD = true
case "AES":
cpu.ARM64.HasAES = true
case "PMULL":
cpu.ARM64.HasPMULL = true
default:
cpu.ARM64.HasASIMD = false
cpu.ARM64.HasAES = false
cpu.ARM64.HasPMULL = false
if ext == "DISABLE" {
cpuid.CPU.Disable(cpuid.ASIMD, cpuid.AESARM, cpuid.PMULL)
} else {
exts := strings.Split(ext, ",")

for _, x := range exts {
switch x {
case "NEON":
cpuid.CPU.Enable(cpuid.ASIMD)
case "AES":
cpuid.CPU.Enable(cpuid.AESARM)
case "PMULL":
cpuid.CPU.Enable(cpuid.PMULL)
default:
fmt.Fprintln(os.Stderr, "unrecognized value for ARM_ENABLE_EXT:", x)
}
}
}
}
if cpu.ARM64.HasASIMD {
funclist.extractBits = extractBitsNEON

// after benchmarking, turns out the pure go lookup table version
// is nearly twice as fast as the non-lookup table assembly
// because arm doesn't have a PEXT instruction.
funclist.extractBits = extractBitsGo

if cpuid.CPU.Has(cpuid.ASIMD) {
funclist.gtbitmap = greaterThanBitmapNEON
} else {
funclist.extractBits = extractBitsGo
funclist.gtbitmap = greaterThanBitmapGo
}
}
47 changes: 47 additions & 0 deletions go/parquet/internal/bmi/bmi_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package bmi_test

import (
"fmt"
"testing"

"github.com/apache/arrow/go/v14/parquet/internal/bmi"
"github.com/stretchr/testify/assert"
)

// Testing the issue in GH-37712
func TestBasicExtractBits(t *testing.T) {
tests := []struct {
bitmap, selection uint64
expected uint64
}{
{0, 0, 0},
{0xFF, 0, 0},
{0xFF, ^uint64(0), 0xFF},
{0xFF00FF, 0xAAAA, 0x000F},
{0xFF0AFF, 0xAFAA, 0x00AF},
{0xFFAAFF, 0xAFAA, 0x03AF},
{0xFECBDA9876543210, 0xF00FF00FF00FF00F, 0xFBD87430},
}

for _, tt := range tests {
t.Run(fmt.Sprintf("%d-%d=>%d", tt.bitmap, tt.selection, tt.expected), func(t *testing.T) {
assert.Equal(t, tt.expected, bmi.ExtractBits(tt.bitmap, tt.selection))
})
}
}
18 changes: 12 additions & 6 deletions go/parquet/internal/utils/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,14 @@ C_FLAGS_NEON=-O3 -fvectorize -mllvm -force-vector-width=16 -fno-asynchronous-unw
GO_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -not -name '*_test.go')
ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -not -name '*_test.go')

.PHONEY: assembly
.PHONY: assembly

INTEL_SOURCES := \
bit_packing_avx2.s \
unpack_bool_avx2.s unpack_bool_sse4.s
bit_packing_avx2_amd64.s \
unpack_bool_avx2_amd64.s unpack_bool_sse4_amd64.s

ARM_SOURCES := \
bit_packing_neon_arm64.s unpack_bool_neon_arm64.s

#
# ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support.
Expand All @@ -58,13 +61,16 @@ _lib/unpack_bool_sse4.s: _lib/unpack_bool.c
_lib/unpack_bool_neon.s: _lib/unpack_bool.c
$(CC) -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@

bit_packing_avx2.s: _lib/bit_packing_avx2.s
_lib/bit_packing_neon.s: _lib/bit_packing_neon.c
$(CC) -S $(C_FLAGS_NEON) $^ -o $@

bit_packing_avx2_amd64.s: _lib/bit_packing_avx2.s
$(C2GOASM) -a -f $^ $@

unpack_bool_avx2.s: _lib/unpack_bool_avx2.s
unpack_bool_avx2_amd64.s: _lib/unpack_bool_avx2.s
$(C2GOASM) -a -f $^ $@

unpack_bool_sse4.s: _lib/unpack_bool_sse4.s
unpack_bool_sse4_amd64.s: _lib/unpack_bool_sse4.s
$(C2GOASM) -a -f $^ $@

clean:
Expand Down
Loading

0 comments on commit c227fd9

Please sign in to comment.