Skip to content

Commit

Permalink
Loongarch: modify lsx optimization(25215PR) for newest branch
Browse files Browse the repository at this point in the history
  • Loading branch information
pengxu committed Dec 20, 2024
1 parent 35b2c4a commit 7c35c37
Show file tree
Hide file tree
Showing 14 changed files with 110 additions and 15 deletions.
1 change: 1 addition & 0 deletions meson.options
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ option('test-simd', type: 'array',
'VSX', 'VSX2', 'VSX3', 'VSX4',
'NEON', 'ASIMD',
'VX', 'VXE', 'VXE2',
'LSX',
],
description: 'Specify a list of CPU features to be tested against NumPy SIMD interface')
option('test-simd-args', type: 'string', value: '',
Expand Down
8 changes: 8 additions & 0 deletions meson_cpu/loongarch64/meson.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
source_root = meson.project_source_root()
mod_features = import('features')

LSX = mod_features.new(
'LSX', 1, args: ['-mlsx'],
test_code: files(source_root + '/numpy/distutils/checks/cpu_lsx.c')[0]
)
LOONGARCH64_FEATURES = {'LSX': LSX}
4 changes: 4 additions & 0 deletions meson_cpu/main_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,8 @@
#ifdef @P@HAVE_RVV
#include <riscv_vector.h>
#endif

#ifdef @P@HAVE_LSX
#include <lsxintrin.h>
#endif
#endif // @P@_CPU_DISPATCHER_CONF_H_
4 changes: 4 additions & 0 deletions meson_cpu/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,15 @@ subdir('ppc64')
subdir('s390x')
subdir('arm')
subdir('riscv64')
subdir('loongarch64')

CPU_FEATURES = {}
CPU_FEATURES += ARM_FEATURES
CPU_FEATURES += X86_FEATURES
CPU_FEATURES += PPC64_FEATURES
CPU_FEATURES += S390X_FEATURES
CPU_FEATURES += RV64_FEATURES
CPU_FEATURES += LOONGARCH64_FEATURES

# Parse the requested baseline (CPU_CONF_BASELINE) and dispatch features
# (CPU_CONF_DISPATCH).
Expand All @@ -97,6 +99,7 @@ min_features = {
'aarch64': [ASIMD],
'riscv64': [],
'wasm32': [],
'loongarch64': [LSX],
}.get(cpu_family, [])
if host_machine.endian() == 'little' and cpu_family == 'ppc64'
min_features = [VSX2]
Expand All @@ -112,6 +115,7 @@ max_features_dict = {
'aarch64': ARM_FEATURES,
'riscv64': RV64_FEATURES,
'wasm32': {},
'loongarch64': LOONGARCH64_FEATURES,
}.get(cpu_family, {})
max_features = []
foreach fet_name, fet_obj : max_features_dict
Expand Down
4 changes: 2 additions & 2 deletions numpy/_core/include/numpy/npy_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@
#elif __riscv_xlen == 32
#define NPY_CPU_RISCV32
#endif
#elif defined(__loongarch__)
#define NPY_CPU_LOONGARCH
#elif defined(__loongarch64)
#define NPY_CPU_LOONGARCH64
#elif defined(__EMSCRIPTEN__)
/* __EMSCRIPTEN__ is defined by emscripten: an LLVM-to-Web compiler */
#define NPY_CPU_WASM
Expand Down
22 changes: 19 additions & 3 deletions numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ if use_svml
endif
endif

if host_machine.cpu_family() == 'loongarch64'
add_project_arguments(['-DHWY_COMPILE_ONLY_SCALAR'], language: ['cpp'])
endif

use_highway = not get_option('disable-highway')
if use_highway and not fs.exists('src/highway/README.md')
error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.')
Expand Down Expand Up @@ -880,6 +884,7 @@ foreach gen_mtargets : [
ASIMD, NEON,
VSX3, VSX2,
VXE, VX,
LSX,
]
],
[
Expand All @@ -890,6 +895,7 @@ foreach gen_mtargets : [
NEON,
VSX4, VSX2,
VX,
LSX,
]
],
[
Expand All @@ -900,6 +906,7 @@ foreach gen_mtargets : [
VSX3, VSX2,
NEON,
VXE, VX,
LSX,
]
],
[
Expand All @@ -916,7 +923,8 @@ foreach gen_mtargets : [
AVX512_SKX, [AVX2, FMA3],
VSX4, VSX2,
NEON_VFPV4,
VXE
VXE,
LSX,
]
],
[
Expand All @@ -927,6 +935,7 @@ foreach gen_mtargets : [
AVX512_SKX, AVX2, SSE2,
VSX2,
VX,
LSX,
]
],
[
Expand All @@ -937,6 +946,7 @@ foreach gen_mtargets : [
AVX512_SKX, AVX2, SSE2,
VSX2,
VXE, VX,
LSX,
]
],
[
Expand All @@ -954,6 +964,7 @@ foreach gen_mtargets : [
VSX4, VSX3, VSX2,
NEON_VFPV4,
VXE2, VXE,
LSX,
]
],
[
Expand All @@ -968,7 +979,8 @@ foreach gen_mtargets : [
ASIMD, NEON,
AVX512_SKX, AVX2, SSE2,
VSX2,
VXE, VX
VXE, VX,
LSX,
]
],
[
Expand All @@ -978,7 +990,8 @@ foreach gen_mtargets : [
SSE41, SSE2,
VSX2,
ASIMD, NEON,
VXE, VX
VXE, VX,
LSX,
]
],
[
Expand All @@ -988,6 +1001,7 @@ foreach gen_mtargets : [
SSE41, SSE2,
VSX2,
ASIMD, NEON,
LSX,
]
],
[
Expand All @@ -998,6 +1012,7 @@ foreach gen_mtargets : [
ASIMD, NEON,
VSX3, VSX2,
VXE, VX,
LSX,
]
],
[
Expand All @@ -1008,6 +1023,7 @@ foreach gen_mtargets : [
NEON,
VSX2,
VX,
LSX,
]
],
]
Expand Down
22 changes: 21 additions & 1 deletion numpy/_core/src/common/npy_cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ static struct {
{NPY_CPU_FEATURE_ASIMDDP, "ASIMDDP"},
{NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"},
{NPY_CPU_FEATURE_SVE, "SVE"},
{NPY_CPU_FEATURE_RVV, "RVV"}};
{NPY_CPU_FEATURE_RVV, "RVV"},
{NPY_CPU_FEATURE_LSX, "LSX"}};


NPY_VISIBILITY_HIDDEN PyObject *
Expand Down Expand Up @@ -665,6 +666,25 @@ npy__cpu_init_features(void)
npy__cpu_have[NPY_CPU_FEATURE_VX] = 1;
}

/***************** LoongArch ******************/

#elif defined(__loongarch64)

#include <sys/auxv.h>
#include <asm/hwcap.h>

static void
npy__cpu_init_features(void)
{
memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
unsigned int hwcap = getauxval(AT_HWCAP);

if ((hwcap & HWCAP_LOONGARCH_LSX)) {
npy__cpu_have[NPY_CPU_FEATURE_LSX] = 1;
return;
}
}


/***************** ARM ******************/

Expand Down
11 changes: 7 additions & 4 deletions numpy/_core/src/common/npy_cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ enum npy_cpu_features

// IBM/ZARCH
NPY_CPU_FEATURE_VX = 350,

// Vector-Enhancements Facility 1
NPY_CPU_FEATURE_VXE = 351,

Expand All @@ -101,6 +101,9 @@ enum npy_cpu_features
// RISC-V
NPY_CPU_FEATURE_RVV = 400,

// LOONGARCH
NPY_CPU_FEATURE_LSX = 500,

NPY_CPU_FEATURE_MAX
};

Expand All @@ -113,7 +116,7 @@ enum npy_cpu_features
* - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
* - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
*
* It will set a RuntimeError when
* It will set a RuntimeError when
* - CPU baseline features from the build are not supported at runtime
* - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
* - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
Expand All @@ -122,14 +125,14 @@ enum npy_cpu_features
* by the machine or build
* - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
* not built with any feature optimization support
*
*
* It will set an ImportWarning when:
* - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
* by the machine or build
* - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
* disable/enable a feature when the project was not built with any feature
* optimization support
*
*
* return 0 on success otherwise return -1
*/
NPY_VISIBILITY_HIDDEN int
Expand Down
21 changes: 20 additions & 1 deletion numpy/_core/src/common/simd/intdiv.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
divisor.val[0] = npyv_setall_u8(m);
divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
#elif defined(NPY_HAVE_LSX)
divisor.val[0] = npyv_setall_u16(m);
divisor.val[1] = npyv_setall_u8(sh1);
divisor.val[2] = npyv_setall_u8(sh2);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand All @@ -225,7 +229,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d);
NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
{
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
#if defined(NPY_HAVE_SSE2) || defined(NPY_HAVE_LSX) // SSE/AVX2/AVX512
npyv_s16x3 p = npyv_divisor_s16(d);
npyv_s8x3 r;
r.val[0] = npyv_reinterpret_s8_s16(p.val[0]);
Expand Down Expand Up @@ -291,6 +295,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1));
divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2));
#elif defined(NPY_HAVE_LSX)
divisor.val[1] = npyv_setall_u16(sh1);
divisor.val[2] = npyv_setall_u16(sh2);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down Expand Up @@ -321,6 +328,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
divisor.val[1] = npyv_setall_s16(sh);
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_setall_s16(-sh);
#elif defined(NPY_HAVE_LSX)
divisor.val[1] = npyv_setall_s16(sh);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down Expand Up @@ -358,6 +367,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1));
divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2));
#elif defined(NPY_HAVE_LSX)
divisor.val[1] = npyv_setall_u32(sh1);
divisor.val[2] = npyv_setall_u32(sh2);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down Expand Up @@ -393,6 +405,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
divisor.val[1] = npyv_setall_s32(sh);
#elif defined(NPY_HAVE_NEON)
divisor.val[1] = npyv_setall_s32(-sh);
#elif defined(NPY_HAVE_LSX)
divisor.val[1] = npyv_setall_s32(sh);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down Expand Up @@ -427,6 +441,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_u64(sh1);
divisor.val[2] = npyv_set_u64(sh2);
#elif defined(NPY_HAVE_LSX)
divisor.val[1] = npyv_setall_u64(sh1);
divisor.val[2] = npyv_setall_u64(sh2);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down Expand Up @@ -465,6 +482,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
divisor.val[1] = npyv_set_s64(sh);
#elif defined(NPY_HAVE_LSX)
divisor.val[1] = npyv_setall_s64(sh);
#else
#error "please initialize the shifting operand for the new architecture"
#endif
Expand Down
4 changes: 2 additions & 2 deletions numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
* q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
********************************************************************************/

#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)
// Due to integer 128-bit multiplication emulation, SIMD 64-bit division
// may not perform well on both neon and up to VSX3 compared to scalar
// division.
Expand Down Expand Up @@ -452,7 +452,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
* Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
* Power10(VSX4) is an exception here since it has native support for integer vector division.
*/
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX))
#undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
Expand Down
2 changes: 1 addition & 1 deletion numpy/_core/tests/test_cpu_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_dispatcher():
"SSE2", "SSE41", "AVX2",
"VSX", "VSX2", "VSX3",
"NEON", "ASIMD", "ASIMDHP",
"VX", "VXE"
"VX", "VXE", "LSX"
)
highest_sfx = "" # no suffix for the baseline
all_sfx = []
Expand Down
9 changes: 9 additions & 0 deletions numpy/_core/tests/test_cpu_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,3 +420,12 @@ def load_flags(self):
# if the kernel reports any one of the following ARM8 features.
"ASIMD": ("AES", "SHA1", "SHA2", "PMULL", "CRC32")
}


is_loongarch = re.match("^(loongarch)", machine, re.IGNORECASE)
@pytest.mark.skipif(not is_linux or not is_loongarch, reason="Only for Linux and LoongArch")
class Test_LOONGARCH_Features(AbstractTest):
features = ["LSX"]

def load_flags(self):
self.load_flags_cpuinfo("Features")
2 changes: 1 addition & 1 deletion numpy/distutils/ccompiler_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ class _Config:
## ARMv8.2 dot product
ASIMDDP = dict(interest=6, implies="ASIMD"),
## ARMv8.2 Single & half-precision Multiply
ASIMDFHM = dict(interest=7, implies="ASIMDHP"),
ASIMDFHM = dict(interest=7, implies="ASIMDHP")
)
def conf_features_partial(self):
"""Return a dictionary of supported CPU features by the platform,
Expand Down
Loading

0 comments on commit 7c35c37

Please sign in to comment.