From 7c45ae85c49b3b9572d598feae19b7e1fe01106e Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 19 Aug 2024 08:24:09 +0530 Subject: [PATCH 01/16] Convert arithmetic from C universal intrinsics to C++ using Highway --- numpy/_core/meson.build | 2 +- .../src/umath/loops_arithmetic.dispatch.c.src | 521 ------------------ .../src/umath/loops_arithmetic.dispatch.cpp | 369 +++++++++++++ 3 files changed, 370 insertions(+), 522 deletions(-) delete mode 100644 numpy/_core/src/umath/loops_arithmetic.dispatch.c.src create mode 100644 numpy/_core/src/umath/loops_arithmetic.dispatch.cpp diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index b4c769810ad8..13d0702dd97c 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -932,7 +932,7 @@ foreach gen_mtargets : [ ], [ 'loops_arithmetic.dispatch.h', - src_file.process('src/umath/loops_arithmetic.dispatch.c.src'), + 'src/umath/loops_arithmetic.dispatch.cpp', [ AVX512_SKX, AVX512F, AVX2, SSE41, SSE2, NEON, diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src deleted file mode 100644 index c9efe5579e71..000000000000 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src +++ /dev/null @@ -1,521 +0,0 @@ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include "simd/simd.h" -#include "loops_utils.h" -#include "loops.h" -#include "lowlevel_strided_loops.h" -// Provides the various *_LOOP macros -#include "fast_loop_macros.h" - -//############################################################################### -//## Division -//############################################################################### -/******************************************************************************** - ** Defining the SIMD kernels - * - * Floor division of signed is based on T. Granlund and P. L. Montgomery - * "Division by invariant integers using multiplication(see [Figure 6.1] - * https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)" - * For details on TRUNC division see simd/intdiv.h for more clarification - *********************************************************************************** - ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF - *********************************************************************************** - * For q = FLOOR(a/d), all sword: - * sword -dsign = SRL(d, N - 1); - * uword -nsign = (n < -dsign); - * uword -qsign = EOR(-nsign, -dsign); - * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); - ********************************************************************************/ - -#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) - // Due to integer 128-bit multiplication emulation, SIMD 64-bit division - // may not perform well on both neon and up to VSX3 compared to scalar - // division. - #define SIMD_DISABLE_DIV64_OPT -#endif - -#if NPY_SIMD -/**begin repeat - * Signed types - * #sfx = s8, s16, s32, s64# - * #len = 8, 16, 32, 64# - */ -#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) -static inline void -simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; - const int vstep = npyv_nlanes_@sfx@; - const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); - - if (scalar == -1) { - npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); - const npyv_@sfx@ vzero = npyv_zero_@sfx@(); - const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); - noverflow = npyv_and_b@len@(noverflow, gt_min); - npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin); - npyv_store_@sfx@(dst, neg); - } - - int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0; - for (; len > 0; --len, ++src, ++dst) { - npyv_lanetype_@sfx@ a = *src; - if (a == NPY_MIN_INT@len@) { - raise_err = 1; - *dst = NPY_MIN_INT@len@; - } else { - *dst = -a; - } - } - if (raise_err) { - npy_set_floatstatus_overflow(); - } - } else { - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0); - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); - nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); - npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); - npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); - npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); - npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf); - npyv_store_@sfx@(dst, floor); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ a = *src; - npyv_lanetype_@sfx@ r = a / scalar; - // Negative quotients needs to be rounded down - if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) { - r--; - } - *dst = r; - } - } - npyv_cleanup(); -} -#endif -/**end repeat**/ - -/**begin repeat - * Unsigned types - * #sfx = u8, u16, u32, u64# - * #len = 8, 16, 32, 64# - */ -#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) -static inline void -simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; - const int vstep = npyv_nlanes_@sfx@; - const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); - - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); - npyv_store_@sfx@(dst, c); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ a = *src; - *dst = a / scalar; - } - npyv_cleanup(); -} -#endif -/**end repeat**/ - -#if defined(NPY_HAVE_VSX4) - -/**begin repeat - * #t = u, s# - * #signed = 0, 1# - */ -/* - * Computes division of 2 8-bit signed/unsigned integer vectors - * - * As Power10 only supports integer vector division for data of 32 bits or - * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer - * vector division instruction, and then, convert the result back to npyv_u8. - */ -NPY_FINLINE npyv_@t@8 -vsx4_div_@t@8(npyv_@t@8 a, npyv_@t@8 b) -{ -#if @signed@ - npyv_s16x2 ta, tb; - npyv_s32x2 ahi, alo, bhi, blo; - ta.val[0] = vec_unpackh(a); - ta.val[1] = vec_unpackl(a); - tb.val[0] = vec_unpackh(b); - tb.val[1] = vec_unpackl(b); - ahi.val[0] = vec_unpackh(ta.val[0]); - ahi.val[1] = vec_unpackl(ta.val[0]); - alo.val[0] = vec_unpackh(ta.val[1]); - alo.val[1] = vec_unpackl(ta.val[1]); - bhi.val[0] = vec_unpackh(tb.val[0]); - bhi.val[1] = vec_unpackl(tb.val[0]); - blo.val[0] = vec_unpackh(tb.val[1]); - blo.val[1] = vec_unpackl(tb.val[1]); -#else - npyv_u16x2 a_expand = npyv_expand_u16_u8(a); - npyv_u16x2 b_expand = npyv_expand_u16_u8(b); - npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]); - npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]); - npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]); - npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]); -#endif - npyv_@t@32 v1 = vec_div(ahi.val[0], bhi.val[0]); - npyv_@t@32 v2 = vec_div(ahi.val[1], bhi.val[1]); - npyv_@t@32 v3 = vec_div(alo.val[0], blo.val[0]); - npyv_@t@32 v4 = vec_div(alo.val[1], blo.val[1]); - npyv_@t@16 hi = vec_pack(v1, v2); - npyv_@t@16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_@t@16 -vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b) -{ -#if @signed@ - npyv_s32x2 a_expand; - npyv_s32x2 b_expand; - a_expand.val[0] = vec_unpackh(a); - a_expand.val[1] = vec_unpackl(a); - b_expand.val[0] = vec_unpackh(b); - b_expand.val[1] = vec_unpackl(b); -#else - npyv_u32x2 a_expand = npyv_expand_@t@32_@t@16(a); - npyv_u32x2 b_expand = npyv_expand_@t@32_@t@16(b); -#endif - npyv_@t@32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_@t@32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -#define vsx4_div_@t@32 vec_div -#define vsx4_div_@t@64 vec_div -/**end repeat**/ - -/**begin repeat - * Unsigned types - * #sfx = u8, u16, u32, u64# - * #len = 8, 16, 32, 64# - */ -static inline void -vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; - const npyv_@sfx@ vzero = npyv_zero_@sfx@(); - const int vstep = npyv_nlanes_@sfx@; - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src1); - npyv_@sfx@ b = npyv_load_@sfx@(src2); - npyv_@sfx@ c = vsx4_div_@sfx@(a, b); - npyv_store_@sfx@(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_@sfx@ a = *src1; - const npyv_lanetype_@sfx@ b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a / b; - } - } - npyv_cleanup(); -} -/**end repeat**/ - -/**begin repeat - * Signed types - * #sfx = s8, s16, s32, s64# - * #len = 8, 16, 32, 64# - */ -static inline void -vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; - const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); - const npyv_@sfx@ vzero = npyv_zero_@sfx@(); - const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); - npyv_b@len@ warn_zero = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); - npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); - const int vstep = npyv_nlanes_@sfx@; - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src1); - npyv_@sfx@ b = npyv_load_@sfx@(src2); - npyv_@sfx@ quo = vsx4_div_@sfx@(a, b); - npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) - npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero); - npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); - npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one); - npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); - warn_zero = npyv_or_@sfx@(bzero, warn_zero); - warn_overflow = npyv_or_@sfx@(overflow, warn_overflow); - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); - npyv_b@len@ b_gt_zero = npyv_cmpgt_@sfx@(b, vzero); - npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero); - npyv_b@len@ rem_zero = npyv_cmpeq_@sfx@(rem, vzero); - npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); - npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); - quo = npyv_add_@sfx@(quo, to_sub); - // Divide by zero - quo = npyv_select_@sfx@(bzero, vzero, quo); - // Overflow - quo = npyv_select_@sfx@(overflow, vmin, quo); - npyv_store_@sfx@(dst1, quo); - } - - if (!vec_all_eq(warn_zero, vzero)) { - npy_set_floatstatus_divbyzero(); - } - if (!vec_all_eq(warn_overflow, vzero)) { - npy_set_floatstatus_overflow(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_@sfx@ a = *src1; - const npyv_lanetype_@sfx@ b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) { - npy_set_floatstatus_overflow(); - *dst1 = NPY_MIN_INT@len@; - } else { - *dst1 = a / b; - if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) { - *dst1 -= 1; - } - } - } - npyv_cleanup(); -} -/**end repeat**/ -#endif // NPY_HAVE_VSX4 -#endif // NPY_SIMD - -/******************************************************************************** - ** Defining ufunc inner functions - ********************************************************************************/ - -/**begin repeat - * Signed types - * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# - */ -#undef TO_SIMD_SFX -#if 0 -/**begin repeat1 - * #len = 8, 16, 32, 64# - */ -#elif NPY_BITSOF_@TYPE@ == @len@ - #define TO_SIMD_SFX(X) X##_s@len@ -/**end repeat1**/ -#endif -#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT) - #undef TO_SIMD_SFX -#endif - -NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d) -{ - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) { - if (d == 0) { - npy_set_floatstatus_divbyzero(); - return 0; - } - else { - npy_set_floatstatus_overflow(); - return NPY_MIN_@TYPE@; - } - } - @type@ r = n / d; - // Negative quotients needs to be rounded down - if (((n > 0) != (d > 0)) && ((r * d) != n)) { - r--; - } - return r; -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - io1 = floor_div_@TYPE@(io1, *(@type@*)ip2); - } - *((@type@ *)iop1) = io1; - } -#if NPY_SIMD && defined(TO_SIMD_SFX) -#if defined(NPY_HAVE_VSX4) - // both arguments are arrays of the same size - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); - } -#endif - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && - (*(@type@ *)args[1]) != 0) { - TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); - } -#endif - else { - BINARY_LOOP { - *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2); - } - } -} - -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) -(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) -{ - char *ip1 = args[0]; - char *indxp = args[1]; - char *value = args[2]; - npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; - npy_intp shape = steps[3]; - npy_intp n = dimensions[0]; - npy_intp i; - @type@ *indexed; - for(i = 0; i < n; i++, indxp += isindex, value += isb) { - npy_intp indx = *(npy_intp *)indxp; - if (indx < 0) { - indx += shape; - } - indexed = (@type@ *)(ip1 + is1 * indx); - *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value); - } - return 0; -} - -/**end repeat**/ - -/**begin repeat - * Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# - * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# - */ -#undef TO_SIMD_SFX -#if 0 -/**begin repeat1 - * #len = 8, 16, 32, 64# - */ -#elif NPY_BITSOF_@STYPE@ == @len@ - #define TO_SIMD_SFX(X) X##_u@len@ -/**end repeat1**/ -#endif -/* - * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division - * because emulating multiply-high on these architectures is going to be expensive comparing - * to the native scalar dividers. - * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. - * Power10(VSX4) is an exception here since it has native support for integer vector division. - */ -#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)) - #undef TO_SIMD_SFX -#endif -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ d = *(@type@ *)ip2; - if (NPY_UNLIKELY(d == 0)) { - npy_set_floatstatus_divbyzero(); - io1 = 0; - } else { - io1 /= d; - } - } - *((@type@ *)iop1) = io1; - } -#if NPY_SIMD && defined(TO_SIMD_SFX) -#if defined(NPY_HAVE_VSX4) - // both arguments are arrays of the same size - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); - } -#endif - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && - (*(@type@ *)args[1]) != 0) { - TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); - } -#endif - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } else{ - *((@type@ *)op1) = in1 / in2; - } - } - } -} - -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) -(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) -{ - char *ip1 = args[0]; - char *indxp = args[1]; - char *value = args[2]; - npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; - npy_intp shape = steps[3]; - npy_intp n = dimensions[0]; - npy_intp i; - @type@ *indexed; - for(i = 0; i < n; i++, indxp += isindex, value += isb) { - npy_intp indx = *(npy_intp *)indxp; - if (indx < 0) { - indx += shape; - } - indexed = (@type@ *)(ip1 + is1 * indx); - @type@ in2 = *(@type@ *)value; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *indexed = 0; - } else { - *indexed = *indexed / in2; - } - } - return 0; -} - -/**end repeat**/ diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp new file mode 100644 index 000000000000..049ae0e99ff3 --- /dev/null +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -0,0 +1,369 @@ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "loops_utils.h" +#include "loops.h" +#include // for memcpy +#include "fast_loop_macros.h" +#include +#include "simd/simd.h" +#include "lowlevel_strided_loops.h" +#include "numpy/npy_math.h" +#include + +#include +namespace hn = hwy::HWY_NAMESPACE; + +HWY_BEFORE_NAMESPACE(); +namespace HWY_NAMESPACE { + +// Helper function to set float status +inline void set_float_status(bool overflow, bool divbyzero) { + if (overflow) { + npy_set_floatstatus_overflow(); + } + if (divbyzero) { + npy_set_floatstatus_divbyzero(); + } +} + +// Signed integer division +template +void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + + bool raise_overflow = false; + bool raise_divbyzero = false; + + if (scalar == 0) { + // Handle division by zero + std::fill(dst, dst + len, static_cast(0)); + raise_divbyzero = true; + } + else if (scalar == 1) { + // Special case for division by 1 + if (src != dst) { + std::memcpy(dst, src, len * sizeof(T)); + } + } + else if (scalar == static_cast(-1)) { + const auto vec_min_val = hn::Set(d, std::numeric_limits::min()); + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_src = hn::LoadU(d, src + i); + const auto is_min_val = hn::Eq(vec_src, vec_min_val); + const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src)); + hn::StoreU(vec_res, d, dst + i); + if (!raise_overflow && !hn::AllFalse(d, is_min_val)) { + raise_overflow = true; + } + } + // Handle remaining elements + for (; i < static_cast(len); i++) { + T val = src[i]; + if (val == std::numeric_limits::min()) { + dst[i] = std::numeric_limits::min(); + raise_overflow = true; + } else { + dst[i] = -val; + } + } + } + else { + // General case with floor division semantics + const auto vec_scalar = hn::Set(d, scalar); + const auto vec_zero = hn::Zero(d); + size_t i = 0; + + for (; i + N <= static_cast(len); i += N) { + const auto vec_src = hn::LoadU(d, src + i); + auto vec_div = hn::Div(vec_src, vec_scalar); + const auto vec_mul = hn::Mul(vec_div, vec_scalar); + const auto has_remainder = hn::Ne(vec_src, vec_mul); + const auto src_sign = hn::Lt(vec_src, vec_zero); + const auto scalar_sign = hn::Lt(vec_scalar, vec_zero); + const auto different_signs = hn::Xor(src_sign, scalar_sign); + + auto adjustment = hn::And(different_signs, has_remainder); + vec_div = hn::IfThenElse(adjustment, hn::Sub(vec_div, hn::Set(d, static_cast(1))), vec_div); + + hn::StoreU(vec_div, d, dst + i); + } + + // Handle remaining elements with scalar code + for (; i < static_cast(len); i++) { + T n = src[i]; + T r = n / scalar; + if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) { + r--; + } + dst[i] = r; + } + } + set_float_status(raise_overflow, raise_divbyzero); +} + +// Unsigned integer division +template +void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + + bool raise_divbyzero = false; + + if (scalar == 0) { + // Handle division by zero + std::fill(dst, dst + len, static_cast(0)); + raise_divbyzero = true; + } + else if (scalar == 1) { + // Special case for division by 1 + if (src != dst) { + std::memcpy(dst, src, len * sizeof(T)); + } + } + else { + const auto vec_scalar = hn::Set(d, scalar); + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_src = hn::LoadU(d, src + i); + const auto vec_res = hn::Div(vec_src, vec_scalar); + hn::StoreU(vec_res, d, dst + i); + } + // Handle remaining elements + for (; i < static_cast(len); i++) { + dst[i] = src[i] / scalar; + } + } + + set_float_status(false, raise_divbyzero); +} + +// Floor division for signed integers +template +T floor_div(T n, T d) { + if (HWY_UNLIKELY(d == 0 || (n == std::numeric_limits::min() && d == -1))) { + if (d == 0) { + npy_set_floatstatus_divbyzero(); + return 0; + } + else { + npy_set_floatstatus_overflow(); + return std::numeric_limits::min(); + } + } + T r = n / d; + if (((n > 0) != (d > 0)) && ((r * d) != n)) { + r--; + } + return r; +} + +// Dispatch functions for signed integer division +template +void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(T) { + const T divisor = *reinterpret_cast(ip2); + if (HWY_UNLIKELY(divisor == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else if (HWY_UNLIKELY(io1 == std::numeric_limits::min() && divisor == -1)) { + npy_set_floatstatus_overflow(); + io1 = std::numeric_limits::min(); + } else { + io1 = floor_div(io1, divisor); + } + } + *reinterpret_cast(iop1) = io1; + return; + } + if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && + *reinterpret_cast(args[1]) != 0) + { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); + if (no_overlap) { + T* src1 = reinterpret_cast(args[0]); + T* src2 = reinterpret_cast(args[1]); + T* dst = reinterpret_cast(args[2]); + simd_divide_by_scalar_contig_signed(src1, *src2, dst, dimensions[0]); + return; + } + } + + // Fallback for non-blockable, in-place, or zero divisor cases + BINARY_LOOP { + const T dividend = *reinterpret_cast(ip1); + const T divisor = *reinterpret_cast(ip2); + T* result = reinterpret_cast(op1); + + if (HWY_UNLIKELY(divisor == 0)) { + npy_set_floatstatus_divbyzero(); + *result = 0; + } else if (HWY_UNLIKELY(dividend == std::numeric_limits::min() && divisor == -1)) { + npy_set_floatstatus_overflow(); + *result = std::numeric_limits::min(); + } else { + *result = floor_div(dividend, divisor); + } + } +} + +// Dispatch functions for unsigned integer division +template +void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(T) { + const T d = *reinterpret_cast(ip2); + if (HWY_UNLIKELY(d == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else { + io1 = io1 / d; + } + } + *reinterpret_cast(iop1) = io1; + return; + } + if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && + *reinterpret_cast(args[1]) != 0) + { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); + if (no_overlap) { + T* src1 = reinterpret_cast(args[0]); + T* src2 = reinterpret_cast(args[1]); + T* dst = reinterpret_cast(args[2]); + simd_divide_by_scalar_contig_unsigned(src1, *src2, dst, dimensions[0]); + return; + } + } + // Fallback for non-blockable, in-place, or zero divisor cases + BINARY_LOOP { + const T in1 = *reinterpret_cast(ip1); + const T in2 = *reinterpret_cast(ip2); + if (HWY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *reinterpret_cast(op1) = 0; + } else { + *reinterpret_cast(op1) = in1 / in2; + } + } +} + +// Indexed division for signed integers +template +int TYPE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char * const*args, npy_intp const *dimensions, + npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { + char *ip1 = args[0]; + char *indxp = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp shape = steps[3]; + npy_intp n = dimensions[0]; + + for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) { + npy_intp indx = *(npy_intp *)indxp; + if (indx < 0) { + indx += shape; + } + T* indexed = (T*)(ip1 + is1 * indx); + T divisor = *(T*)value; + *indexed = floor_div(*indexed, divisor); + } + return 0; +} + +// Indexed division for unsigned integers +template +int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char * const*args, npy_intp const *dimensions, + npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { + char *ip1 = args[0]; + char *indxp = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp shape = steps[3]; + npy_intp n = dimensions[0]; + + for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) { + npy_intp indx = *(npy_intp *)indxp; + if (indx < 0) { + indx += shape; + } + T* indexed = (T*)(ip1 + is1 * indx); + T divisor = *(T*)value; + + if (HWY_UNLIKELY(divisor == 0)) { + npy_set_floatstatus_divbyzero(); + *indexed = 0; + } else { + *indexed = *indexed / divisor; + } + } + return 0; +} + +#define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \ + extern "C" { \ + NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ + TYPE_divide(args, dimensions, steps, func); \ + } \ + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ + return TYPE_divide_indexed(context, args, dimensions, steps, func); \ + } \ + } // extern "C" + + +#ifdef NPY_CPU_DISPATCH_CURFX +// On Linux and macOS (LP64 model), long is 64 bits, but on 32-bit Windows (LLP64 model), long is 32 bits. Meanwhile, long long is guaranteed at least 64 bits +#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__)))) + DEFINE_DIVIDE_FUNCTION(BYTE, int8_t) + DEFINE_DIVIDE_FUNCTION(SHORT, int16_t) + DEFINE_DIVIDE_FUNCTION(INT, int32_t) + DEFINE_DIVIDE_FUNCTION(LONG, int32_t) // LONG is 32-bit on 32-bit platforms + DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t) +#else + DEFINE_DIVIDE_FUNCTION(BYTE, int8_t) + DEFINE_DIVIDE_FUNCTION(SHORT, int16_t) + DEFINE_DIVIDE_FUNCTION(INT, int32_t) + DEFINE_DIVIDE_FUNCTION(LONG, int64_t) // LONG is 64-bit on 64-bit platforms + DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t) +#endif +#endif + +#define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \ + extern "C" { \ + NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ + TYPE_divide_unsigned(args, dimensions, steps, func); \ + } \ + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ + return TYPE_divide_unsigned_indexed(context, args, dimensions, steps, func); \ + } \ + } + +#ifdef NPY_CPU_DISPATCH_CURFX +#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__)))) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t) // ULONG is 32-bit on 32-bit platforms + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t) +#else + DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t) // ULONG is 64-bit on 64-bit platforms + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t) +#endif +#endif + +#undef DEFINE_DIVIDE_FUNCTION +#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED + +} // namespace HWY_NAMESPACE +HWY_AFTER_NAMESPACE(); From d515a55c4dfc39b6c565a928fe815da9acf1dff0 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 5 May 2025 21:49:35 +0530 Subject: [PATCH 02/16] change dispatch logic --- .../src/umath/loops_arithmetic.dispatch.cpp | 59 +++++++++---------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 049ae0e99ff3..9174d7ef4467 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -2,14 +2,18 @@ #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION +#include "numpy/npy_common.h" +#include "numpy/npy_math.h" + #include "loops_utils.h" #include "loops.h" -#include // for memcpy #include "fast_loop_macros.h" -#include #include "simd/simd.h" #include "lowlevel_strided_loops.h" -#include "numpy/npy_math.h" +#include "common.hpp" + +#include // for memcpy +#include #include #include @@ -308,6 +312,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), return 0; } +// Macro to define the dispatch functions for signed types #define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \ extern "C" { \ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ @@ -318,24 +323,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), } \ } // extern "C" - -#ifdef NPY_CPU_DISPATCH_CURFX -// On Linux and macOS (LP64 model), long is 64 bits, but on 32-bit Windows (LLP64 model), long is 32 bits. Meanwhile, long long is guaranteed at least 64 bits -#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__)))) - DEFINE_DIVIDE_FUNCTION(BYTE, int8_t) - DEFINE_DIVIDE_FUNCTION(SHORT, int16_t) - DEFINE_DIVIDE_FUNCTION(INT, int32_t) - DEFINE_DIVIDE_FUNCTION(LONG, int32_t) // LONG is 32-bit on 32-bit platforms - DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t) -#else - DEFINE_DIVIDE_FUNCTION(BYTE, int8_t) - DEFINE_DIVIDE_FUNCTION(SHORT, int16_t) - DEFINE_DIVIDE_FUNCTION(INT, int32_t) - DEFINE_DIVIDE_FUNCTION(LONG, int64_t) // LONG is 64-bit on 64-bit platforms - DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t) -#endif -#endif - +// Macro to define the dispatch functions for unsigned types #define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \ extern "C" { \ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ @@ -344,23 +332,32 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ return TYPE_divide_unsigned_indexed(context, args, dimensions, steps, func); \ } \ - } + } // extern "C" + + +#ifdef NPY_CPU_DISPATCH_CURFX + DEFINE_DIVIDE_FUNCTION(BYTE, int8_t) + DEFINE_DIVIDE_FUNCTION(SHORT, int16_t) + DEFINE_DIVIDE_FUNCTION(INT, int32_t) + #if NPY_SIZEOF_LONG == 4 + DEFINE_DIVIDE_FUNCTION(LONG, int32_t) + #elif NPY_SIZEOF_LONG == 8 + DEFINE_DIVIDE_FUNCTION(LONG, int64_t) + #endif + DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t) +#endif #ifdef NPY_CPU_DISPATCH_CURFX -#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__)))) - DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t) - DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t) - DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t) - DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t) // ULONG is 32-bit on 32-bit platforms - DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t) -#else DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t) DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t) DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t) - DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t) // ULONG is 64-bit on 64-bit platforms + #if NPY_SIZEOF_LONG == 4 + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t) + #elif NPY_SIZEOF_LONG == 8 + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t) + #endif DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t) #endif -#endif #undef DEFINE_DIVIDE_FUNCTION #undef DEFINE_DIVIDE_FUNCTION_UNSIGNED From 8db182a6de5bf0875f7232bf704152a3014ba629 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Tue, 6 May 2025 00:43:28 +0530 Subject: [PATCH 03/16] optimise further --- .../src/umath/loops_arithmetic.dispatch.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 9174d7ef4467..931ce705ce17 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -80,20 +80,18 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) // General case with floor division semantics const auto vec_scalar = hn::Set(d, scalar); const auto vec_zero = hn::Zero(d); + const auto one = hn::Set(d, static_cast(1)); size_t i = 0; for (; i + N <= static_cast(len); i += N) { const auto vec_src = hn::LoadU(d, src + i); auto vec_div = hn::Div(vec_src, vec_scalar); const auto vec_mul = hn::Mul(vec_div, vec_scalar); - const auto has_remainder = hn::Ne(vec_src, vec_mul); - const auto src_sign = hn::Lt(vec_src, vec_zero); - const auto scalar_sign = hn::Lt(vec_scalar, vec_zero); - const auto different_signs = hn::Xor(src_sign, scalar_sign); - - auto adjustment = hn::And(different_signs, has_remainder); - vec_div = hn::IfThenElse(adjustment, hn::Sub(vec_div, hn::Set(d, static_cast(1))), vec_div); + const auto eq_mask = hn::Eq(vec_src, vec_mul); + const auto diff_signs = hn::Lt(hn::Xor(vec_src, vec_scalar), vec_zero); + const auto adjust = hn::AndNot(eq_mask, diff_signs); + vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one); hn::StoreU(vec_div, d, dst + i); } @@ -102,7 +100,7 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) T n = src[i]; T r = n / scalar; if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) { - r--; + --r; } dst[i] = r; } @@ -162,7 +160,7 @@ T floor_div(T n, T d) { } T r = n / d; if (((n > 0) != (d > 0)) && ((r * d) != n)) { - r--; + --r; } return r; } From e58409dd6672fcf22e4f4947817217728780cf90 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Wed, 14 May 2025 14:48:27 +0530 Subject: [PATCH 04/16] add NPY_SIMD flag --- numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 931ce705ce17..78ae7be4a873 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -31,7 +31,7 @@ inline void set_float_status(bool overflow, bool divbyzero) { npy_set_floatstatus_divbyzero(); } } - +#if NPY_SIMD // Signed integer division template void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) { @@ -144,7 +144,7 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le set_float_status(false, raise_divbyzero); } - +#endif // NPY_SIMD // Floor division for signed integers template T floor_div(T n, T d) { @@ -183,7 +183,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, } *reinterpret_cast(iop1) = io1; return; - } + } +#if NPY_SIMD if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && *reinterpret_cast(args[1]) != 0) { @@ -196,6 +197,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, return; } } +#endif // NPY_SIMD // Fallback for non-blockable, in-place, or zero divisor cases BINARY_LOOP { @@ -231,6 +233,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons *reinterpret_cast(iop1) = io1; return; } +#if NPY_SIMD if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && *reinterpret_cast(args[1]) != 0) { @@ -243,6 +246,8 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons return; } } +#endif // NPY_SIMD + // Fallback for non-blockable, in-place, or zero divisor cases BINARY_LOOP { const T in1 = *reinterpret_cast(ip1); From 65ff928da43e6bd01dbef4a06bbf82a5af76a9cf Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Wed, 14 May 2025 17:26:54 +0530 Subject: [PATCH 05/16] fix --- numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 78ae7be4a873..013717a159d4 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -2,6 +2,11 @@ #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION +#include "npy_cpu_dispatch.h" +#if NPY__CPU_TARGET_INDEX == 0 +#define HWY_COMPILE_ONLY_SCALAR 1 +#endif + #include "numpy/npy_common.h" #include "numpy/npy_math.h" From 693936a82eccfc622ec73f25d0176bd83b5bb80c Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Sun, 8 Jun 2025 22:37:06 +0530 Subject: [PATCH 06/16] Add support for RVV --- numpy/_core/meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 13d0702dd97c..24163080589c 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -939,6 +939,7 @@ foreach gen_mtargets : [ VSX4, VSX2, VX, LSX, + RVV, ] ], [ From e942017d5fe47cd65841c726616e0db4a066b77a Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Thu, 3 Jul 2025 15:24:21 +0530 Subject: [PATCH 07/16] add array-array div logic --- .../src/umath/loops_arithmetic.dispatch.cpp | 131 +++++++++++++++++- 1 file changed, 128 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 013717a159d4..2ee537454599 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -169,10 +169,107 @@ T floor_div(T n, T d) { } return r; } +// General divide implementation for arrays +template +void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + bool raise_overflow = false; + bool raise_divbyzero = false; + const auto vec_zero = hn::Zero(d); + const auto vec_min_val = hn::Set(d, std::numeric_limits::min()); + const auto vec_neg_one = hn::Set(d, static_cast(-1)); + + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_a = hn::LoadU(d, src1 + i); + const auto vec_b = hn::LoadU(d, src2 + i); + const auto b_is_zero = hn::Eq(vec_b, vec_zero); + const auto a_is_min = hn::Eq(vec_a, vec_min_val); + const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one); + const auto overflow_cond = hn::And(a_is_min, b_is_neg_one); + auto vec_div = hn::Div(vec_a, vec_b); + const auto vec_mul = hn::Mul(vec_div, vec_b); + const auto has_remainder = hn::Ne(vec_a, vec_mul); + const auto a_sign = hn::Lt(vec_a, vec_zero); + const auto b_sign = hn::Lt(vec_b, vec_zero); + const auto different_signs = hn::Xor(a_sign, b_sign); + auto adjustment = hn::And(different_signs, has_remainder); + vec_div = hn::IfThenElse(adjustment, + hn::Sub(vec_div, hn::Set(d, static_cast(1))), + vec_div); + vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); + vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div); + hn::StoreU(vec_div, d, dst + i); + if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { + raise_divbyzero = true; + } + if (!raise_overflow && !hn::AllFalse(d, overflow_cond)) { + raise_overflow = true; + } + } + for (; i < static_cast(len); i++) { + T a = src1[i]; + T b = src2[i]; + + if (b == 0) { + dst[i] = 0; + raise_divbyzero = true; + } + else if (a == std::numeric_limits::min() && b == -1) { + dst[i] = std::numeric_limits::min(); + raise_overflow = true; + } + else { + dst[i] = floor_div(a, b); + } + } + + set_float_status(raise_overflow, raise_divbyzero); +} +// Unsigned division for arrays +template +void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + + bool raise_divbyzero = false; + const auto vec_zero = hn::Zero(d); + + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_a = hn::LoadU(d, src1 + i); + const auto vec_b = hn::LoadU(d, src2 + i); + const auto b_is_zero = hn::Eq(vec_b, vec_zero); + auto vec_div = hn::Div(vec_a, vec_b); + vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); + hn::StoreU(vec_div, d, dst + i); + if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { + raise_divbyzero = true; + } + } + for (; i < static_cast(len); i++) { + T a = src1[i]; + T b = src2[i]; + + if (b == 0) { + dst[i] = 0; + raise_divbyzero = true; + } else { + dst[i] = a / b; + } + } + + set_float_status(false, raise_divbyzero); +} + // Dispatch functions for signed integer division template void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + npy_clear_floatstatus(); if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(T) { const T divisor = *reinterpret_cast(ip2); @@ -189,8 +286,22 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, *reinterpret_cast(iop1) = io1; return; } -#if NPY_SIMD - if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && +#if NPY_SIMD + // Handle array-array case + if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) + { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && + nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]); + // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T) + if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) { + T* src1 = (T*)args[0]; + T* src2 = (T*)args[1]; + T* dst = (T*)args[2]; + simd_divide_contig_signed(src1, src2, dst, dimensions[0]); + return; + } + } + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && *reinterpret_cast(args[1]) != 0) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); @@ -225,6 +336,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, // Dispatch functions for unsigned integer division template void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + npy_clear_floatstatus(); if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(T) { const T d = *reinterpret_cast(ip2); @@ -239,7 +351,20 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons return; } #if NPY_SIMD - if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && + // Handle array-array case + if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && + nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]); + // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T) + if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) { + T* src1 = (T*)args[0]; + T* src2 = (T*)args[1]; + T* dst = (T*)args[2]; + simd_divide_contig_unsigned(src1, src2, dst, dimensions[0]); + return; + } + } + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && *reinterpret_cast(args[1]) != 0) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); From 67bd0a6c0a262cc009e9a0dbb61e3c1027ed1343 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 21 Jul 2025 17:37:32 +0530 Subject: [PATCH 08/16] update logic for array-array div --- .../src/umath/loops_arithmetic.dispatch.cpp | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 2ee537454599..af32d0539fd6 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -169,6 +169,7 @@ T floor_div(T n, T d) { } return r; } + // General divide implementation for arrays template void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { @@ -178,30 +179,40 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { bool raise_overflow = false; bool raise_divbyzero = false; const auto vec_zero = hn::Zero(d); + const auto vec_one = hn::Set(d, static_cast(1)); const auto vec_min_val = hn::Set(d, std::numeric_limits::min()); const auto vec_neg_one = hn::Set(d, static_cast(-1)); - + size_t i = 0; for (; i + N <= static_cast(len); i += N) { const auto vec_a = hn::LoadU(d, src1 + i); const auto vec_b = hn::LoadU(d, src2 + i); + const auto b_is_zero = hn::Eq(vec_b, vec_zero); const auto a_is_min = hn::Eq(vec_a, vec_min_val); const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one); const auto overflow_cond = hn::And(a_is_min, b_is_neg_one); - auto vec_div = hn::Div(vec_a, vec_b); - const auto vec_mul = hn::Mul(vec_div, vec_b); + + const auto safe_div_mask = hn::Not(hn::Or(b_is_zero, overflow_cond)); + const auto safe_b = hn::IfThenElse(hn::Or(b_is_zero, overflow_cond), vec_one, vec_b); + + auto vec_div = hn::Div(vec_a, safe_b); + + const auto vec_mul = hn::Mul(vec_div, safe_b); const auto has_remainder = hn::Ne(vec_a, vec_mul); const auto a_sign = hn::Lt(vec_a, vec_zero); - const auto b_sign = hn::Lt(vec_b, vec_zero); + const auto b_sign = hn::Lt(safe_b, vec_zero); const auto different_signs = hn::Xor(a_sign, b_sign); - auto adjustment = hn::And(different_signs, has_remainder); - vec_div = hn::IfThenElse(adjustment, - hn::Sub(vec_div, hn::Set(d, static_cast(1))), - vec_div); + const auto needs_adjustment = hn::And(safe_div_mask, + hn::And(different_signs, has_remainder)); + + vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one); + vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div); + hn::StoreU(vec_div, d, dst + i); + if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { raise_divbyzero = true; } @@ -209,10 +220,11 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { raise_overflow = true; } } + for (; i < static_cast(len); i++) { T a = src1[i]; T b = src2[i]; - + if (b == 0) { dst[i] = 0; raise_divbyzero = true; @@ -225,35 +237,41 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { dst[i] = floor_div(a, b); } } - + set_float_status(raise_overflow, raise_divbyzero); } + // Unsigned division for arrays template void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { using D = hn::ScalableTag; const D d; const size_t N = hn::Lanes(d); - bool raise_divbyzero = false; const auto vec_zero = hn::Zero(d); - + const auto vec_one = hn::Set(d, static_cast(1)); + size_t i = 0; for (; i + N <= static_cast(len); i += N) { const auto vec_a = hn::LoadU(d, src1 + i); const auto vec_b = hn::LoadU(d, src2 + i); + const auto b_is_zero = hn::Eq(vec_b, vec_zero); - auto vec_div = hn::Div(vec_a, vec_b); + + const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b); + auto vec_div = hn::Div(vec_a, safe_b); vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); hn::StoreU(vec_div, d, dst + i); if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { raise_divbyzero = true; } } + + // Handle remaining elements for (; i < static_cast(len); i++) { T a = src1[i]; T b = src2[i]; - + if (b == 0) { dst[i] = 0; raise_divbyzero = true; @@ -261,11 +279,9 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { dst[i] = a / b; } } - set_float_status(false, raise_divbyzero); } - // Dispatch functions for signed integer division template void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { From 6573701b9a8332fad7208e580c65c26de1a68209 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Tue, 22 Jul 2025 00:09:18 +0530 Subject: [PATCH 09/16] update logic for int64 array-array --- .../src/umath/loops_arithmetic.dispatch.cpp | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index af32d0539fd6..29dde27460ea 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -198,15 +198,17 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { auto vec_div = hn::Div(vec_a, safe_b); - const auto vec_mul = hn::Mul(vec_div, safe_b); - const auto has_remainder = hn::Ne(vec_a, vec_mul); - const auto a_sign = hn::Lt(vec_a, vec_zero); - const auto b_sign = hn::Lt(safe_b, vec_zero); - const auto different_signs = hn::Xor(a_sign, b_sign); - const auto needs_adjustment = hn::And(safe_div_mask, - hn::And(different_signs, has_remainder)); - - vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one); + if (!hn::AllFalse(d, safe_div_mask)) { + const auto vec_mul = hn::Mul(vec_div, safe_b); + const auto has_remainder = hn::Ne(vec_a, vec_mul); + const auto a_sign = hn::Lt(vec_a, vec_zero); + const auto b_sign = hn::Lt(vec_b, vec_zero); + const auto different_signs = hn::Xor(a_sign, b_sign); + const auto needs_adjustment = hn::And(safe_div_mask, + hn::And(has_remainder, different_signs)); + + vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one); + } vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div); @@ -221,6 +223,7 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { } } + // Handle remaining elements for (; i < static_cast(len); i++) { T a = src1[i]; T b = src2[i]; @@ -234,10 +237,15 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { raise_overflow = true; } else { - dst[i] = floor_div(a, b); + T r = a / b; + if (((a > 0) != (b > 0)) && ((r * b) != a)) { + --r; + } + dst[i] = r; } } - + + npy_clear_floatstatus(); set_float_status(raise_overflow, raise_divbyzero); } @@ -255,13 +263,17 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { for (; i + N <= static_cast(len); i += N) { const auto vec_a = hn::LoadU(d, src1 + i); const auto vec_b = hn::LoadU(d, src2 + i); - - const auto b_is_zero = hn::Eq(vec_b, vec_zero); + const auto b_is_zero = hn::Eq(vec_b, vec_zero); + const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b); + auto vec_div = hn::Div(vec_a, safe_b); + vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); + hn::StoreU(vec_div, d, dst + i); + if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { raise_divbyzero = true; } @@ -279,6 +291,8 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { dst[i] = a / b; } } + + npy_clear_floatstatus(); set_float_status(false, raise_divbyzero); } From fb01fb327d99eb1bb2e556912c7ce03b0a222723 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 18 Aug 2025 15:16:10 +0530 Subject: [PATCH 10/16] include highway wrapper --- .../src/umath/loops_arithmetic.dispatch.cpp | 204 +++++++++--------- 1 file changed, 105 insertions(+), 99 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 29dde27460ea..849d0733a988 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -13,7 +13,7 @@ #include "loops_utils.h" #include "loops.h" #include "fast_loop_macros.h" -#include "simd/simd.h" +#include "simd/simd.hpp" #include "lowlevel_strided_loops.h" #include "common.hpp" @@ -21,11 +21,10 @@ #include #include -#include -namespace hn = hwy::HWY_NAMESPACE; - -HWY_BEFORE_NAMESPACE(); -namespace HWY_NAMESPACE { +using namespace np::simd; +#if NPY_HWY +namespace hn = np::simd::hn; +#endif // Helper function to set float status inline void set_float_status(bool overflow, bool divbyzero) { @@ -36,13 +35,15 @@ inline void set_float_status(bool overflow, bool divbyzero) { npy_set_floatstatus_divbyzero(); } } -#if NPY_SIMD -// Signed integer division +#if NPY_HWY + +// Signed integer DIVIDE by scalar + template void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) { - using D = hn::ScalableTag; + using D = _Tag; const D d; - const size_t N = hn::Lanes(d); + const size_t N = Lanes(T{}); bool raise_overflow = false; bool raise_divbyzero = false; @@ -59,13 +60,13 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) } } else if (scalar == static_cast(-1)) { - const auto vec_min_val = hn::Set(d, std::numeric_limits::min()); + const auto vec_min_val = Set(static_cast(std::numeric_limits::min())); size_t i = 0; for (; i + N <= static_cast(len); i += N) { - const auto vec_src = hn::LoadU(d, src + i); - const auto is_min_val = hn::Eq(vec_src, vec_min_val); + const auto vec_src = LoadU(src + i); + const auto is_min_val = Eq(vec_src, vec_min_val); const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src)); - hn::StoreU(vec_res, d, dst + i); + StoreU(vec_res, dst + i); if (!raise_overflow && !hn::AllFalse(d, is_min_val)) { raise_overflow = true; } @@ -83,21 +84,21 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) } else { // General case with floor division semantics - const auto vec_scalar = hn::Set(d, scalar); - const auto vec_zero = hn::Zero(d); - const auto one = hn::Set(d, static_cast(1)); + const auto vec_scalar = Set(scalar); + const auto vec_zero = Zero(); + const auto one = Set(static_cast(1)); size_t i = 0; for (; i + N <= static_cast(len); i += N) { - const auto vec_src = hn::LoadU(d, src + i); - auto vec_div = hn::Div(vec_src, vec_scalar); - const auto vec_mul = hn::Mul(vec_div, vec_scalar); - const auto eq_mask = hn::Eq(vec_src, vec_mul); - const auto diff_signs = hn::Lt(hn::Xor(vec_src, vec_scalar), vec_zero); - const auto adjust = hn::AndNot(eq_mask, diff_signs); + const auto vec_src = LoadU(src + i); + auto vec_div = Div(vec_src, vec_scalar); + const auto vec_mul = Mul(vec_div, vec_scalar); + const auto eq_mask = Eq(vec_src, vec_mul); + const auto diff_signs = Lt(Xor(vec_src, vec_scalar), vec_zero); + const auto adjust = AndNot(eq_mask, diff_signs); vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one); - hn::StoreU(vec_div, d, dst + i); + StoreU(vec_div, dst + i); } // Handle remaining elements with scalar code @@ -113,13 +114,12 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) set_float_status(raise_overflow, raise_divbyzero); } -// Unsigned integer division +// Unsigned integer DIVIDE by scalar + template void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) { - using D = hn::ScalableTag; - const D d; - const size_t N = hn::Lanes(d); + const size_t N = Lanes(T{}); bool raise_divbyzero = false; if (scalar == 0) { @@ -134,12 +134,12 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le } } else { - const auto vec_scalar = hn::Set(d, scalar); + const auto vec_scalar = Set(scalar); size_t i = 0; for (; i + N <= static_cast(len); i += N) { - const auto vec_src = hn::LoadU(d, src + i); - const auto vec_res = hn::Div(vec_src, vec_scalar); - hn::StoreU(vec_res, d, dst + i); + const auto vec_src = LoadU(src + i); + const auto vec_res = Div(vec_src, vec_scalar); + StoreU(vec_res, dst + i); } // Handle remaining elements for (; i < static_cast(len); i++) { @@ -149,63 +149,45 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le set_float_status(false, raise_divbyzero); } -#endif // NPY_SIMD -// Floor division for signed integers -template -T floor_div(T n, T d) { - if (HWY_UNLIKELY(d == 0 || (n == std::numeric_limits::min() && d == -1))) { - if (d == 0) { - npy_set_floatstatus_divbyzero(); - return 0; - } - else { - npy_set_floatstatus_overflow(); - return std::numeric_limits::min(); - } - } - T r = n / d; - if (((n > 0) != (d > 0)) && ((r * d) != n)) { - --r; - } - return r; -} -// General divide implementation for arrays +// Signed integer DIVIDE array / array + template void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { - using D = hn::ScalableTag; + using D = _Tag; const D d; - const size_t N = hn::Lanes(d); + const size_t N = Lanes(T{}); + bool raise_overflow = false; bool raise_divbyzero = false; - const auto vec_zero = hn::Zero(d); - const auto vec_one = hn::Set(d, static_cast(1)); - const auto vec_min_val = hn::Set(d, std::numeric_limits::min()); - const auto vec_neg_one = hn::Set(d, static_cast(-1)); + const auto vec_zero = Zero(); + const auto vec_one = Set(static_cast(1)); + const auto vec_min_val = Set(static_cast(std::numeric_limits::min())); + const auto vec_neg_one = Set(static_cast(-1)); size_t i = 0; for (; i + N <= static_cast(len); i += N) { - const auto vec_a = hn::LoadU(d, src1 + i); - const auto vec_b = hn::LoadU(d, src2 + i); + const auto vec_a = LoadU(src1 + i); + const auto vec_b = LoadU(src2 + i); - const auto b_is_zero = hn::Eq(vec_b, vec_zero); - const auto a_is_min = hn::Eq(vec_a, vec_min_val); - const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one); - const auto overflow_cond = hn::And(a_is_min, b_is_neg_one); + const auto b_is_zero = Eq(vec_b, vec_zero); + const auto a_is_min = Eq(vec_a, vec_min_val); + const auto b_is_neg_one = Eq(vec_b, vec_neg_one); + const auto overflow_cond = And(a_is_min, b_is_neg_one); - const auto safe_div_mask = hn::Not(hn::Or(b_is_zero, overflow_cond)); - const auto safe_b = hn::IfThenElse(hn::Or(b_is_zero, overflow_cond), vec_one, vec_b); + const auto safe_div_mask = hn::Not(Or(b_is_zero, overflow_cond)); + const auto safe_b = hn::IfThenElse(Or(b_is_zero, overflow_cond), vec_one, vec_b); - auto vec_div = hn::Div(vec_a, safe_b); + auto vec_div = Div(vec_a, safe_b); if (!hn::AllFalse(d, safe_div_mask)) { - const auto vec_mul = hn::Mul(vec_div, safe_b); + const auto vec_mul = Mul(vec_div, safe_b); const auto has_remainder = hn::Ne(vec_a, vec_mul); - const auto a_sign = hn::Lt(vec_a, vec_zero); - const auto b_sign = hn::Lt(vec_b, vec_zero); - const auto different_signs = hn::Xor(a_sign, b_sign); - const auto needs_adjustment = hn::And(safe_div_mask, - hn::And(has_remainder, different_signs)); + const auto a_sign = Lt(vec_a, vec_zero); + const auto b_sign = Lt(vec_b, vec_zero); + const auto different_signs = Xor(a_sign, b_sign); + const auto needs_adjustment = And(safe_div_mask, + And(has_remainder, different_signs)); vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one); } @@ -213,7 +195,7 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div); - hn::StoreU(vec_div, d, dst + i); + StoreU(vec_div, dst + i); if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { raise_divbyzero = true; @@ -249,30 +231,32 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { set_float_status(raise_overflow, raise_divbyzero); } -// Unsigned division for arrays +// Unsigned integer DIVIDE array / array + template void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { - using D = hn::ScalableTag; + using D = _Tag; const D d; - const size_t N = hn::Lanes(d); + const size_t N = Lanes(T{}); + bool raise_divbyzero = false; - const auto vec_zero = hn::Zero(d); - const auto vec_one = hn::Set(d, static_cast(1)); + const auto vec_zero = Zero(); + const auto vec_one = Set(static_cast(1)); size_t i = 0; for (; i + N <= static_cast(len); i += N) { - const auto vec_a = hn::LoadU(d, src1 + i); - const auto vec_b = hn::LoadU(d, src2 + i); + const auto vec_a = LoadU(src1 + i); + const auto vec_b = LoadU(src2 + i); - const auto b_is_zero = hn::Eq(vec_b, vec_zero); + const auto b_is_zero = Eq(vec_b, vec_zero); const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b); - auto vec_div = hn::Div(vec_a, safe_b); + auto vec_div = Div(vec_a, safe_b); vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); - hn::StoreU(vec_div, d, dst + i); + StoreU(vec_div, dst + i); if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { raise_divbyzero = true; @@ -296,6 +280,29 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { set_float_status(false, raise_divbyzero); } +#endif // NPY_HWY + +// Floor division for signed integers +template +T floor_div(T n, T d) { + if (NPY_UNLIKELY(d == 0 || (n == std::numeric_limits::min() && d == -1))) { + if (d == 0) { + npy_set_floatstatus_divbyzero(); + return 0; + } + else { + npy_set_floatstatus_overflow(); + return std::numeric_limits::min(); + } + } + T r = n / d; + if (((n > 0) != (d > 0)) && ((r * d) != n)) { + --r; + } + return r; +} + + // Dispatch functions for signed integer division template void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @@ -303,10 +310,10 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(T) { const T divisor = *reinterpret_cast(ip2); - if (HWY_UNLIKELY(divisor == 0)) { + if (NPY_UNLIKELY(divisor == 0)) { npy_set_floatstatus_divbyzero(); io1 = 0; - } else if (HWY_UNLIKELY(io1 == std::numeric_limits::min() && divisor == -1)) { + } else if (NPY_UNLIKELY(io1 == std::numeric_limits::min() && divisor == -1)) { npy_set_floatstatus_overflow(); io1 = std::numeric_limits::min(); } else { @@ -316,7 +323,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, *reinterpret_cast(iop1) = io1; return; } -#if NPY_SIMD + +#if NPY_HWY // Handle array-array case if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) { @@ -343,18 +351,19 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, return; } } -#endif // NPY_SIMD +#endif // NPY_HWY + // Scalar fallback // Fallback for non-blockable, in-place, or zero divisor cases BINARY_LOOP { const T dividend = *reinterpret_cast(ip1); const T divisor = *reinterpret_cast(ip2); T* result = reinterpret_cast(op1); - if (HWY_UNLIKELY(divisor == 0)) { + if (NPY_UNLIKELY(divisor == 0)) { npy_set_floatstatus_divbyzero(); *result = 0; - } else if (HWY_UNLIKELY(dividend == std::numeric_limits::min() && divisor == -1)) { + } else if (NPY_UNLIKELY(dividend == std::numeric_limits::min() && divisor == -1)) { npy_set_floatstatus_overflow(); *result = std::numeric_limits::min(); } else { @@ -370,7 +379,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(T) { const T d = *reinterpret_cast(ip2); - if (HWY_UNLIKELY(d == 0)) { + if (NPY_UNLIKELY(d == 0)) { npy_set_floatstatus_divbyzero(); io1 = 0; } else { @@ -380,7 +389,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons *reinterpret_cast(iop1) = io1; return; } -#if NPY_SIMD +#if NPY_HWY // Handle array-array case if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && @@ -406,13 +415,13 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons return; } } -#endif // NPY_SIMD +#endif // NPY_HWY // Fallback for non-blockable, in-place, or zero divisor cases BINARY_LOOP { const T in1 = *reinterpret_cast(ip1); const T in2 = *reinterpret_cast(ip2); - if (HWY_UNLIKELY(in2 == 0)) { + if (NPY_UNLIKELY(in2 == 0)) { npy_set_floatstatus_divbyzero(); *reinterpret_cast(op1) = 0; } else { @@ -465,7 +474,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), T* indexed = (T*)(ip1 + is1 * indx); T divisor = *(T*)value; - if (HWY_UNLIKELY(divisor == 0)) { + if (NPY_UNLIKELY(divisor == 0)) { npy_set_floatstatus_divbyzero(); *indexed = 0; } else { @@ -524,6 +533,3 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), #undef DEFINE_DIVIDE_FUNCTION #undef DEFINE_DIVIDE_FUNCTION_UNSIGNED - -} // namespace HWY_NAMESPACE -HWY_AFTER_NAMESPACE(); From fce87004a0dc93ccadaa030b7b9c103ff133b655 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 18 Aug 2025 16:27:34 +0530 Subject: [PATCH 11/16] remove baseline HWY dispatch --- numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 849d0733a988..1ca6172418c1 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -3,9 +3,6 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "npy_cpu_dispatch.h" -#if NPY__CPU_TARGET_INDEX == 0 -#define HWY_COMPILE_ONLY_SCALAR 1 -#endif #include "numpy/npy_common.h" #include "numpy/npy_math.h" From 896a08780a703b8591eefb0a6425fe2be368013f Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 18 Aug 2025 22:04:55 +0530 Subject: [PATCH 12/16] use XOR instead of Zero --- numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 1ca6172418c1..1c0eff89c0ea 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -82,8 +82,8 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) else { // General case with floor division semantics const auto vec_scalar = Set(scalar); - const auto vec_zero = Zero(); const auto one = Set(static_cast(1)); + const auto vec_zero = Xor(one, one); size_t i = 0; for (; i + N <= static_cast(len); i += N) { @@ -157,8 +157,8 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { bool raise_overflow = false; bool raise_divbyzero = false; - const auto vec_zero = Zero(); const auto vec_one = Set(static_cast(1)); + const auto vec_zero = Xor(vec_one, vec_one); const auto vec_min_val = Set(static_cast(std::numeric_limits::min())); const auto vec_neg_one = Set(static_cast(-1)); @@ -237,8 +237,8 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { const size_t N = Lanes(T{}); bool raise_divbyzero = false; - const auto vec_zero = Zero(); const auto vec_one = Set(static_cast(1)); + const auto vec_zero = Xor(vec_one, vec_one); size_t i = 0; for (; i + N <= static_cast(len); i += N) { From 1efaa66bee3849bc875f7133ee21dbeac0e8e51b Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Tue, 19 Aug 2025 17:29:34 +0530 Subject: [PATCH 13/16] remove VSX4 --- numpy/_core/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 24163080589c..c8906679412b 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -936,7 +936,7 @@ foreach gen_mtargets : [ [ AVX512_SKX, AVX512F, AVX2, SSE41, SSE2, NEON, - VSX4, VSX2, + VSX2, VX, LSX, RVV, From 7a530b70d43c206ff3a985a6116a3a7b79bd9dcc Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Thu, 21 Aug 2025 15:50:50 +0530 Subject: [PATCH 14/16] fix comment --- .../src/umath/loops_arithmetic.dispatch.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 1c0eff89c0ea..6c0204cef01a 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -323,7 +323,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, #if NPY_HWY // Handle array-array case - if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) + if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes)) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]); @@ -336,7 +336,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, return; } } - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes) && *reinterpret_cast(args[1]) != 0) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); @@ -388,7 +388,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons } #if NPY_HWY // Handle array-array case - if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) { + if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes)) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]); // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T) @@ -400,7 +400,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons return; } } - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) && + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes) && *reinterpret_cast(args[1]) != 0) { bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); @@ -483,25 +483,21 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), // Macro to define the dispatch functions for signed types #define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \ - extern "C" { \ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ TYPE_divide(args, dimensions, steps, func); \ } \ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ return TYPE_divide_indexed(context, args, dimensions, steps, func); \ - } \ - } // extern "C" + } // Macro to define the dispatch functions for unsigned types #define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \ - extern "C" { \ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ TYPE_divide_unsigned(args, dimensions, steps, func); \ } \ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ return TYPE_divide_unsigned_indexed(context, args, dimensions, steps, func); \ - } \ - } // extern "C" + } #ifdef NPY_CPU_DISPATCH_CURFX From c9f99b3999ea3884709144a1cf980940b4d71f9c Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Fri, 22 Aug 2025 16:47:21 +0530 Subject: [PATCH 15/16] change set --- .../_core/src/umath/loops_arithmetic.dispatch.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp index 6c0204cef01a..3118dd4ff599 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -57,7 +57,7 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) } } else if (scalar == static_cast(-1)) { - const auto vec_min_val = Set(static_cast(std::numeric_limits::min())); + const auto vec_min_val = Set(d, static_cast(std::numeric_limits::min())); size_t i = 0; for (; i + N <= static_cast(len); i += N) { const auto vec_src = LoadU(src + i); @@ -81,8 +81,8 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) } else { // General case with floor division semantics - const auto vec_scalar = Set(scalar); - const auto one = Set(static_cast(1)); + const auto vec_scalar = Set(d, scalar); + const auto one = Set(d, static_cast(1)); const auto vec_zero = Xor(one, one); size_t i = 0; @@ -157,10 +157,10 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { bool raise_overflow = false; bool raise_divbyzero = false; - const auto vec_one = Set(static_cast(1)); + const auto vec_one = Set(d, static_cast(1)); const auto vec_zero = Xor(vec_one, vec_one); - const auto vec_min_val = Set(static_cast(std::numeric_limits::min())); - const auto vec_neg_one = Set(static_cast(-1)); + const auto vec_min_val = Set(d, static_cast(std::numeric_limits::min())); + const auto vec_neg_one = Set(d, static_cast(-1)); size_t i = 0; for (; i + N <= static_cast(len); i += N) { @@ -237,7 +237,7 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { const size_t N = Lanes(T{}); bool raise_divbyzero = false; - const auto vec_one = Set(static_cast(1)); + const auto vec_one = Set(d, static_cast(1)); const auto vec_zero = Xor(vec_one, vec_one); size_t i = 0; From 1d5826f0d1b5833b7fd4e69fa26b4d04ad0f348b Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Fri, 22 Aug 2025 17:48:05 +0530 Subject: [PATCH 16/16] change CI config --- .github/workflows/linux_qemu.yml | 4 ++-- numpy/_core/meson.build | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml index 53780ae097a3..197007845cb6 100644 --- a/.github/workflows/linux_qemu.yml +++ b/.github/workflows/linux_qemu.yml @@ -42,7 +42,7 @@ jobs: "ppc64le", "powerpc64le-linux-gnu", "ppc64le/ubuntu:22.04", - "-Dallow-noblas=true", + "-Dallow-noblas=true -Dcpu-dispatch=vsx,vsx2,vsx3", "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", "ppc64le" ] @@ -50,7 +50,7 @@ jobs: "ppc64le - baseline(Power9)", "powerpc64le-linux-gnu", "ppc64le/ubuntu:22.04", - "-Dallow-noblas=true -Dcpu-baseline=vsx3", + "-Dallow-noblas=true -Dcpu-baseline=vsx3 -Dcpu-dispatch=vsx,vsx2,vsx3", "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", "ppc64le" ] diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index c8906679412b..24163080589c 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -936,7 +936,7 @@ foreach gen_mtargets : [ [ AVX512_SKX, AVX512F, AVX2, SSE41, SSE2, NEON, - VSX2, + VSX4, VSX2, VX, LSX, RVV,