diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml index 53780ae097a3..197007845cb6 100644 --- a/.github/workflows/linux_qemu.yml +++ b/.github/workflows/linux_qemu.yml @@ -42,7 +42,7 @@ jobs: "ppc64le", "powerpc64le-linux-gnu", "ppc64le/ubuntu:22.04", - "-Dallow-noblas=true", + "-Dallow-noblas=true -Dcpu-dispatch=vsx,vsx2,vsx3", "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", "ppc64le" ] @@ -50,7 +50,7 @@ jobs: "ppc64le - baseline(Power9)", "powerpc64le-linux-gnu", "ppc64le/ubuntu:22.04", - "-Dallow-noblas=true -Dcpu-baseline=vsx3", + "-Dallow-noblas=true -Dcpu-baseline=vsx3 -Dcpu-dispatch=vsx,vsx2,vsx3", "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", "ppc64le" ] diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index b4c769810ad8..24163080589c 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -932,13 +932,14 @@ foreach gen_mtargets : [ ], [ 'loops_arithmetic.dispatch.h', - src_file.process('src/umath/loops_arithmetic.dispatch.c.src'), + 'src/umath/loops_arithmetic.dispatch.cpp', [ AVX512_SKX, AVX512F, AVX2, SSE41, SSE2, NEON, VSX4, VSX2, VX, LSX, + RVV, ] ], [ diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src deleted file mode 100644 index c9efe5579e71..000000000000 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src +++ /dev/null @@ -1,521 +0,0 @@ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include "simd/simd.h" -#include "loops_utils.h" -#include "loops.h" -#include "lowlevel_strided_loops.h" -// Provides the various *_LOOP macros -#include "fast_loop_macros.h" - -//############################################################################### -//## Division -//############################################################################### -/******************************************************************************** - ** Defining the SIMD kernels - * - * Floor division of signed is based on T. Granlund and P. L. Montgomery - * "Division by invariant integers using multiplication(see [Figure 6.1] - * https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)" - * For details on TRUNC division see simd/intdiv.h for more clarification - *********************************************************************************** - ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF - *********************************************************************************** - * For q = FLOOR(a/d), all sword: - * sword -dsign = SRL(d, N - 1); - * uword -nsign = (n < -dsign); - * uword -qsign = EOR(-nsign, -dsign); - * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); - ********************************************************************************/ - -#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) - // Due to integer 128-bit multiplication emulation, SIMD 64-bit division - // may not perform well on both neon and up to VSX3 compared to scalar - // division. - #define SIMD_DISABLE_DIV64_OPT -#endif - -#if NPY_SIMD -/**begin repeat - * Signed types - * #sfx = s8, s16, s32, s64# - * #len = 8, 16, 32, 64# - */ -#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) -static inline void -simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; - const int vstep = npyv_nlanes_@sfx@; - const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); - - if (scalar == -1) { - npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); - const npyv_@sfx@ vzero = npyv_zero_@sfx@(); - const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); - noverflow = npyv_and_b@len@(noverflow, gt_min); - npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin); - npyv_store_@sfx@(dst, neg); - } - - int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0; - for (; len > 0; --len, ++src, ++dst) { - npyv_lanetype_@sfx@ a = *src; - if (a == NPY_MIN_INT@len@) { - raise_err = 1; - *dst = NPY_MIN_INT@len@; - } else { - *dst = -a; - } - } - if (raise_err) { - npy_set_floatstatus_overflow(); - } - } else { - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0); - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); - nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); - npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); - npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); - npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); - npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf); - npyv_store_@sfx@(dst, floor); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ a = *src; - npyv_lanetype_@sfx@ r = a / scalar; - // Negative quotients needs to be rounded down - if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) { - r--; - } - *dst = r; - } - } - npyv_cleanup(); -} -#endif -/**end repeat**/ - -/**begin repeat - * Unsigned types - * #sfx = u8, u16, u32, u64# - * #len = 8, 16, 32, 64# - */ -#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT)) -static inline void -simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; - const int vstep = npyv_nlanes_@sfx@; - const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); - - for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); - npyv_store_@sfx@(dst, c); - } - - for (; len > 0; --len, ++src, ++dst) { - const npyv_lanetype_@sfx@ a = *src; - *dst = a / scalar; - } - npyv_cleanup(); -} -#endif -/**end repeat**/ - -#if defined(NPY_HAVE_VSX4) - -/**begin repeat - * #t = u, s# - * #signed = 0, 1# - */ -/* - * Computes division of 2 8-bit signed/unsigned integer vectors - * - * As Power10 only supports integer vector division for data of 32 bits or - * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer - * vector division instruction, and then, convert the result back to npyv_u8. - */ -NPY_FINLINE npyv_@t@8 -vsx4_div_@t@8(npyv_@t@8 a, npyv_@t@8 b) -{ -#if @signed@ - npyv_s16x2 ta, tb; - npyv_s32x2 ahi, alo, bhi, blo; - ta.val[0] = vec_unpackh(a); - ta.val[1] = vec_unpackl(a); - tb.val[0] = vec_unpackh(b); - tb.val[1] = vec_unpackl(b); - ahi.val[0] = vec_unpackh(ta.val[0]); - ahi.val[1] = vec_unpackl(ta.val[0]); - alo.val[0] = vec_unpackh(ta.val[1]); - alo.val[1] = vec_unpackl(ta.val[1]); - bhi.val[0] = vec_unpackh(tb.val[0]); - bhi.val[1] = vec_unpackl(tb.val[0]); - blo.val[0] = vec_unpackh(tb.val[1]); - blo.val[1] = vec_unpackl(tb.val[1]); -#else - npyv_u16x2 a_expand = npyv_expand_u16_u8(a); - npyv_u16x2 b_expand = npyv_expand_u16_u8(b); - npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]); - npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]); - npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]); - npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]); -#endif - npyv_@t@32 v1 = vec_div(ahi.val[0], bhi.val[0]); - npyv_@t@32 v2 = vec_div(ahi.val[1], bhi.val[1]); - npyv_@t@32 v3 = vec_div(alo.val[0], blo.val[0]); - npyv_@t@32 v4 = vec_div(alo.val[1], blo.val[1]); - npyv_@t@16 hi = vec_pack(v1, v2); - npyv_@t@16 lo = vec_pack(v3, v4); - return vec_pack(hi, lo); -} - -NPY_FINLINE npyv_@t@16 -vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b) -{ -#if @signed@ - npyv_s32x2 a_expand; - npyv_s32x2 b_expand; - a_expand.val[0] = vec_unpackh(a); - a_expand.val[1] = vec_unpackl(a); - b_expand.val[0] = vec_unpackh(b); - b_expand.val[1] = vec_unpackl(b); -#else - npyv_u32x2 a_expand = npyv_expand_@t@32_@t@16(a); - npyv_u32x2 b_expand = npyv_expand_@t@32_@t@16(b); -#endif - npyv_@t@32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); - npyv_@t@32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); - return vec_pack(v1, v2); -} - -#define vsx4_div_@t@32 vec_div -#define vsx4_div_@t@64 vec_div -/**end repeat**/ - -/**begin repeat - * Unsigned types - * #sfx = u8, u16, u32, u64# - * #len = 8, 16, 32, 64# - */ -static inline void -vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; - const npyv_@sfx@ vzero = npyv_zero_@sfx@(); - const int vstep = npyv_nlanes_@sfx@; - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src1); - npyv_@sfx@ b = npyv_load_@sfx@(src2); - npyv_@sfx@ c = vsx4_div_@sfx@(a, b); - npyv_store_@sfx@(dst1, c); - if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { - npy_set_floatstatus_divbyzero(); - } - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_@sfx@ a = *src1; - const npyv_lanetype_@sfx@ b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else{ - *dst1 = a / b; - } - } - npyv_cleanup(); -} -/**end repeat**/ - -/**begin repeat - * Signed types - * #sfx = s8, s16, s32, s64# - * #len = 8, 16, 32, 64# - */ -static inline void -vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) -{ - npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; - npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; - npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; - const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); - const npyv_@sfx@ vzero = npyv_zero_@sfx@(); - const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); - npyv_b@len@ warn_zero = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); - npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); - const int vstep = npyv_nlanes_@sfx@; - - for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, - dst1 += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src1); - npyv_@sfx@ b = npyv_load_@sfx@(src2); - npyv_@sfx@ quo = vsx4_div_@sfx@(a, b); - npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo)); - // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) - npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero); - npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); - npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one); - npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); - warn_zero = npyv_or_@sfx@(bzero, warn_zero); - warn_overflow = npyv_or_@sfx@(overflow, warn_overflow); - // handle mixed case the way Python does - // ((a > 0) == (b > 0) || rem == 0) - npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); - npyv_b@len@ b_gt_zero = npyv_cmpgt_@sfx@(b, vzero); - npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero); - npyv_b@len@ rem_zero = npyv_cmpeq_@sfx@(rem, vzero); - npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); - npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); - quo = npyv_add_@sfx@(quo, to_sub); - // Divide by zero - quo = npyv_select_@sfx@(bzero, vzero, quo); - // Overflow - quo = npyv_select_@sfx@(overflow, vmin, quo); - npyv_store_@sfx@(dst1, quo); - } - - if (!vec_all_eq(warn_zero, vzero)) { - npy_set_floatstatus_divbyzero(); - } - if (!vec_all_eq(warn_overflow, vzero)) { - npy_set_floatstatus_overflow(); - } - - for (; len > 0; --len, ++src1, ++src2, ++dst1) { - const npyv_lanetype_@sfx@ a = *src1; - const npyv_lanetype_@sfx@ b = *src2; - if (NPY_UNLIKELY(b == 0)) { - npy_set_floatstatus_divbyzero(); - *dst1 = 0; - } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) { - npy_set_floatstatus_overflow(); - *dst1 = NPY_MIN_INT@len@; - } else { - *dst1 = a / b; - if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) { - *dst1 -= 1; - } - } - } - npyv_cleanup(); -} -/**end repeat**/ -#endif // NPY_HAVE_VSX4 -#endif // NPY_SIMD - -/******************************************************************************** - ** Defining ufunc inner functions - ********************************************************************************/ - -/**begin repeat - * Signed types - * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# - */ -#undef TO_SIMD_SFX -#if 0 -/**begin repeat1 - * #len = 8, 16, 32, 64# - */ -#elif NPY_BITSOF_@TYPE@ == @len@ - #define TO_SIMD_SFX(X) X##_s@len@ -/**end repeat1**/ -#endif -#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT) - #undef TO_SIMD_SFX -#endif - -NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d) -{ - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) { - if (d == 0) { - npy_set_floatstatus_divbyzero(); - return 0; - } - else { - npy_set_floatstatus_overflow(); - return NPY_MIN_@TYPE@; - } - } - @type@ r = n / d; - // Negative quotients needs to be rounded down - if (((n > 0) != (d > 0)) && ((r * d) != n)) { - r--; - } - return r; -} - -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - io1 = floor_div_@TYPE@(io1, *(@type@*)ip2); - } - *((@type@ *)iop1) = io1; - } -#if NPY_SIMD && defined(TO_SIMD_SFX) -#if defined(NPY_HAVE_VSX4) - // both arguments are arrays of the same size - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); - } -#endif - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && - (*(@type@ *)args[1]) != 0) { - TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); - } -#endif - else { - BINARY_LOOP { - *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2); - } - } -} - -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) -(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) -{ - char *ip1 = args[0]; - char *indxp = args[1]; - char *value = args[2]; - npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; - npy_intp shape = steps[3]; - npy_intp n = dimensions[0]; - npy_intp i; - @type@ *indexed; - for(i = 0; i < n; i++, indxp += isindex, value += isb) { - npy_intp indx = *(npy_intp *)indxp; - if (indx < 0) { - indx += shape; - } - indexed = (@type@ *)(ip1 + is1 * indx); - *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value); - } - return 0; -} - -/**end repeat**/ - -/**begin repeat - * Unsigned types - * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# - * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# - */ -#undef TO_SIMD_SFX -#if 0 -/**begin repeat1 - * #len = 8, 16, 32, 64# - */ -#elif NPY_BITSOF_@STYPE@ == @len@ - #define TO_SIMD_SFX(X) X##_u@len@ -/**end repeat1**/ -#endif -/* - * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division - * because emulating multiply-high on these architectures is going to be expensive comparing - * to the native scalar dividers. - * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. - * Power10(VSX4) is an exception here since it has native support for integer vector division. - */ -#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)) - #undef TO_SIMD_SFX -#endif -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) -(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(@type@) { - const @type@ d = *(@type@ *)ip2; - if (NPY_UNLIKELY(d == 0)) { - npy_set_floatstatus_divbyzero(); - io1 = 0; - } else { - io1 /= d; - } - } - *((@type@ *)iop1) = io1; - } -#if NPY_SIMD && defined(TO_SIMD_SFX) -#if defined(NPY_HAVE_VSX4) - // both arguments are arrays of the same size - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { - TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); - } -#endif - // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && - (*(@type@ *)args[1]) != 0) { - TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); - } -#endif - else { - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } else{ - *((@type@ *)op1) = in1 / in2; - } - } - } -} - -NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) -(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) -{ - char *ip1 = args[0]; - char *indxp = args[1]; - char *value = args[2]; - npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; - npy_intp shape = steps[3]; - npy_intp n = dimensions[0]; - npy_intp i; - @type@ *indexed; - for(i = 0; i < n; i++, indxp += isindex, value += isb) { - npy_intp indx = *(npy_intp *)indxp; - if (indx < 0) { - indx += shape; - } - indexed = (@type@ *)(ip1 + is1 * indx); - @type@ in2 = *(@type@ *)value; - if (NPY_UNLIKELY(in2 == 0)) { - npy_set_floatstatus_divbyzero(); - *indexed = 0; - } else { - *indexed = *indexed / in2; - } - } - return 0; -} - -/**end repeat**/ diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp new file mode 100644 index 000000000000..3118dd4ff599 --- /dev/null +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp @@ -0,0 +1,528 @@ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "npy_cpu_dispatch.h" + +#include "numpy/npy_common.h" +#include "numpy/npy_math.h" + +#include "loops_utils.h" +#include "loops.h" +#include "fast_loop_macros.h" +#include "simd/simd.hpp" +#include "lowlevel_strided_loops.h" +#include "common.hpp" + +#include // for memcpy +#include +#include + +using namespace np::simd; +#if NPY_HWY +namespace hn = np::simd::hn; +#endif + +// Helper function to set float status +inline void set_float_status(bool overflow, bool divbyzero) { + if (overflow) { + npy_set_floatstatus_overflow(); + } + if (divbyzero) { + npy_set_floatstatus_divbyzero(); + } +} +#if NPY_HWY + +// Signed integer DIVIDE by scalar + +template +void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) { + using D = _Tag; + const D d; + const size_t N = Lanes(T{}); + + bool raise_overflow = false; + bool raise_divbyzero = false; + + if (scalar == 0) { + // Handle division by zero + std::fill(dst, dst + len, static_cast(0)); + raise_divbyzero = true; + } + else if (scalar == 1) { + // Special case for division by 1 + if (src != dst) { + std::memcpy(dst, src, len * sizeof(T)); + } + } + else if (scalar == static_cast(-1)) { + const auto vec_min_val = Set(d, static_cast(std::numeric_limits::min())); + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_src = LoadU(src + i); + const auto is_min_val = Eq(vec_src, vec_min_val); + const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src)); + StoreU(vec_res, dst + i); + if (!raise_overflow && !hn::AllFalse(d, is_min_val)) { + raise_overflow = true; + } + } + // Handle remaining elements + for (; i < static_cast(len); i++) { + T val = src[i]; + if (val == std::numeric_limits::min()) { + dst[i] = std::numeric_limits::min(); + raise_overflow = true; + } else { + dst[i] = -val; + } + } + } + else { + // General case with floor division semantics + const auto vec_scalar = Set(d, scalar); + const auto one = Set(d, static_cast(1)); + const auto vec_zero = Xor(one, one); + size_t i = 0; + + for (; i + N <= static_cast(len); i += N) { + const auto vec_src = LoadU(src + i); + auto vec_div = Div(vec_src, vec_scalar); + const auto vec_mul = Mul(vec_div, vec_scalar); + const auto eq_mask = Eq(vec_src, vec_mul); + const auto diff_signs = Lt(Xor(vec_src, vec_scalar), vec_zero); + const auto adjust = AndNot(eq_mask, diff_signs); + + vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one); + StoreU(vec_div, dst + i); + } + + // Handle remaining elements with scalar code + for (; i < static_cast(len); i++) { + T n = src[i]; + T r = n / scalar; + if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) { + --r; + } + dst[i] = r; + } + } + set_float_status(raise_overflow, raise_divbyzero); +} + +// Unsigned integer DIVIDE by scalar + +template +void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) { + + const size_t N = Lanes(T{}); + bool raise_divbyzero = false; + + if (scalar == 0) { + // Handle division by zero + std::fill(dst, dst + len, static_cast(0)); + raise_divbyzero = true; + } + else if (scalar == 1) { + // Special case for division by 1 + if (src != dst) { + std::memcpy(dst, src, len * sizeof(T)); + } + } + else { + const auto vec_scalar = Set(scalar); + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_src = LoadU(src + i); + const auto vec_res = Div(vec_src, vec_scalar); + StoreU(vec_res, dst + i); + } + // Handle remaining elements + for (; i < static_cast(len); i++) { + dst[i] = src[i] / scalar; + } + } + + set_float_status(false, raise_divbyzero); +} + +// Signed integer DIVIDE array / array + +template +void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) { + using D = _Tag; + const D d; + const size_t N = Lanes(T{}); + + bool raise_overflow = false; + bool raise_divbyzero = false; + const auto vec_one = Set(d, static_cast(1)); + const auto vec_zero = Xor(vec_one, vec_one); + const auto vec_min_val = Set(d, static_cast(std::numeric_limits::min())); + const auto vec_neg_one = Set(d, static_cast(-1)); + + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_a = LoadU(src1 + i); + const auto vec_b = LoadU(src2 + i); + + const auto b_is_zero = Eq(vec_b, vec_zero); + const auto a_is_min = Eq(vec_a, vec_min_val); + const auto b_is_neg_one = Eq(vec_b, vec_neg_one); + const auto overflow_cond = And(a_is_min, b_is_neg_one); + + const auto safe_div_mask = hn::Not(Or(b_is_zero, overflow_cond)); + const auto safe_b = hn::IfThenElse(Or(b_is_zero, overflow_cond), vec_one, vec_b); + + auto vec_div = Div(vec_a, safe_b); + + if (!hn::AllFalse(d, safe_div_mask)) { + const auto vec_mul = Mul(vec_div, safe_b); + const auto has_remainder = hn::Ne(vec_a, vec_mul); + const auto a_sign = Lt(vec_a, vec_zero); + const auto b_sign = Lt(vec_b, vec_zero); + const auto different_signs = Xor(a_sign, b_sign); + const auto needs_adjustment = And(safe_div_mask, + And(has_remainder, different_signs)); + + vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one); + } + + vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); + vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div); + + StoreU(vec_div, dst + i); + + if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { + raise_divbyzero = true; + } + if (!raise_overflow && !hn::AllFalse(d, overflow_cond)) { + raise_overflow = true; + } + } + + // Handle remaining elements + for (; i < static_cast(len); i++) { + T a = src1[i]; + T b = src2[i]; + + if (b == 0) { + dst[i] = 0; + raise_divbyzero = true; + } + else if (a == std::numeric_limits::min() && b == -1) { + dst[i] = std::numeric_limits::min(); + raise_overflow = true; + } + else { + T r = a / b; + if (((a > 0) != (b > 0)) && ((r * b) != a)) { + --r; + } + dst[i] = r; + } + } + + npy_clear_floatstatus(); + set_float_status(raise_overflow, raise_divbyzero); +} + +// Unsigned integer DIVIDE array / array + +template +void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) { + using D = _Tag; + const D d; + const size_t N = Lanes(T{}); + + bool raise_divbyzero = false; + const auto vec_one = Set(d, static_cast(1)); + const auto vec_zero = Xor(vec_one, vec_one); + + size_t i = 0; + for (; i + N <= static_cast(len); i += N) { + const auto vec_a = LoadU(src1 + i); + const auto vec_b = LoadU(src2 + i); + + const auto b_is_zero = Eq(vec_b, vec_zero); + + const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b); + + auto vec_div = Div(vec_a, safe_b); + + vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div); + + StoreU(vec_div, dst + i); + + if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) { + raise_divbyzero = true; + } + } + + // Handle remaining elements + for (; i < static_cast(len); i++) { + T a = src1[i]; + T b = src2[i]; + + if (b == 0) { + dst[i] = 0; + raise_divbyzero = true; + } else { + dst[i] = a / b; + } + } + + npy_clear_floatstatus(); + set_float_status(false, raise_divbyzero); +} + +#endif // NPY_HWY + +// Floor division for signed integers +template +T floor_div(T n, T d) { + if (NPY_UNLIKELY(d == 0 || (n == std::numeric_limits::min() && d == -1))) { + if (d == 0) { + npy_set_floatstatus_divbyzero(); + return 0; + } + else { + npy_set_floatstatus_overflow(); + return std::numeric_limits::min(); + } + } + T r = n / d; + if (((n > 0) != (d > 0)) && ((r * d) != n)) { + --r; + } + return r; +} + + +// Dispatch functions for signed integer division +template +void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + npy_clear_floatstatus(); + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(T) { + const T divisor = *reinterpret_cast(ip2); + if (NPY_UNLIKELY(divisor == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else if (NPY_UNLIKELY(io1 == std::numeric_limits::min() && divisor == -1)) { + npy_set_floatstatus_overflow(); + io1 = std::numeric_limits::min(); + } else { + io1 = floor_div(io1, divisor); + } + } + *reinterpret_cast(iop1) = io1; + return; + } + +#if NPY_HWY + // Handle array-array case + if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes)) + { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && + nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]); + // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T) + if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) { + T* src1 = (T*)args[0]; + T* src2 = (T*)args[1]; + T* dst = (T*)args[2]; + simd_divide_contig_signed(src1, src2, dst, dimensions[0]); + return; + } + } + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes) && + *reinterpret_cast(args[1]) != 0) + { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); + if (no_overlap) { + T* src1 = reinterpret_cast(args[0]); + T* src2 = reinterpret_cast(args[1]); + T* dst = reinterpret_cast(args[2]); + simd_divide_by_scalar_contig_signed(src1, *src2, dst, dimensions[0]); + return; + } + } +#endif // NPY_HWY + + // Scalar fallback + // Fallback for non-blockable, in-place, or zero divisor cases + BINARY_LOOP { + const T dividend = *reinterpret_cast(ip1); + const T divisor = *reinterpret_cast(ip2); + T* result = reinterpret_cast(op1); + + if (NPY_UNLIKELY(divisor == 0)) { + npy_set_floatstatus_divbyzero(); + *result = 0; + } else if (NPY_UNLIKELY(dividend == std::numeric_limits::min() && divisor == -1)) { + npy_set_floatstatus_overflow(); + *result = std::numeric_limits::min(); + } else { + *result = floor_div(dividend, divisor); + } + } +} + +// Dispatch functions for unsigned integer division +template +void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { + npy_clear_floatstatus(); + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(T) { + const T d = *reinterpret_cast(ip2); + if (NPY_UNLIKELY(d == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else { + io1 = io1 / d; + } + } + *reinterpret_cast(iop1) = io1; + return; + } +#if NPY_HWY + // Handle array-array case + if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes)) { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) && + nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]); + // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T) + if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) { + T* src1 = (T*)args[0]; + T* src2 = (T*)args[1]; + T* dst = (T*)args[2]; + simd_divide_contig_unsigned(src1, src2, dst, dimensions[0]); + return; + } + } + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes) && + *reinterpret_cast(args[1]) != 0) + { + bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]); + if (no_overlap) { + T* src1 = reinterpret_cast(args[0]); + T* src2 = reinterpret_cast(args[1]); + T* dst = reinterpret_cast(args[2]); + simd_divide_by_scalar_contig_unsigned(src1, *src2, dst, dimensions[0]); + return; + } + } +#endif // NPY_HWY + + // Fallback for non-blockable, in-place, or zero divisor cases + BINARY_LOOP { + const T in1 = *reinterpret_cast(ip1); + const T in2 = *reinterpret_cast(ip2); + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *reinterpret_cast(op1) = 0; + } else { + *reinterpret_cast(op1) = in1 / in2; + } + } +} + +// Indexed division for signed integers +template +int TYPE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char * const*args, npy_intp const *dimensions, + npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { + char *ip1 = args[0]; + char *indxp = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp shape = steps[3]; + npy_intp n = dimensions[0]; + + for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) { + npy_intp indx = *(npy_intp *)indxp; + if (indx < 0) { + indx += shape; + } + T* indexed = (T*)(ip1 + is1 * indx); + T divisor = *(T*)value; + *indexed = floor_div(*indexed, divisor); + } + return 0; +} + +// Indexed division for unsigned integers +template +int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), + char * const*args, npy_intp const *dimensions, + npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { + char *ip1 = args[0]; + char *indxp = args[1]; + char *value = args[2]; + npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; + npy_intp shape = steps[3]; + npy_intp n = dimensions[0]; + + for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) { + npy_intp indx = *(npy_intp *)indxp; + if (indx < 0) { + indx += shape; + } + T* indexed = (T*)(ip1 + is1 * indx); + T divisor = *(T*)value; + + if (NPY_UNLIKELY(divisor == 0)) { + npy_set_floatstatus_divbyzero(); + *indexed = 0; + } else { + *indexed = *indexed / divisor; + } + } + return 0; +} + +// Macro to define the dispatch functions for signed types +#define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \ + NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ + TYPE_divide(args, dimensions, steps, func); \ + } \ + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ + return TYPE_divide_indexed(context, args, dimensions, steps, func); \ + } + +// Macro to define the dispatch functions for unsigned types +#define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \ + NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \ + TYPE_divide_unsigned(args, dimensions, steps, func); \ + } \ + NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \ + return TYPE_divide_unsigned_indexed(context, args, dimensions, steps, func); \ + } + + +#ifdef NPY_CPU_DISPATCH_CURFX + DEFINE_DIVIDE_FUNCTION(BYTE, int8_t) + DEFINE_DIVIDE_FUNCTION(SHORT, int16_t) + DEFINE_DIVIDE_FUNCTION(INT, int32_t) + #if NPY_SIZEOF_LONG == 4 + DEFINE_DIVIDE_FUNCTION(LONG, int32_t) + #elif NPY_SIZEOF_LONG == 8 + DEFINE_DIVIDE_FUNCTION(LONG, int64_t) + #endif + DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t) +#endif + +#ifdef NPY_CPU_DISPATCH_CURFX + DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t) + DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t) + #if NPY_SIZEOF_LONG == 4 + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t) + #elif NPY_SIZEOF_LONG == 8 + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t) + #endif + DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t) +#endif + +#undef DEFINE_DIVIDE_FUNCTION +#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED