From 7c45ae85c49b3b9572d598feae19b7e1fe01106e Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Mon, 19 Aug 2024 08:24:09 +0530
Subject: [PATCH 01/16] Convert arithmetic from C universal intrinsics to C++
 using Highway

---
 numpy/_core/meson.build                       |   2 +-
 .../src/umath/loops_arithmetic.dispatch.c.src | 521 ------------------
 .../src/umath/loops_arithmetic.dispatch.cpp   | 369 +++++++++++++
 3 files changed, 370 insertions(+), 522 deletions(-)
 delete mode 100644 numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
 create mode 100644 numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index b4c769810ad8..13d0702dd97c 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -932,7 +932,7 @@ foreach gen_mtargets : [
   ],
   [
     'loops_arithmetic.dispatch.h',
-    src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
+    'src/umath/loops_arithmetic.dispatch.cpp',
     [
       AVX512_SKX, AVX512F, AVX2, SSE41, SSE2,
       NEON,
diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
deleted file mode 100644
index c9efe5579e71..000000000000
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
+++ /dev/null
@@ -1,521 +0,0 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "simd/simd.h"
-#include "loops_utils.h"
-#include "loops.h"
-#include "lowlevel_strided_loops.h"
-// Provides the various *_LOOP macros
-#include "fast_loop_macros.h"
-
-//###############################################################################
-//## Division
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- *
- * Floor division of signed is based on T. Granlund and P. L. Montgomery
- * "Division by invariant integers using multiplication(see [Figure 6.1]
- * https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
- * For details on TRUNC division see simd/intdiv.h for more clarification
- ***********************************************************************************
- ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF
- ***********************************************************************************
- * For q = FLOOR(a/d), all sword:
- *     sword -dsign = SRL(d, N - 1);
- *     uword -nsign = (n < -dsign);
- *     uword -qsign = EOR(-nsign, -dsign);
- *     q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
- ********************************************************************************/
-
-#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)
-    // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
-    // may not perform well on both neon and up to VSX3 compared to scalar
-    // division.
-    #define SIMD_DISABLE_DIV64_OPT
-#endif
-
-#if NPY_SIMD
-/**begin repeat
- * Signed types
- * #sfx    = s8, s16, s32, s64#
- * #len    = 8,  16,  32,  64#
- */
-#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
-static inline void
-simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
-    const int vstep            = npyv_nlanes_@sfx@;
-    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
-
-    if (scalar == -1) {
-        npyv_b@len@ noverflow  = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
-        const npyv_@sfx@ vzero = npyv_zero_@sfx@();
-        const npyv_@sfx@ vmin  = npyv_setall_@sfx@(NPY_MIN_INT@len@);
-        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            npyv_@sfx@ a       = npyv_load_@sfx@(src);
-            npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
-            noverflow          = npyv_and_b@len@(noverflow, gt_min);
-            npyv_@sfx@ neg     = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin);
-            npyv_store_@sfx@(dst, neg);
-        }
-
-        int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
-        for (; len > 0; --len, ++src, ++dst) {
-            npyv_lanetype_@sfx@ a = *src;
-            if (a == NPY_MIN_INT@len@) {
-                raise_err = 1;
-                *dst  = NPY_MIN_INT@len@;
-            } else {
-                *dst = -a;
-            }
-        }
-        if (raise_err) {
-            npy_set_floatstatus_overflow();
-        }
-    } else {
-        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            npyv_@sfx@  nsign_d   = npyv_setall_@sfx@(scalar < 0);
-            npyv_@sfx@  a         = npyv_load_@sfx@(src);
-            npyv_@sfx@  nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
-            nsign_a               = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
-            npyv_@sfx@  diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
-            npyv_@sfx@  to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
-            npyv_@sfx@  trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
-            npyv_@sfx@  floor     = npyv_sub_@sfx@(trunc, to_ninf);
-            npyv_store_@sfx@(dst, floor);
-        }
-
-        for (; len > 0; --len, ++src, ++dst) {
-            const npyv_lanetype_@sfx@ a = *src;
-            npyv_lanetype_@sfx@ r = a / scalar;
-            // Negative quotients needs to be rounded down
-            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
-                r--;
-            }
-            *dst = r;
-        }
-    }
-    npyv_cleanup();
-}
-#endif
-/**end repeat**/
-
-/**begin repeat
- * Unsigned types
- * #sfx    = u8, u16, u32, u64#
- * #len    = 8,  16,  32,  64#
- */
-#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
-static inline void
-simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
-    const int vstep            = npyv_nlanes_@sfx@;
-    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
-
-    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_@sfx@ a = npyv_load_@sfx@(src);
-        npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
-        npyv_store_@sfx@(dst, c);
-    }
-
-    for (; len > 0; --len, ++src, ++dst) {
-        const npyv_lanetype_@sfx@ a = *src;
-        *dst = a / scalar;
-    }
-    npyv_cleanup();
-}
-#endif
-/**end repeat**/
-
-#if defined(NPY_HAVE_VSX4)
-
-/**begin repeat
- * #t = u, s#
- * #signed = 0, 1#
- */
-/*
- * Computes division of 2 8-bit signed/unsigned integer vectors
- *
- * As Power10 only supports integer vector division for data of 32 bits or
- * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
- * vector division instruction, and then, convert the result back to npyv_u8.
- */
-NPY_FINLINE npyv_@t@8
-vsx4_div_@t@8(npyv_@t@8 a, npyv_@t@8 b)
-{
-#if @signed@
-    npyv_s16x2 ta, tb;
-    npyv_s32x2 ahi, alo, bhi, blo;
-    ta.val[0] = vec_unpackh(a);
-    ta.val[1] = vec_unpackl(a);
-    tb.val[0] = vec_unpackh(b);
-    tb.val[1] = vec_unpackl(b);
-    ahi.val[0] = vec_unpackh(ta.val[0]);
-    ahi.val[1] = vec_unpackl(ta.val[0]);
-    alo.val[0] = vec_unpackh(ta.val[1]);
-    alo.val[1] = vec_unpackl(ta.val[1]);
-    bhi.val[0] = vec_unpackh(tb.val[0]);
-    bhi.val[1] = vec_unpackl(tb.val[0]);
-    blo.val[0] = vec_unpackh(tb.val[1]);
-    blo.val[1] = vec_unpackl(tb.val[1]);
-#else
-    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
-    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
-    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
-    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
-    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
-    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
-#endif
-    npyv_@t@32 v1 = vec_div(ahi.val[0], bhi.val[0]);
-    npyv_@t@32 v2 = vec_div(ahi.val[1], bhi.val[1]);
-    npyv_@t@32 v3 = vec_div(alo.val[0], blo.val[0]);
-    npyv_@t@32 v4 = vec_div(alo.val[1], blo.val[1]);
-    npyv_@t@16 hi = vec_pack(v1, v2);
-    npyv_@t@16 lo = vec_pack(v3, v4);
-    return vec_pack(hi, lo);
-}
-
-NPY_FINLINE npyv_@t@16
-vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b)
-{
-#if @signed@
-    npyv_s32x2 a_expand;
-    npyv_s32x2 b_expand;
-    a_expand.val[0] = vec_unpackh(a);
-    a_expand.val[1] = vec_unpackl(a);
-    b_expand.val[0] = vec_unpackh(b);
-    b_expand.val[1] = vec_unpackl(b);
-#else
-    npyv_u32x2 a_expand = npyv_expand_@t@32_@t@16(a);
-    npyv_u32x2 b_expand = npyv_expand_@t@32_@t@16(b);
-#endif
-    npyv_@t@32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
-    npyv_@t@32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
-    return vec_pack(v1, v2);
-}
-
-#define vsx4_div_@t@32 vec_div
-#define vsx4_div_@t@64 vec_div
-/**end repeat**/
-
-/**begin repeat
- * Unsigned types
- * #sfx = u8, u16, u32, u64#
- * #len = 8,  16,  32,  64#
- */
-static inline void
-vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2];
-    const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
-    const int vstep           = npyv_nlanes_@sfx@;
-
-    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
-         dst1 += vstep) {
-        npyv_@sfx@ a = npyv_load_@sfx@(src1);
-        npyv_@sfx@ b = npyv_load_@sfx@(src2);
-        npyv_@sfx@ c = vsx4_div_@sfx@(a, b);
-        npyv_store_@sfx@(dst1, c);
-        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
-            npy_set_floatstatus_divbyzero();
-        }
-    }
-
-    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
-        const npyv_lanetype_@sfx@ a = *src1;
-        const npyv_lanetype_@sfx@ b = *src2;
-        if (NPY_UNLIKELY(b == 0)) {
-            npy_set_floatstatus_divbyzero();
-            *dst1 = 0;
-        } else{
-            *dst1 = a / b;
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-
-/**begin repeat
- * Signed types
- * #sfx = s8, s16, s32, s64#
- * #len = 8,  16,  32,  64#
- */
-static inline void
-vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2];
-    const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
-    const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
-    const npyv_@sfx@ vmin     = npyv_setall_@sfx@(NPY_MIN_INT@len@);
-    npyv_b@len@ warn_zero     = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
-    npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
-    const int vstep           = npyv_nlanes_@sfx@;
-
-    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
-         dst1 += vstep) {
-        npyv_@sfx@ a   = npyv_load_@sfx@(src1);
-        npyv_@sfx@ b   = npyv_load_@sfx@(src2);
-        npyv_@sfx@ quo = vsx4_div_@sfx@(a, b);
-        npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo));
-        // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1))
-        npyv_b@len@ bzero    = npyv_cmpeq_@sfx@(b, vzero);
-        npyv_b@len@ amin     = npyv_cmpeq_@sfx@(a, vmin);
-        npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one);
-        npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
-                   warn_zero = npyv_or_@sfx@(bzero, warn_zero);
-               warn_overflow = npyv_or_@sfx@(overflow, warn_overflow);
-        // handle mixed case the way Python does
-        // ((a > 0) == (b > 0) || rem == 0)
-        npyv_b@len@ a_gt_zero  = npyv_cmpgt_@sfx@(a, vzero);
-        npyv_b@len@ b_gt_zero  = npyv_cmpgt_@sfx@(b, vzero);
-        npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero);
-        npyv_b@len@ rem_zero   = npyv_cmpeq_@sfx@(rem, vzero);
-        npyv_b@len@ or         = npyv_or_@sfx@(ab_eq_cond, rem_zero);
-        npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
-                      quo = npyv_add_@sfx@(quo, to_sub);
-                      // Divide by zero
-                      quo = npyv_select_@sfx@(bzero, vzero, quo);
-                      // Overflow
-                      quo = npyv_select_@sfx@(overflow, vmin, quo);
-        npyv_store_@sfx@(dst1, quo);
-    }
-
-    if (!vec_all_eq(warn_zero, vzero)) {
-        npy_set_floatstatus_divbyzero();
-    }
-    if (!vec_all_eq(warn_overflow, vzero)) {
-        npy_set_floatstatus_overflow();
-    }
-
-    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
-        const npyv_lanetype_@sfx@ a = *src1;
-        const npyv_lanetype_@sfx@ b = *src2;
-        if (NPY_UNLIKELY(b == 0)) {
-            npy_set_floatstatus_divbyzero();
-            *dst1 = 0;
-        } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) {
-            npy_set_floatstatus_overflow();
-            *dst1 = NPY_MIN_INT@len@;
-        } else {
-            *dst1 = a / b;
-            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
-                *dst1 -= 1;
-            }
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-#endif // NPY_HAVE_VSX4
-#endif // NPY_SIMD
-
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-
-/**begin repeat
- * Signed types
- *  #type  = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
- *  #TYPE  = BYTE,     SHORT,     INT,     LONG,     LONGLONG#
- */
-#undef TO_SIMD_SFX
-#if 0
-/**begin repeat1
- * #len = 8, 16, 32, 64#
- */
-#elif NPY_BITSOF_@TYPE@ == @len@
-    #define TO_SIMD_SFX(X) X##_s@len@
-/**end repeat1**/
-#endif
-#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT)
-    #undef TO_SIMD_SFX
-#endif
-
-NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
-{
-    /*
-     * FIXME: On x86 at least, dividing the smallest representable integer
-     * by -1 causes a SIFGPE (division overflow). We treat this case here
-     * (to avoid a SIGFPE crash at python level), but a good solution would
-     * be to treat integer division problems separately from FPU exceptions
-     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-     */
-    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
-        if (d == 0) {
-            npy_set_floatstatus_divbyzero();
-            return 0;
-        }
-        else {
-            npy_set_floatstatus_overflow();
-            return NPY_MIN_@TYPE@;
-        }
-    }
-    @type@ r = n / d;
-    // Negative quotients needs to be rounded down
-    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
-        r--;
-    }
-    return r;
-}
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
-        }
-        *((@type@ *)iop1) = io1;
-    }
-#if NPY_SIMD && defined(TO_SIMD_SFX)
-#if defined(NPY_HAVE_VSX4)
-    // both arguments are arrays of the same size
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
-    }
-#endif
-    // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
-             (*(@type@ *)args[1]) != 0) {
-        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
-    }
-#endif
-    else {
-        BINARY_LOOP {
-            *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
-        }
-    }
-}
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
-(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
-{
-    char *ip1 = args[0];
-    char *indxp = args[1];
-    char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
-    npy_intp shape = steps[3];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-    @type@ *indexed;
-    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
-        npy_intp indx = *(npy_intp *)indxp;
-        if (indx < 0) {
-            indx += shape;
-        }
-        indexed = (@type@ *)(ip1 + is1 * indx);
-        *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
-    }
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * Unsigned types
- *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
- *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
- */
-#undef TO_SIMD_SFX
-#if 0
-/**begin repeat1
- * #len = 8, 16, 32, 64#
- */
-#elif NPY_BITSOF_@STYPE@ == @len@
-    #define TO_SIMD_SFX(X) X##_u@len@
-/**end repeat1**/
-#endif
-/*
- * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
- * because emulating multiply-high on these architectures is going to be expensive comparing
- * to the native scalar dividers.
- * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
- * Power10(VSX4) is an exception here since it has native support for integer vector division.
- */
-#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX))
-    #undef TO_SIMD_SFX
-#endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ d = *(@type@ *)ip2;
-            if (NPY_UNLIKELY(d == 0)) {
-                npy_set_floatstatus_divbyzero();
-                io1 = 0;
-            } else {
-                io1 /= d;
-            }
-        }
-        *((@type@ *)iop1) = io1;
-    }
-#if NPY_SIMD && defined(TO_SIMD_SFX)
-#if defined(NPY_HAVE_VSX4)
-    // both arguments are arrays of the same size
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
-    }
-#endif
-    // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
-             (*(@type@ *)args[1]) != 0) {
-        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
-    }
-#endif
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            if (NPY_UNLIKELY(in2 == 0)) {
-                npy_set_floatstatus_divbyzero();
-                *((@type@ *)op1) = 0;
-            } else{
-                *((@type@ *)op1) = in1 / in2;
-            }
-        }
-    }
-}
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
-(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
-{
-    char *ip1 = args[0];
-    char *indxp = args[1];
-    char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
-    npy_intp shape = steps[3];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-    @type@ *indexed;
-    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
-        npy_intp indx = *(npy_intp *)indxp;
-        if (indx < 0) {
-            indx += shape;
-        }
-        indexed = (@type@ *)(ip1 + is1 * indx);
-        @type@ in2 = *(@type@ *)value;
-        if (NPY_UNLIKELY(in2 == 0)) {
-            npy_set_floatstatus_divbyzero();
-            *indexed = 0;
-        } else {
-            *indexed = *indexed / in2;
-        }
-    }
-    return 0;
-}
-
-/**end repeat**/
diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
new file mode 100644
index 000000000000..049ae0e99ff3
--- /dev/null
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -0,0 +1,369 @@
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "loops_utils.h"
+#include "loops.h"
+#include <cstring> // for memcpy
+#include "fast_loop_macros.h"
+#include <limits>
+#include "simd/simd.h"
+#include "lowlevel_strided_loops.h"
+#include "numpy/npy_math.h"
+#include <cstdio>
+
+#include <hwy/highway.h>
+namespace hn = hwy::HWY_NAMESPACE;
+
+HWY_BEFORE_NAMESPACE();
+namespace HWY_NAMESPACE {
+
+// Helper function to set float status
+inline void set_float_status(bool overflow, bool divbyzero) {
+    if (overflow) {
+        npy_set_floatstatus_overflow();
+    }
+    if (divbyzero) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+
+// Signed integer division
+template <typename T>
+void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) {
+    using D = hn::ScalableTag<T>;
+    const D d;
+    const size_t N = hn::Lanes(d);
+
+    bool raise_overflow = false;
+    bool raise_divbyzero = false;
+
+    if (scalar == 0) {
+        // Handle division by zero
+        std::fill(dst, dst + len, static_cast<T>(0));
+        raise_divbyzero = true;
+    }
+    else if (scalar == 1) {
+        // Special case for division by 1
+        if (src != dst) {
+            std::memcpy(dst, src, len * sizeof(T));
+        }
+    }
+    else if (scalar == static_cast<T>(-1)) {
+        const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
+        size_t i = 0;
+        for (; i + N <= static_cast<size_t>(len); i += N) {
+            const auto vec_src = hn::LoadU(d, src + i);
+            const auto is_min_val = hn::Eq(vec_src, vec_min_val);
+            const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
+            hn::StoreU(vec_res, d, dst + i);
+            if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
+                raise_overflow = true;
+            }
+        }
+        // Handle remaining elements
+        for (; i < static_cast<size_t>(len); i++) {
+            T val = src[i];
+            if (val == std::numeric_limits<T>::min()) {
+                dst[i] = std::numeric_limits<T>::min();
+                raise_overflow = true;
+            } else {
+                dst[i] = -val;
+            }
+        }
+    }
+    else {
+        // General case with floor division semantics
+        const auto vec_scalar = hn::Set(d, scalar);
+        const auto vec_zero = hn::Zero(d);
+        size_t i = 0;
+        
+        for (; i + N <= static_cast<size_t>(len); i += N) {
+            const auto vec_src = hn::LoadU(d, src + i);
+            auto vec_div = hn::Div(vec_src, vec_scalar);
+            const auto vec_mul = hn::Mul(vec_div, vec_scalar);
+            const auto has_remainder = hn::Ne(vec_src, vec_mul);
+            const auto src_sign = hn::Lt(vec_src, vec_zero);
+            const auto scalar_sign = hn::Lt(vec_scalar, vec_zero); 
+            const auto different_signs = hn::Xor(src_sign, scalar_sign);
+            
+            auto adjustment = hn::And(different_signs, has_remainder);
+            vec_div = hn::IfThenElse(adjustment, hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))), vec_div);
+            
+            hn::StoreU(vec_div, d, dst + i);
+        }
+        
+        // Handle remaining elements with scalar code
+        for (; i < static_cast<size_t>(len); i++) {
+            T n = src[i];
+            T r = n / scalar;
+            if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) {
+                r--;
+            }
+            dst[i] = r;
+        }
+    }
+    set_float_status(raise_overflow, raise_divbyzero);
+}
+
+// Unsigned integer division
+template <typename T>
+void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) {
+    using D = hn::ScalableTag<T>;
+    const D d;
+    const size_t N = hn::Lanes(d);
+
+    bool raise_divbyzero = false;
+
+    if (scalar == 0) {
+        // Handle division by zero
+        std::fill(dst, dst + len, static_cast<T>(0));
+        raise_divbyzero = true;
+    }
+    else if (scalar == 1) {
+        // Special case for division by 1
+        if (src != dst) {
+            std::memcpy(dst, src, len * sizeof(T));
+        }
+    }
+    else {
+        const auto vec_scalar = hn::Set(d, scalar);
+        size_t i = 0;
+        for (; i + N <= static_cast<size_t>(len); i += N) {
+            const auto vec_src = hn::LoadU(d, src + i);
+            const auto vec_res = hn::Div(vec_src, vec_scalar);
+            hn::StoreU(vec_res, d, dst + i);
+        }
+        // Handle remaining elements
+        for (; i < static_cast<size_t>(len); i++) {
+            dst[i] = src[i] / scalar;
+        }
+    }
+
+    set_float_status(false, raise_divbyzero);
+}
+
+// Floor division for signed integers
+template <typename T>
+T floor_div(T n, T d) {
+    if (HWY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return std::numeric_limits<T>::min();
+        }
+    }
+    T r = n / d;
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+// Dispatch functions for signed integer division
+template <typename T>
+void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(T) {
+            const T divisor = *reinterpret_cast<T*>(ip2);
+            if (HWY_UNLIKELY(divisor == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else if (HWY_UNLIKELY(io1 == std::numeric_limits<T>::min() && divisor == -1)) {
+                npy_set_floatstatus_overflow();
+                io1 = std::numeric_limits<T>::min();
+            } else {
+                io1 = floor_div(io1, divisor);
+            }
+        }
+        *reinterpret_cast<T*>(iop1) = io1;
+        return;
+    }    
+    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
+        *reinterpret_cast<T*>(args[1]) != 0)
+    {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
+        if (no_overlap) {
+            T* src1 = reinterpret_cast<T*>(args[0]);
+            T* src2 = reinterpret_cast<T*>(args[1]);
+            T* dst = reinterpret_cast<T*>(args[2]);
+            simd_divide_by_scalar_contig_signed(src1, *src2, dst, dimensions[0]);
+            return;
+        }
+    }
+
+    // Fallback for non-blockable, in-place, or zero divisor cases
+    BINARY_LOOP {
+        const T dividend = *reinterpret_cast<T*>(ip1);
+        const T divisor = *reinterpret_cast<T*>(ip2);
+        T* result = reinterpret_cast<T*>(op1);
+
+        if (HWY_UNLIKELY(divisor == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *result = 0;
+        } else if (HWY_UNLIKELY(dividend == std::numeric_limits<T>::min() && divisor == -1)) {
+            npy_set_floatstatus_overflow();
+            *result = std::numeric_limits<T>::min();
+        } else {
+            *result = floor_div(dividend, divisor);
+        }
+    }
+}
+
+// Dispatch functions for unsigned integer division
+template <typename T>
+void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(T) {
+            const T d = *reinterpret_cast<T*>(ip2);
+            if (HWY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 = io1 / d;
+            }
+        }
+        *reinterpret_cast<T*>(iop1) = io1;
+        return;
+    }
+    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
+        *reinterpret_cast<T*>(args[1]) != 0)
+    {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
+        if (no_overlap) {
+            T* src1 = reinterpret_cast<T*>(args[0]);
+            T* src2 = reinterpret_cast<T*>(args[1]);
+            T* dst  = reinterpret_cast<T*>(args[2]);
+            simd_divide_by_scalar_contig_unsigned(src1, *src2, dst, dimensions[0]);
+            return;
+        }
+    }
+    // Fallback for non-blockable, in-place, or zero divisor cases
+    BINARY_LOOP {
+        const T in1 = *reinterpret_cast<T*>(ip1);
+        const T in2 = *reinterpret_cast<T*>(ip2);
+        if (HWY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *reinterpret_cast<T*>(op1) = 0;
+        } else {
+            *reinterpret_cast<T*>(op1) = in1 / in2;
+        }
+    }
+}
+
+// Indexed division for signed integers
+template <typename T>
+int TYPE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), 
+                       char * const*args, npy_intp const *dimensions, 
+                       npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+
+    for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        T* indexed = (T*)(ip1 + is1 * indx);
+        T divisor = *(T*)value;
+        *indexed = floor_div(*indexed, divisor);
+    }
+    return 0;
+}
+
+// Indexed division for unsigned integers
+template <typename T>
+int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), 
+                               char * const*args, npy_intp const *dimensions, 
+                               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+
+    for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        T* indexed = (T*)(ip1 + is1 * indx);
+        T divisor = *(T*)value;
+
+        if (HWY_UNLIKELY(divisor == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / divisor;
+        }
+    }
+    return 0;
+}
+
+#define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \
+    extern "C" { \
+        NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
+            TYPE_divide<SCALAR_TYPE>(args, dimensions, steps, func); \
+        } \
+        NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
+            return TYPE_divide_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
+        } \
+    } // extern "C"
+
+
+#ifdef NPY_CPU_DISPATCH_CURFX
+// On Linux and macOS (LP64 model), long is 64 bits, but on 32-bit Windows (LLP64 model), long is 32 bits. Meanwhile, long long is guaranteed at least 64 bits
+#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
+    DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
+    DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)  
+    DEFINE_DIVIDE_FUNCTION(INT, int32_t)
+    DEFINE_DIVIDE_FUNCTION(LONG, int32_t)  // LONG is 32-bit on 32-bit platforms
+    DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
+#else
+    DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
+    DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
+    DEFINE_DIVIDE_FUNCTION(INT, int32_t)
+    DEFINE_DIVIDE_FUNCTION(LONG, int64_t)  // LONG is 64-bit on 64-bit platforms
+    DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
+#endif
+#endif
+
+#define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \
+    extern "C" { \
+        NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
+            TYPE_divide_unsigned<SCALAR_TYPE>(args, dimensions, steps, func); \
+        } \
+        NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
+            return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
+        } \
+    }
+
+#ifdef NPY_CPU_DISPATCH_CURFX
+#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t)  // ULONG is 32-bit on 32-bit platforms
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
+#else
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)  // ULONG is 64-bit on 64-bit platforms
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
+#endif
+#endif
+
+#undef DEFINE_DIVIDE_FUNCTION
+#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED
+
+} // namespace HWY_NAMESPACE
+HWY_AFTER_NAMESPACE();

From d515a55c4dfc39b6c565a928fe815da9acf1dff0 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Mon, 5 May 2025 21:49:35 +0530
Subject: [PATCH 02/16] change dispatch logic

---
 .../src/umath/loops_arithmetic.dispatch.cpp   | 59 +++++++++----------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 049ae0e99ff3..9174d7ef4467 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -2,14 +2,18 @@
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
+#include "numpy/npy_common.h"
+#include "numpy/npy_math.h"
+
 #include "loops_utils.h"
 #include "loops.h"
-#include <cstring> // for memcpy
 #include "fast_loop_macros.h"
-#include <limits>
 #include "simd/simd.h"
 #include "lowlevel_strided_loops.h"
-#include "numpy/npy_math.h"
+#include "common.hpp"
+
+#include <cstring> // for memcpy
+#include <limits>
 #include <cstdio>
 
 #include <hwy/highway.h>
@@ -308,6 +312,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
     return 0;
 }
 
+// Macro to define the dispatch functions for signed types
 #define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \
     extern "C" { \
         NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
@@ -318,24 +323,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
         } \
     } // extern "C"
 
-
-#ifdef NPY_CPU_DISPATCH_CURFX
-// On Linux and macOS (LP64 model), long is 64 bits, but on 32-bit Windows (LLP64 model), long is 32 bits. Meanwhile, long long is guaranteed at least 64 bits
-#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
-    DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
-    DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)  
-    DEFINE_DIVIDE_FUNCTION(INT, int32_t)
-    DEFINE_DIVIDE_FUNCTION(LONG, int32_t)  // LONG is 32-bit on 32-bit platforms
-    DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
-#else
-    DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
-    DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
-    DEFINE_DIVIDE_FUNCTION(INT, int32_t)
-    DEFINE_DIVIDE_FUNCTION(LONG, int64_t)  // LONG is 64-bit on 64-bit platforms
-    DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
-#endif
-#endif
-
+// Macro to define the dispatch functions for unsigned types
 #define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \
     extern "C" { \
         NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
@@ -344,23 +332,32 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
         NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
             return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
         } \
-    }
+    } // extern "C"
+
+
+#ifdef NPY_CPU_DISPATCH_CURFX
+    DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
+    DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
+    DEFINE_DIVIDE_FUNCTION(INT, int32_t)
+    #if NPY_SIZEOF_LONG == 4
+        DEFINE_DIVIDE_FUNCTION(LONG, int32_t)
+    #elif NPY_SIZEOF_LONG == 8
+        DEFINE_DIVIDE_FUNCTION(LONG, int64_t)
+    #endif
+    DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
+#endif
 
 #ifdef NPY_CPU_DISPATCH_CURFX
-#if defined(_WIN32) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
-    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
-    DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
-    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
-    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t)  // ULONG is 32-bit on 32-bit platforms
-    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
-#else
     DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
     DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
     DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
-    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)  // ULONG is 64-bit on 64-bit platforms
+    #if NPY_SIZEOF_LONG == 4
+        DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t)
+    #elif NPY_SIZEOF_LONG == 8
+        DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)
+    #endif
     DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
 #endif
-#endif
 
 #undef DEFINE_DIVIDE_FUNCTION
 #undef DEFINE_DIVIDE_FUNCTION_UNSIGNED

From 8db182a6de5bf0875f7232bf704152a3014ba629 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Tue, 6 May 2025 00:43:28 +0530
Subject: [PATCH 03/16] optimise further

---
 .../src/umath/loops_arithmetic.dispatch.cpp      | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 9174d7ef4467..931ce705ce17 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -80,20 +80,18 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
         // General case with floor division semantics
         const auto vec_scalar = hn::Set(d, scalar);
         const auto vec_zero = hn::Zero(d);
+        const auto one = hn::Set(d, static_cast<T>(1));
         size_t i = 0;
         
         for (; i + N <= static_cast<size_t>(len); i += N) {
             const auto vec_src = hn::LoadU(d, src + i);
             auto vec_div = hn::Div(vec_src, vec_scalar);
             const auto vec_mul = hn::Mul(vec_div, vec_scalar);
-            const auto has_remainder = hn::Ne(vec_src, vec_mul);
-            const auto src_sign = hn::Lt(vec_src, vec_zero);
-            const auto scalar_sign = hn::Lt(vec_scalar, vec_zero); 
-            const auto different_signs = hn::Xor(src_sign, scalar_sign);
-            
-            auto adjustment = hn::And(different_signs, has_remainder);
-            vec_div = hn::IfThenElse(adjustment, hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))), vec_div);
+            const auto eq_mask = hn::Eq(vec_src, vec_mul);
+            const auto diff_signs = hn::Lt(hn::Xor(vec_src, vec_scalar), vec_zero);
+            const auto adjust = hn::AndNot(eq_mask, diff_signs);
             
+            vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one);
             hn::StoreU(vec_div, d, dst + i);
         }
         
@@ -102,7 +100,7 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
             T n = src[i];
             T r = n / scalar;
             if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) {
-                r--;
+                --r;
             }
             dst[i] = r;
         }
@@ -162,7 +160,7 @@ T floor_div(T n, T d) {
     }
     T r = n / d;
     if (((n > 0) != (d > 0)) && ((r * d) != n)) {
-        r--;
+        --r;
     }
     return r;
 }

From e58409dd6672fcf22e4f4947817217728780cf90 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Wed, 14 May 2025 14:48:27 +0530
Subject: [PATCH 04/16] add NPY_SIMD flag

---
 numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 931ce705ce17..78ae7be4a873 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -31,7 +31,7 @@ inline void set_float_status(bool overflow, bool divbyzero) {
         npy_set_floatstatus_divbyzero();
     }
 }
-
+#if NPY_SIMD
 // Signed integer division
 template <typename T>
 void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) {
@@ -144,7 +144,7 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
 
     set_float_status(false, raise_divbyzero);
 }
-
+#endif // NPY_SIMD
 // Floor division for signed integers
 template <typename T>
 T floor_div(T n, T d) {
@@ -183,7 +183,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
         }
         *reinterpret_cast<T*>(iop1) = io1;
         return;
-    }    
+    }
+#if NPY_SIMD   
     if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
         *reinterpret_cast<T*>(args[1]) != 0)
     {
@@ -196,6 +197,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
             return;
         }
     }
+#endif // NPY_SIMD
 
     // Fallback for non-blockable, in-place, or zero divisor cases
     BINARY_LOOP {
@@ -231,6 +233,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
         *reinterpret_cast<T*>(iop1) = io1;
         return;
     }
+#if NPY_SIMD
     if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
         *reinterpret_cast<T*>(args[1]) != 0)
     {
@@ -243,6 +246,8 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
             return;
         }
     }
+#endif // NPY_SIMD
+
     // Fallback for non-blockable, in-place, or zero divisor cases
     BINARY_LOOP {
         const T in1 = *reinterpret_cast<T*>(ip1);

From 65ff928da43e6bd01dbef4a06bbf82a5af76a9cf Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Wed, 14 May 2025 17:26:54 +0530
Subject: [PATCH 05/16] fix

---
 numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 78ae7be4a873..013717a159d4 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -2,6 +2,11 @@
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
+#include "npy_cpu_dispatch.h"
+#if NPY__CPU_TARGET_INDEX == 0
+#define HWY_COMPILE_ONLY_SCALAR 1
+#endif
+
 #include "numpy/npy_common.h"
 #include "numpy/npy_math.h"
 

From 693936a82eccfc622ec73f25d0176bd83b5bb80c Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Sun, 8 Jun 2025 22:37:06 +0530
Subject: [PATCH 06/16] Add support for RVV

---
 numpy/_core/meson.build | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index 13d0702dd97c..24163080589c 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -939,6 +939,7 @@ foreach gen_mtargets : [
       VSX4, VSX2,
       VX,
       LSX,
+      RVV,
     ]
   ],
   [

From e942017d5fe47cd65841c726616e0db4a066b77a Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Thu, 3 Jul 2025 15:24:21 +0530
Subject: [PATCH 07/16] add array-array div logic

---
 .../src/umath/loops_arithmetic.dispatch.cpp   | 131 +++++++++++++++++-
 1 file changed, 128 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 013717a159d4..2ee537454599 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -169,10 +169,107 @@ T floor_div(T n, T d) {
     }
     return r;
 }
+// General divide implementation for arrays
+template <typename T>
+void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
+    using D = hn::ScalableTag<T>;
+    const D d;
+    const size_t N = hn::Lanes(d);
+    bool raise_overflow = false;
+    bool raise_divbyzero = false;
+    const auto vec_zero = hn::Zero(d);
+    const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
+    const auto vec_neg_one = hn::Set(d, static_cast<T>(-1));
+    
+    size_t i = 0;
+    for (; i + N <= static_cast<size_t>(len); i += N) {
+        const auto vec_a = hn::LoadU(d, src1 + i);
+        const auto vec_b = hn::LoadU(d, src2 + i);
+        const auto b_is_zero = hn::Eq(vec_b, vec_zero);
+        const auto a_is_min = hn::Eq(vec_a, vec_min_val);
+        const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one);
+        const auto overflow_cond = hn::And(a_is_min, b_is_neg_one);
+        auto vec_div = hn::Div(vec_a, vec_b);
+        const auto vec_mul = hn::Mul(vec_div, vec_b);
+        const auto has_remainder = hn::Ne(vec_a, vec_mul);
+        const auto a_sign = hn::Lt(vec_a, vec_zero);
+        const auto b_sign = hn::Lt(vec_b, vec_zero);
+        const auto different_signs = hn::Xor(a_sign, b_sign);
+        auto adjustment = hn::And(different_signs, has_remainder);
+        vec_div = hn::IfThenElse(adjustment, 
+                                 hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))), 
+                                 vec_div);
+        vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
+        vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
+        hn::StoreU(vec_div, d, dst + i);
+        if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
+            raise_divbyzero = true;
+        }
+        if (!raise_overflow && !hn::AllFalse(d, overflow_cond)) {
+            raise_overflow = true;
+        }
+    }
+    for (; i < static_cast<size_t>(len); i++) {
+        T a = src1[i];
+        T b = src2[i];
+        
+        if (b == 0) {
+            dst[i] = 0;
+            raise_divbyzero = true;
+        } 
+        else if (a == std::numeric_limits<T>::min() && b == -1) {
+            dst[i] = std::numeric_limits<T>::min();
+            raise_overflow = true;
+        } 
+        else {
+            dst[i] = floor_div(a, b);
+        }
+    }
+    
+    set_float_status(raise_overflow, raise_divbyzero);
+}
+// Unsigned division for arrays
+template <typename T>
+void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
+    using D = hn::ScalableTag<T>;
+    const D d;
+    const size_t N = hn::Lanes(d);
+    
+    bool raise_divbyzero = false;
+    const auto vec_zero = hn::Zero(d);
+    
+    size_t i = 0;
+    for (; i + N <= static_cast<size_t>(len); i += N) {
+        const auto vec_a = hn::LoadU(d, src1 + i);
+        const auto vec_b = hn::LoadU(d, src2 + i);
+        const auto b_is_zero = hn::Eq(vec_b, vec_zero);
+        auto vec_div = hn::Div(vec_a, vec_b);
+        vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
+        hn::StoreU(vec_div, d, dst + i);
+        if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
+            raise_divbyzero = true;
+        }
+    }
+    for (; i < static_cast<size_t>(len); i++) {
+        T a = src1[i];
+        T b = src2[i];
+        
+        if (b == 0) {
+            dst[i] = 0;
+            raise_divbyzero = true;
+        } else {
+            dst[i] = a / b;
+        }
+    }
+    
+    set_float_status(false, raise_divbyzero);
+}
+
 
 // Dispatch functions for signed integer division
 template <typename T>
 void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    npy_clear_floatstatus();
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(T) {
             const T divisor = *reinterpret_cast<T*>(ip2);
@@ -189,8 +286,22 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
         *reinterpret_cast<T*>(iop1) = io1;
         return;
     }
-#if NPY_SIMD   
-    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
+#if NPY_SIMD
+    // Handle array-array case
+    if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) 
+    {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
+                         nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
+        // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
+        if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) {
+            T* src1 = (T*)args[0];
+            T* src2 = (T*)args[1];
+            T* dst = (T*)args[2];
+            simd_divide_contig_signed(src1, src2, dst, dimensions[0]);
+            return;
+        }
+    }
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
         *reinterpret_cast<T*>(args[1]) != 0)
     {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
@@ -225,6 +336,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
 // Dispatch functions for unsigned integer division
 template <typename T>
 void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    npy_clear_floatstatus();
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(T) {
             const T d = *reinterpret_cast<T*>(ip2);
@@ -239,7 +351,20 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
         return;
     }
 #if NPY_SIMD
-    if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
+    // Handle array-array case
+    if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
+                         nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
+        // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
+        if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) {
+            T* src1 = (T*)args[0];
+            T* src2 = (T*)args[1];
+            T* dst = (T*)args[2];
+            simd_divide_contig_unsigned(src1, src2, dst, dimensions[0]);
+            return;
+        }
+    }
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
         *reinterpret_cast<T*>(args[1]) != 0)
     {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);

From 67bd0a6c0a262cc009e9a0dbb61e3c1027ed1343 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Mon, 21 Jul 2025 17:37:32 +0530
Subject: [PATCH 08/16] update logic for array-array div

---
 .../src/umath/loops_arithmetic.dispatch.cpp   | 48 ++++++++++++-------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 2ee537454599..af32d0539fd6 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -169,6 +169,7 @@ T floor_div(T n, T d) {
     }
     return r;
 }
+
 // General divide implementation for arrays
 template <typename T>
 void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
@@ -178,30 +179,40 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
     bool raise_overflow = false;
     bool raise_divbyzero = false;
     const auto vec_zero = hn::Zero(d);
+    const auto vec_one = hn::Set(d, static_cast<T>(1));
     const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
     const auto vec_neg_one = hn::Set(d, static_cast<T>(-1));
-    
+
     size_t i = 0;
     for (; i + N <= static_cast<size_t>(len); i += N) {
         const auto vec_a = hn::LoadU(d, src1 + i);
         const auto vec_b = hn::LoadU(d, src2 + i);
+        
         const auto b_is_zero = hn::Eq(vec_b, vec_zero);
         const auto a_is_min = hn::Eq(vec_a, vec_min_val);
         const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one);
         const auto overflow_cond = hn::And(a_is_min, b_is_neg_one);
-        auto vec_div = hn::Div(vec_a, vec_b);
-        const auto vec_mul = hn::Mul(vec_div, vec_b);
+        
+        const auto safe_div_mask = hn::Not(hn::Or(b_is_zero, overflow_cond));
+        const auto safe_b = hn::IfThenElse(hn::Or(b_is_zero, overflow_cond), vec_one, vec_b);
+        
+        auto vec_div = hn::Div(vec_a, safe_b);
+        
+        const auto vec_mul = hn::Mul(vec_div, safe_b);
         const auto has_remainder = hn::Ne(vec_a, vec_mul);
         const auto a_sign = hn::Lt(vec_a, vec_zero);
-        const auto b_sign = hn::Lt(vec_b, vec_zero);
+        const auto b_sign = hn::Lt(safe_b, vec_zero);
         const auto different_signs = hn::Xor(a_sign, b_sign);
-        auto adjustment = hn::And(different_signs, has_remainder);
-        vec_div = hn::IfThenElse(adjustment, 
-                                 hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))), 
-                                 vec_div);
+        const auto needs_adjustment = hn::And(safe_div_mask, 
+                                             hn::And(different_signs, has_remainder));
+        
+        vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
+        
         vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
         vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
+        
         hn::StoreU(vec_div, d, dst + i);
+        
         if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
             raise_divbyzero = true;
         }
@@ -209,10 +220,11 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
             raise_overflow = true;
         }
     }
+    
     for (; i < static_cast<size_t>(len); i++) {
         T a = src1[i];
         T b = src2[i];
-        
+
         if (b == 0) {
             dst[i] = 0;
             raise_divbyzero = true;
@@ -225,35 +237,41 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
             dst[i] = floor_div(a, b);
         }
     }
-    
+
     set_float_status(raise_overflow, raise_divbyzero);
 }
+
 // Unsigned division for arrays
 template <typename T>
 void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
     using D = hn::ScalableTag<T>;
     const D d;
     const size_t N = hn::Lanes(d);
-    
     bool raise_divbyzero = false;
     const auto vec_zero = hn::Zero(d);
-    
+    const auto vec_one = hn::Set(d, static_cast<T>(1));
+
     size_t i = 0;
     for (; i + N <= static_cast<size_t>(len); i += N) {
         const auto vec_a = hn::LoadU(d, src1 + i);
         const auto vec_b = hn::LoadU(d, src2 + i);
+        
         const auto b_is_zero = hn::Eq(vec_b, vec_zero);
-        auto vec_div = hn::Div(vec_a, vec_b);
+
+        const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b);
+        auto vec_div = hn::Div(vec_a, safe_b);
         vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
         hn::StoreU(vec_div, d, dst + i);
         if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
             raise_divbyzero = true;
         }
     }
+    
+    // Handle remaining elements
     for (; i < static_cast<size_t>(len); i++) {
         T a = src1[i];
         T b = src2[i];
-        
+
         if (b == 0) {
             dst[i] = 0;
             raise_divbyzero = true;
@@ -261,11 +279,9 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
             dst[i] = a / b;
         }
     }
-    
     set_float_status(false, raise_divbyzero);
 }
 
-
 // Dispatch functions for signed integer division
 template <typename T>
 void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {

From 6573701b9a8332fad7208e580c65c26de1a68209 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Tue, 22 Jul 2025 00:09:18 +0530
Subject: [PATCH 09/16] update logic for int64 array-array

---
 .../src/umath/loops_arithmetic.dispatch.cpp   | 40 +++++++++++++------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index af32d0539fd6..29dde27460ea 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -198,15 +198,17 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
         
         auto vec_div = hn::Div(vec_a, safe_b);
         
-        const auto vec_mul = hn::Mul(vec_div, safe_b);
-        const auto has_remainder = hn::Ne(vec_a, vec_mul);
-        const auto a_sign = hn::Lt(vec_a, vec_zero);
-        const auto b_sign = hn::Lt(safe_b, vec_zero);
-        const auto different_signs = hn::Xor(a_sign, b_sign);
-        const auto needs_adjustment = hn::And(safe_div_mask, 
-                                             hn::And(different_signs, has_remainder));
-        
-        vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
+        if (!hn::AllFalse(d, safe_div_mask)) {
+            const auto vec_mul = hn::Mul(vec_div, safe_b);
+            const auto has_remainder = hn::Ne(vec_a, vec_mul);
+            const auto a_sign = hn::Lt(vec_a, vec_zero);
+            const auto b_sign = hn::Lt(vec_b, vec_zero);
+            const auto different_signs = hn::Xor(a_sign, b_sign);
+            const auto needs_adjustment = hn::And(safe_div_mask,
+                                                 hn::And(has_remainder, different_signs));
+            
+            vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
+        }
         
         vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
         vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
@@ -221,6 +223,7 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
         }
     }
     
+    // Handle remaining elements
     for (; i < static_cast<size_t>(len); i++) {
         T a = src1[i];
         T b = src2[i];
@@ -234,10 +237,15 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
             raise_overflow = true;
         } 
         else {
-            dst[i] = floor_div(a, b);
+            T r = a / b;
+            if (((a > 0) != (b > 0)) && ((r * b) != a)) {
+                --r;
+            }
+            dst[i] = r;
         }
     }
-
+    
+    npy_clear_floatstatus();
     set_float_status(raise_overflow, raise_divbyzero);
 }
 
@@ -255,13 +263,17 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
     for (; i + N <= static_cast<size_t>(len); i += N) {
         const auto vec_a = hn::LoadU(d, src1 + i);
         const auto vec_b = hn::LoadU(d, src2 + i);
-        
-        const auto b_is_zero = hn::Eq(vec_b, vec_zero);
 
+        const auto b_is_zero = hn::Eq(vec_b, vec_zero);
+        
         const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b);
+        
         auto vec_div = hn::Div(vec_a, safe_b);
+        
         vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
+        
         hn::StoreU(vec_div, d, dst + i);
+        
         if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
             raise_divbyzero = true;
         }
@@ -279,6 +291,8 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
             dst[i] = a / b;
         }
     }
+    
+    npy_clear_floatstatus();
     set_float_status(false, raise_divbyzero);
 }
 

From fb01fb327d99eb1bb2e556912c7ce03b0a222723 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Mon, 18 Aug 2025 15:16:10 +0530
Subject: [PATCH 10/16] include highway wrapper

---
 .../src/umath/loops_arithmetic.dispatch.cpp   | 204 +++++++++---------
 1 file changed, 105 insertions(+), 99 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 29dde27460ea..849d0733a988 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -13,7 +13,7 @@
 #include "loops_utils.h"
 #include "loops.h"
 #include "fast_loop_macros.h"
-#include "simd/simd.h"
+#include "simd/simd.hpp"
 #include "lowlevel_strided_loops.h"
 #include "common.hpp"
 
@@ -21,11 +21,10 @@
 #include <limits>
 #include <cstdio>
 
-#include <hwy/highway.h>
-namespace hn = hwy::HWY_NAMESPACE;
-
-HWY_BEFORE_NAMESPACE();
-namespace HWY_NAMESPACE {
+using namespace np::simd;
+#if NPY_HWY
+namespace hn = np::simd::hn;
+#endif
 
 // Helper function to set float status
 inline void set_float_status(bool overflow, bool divbyzero) {
@@ -36,13 +35,15 @@ inline void set_float_status(bool overflow, bool divbyzero) {
         npy_set_floatstatus_divbyzero();
     }
 }
-#if NPY_SIMD
-// Signed integer division
+#if NPY_HWY
+
+// Signed integer  DIVIDE  by scalar
+
 template <typename T>
 void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) {
-    using D = hn::ScalableTag<T>;
+    using D = _Tag<T>;
     const D d;
-    const size_t N = hn::Lanes(d);
+    const size_t N = Lanes(T{});
 
     bool raise_overflow = false;
     bool raise_divbyzero = false;
@@ -59,13 +60,13 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
         }
     }
     else if (scalar == static_cast<T>(-1)) {
-        const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
+        const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
         size_t i = 0;
         for (; i + N <= static_cast<size_t>(len); i += N) {
-            const auto vec_src = hn::LoadU(d, src + i);
-            const auto is_min_val = hn::Eq(vec_src, vec_min_val);
+            const auto vec_src = LoadU(src + i);
+            const auto is_min_val = Eq(vec_src, vec_min_val);
             const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
-            hn::StoreU(vec_res, d, dst + i);
+            StoreU(vec_res, dst + i);
             if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
                 raise_overflow = true;
             }
@@ -83,21 +84,21 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
     }
     else {
         // General case with floor division semantics
-        const auto vec_scalar = hn::Set(d, scalar);
-        const auto vec_zero = hn::Zero(d);
-        const auto one = hn::Set(d, static_cast<T>(1));
+        const auto vec_scalar = Set(scalar);
+        const auto vec_zero = Zero<T>();
+        const auto one = Set(static_cast<T>(1));
         size_t i = 0;
         
         for (; i + N <= static_cast<size_t>(len); i += N) {
-            const auto vec_src = hn::LoadU(d, src + i);
-            auto vec_div = hn::Div(vec_src, vec_scalar);
-            const auto vec_mul = hn::Mul(vec_div, vec_scalar);
-            const auto eq_mask = hn::Eq(vec_src, vec_mul);
-            const auto diff_signs = hn::Lt(hn::Xor(vec_src, vec_scalar), vec_zero);
-            const auto adjust = hn::AndNot(eq_mask, diff_signs);
+            const auto vec_src = LoadU(src + i);
+            auto vec_div = Div(vec_src, vec_scalar);
+            const auto vec_mul = Mul(vec_div, vec_scalar);
+            const auto eq_mask = Eq(vec_src, vec_mul);
+            const auto diff_signs = Lt(Xor(vec_src, vec_scalar), vec_zero);
+            const auto adjust = AndNot(eq_mask, diff_signs);
             
             vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one);
-            hn::StoreU(vec_div, d, dst + i);
+            StoreU(vec_div, dst + i);
         }
         
         // Handle remaining elements with scalar code
@@ -113,13 +114,12 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
     set_float_status(raise_overflow, raise_divbyzero);
 }
 
-// Unsigned integer division
+// Unsigned integer  DIVIDE  by scalar
+
 template <typename T>
 void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) {
-    using D = hn::ScalableTag<T>;
-    const D d;
-    const size_t N = hn::Lanes(d);
 
+    const size_t N = Lanes(T{});
     bool raise_divbyzero = false;
 
     if (scalar == 0) {
@@ -134,12 +134,12 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
         }
     }
     else {
-        const auto vec_scalar = hn::Set(d, scalar);
+        const auto vec_scalar = Set(scalar);
         size_t i = 0;
         for (; i + N <= static_cast<size_t>(len); i += N) {
-            const auto vec_src = hn::LoadU(d, src + i);
-            const auto vec_res = hn::Div(vec_src, vec_scalar);
-            hn::StoreU(vec_res, d, dst + i);
+            const auto vec_src = LoadU(src + i);
+            const auto vec_res = Div(vec_src, vec_scalar);
+            StoreU(vec_res, dst + i);
         }
         // Handle remaining elements
         for (; i < static_cast<size_t>(len); i++) {
@@ -149,63 +149,45 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
 
     set_float_status(false, raise_divbyzero);
 }
-#endif // NPY_SIMD
-// Floor division for signed integers
-template <typename T>
-T floor_div(T n, T d) {
-    if (HWY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
-        if (d == 0) {
-            npy_set_floatstatus_divbyzero();
-            return 0;
-        }
-        else {
-            npy_set_floatstatus_overflow();
-            return std::numeric_limits<T>::min();
-        }
-    }
-    T r = n / d;
-    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
-        --r;
-    }
-    return r;
-}
 
-// General divide implementation for arrays
+// Signed integer  DIVIDE  array / array
+
 template <typename T>
 void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
-    using D = hn::ScalableTag<T>;
+    using D = _Tag<T>;
     const D d;
-    const size_t N = hn::Lanes(d);
+    const size_t N = Lanes(T{});
+
     bool raise_overflow = false;
     bool raise_divbyzero = false;
-    const auto vec_zero = hn::Zero(d);
-    const auto vec_one = hn::Set(d, static_cast<T>(1));
-    const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
-    const auto vec_neg_one = hn::Set(d, static_cast<T>(-1));
+    const auto vec_zero = Zero<T>();
+    const auto vec_one = Set(static_cast<T>(1));
+    const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
+    const auto vec_neg_one = Set(static_cast<T>(-1));
 
     size_t i = 0;
     for (; i + N <= static_cast<size_t>(len); i += N) {
-        const auto vec_a = hn::LoadU(d, src1 + i);
-        const auto vec_b = hn::LoadU(d, src2 + i);
+        const auto vec_a = LoadU(src1 + i);
+        const auto vec_b = LoadU(src2 + i);
         
-        const auto b_is_zero = hn::Eq(vec_b, vec_zero);
-        const auto a_is_min = hn::Eq(vec_a, vec_min_val);
-        const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one);
-        const auto overflow_cond = hn::And(a_is_min, b_is_neg_one);
+        const auto b_is_zero = Eq(vec_b, vec_zero);
+        const auto a_is_min = Eq(vec_a, vec_min_val);
+        const auto b_is_neg_one = Eq(vec_b, vec_neg_one);
+        const auto overflow_cond = And(a_is_min, b_is_neg_one);
         
-        const auto safe_div_mask = hn::Not(hn::Or(b_is_zero, overflow_cond));
-        const auto safe_b = hn::IfThenElse(hn::Or(b_is_zero, overflow_cond), vec_one, vec_b);
+        const auto safe_div_mask = hn::Not(Or(b_is_zero, overflow_cond));
+        const auto safe_b = hn::IfThenElse(Or(b_is_zero, overflow_cond), vec_one, vec_b);
         
-        auto vec_div = hn::Div(vec_a, safe_b);
+        auto vec_div = Div(vec_a, safe_b);
         
         if (!hn::AllFalse(d, safe_div_mask)) {
-            const auto vec_mul = hn::Mul(vec_div, safe_b);
+            const auto vec_mul = Mul(vec_div, safe_b);
             const auto has_remainder = hn::Ne(vec_a, vec_mul);
-            const auto a_sign = hn::Lt(vec_a, vec_zero);
-            const auto b_sign = hn::Lt(vec_b, vec_zero);
-            const auto different_signs = hn::Xor(a_sign, b_sign);
-            const auto needs_adjustment = hn::And(safe_div_mask,
-                                                 hn::And(has_remainder, different_signs));
+            const auto a_sign = Lt(vec_a, vec_zero);
+            const auto b_sign = Lt(vec_b, vec_zero);
+            const auto different_signs = Xor(a_sign, b_sign);
+            const auto needs_adjustment = And(safe_div_mask,
+                                                And(has_remainder, different_signs));
             
             vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
         }
@@ -213,7 +195,7 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
         vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
         vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
         
-        hn::StoreU(vec_div, d, dst + i);
+        StoreU(vec_div, dst + i);
         
         if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
             raise_divbyzero = true;
@@ -249,30 +231,32 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
     set_float_status(raise_overflow, raise_divbyzero);
 }
 
-// Unsigned division for arrays
+// Unsigned integer  DIVIDE  array / array
+
 template <typename T>
 void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
-    using D = hn::ScalableTag<T>;
+    using D = _Tag<T>;
     const D d;
-    const size_t N = hn::Lanes(d);
+    const size_t N = Lanes(T{});
+
     bool raise_divbyzero = false;
-    const auto vec_zero = hn::Zero(d);
-    const auto vec_one = hn::Set(d, static_cast<T>(1));
+    const auto vec_zero = Zero<T>();
+    const auto vec_one = Set(static_cast<T>(1));
 
     size_t i = 0;
     for (; i + N <= static_cast<size_t>(len); i += N) {
-        const auto vec_a = hn::LoadU(d, src1 + i);
-        const auto vec_b = hn::LoadU(d, src2 + i);
+        const auto vec_a = LoadU(src1 + i);
+        const auto vec_b = LoadU(src2 + i);
 
-        const auto b_is_zero = hn::Eq(vec_b, vec_zero);
+        const auto b_is_zero = Eq(vec_b, vec_zero);
         
         const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b);
         
-        auto vec_div = hn::Div(vec_a, safe_b);
+        auto vec_div = Div(vec_a, safe_b);
         
         vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
         
-        hn::StoreU(vec_div, d, dst + i);
+        StoreU(vec_div, dst + i);
         
         if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
             raise_divbyzero = true;
@@ -296,6 +280,29 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
     set_float_status(false, raise_divbyzero);
 }
 
+#endif // NPY_HWY
+
+// Floor division for signed integers
+template <typename T>
+T floor_div(T n, T d) {
+    if (NPY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return std::numeric_limits<T>::min();
+        }
+    }
+    T r = n / d;
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        --r;
+    }
+    return r;
+}
+
+
 // Dispatch functions for signed integer division
 template <typename T>
 void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
@@ -303,10 +310,10 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(T) {
             const T divisor = *reinterpret_cast<T*>(ip2);
-            if (HWY_UNLIKELY(divisor == 0)) {
+            if (NPY_UNLIKELY(divisor == 0)) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
-            } else if (HWY_UNLIKELY(io1 == std::numeric_limits<T>::min() && divisor == -1)) {
+            } else if (NPY_UNLIKELY(io1 == std::numeric_limits<T>::min() && divisor == -1)) {
                 npy_set_floatstatus_overflow();
                 io1 = std::numeric_limits<T>::min();
             } else {
@@ -316,7 +323,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
         *reinterpret_cast<T*>(iop1) = io1;
         return;
     }
-#if NPY_SIMD
+
+#if NPY_HWY
     // Handle array-array case
     if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) 
     {
@@ -343,18 +351,19 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
             return;
         }
     }
-#endif // NPY_SIMD
+#endif // NPY_HWY
 
+    // Scalar fallback
     // Fallback for non-blockable, in-place, or zero divisor cases
     BINARY_LOOP {
         const T dividend = *reinterpret_cast<T*>(ip1);
         const T divisor = *reinterpret_cast<T*>(ip2);
         T* result = reinterpret_cast<T*>(op1);
 
-        if (HWY_UNLIKELY(divisor == 0)) {
+        if (NPY_UNLIKELY(divisor == 0)) {
             npy_set_floatstatus_divbyzero();
             *result = 0;
-        } else if (HWY_UNLIKELY(dividend == std::numeric_limits<T>::min() && divisor == -1)) {
+        } else if (NPY_UNLIKELY(dividend == std::numeric_limits<T>::min() && divisor == -1)) {
             npy_set_floatstatus_overflow();
             *result = std::numeric_limits<T>::min();
         } else {
@@ -370,7 +379,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(T) {
             const T d = *reinterpret_cast<T*>(ip2);
-            if (HWY_UNLIKELY(d == 0)) {
+            if (NPY_UNLIKELY(d == 0)) {
                 npy_set_floatstatus_divbyzero();
                 io1 = 0;
             } else {
@@ -380,7 +389,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
         *reinterpret_cast<T*>(iop1) = io1;
         return;
     }
-#if NPY_SIMD
+#if NPY_HWY
     // Handle array-array case
     if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
@@ -406,13 +415,13 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
             return;
         }
     }
-#endif // NPY_SIMD
+#endif // NPY_HWY
 
     // Fallback for non-blockable, in-place, or zero divisor cases
     BINARY_LOOP {
         const T in1 = *reinterpret_cast<T*>(ip1);
         const T in2 = *reinterpret_cast<T*>(ip2);
-        if (HWY_UNLIKELY(in2 == 0)) {
+        if (NPY_UNLIKELY(in2 == 0)) {
             npy_set_floatstatus_divbyzero();
             *reinterpret_cast<T*>(op1) = 0;
         } else {
@@ -465,7 +474,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
         T* indexed = (T*)(ip1 + is1 * indx);
         T divisor = *(T*)value;
 
-        if (HWY_UNLIKELY(divisor == 0)) {
+        if (NPY_UNLIKELY(divisor == 0)) {
             npy_set_floatstatus_divbyzero();
             *indexed = 0;
         } else {
@@ -524,6 +533,3 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
 
 #undef DEFINE_DIVIDE_FUNCTION
 #undef DEFINE_DIVIDE_FUNCTION_UNSIGNED
-
-} // namespace HWY_NAMESPACE
-HWY_AFTER_NAMESPACE();

From fce87004a0dc93ccadaa030b7b9c103ff133b655 Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Mon, 18 Aug 2025 16:27:34 +0530
Subject: [PATCH 11/16] remove baseline HWY dispatch

---
 numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 849d0733a988..1ca6172418c1 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -3,9 +3,6 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
 #include "npy_cpu_dispatch.h"
-#if NPY__CPU_TARGET_INDEX == 0
-#define HWY_COMPILE_ONLY_SCALAR 1
-#endif
 
 #include "numpy/npy_common.h"
 #include "numpy/npy_math.h"

From 896a08780a703b8591eefb0a6425fe2be368013f Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Mon, 18 Aug 2025 22:04:55 +0530
Subject: [PATCH 12/16] use XOR instead of Zero

---
 numpy/_core/src/umath/loops_arithmetic.dispatch.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 1ca6172418c1..1c0eff89c0ea 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -82,8 +82,8 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
     else {
         // General case with floor division semantics
         const auto vec_scalar = Set(scalar);
-        const auto vec_zero = Zero<T>();
         const auto one = Set(static_cast<T>(1));
+        const auto vec_zero = Xor(one, one);
         size_t i = 0;
         
         for (; i + N <= static_cast<size_t>(len); i += N) {
@@ -157,8 +157,8 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
 
     bool raise_overflow = false;
     bool raise_divbyzero = false;
-    const auto vec_zero = Zero<T>();
     const auto vec_one = Set(static_cast<T>(1));
+    const auto vec_zero = Xor(vec_one, vec_one);
     const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
     const auto vec_neg_one = Set(static_cast<T>(-1));
 
@@ -237,8 +237,8 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
     const size_t N = Lanes(T{});
 
     bool raise_divbyzero = false;
-    const auto vec_zero = Zero<T>();
     const auto vec_one = Set(static_cast<T>(1));
+    const auto vec_zero = Xor(vec_one, vec_one);
 
     size_t i = 0;
     for (; i + N <= static_cast<size_t>(len); i += N) {

From 1efaa66bee3849bc875f7133ee21dbeac0e8e51b Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Tue, 19 Aug 2025 17:29:34 +0530
Subject: [PATCH 13/16] remove VSX4

---
 numpy/_core/meson.build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index 24163080589c..c8906679412b 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -936,7 +936,7 @@ foreach gen_mtargets : [
     [
       AVX512_SKX, AVX512F, AVX2, SSE41, SSE2,
       NEON,
-      VSX4, VSX2,
+      VSX2,
       VX,
       LSX,
       RVV,

From 7a530b70d43c206ff3a985a6116a3a7b79bd9dcc Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Thu, 21 Aug 2025 15:50:50 +0530
Subject: [PATCH 14/16] fix comment

---
 .../src/umath/loops_arithmetic.dispatch.cpp      | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 1c0eff89c0ea..6c0204cef01a 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -323,7 +323,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
 
 #if NPY_HWY
     // Handle array-array case
-    if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) 
+    if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes<uint8_t>)) 
     {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
                          nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
@@ -336,7 +336,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
             return;
         }
     }
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes<uint8_t>) &&
         *reinterpret_cast<T*>(args[1]) != 0)
     {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
@@ -388,7 +388,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
     }
 #if NPY_HWY
     // Handle array-array case
-    if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) {
+    if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes<uint8_t>)) {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
                          nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
         // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
@@ -400,7 +400,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
             return;
         }
     }
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes<uint8_t>) &&
         *reinterpret_cast<T*>(args[1]) != 0)
     {
         bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
@@ -483,25 +483,21 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
 
 // Macro to define the dispatch functions for signed types
 #define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \
-    extern "C" { \
         NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
             TYPE_divide<SCALAR_TYPE>(args, dimensions, steps, func); \
         } \
         NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
             return TYPE_divide_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
-        } \
-    } // extern "C"
+        }
 
 // Macro to define the dispatch functions for unsigned types
 #define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \
-    extern "C" { \
         NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
             TYPE_divide_unsigned<SCALAR_TYPE>(args, dimensions, steps, func); \
         } \
         NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
             return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
-        } \
-    } // extern "C"
+        }
 
 
 #ifdef NPY_CPU_DISPATCH_CURFX

From c9f99b3999ea3884709144a1cf980940b4d71f9c Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Fri, 22 Aug 2025 16:47:21 +0530
Subject: [PATCH 15/16] change set

---
 .../_core/src/umath/loops_arithmetic.dispatch.cpp  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
index 6c0204cef01a..3118dd4ff599 100644
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -57,7 +57,7 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
         }
     }
     else if (scalar == static_cast<T>(-1)) {
-        const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
+        const auto vec_min_val = Set(d, static_cast<T>(std::numeric_limits<T>::min()));
         size_t i = 0;
         for (; i + N <= static_cast<size_t>(len); i += N) {
             const auto vec_src = LoadU(src + i);
@@ -81,8 +81,8 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
     }
     else {
         // General case with floor division semantics
-        const auto vec_scalar = Set(scalar);
-        const auto one = Set(static_cast<T>(1));
+        const auto vec_scalar = Set(d, scalar);
+        const auto one = Set(d, static_cast<T>(1));
         const auto vec_zero = Xor(one, one);
         size_t i = 0;
         
@@ -157,10 +157,10 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
 
     bool raise_overflow = false;
     bool raise_divbyzero = false;
-    const auto vec_one = Set(static_cast<T>(1));
+    const auto vec_one = Set(d, static_cast<T>(1));
     const auto vec_zero = Xor(vec_one, vec_one);
-    const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
-    const auto vec_neg_one = Set(static_cast<T>(-1));
+    const auto vec_min_val = Set(d, static_cast<T>(std::numeric_limits<T>::min()));
+    const auto vec_neg_one = Set(d, static_cast<T>(-1));
 
     size_t i = 0;
     for (; i + N <= static_cast<size_t>(len); i += N) {
@@ -237,7 +237,7 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
     const size_t N = Lanes(T{});
 
     bool raise_divbyzero = false;
-    const auto vec_one = Set(static_cast<T>(1));
+    const auto vec_one = Set(d, static_cast<T>(1));
     const auto vec_zero = Xor(vec_one, vec_one);
 
     size_t i = 0;

From 1d5826f0d1b5833b7fd4e69fa26b4d04ad0f348b Mon Sep 17 00:00:00 2001
From: abhishek-fujitsu <abhishek.r.kumar@fujitsu.com>
Date: Fri, 22 Aug 2025 17:48:05 +0530
Subject: [PATCH 16/16] change CI config

---
 .github/workflows/linux_qemu.yml | 4 ++--
 numpy/_core/meson.build          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml
index 53780ae097a3..197007845cb6 100644
--- a/.github/workflows/linux_qemu.yml
+++ b/.github/workflows/linux_qemu.yml
@@ -42,7 +42,7 @@ jobs:
               "ppc64le",
               "powerpc64le-linux-gnu",
               "ppc64le/ubuntu:22.04",
-              "-Dallow-noblas=true",
+              "-Dallow-noblas=true -Dcpu-dispatch=vsx,vsx2,vsx3",
               "test_kind or test_multiarray or test_simd or test_umath or test_ufunc",
               "ppc64le"
             ]
@@ -50,7 +50,7 @@ jobs:
               "ppc64le - baseline(Power9)",
               "powerpc64le-linux-gnu",
               "ppc64le/ubuntu:22.04",
-              "-Dallow-noblas=true -Dcpu-baseline=vsx3",
+              "-Dallow-noblas=true -Dcpu-baseline=vsx3 -Dcpu-dispatch=vsx,vsx2,vsx3",
               "test_kind or test_multiarray or test_simd or test_umath or test_ufunc",
               "ppc64le"
             ]
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index c8906679412b..24163080589c 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -936,7 +936,7 @@ foreach gen_mtargets : [
     [
       AVX512_SKX, AVX512F, AVX2, SSE41, SSE2,
       NEON,
-      VSX2,
+      VSX4, VSX2,
       VX,
       LSX,
       RVV,