diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml
index 53780ae097a3..197007845cb6 100644
--- a/.github/workflows/linux_qemu.yml
+++ b/.github/workflows/linux_qemu.yml
@@ -42,7 +42,7 @@ jobs:
               "ppc64le",
               "powerpc64le-linux-gnu",
               "ppc64le/ubuntu:22.04",
-              "-Dallow-noblas=true",
+              "-Dallow-noblas=true -Dcpu-dispatch=vsx,vsx2,vsx3",
               "test_kind or test_multiarray or test_simd or test_umath or test_ufunc",
               "ppc64le"
             ]
@@ -50,7 +50,7 @@ jobs:
               "ppc64le - baseline(Power9)",
               "powerpc64le-linux-gnu",
               "ppc64le/ubuntu:22.04",
-              "-Dallow-noblas=true -Dcpu-baseline=vsx3",
+              "-Dallow-noblas=true -Dcpu-baseline=vsx3 -Dcpu-dispatch=vsx,vsx2,vsx3",
               "test_kind or test_multiarray or test_simd or test_umath or test_ufunc",
               "ppc64le"
             ]
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index b4c769810ad8..24163080589c 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -932,13 +932,14 @@ foreach gen_mtargets : [
   ],
   [
     'loops_arithmetic.dispatch.h',
-    src_file.process('src/umath/loops_arithmetic.dispatch.c.src'),
+    'src/umath/loops_arithmetic.dispatch.cpp',
     [
       AVX512_SKX, AVX512F, AVX2, SSE41, SSE2,
       NEON,
       VSX4, VSX2,
       VX,
       LSX,
+      RVV,
     ]
   ],
   [
diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
deleted file mode 100644
index c9efe5579e71..000000000000
--- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
+++ /dev/null
@@ -1,521 +0,0 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "simd/simd.h"
-#include "loops_utils.h"
-#include "loops.h"
-#include "lowlevel_strided_loops.h"
-// Provides the various *_LOOP macros
-#include "fast_loop_macros.h"
-
-//###############################################################################
-//## Division
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- *
- * Floor division of signed is based on T. Granlund and P. L. Montgomery
- * "Division by invariant integers using multiplication(see [Figure 6.1]
- * https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
- * For details on TRUNC division see simd/intdiv.h for more clarification
- ***********************************************************************************
- ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF
- ***********************************************************************************
- * For q = FLOOR(a/d), all sword:
- *     sword -dsign = SRL(d, N - 1);
- *     uword -nsign = (n < -dsign);
- *     uword -qsign = EOR(-nsign, -dsign);
- *     q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
- ********************************************************************************/
-
-#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)
-    // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
-    // may not perform well on both neon and up to VSX3 compared to scalar
-    // division.
-    #define SIMD_DISABLE_DIV64_OPT
-#endif
-
-#if NPY_SIMD
-/**begin repeat
- * Signed types
- * #sfx    = s8, s16, s32, s64#
- * #len    = 8,  16,  32,  64#
- */
-#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
-static inline void
-simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
-    const int vstep            = npyv_nlanes_@sfx@;
-    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
-
-    if (scalar == -1) {
-        npyv_b@len@ noverflow  = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
-        const npyv_@sfx@ vzero = npyv_zero_@sfx@();
-        const npyv_@sfx@ vmin  = npyv_setall_@sfx@(NPY_MIN_INT@len@);
-        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            npyv_@sfx@ a       = npyv_load_@sfx@(src);
-            npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
-            noverflow          = npyv_and_b@len@(noverflow, gt_min);
-            npyv_@sfx@ neg     = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin);
-            npyv_store_@sfx@(dst, neg);
-        }
-
-        int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
-        for (; len > 0; --len, ++src, ++dst) {
-            npyv_lanetype_@sfx@ a = *src;
-            if (a == NPY_MIN_INT@len@) {
-                raise_err = 1;
-                *dst  = NPY_MIN_INT@len@;
-            } else {
-                *dst = -a;
-            }
-        }
-        if (raise_err) {
-            npy_set_floatstatus_overflow();
-        }
-    } else {
-        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-            npyv_@sfx@  nsign_d   = npyv_setall_@sfx@(scalar < 0);
-            npyv_@sfx@  a         = npyv_load_@sfx@(src);
-            npyv_@sfx@  nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
-            nsign_a               = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
-            npyv_@sfx@  diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
-            npyv_@sfx@  to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
-            npyv_@sfx@  trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
-            npyv_@sfx@  floor     = npyv_sub_@sfx@(trunc, to_ninf);
-            npyv_store_@sfx@(dst, floor);
-        }
-
-        for (; len > 0; --len, ++src, ++dst) {
-            const npyv_lanetype_@sfx@ a = *src;
-            npyv_lanetype_@sfx@ r = a / scalar;
-            // Negative quotients needs to be rounded down
-            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
-                r--;
-            }
-            *dst = r;
-        }
-    }
-    npyv_cleanup();
-}
-#endif
-/**end repeat**/
-
-/**begin repeat
- * Unsigned types
- * #sfx    = u8, u16, u32, u64#
- * #len    = 8,  16,  32,  64#
- */
-#if @len@ < 64 || (@len@ == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
-static inline void
-simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
-    const int vstep            = npyv_nlanes_@sfx@;
-    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
-
-    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
-        npyv_@sfx@ a = npyv_load_@sfx@(src);
-        npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
-        npyv_store_@sfx@(dst, c);
-    }
-
-    for (; len > 0; --len, ++src, ++dst) {
-        const npyv_lanetype_@sfx@ a = *src;
-        *dst = a / scalar;
-    }
-    npyv_cleanup();
-}
-#endif
-/**end repeat**/
-
-#if defined(NPY_HAVE_VSX4)
-
-/**begin repeat
- * #t = u, s#
- * #signed = 0, 1#
- */
-/*
- * Computes division of 2 8-bit signed/unsigned integer vectors
- *
- * As Power10 only supports integer vector division for data of 32 bits or
- * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
- * vector division instruction, and then, convert the result back to npyv_u8.
- */
-NPY_FINLINE npyv_@t@8
-vsx4_div_@t@8(npyv_@t@8 a, npyv_@t@8 b)
-{
-#if @signed@
-    npyv_s16x2 ta, tb;
-    npyv_s32x2 ahi, alo, bhi, blo;
-    ta.val[0] = vec_unpackh(a);
-    ta.val[1] = vec_unpackl(a);
-    tb.val[0] = vec_unpackh(b);
-    tb.val[1] = vec_unpackl(b);
-    ahi.val[0] = vec_unpackh(ta.val[0]);
-    ahi.val[1] = vec_unpackl(ta.val[0]);
-    alo.val[0] = vec_unpackh(ta.val[1]);
-    alo.val[1] = vec_unpackl(ta.val[1]);
-    bhi.val[0] = vec_unpackh(tb.val[0]);
-    bhi.val[1] = vec_unpackl(tb.val[0]);
-    blo.val[0] = vec_unpackh(tb.val[1]);
-    blo.val[1] = vec_unpackl(tb.val[1]);
-#else
-    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
-    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
-    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
-    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
-    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
-    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
-#endif
-    npyv_@t@32 v1 = vec_div(ahi.val[0], bhi.val[0]);
-    npyv_@t@32 v2 = vec_div(ahi.val[1], bhi.val[1]);
-    npyv_@t@32 v3 = vec_div(alo.val[0], blo.val[0]);
-    npyv_@t@32 v4 = vec_div(alo.val[1], blo.val[1]);
-    npyv_@t@16 hi = vec_pack(v1, v2);
-    npyv_@t@16 lo = vec_pack(v3, v4);
-    return vec_pack(hi, lo);
-}
-
-NPY_FINLINE npyv_@t@16
-vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b)
-{
-#if @signed@
-    npyv_s32x2 a_expand;
-    npyv_s32x2 b_expand;
-    a_expand.val[0] = vec_unpackh(a);
-    a_expand.val[1] = vec_unpackl(a);
-    b_expand.val[0] = vec_unpackh(b);
-    b_expand.val[1] = vec_unpackl(b);
-#else
-    npyv_u32x2 a_expand = npyv_expand_@t@32_@t@16(a);
-    npyv_u32x2 b_expand = npyv_expand_@t@32_@t@16(b);
-#endif
-    npyv_@t@32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
-    npyv_@t@32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
-    return vec_pack(v1, v2);
-}
-
-#define vsx4_div_@t@32 vec_div
-#define vsx4_div_@t@64 vec_div
-/**end repeat**/
-
-/**begin repeat
- * Unsigned types
- * #sfx = u8, u16, u32, u64#
- * #len = 8,  16,  32,  64#
- */
-static inline void
-vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2];
-    const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
-    const int vstep           = npyv_nlanes_@sfx@;
-
-    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
-         dst1 += vstep) {
-        npyv_@sfx@ a = npyv_load_@sfx@(src1);
-        npyv_@sfx@ b = npyv_load_@sfx@(src2);
-        npyv_@sfx@ c = vsx4_div_@sfx@(a, b);
-        npyv_store_@sfx@(dst1, c);
-        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
-            npy_set_floatstatus_divbyzero();
-        }
-    }
-
-    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
-        const npyv_lanetype_@sfx@ a = *src1;
-        const npyv_lanetype_@sfx@ b = *src2;
-        if (NPY_UNLIKELY(b == 0)) {
-            npy_set_floatstatus_divbyzero();
-            *dst1 = 0;
-        } else{
-            *dst1 = a / b;
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-
-/**begin repeat
- * Signed types
- * #sfx = s8, s16, s32, s64#
- * #len = 8,  16,  32,  64#
- */
-static inline void
-vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len)
-{
-    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
-    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
-    npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2];
-    const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
-    const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
-    const npyv_@sfx@ vmin     = npyv_setall_@sfx@(NPY_MIN_INT@len@);
-    npyv_b@len@ warn_zero     = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
-    npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
-    const int vstep           = npyv_nlanes_@sfx@;
-
-    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
-         dst1 += vstep) {
-        npyv_@sfx@ a   = npyv_load_@sfx@(src1);
-        npyv_@sfx@ b   = npyv_load_@sfx@(src2);
-        npyv_@sfx@ quo = vsx4_div_@sfx@(a, b);
-        npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo));
-        // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1))
-        npyv_b@len@ bzero    = npyv_cmpeq_@sfx@(b, vzero);
-        npyv_b@len@ amin     = npyv_cmpeq_@sfx@(a, vmin);
-        npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one);
-        npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
-                   warn_zero = npyv_or_@sfx@(bzero, warn_zero);
-               warn_overflow = npyv_or_@sfx@(overflow, warn_overflow);
-        // handle mixed case the way Python does
-        // ((a > 0) == (b > 0) || rem == 0)
-        npyv_b@len@ a_gt_zero  = npyv_cmpgt_@sfx@(a, vzero);
-        npyv_b@len@ b_gt_zero  = npyv_cmpgt_@sfx@(b, vzero);
-        npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero);
-        npyv_b@len@ rem_zero   = npyv_cmpeq_@sfx@(rem, vzero);
-        npyv_b@len@ or         = npyv_or_@sfx@(ab_eq_cond, rem_zero);
-        npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
-                      quo = npyv_add_@sfx@(quo, to_sub);
-                      // Divide by zero
-                      quo = npyv_select_@sfx@(bzero, vzero, quo);
-                      // Overflow
-                      quo = npyv_select_@sfx@(overflow, vmin, quo);
-        npyv_store_@sfx@(dst1, quo);
-    }
-
-    if (!vec_all_eq(warn_zero, vzero)) {
-        npy_set_floatstatus_divbyzero();
-    }
-    if (!vec_all_eq(warn_overflow, vzero)) {
-        npy_set_floatstatus_overflow();
-    }
-
-    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
-        const npyv_lanetype_@sfx@ a = *src1;
-        const npyv_lanetype_@sfx@ b = *src2;
-        if (NPY_UNLIKELY(b == 0)) {
-            npy_set_floatstatus_divbyzero();
-            *dst1 = 0;
-        } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) {
-            npy_set_floatstatus_overflow();
-            *dst1 = NPY_MIN_INT@len@;
-        } else {
-            *dst1 = a / b;
-            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
-                *dst1 -= 1;
-            }
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-#endif // NPY_HAVE_VSX4
-#endif // NPY_SIMD
-
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-
-/**begin repeat
- * Signed types
- *  #type  = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
- *  #TYPE  = BYTE,     SHORT,     INT,     LONG,     LONGLONG#
- */
-#undef TO_SIMD_SFX
-#if 0
-/**begin repeat1
- * #len = 8, 16, 32, 64#
- */
-#elif NPY_BITSOF_@TYPE@ == @len@
-    #define TO_SIMD_SFX(X) X##_s@len@
-/**end repeat1**/
-#endif
-#if NPY_BITSOF_@TYPE@ == 64 && defined(SIMD_DISABLE_DIV64_OPT)
-    #undef TO_SIMD_SFX
-#endif
-
-NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
-{
-    /*
-     * FIXME: On x86 at least, dividing the smallest representable integer
-     * by -1 causes a SIFGPE (division overflow). We treat this case here
-     * (to avoid a SIGFPE crash at python level), but a good solution would
-     * be to treat integer division problems separately from FPU exceptions
-     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-     */
-    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
-        if (d == 0) {
-            npy_set_floatstatus_divbyzero();
-            return 0;
-        }
-        else {
-            npy_set_floatstatus_overflow();
-            return NPY_MIN_@TYPE@;
-        }
-    }
-    @type@ r = n / d;
-    // Negative quotients needs to be rounded down
-    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
-        r--;
-    }
-    return r;
-}
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
-        }
-        *((@type@ *)iop1) = io1;
-    }
-#if NPY_SIMD && defined(TO_SIMD_SFX)
-#if defined(NPY_HAVE_VSX4)
-    // both arguments are arrays of the same size
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
-    }
-#endif
-    // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
-             (*(@type@ *)args[1]) != 0) {
-        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
-    }
-#endif
-    else {
-        BINARY_LOOP {
-            *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
-        }
-    }
-}
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
-(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
-{
-    char *ip1 = args[0];
-    char *indxp = args[1];
-    char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
-    npy_intp shape = steps[3];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-    @type@ *indexed;
-    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
-        npy_intp indx = *(npy_intp *)indxp;
-        if (indx < 0) {
-            indx += shape;
-        }
-        indexed = (@type@ *)(ip1 + is1 * indx);
-        *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
-    }
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * Unsigned types
- *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
- *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
- *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
- */
-#undef TO_SIMD_SFX
-#if 0
-/**begin repeat1
- * #len = 8, 16, 32, 64#
- */
-#elif NPY_BITSOF_@STYPE@ == @len@
-    #define TO_SIMD_SFX(X) X##_u@len@
-/**end repeat1**/
-#endif
-/*
- * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
- * because emulating multiply-high on these architectures is going to be expensive comparing
- * to the native scalar dividers.
- * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
- * Power10(VSX4) is an exception here since it has native support for integer vector division.
- */
-#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX))
-    #undef TO_SIMD_SFX
-#endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ d = *(@type@ *)ip2;
-            if (NPY_UNLIKELY(d == 0)) {
-                npy_set_floatstatus_divbyzero();
-                io1 = 0;
-            } else {
-                io1 /= d;
-            }
-        }
-        *((@type@ *)iop1) = io1;
-    }
-#if NPY_SIMD && defined(TO_SIMD_SFX)
-#if defined(NPY_HAVE_VSX4)
-    // both arguments are arrays of the same size
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
-    }
-#endif
-    // for contiguous block of memory, divisor is a scalar and not 0
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
-             (*(@type@ *)args[1]) != 0) {
-        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
-    }
-#endif
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            if (NPY_UNLIKELY(in2 == 0)) {
-                npy_set_floatstatus_divbyzero();
-                *((@type@ *)op1) = 0;
-            } else{
-                *((@type@ *)op1) = in1 / in2;
-            }
-        }
-    }
-}
-
-NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
-(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
-{
-    char *ip1 = args[0];
-    char *indxp = args[1];
-    char *value = args[2];
-    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
-    npy_intp shape = steps[3];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-    @type@ *indexed;
-    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
-        npy_intp indx = *(npy_intp *)indxp;
-        if (indx < 0) {
-            indx += shape;
-        }
-        indexed = (@type@ *)(ip1 + is1 * indx);
-        @type@ in2 = *(@type@ *)value;
-        if (NPY_UNLIKELY(in2 == 0)) {
-            npy_set_floatstatus_divbyzero();
-            *indexed = 0;
-        } else {
-            *indexed = *indexed / in2;
-        }
-    }
-    return 0;
-}
-
-/**end repeat**/
diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
new file mode 100644
index 000000000000..3118dd4ff599
--- /dev/null
+++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.cpp
@@ -0,0 +1,528 @@
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_cpu_dispatch.h"
+
+#include "numpy/npy_common.h"
+#include "numpy/npy_math.h"
+
+#include "loops_utils.h"
+#include "loops.h"
+#include "fast_loop_macros.h"
+#include "simd/simd.hpp"
+#include "lowlevel_strided_loops.h"
+#include "common.hpp"
+
+#include <cstring> // for memcpy
+#include <limits>
+#include <cstdio>
+
+using namespace np::simd;
+#if NPY_HWY
+namespace hn = np::simd::hn;
+#endif
+
+// Helper function to set float status
+inline void set_float_status(bool overflow, bool divbyzero) {
+    if (overflow) {
+        npy_set_floatstatus_overflow();
+    }
+    if (divbyzero) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#if NPY_HWY
+
+// Signed integer  DIVIDE  by scalar
+
+template <typename T>
+void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) {
+    using D = _Tag<T>;
+    const D d;
+    const size_t N = Lanes(T{});
+
+    bool raise_overflow = false;
+    bool raise_divbyzero = false;
+
+    if (scalar == 0) {
+        // Handle division by zero
+        std::fill(dst, dst + len, static_cast<T>(0));
+        raise_divbyzero = true;
+    }
+    else if (scalar == 1) {
+        // Special case for division by 1
+        if (src != dst) {
+            std::memcpy(dst, src, len * sizeof(T));
+        }
+    }
+    else if (scalar == static_cast<T>(-1)) {
+        const auto vec_min_val = Set(d, static_cast<T>(std::numeric_limits<T>::min()));
+        size_t i = 0;
+        for (; i + N <= static_cast<size_t>(len); i += N) {
+            const auto vec_src = LoadU(src + i);
+            const auto is_min_val = Eq(vec_src, vec_min_val);
+            const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
+            StoreU(vec_res, dst + i);
+            if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
+                raise_overflow = true;
+            }
+        }
+        // Handle remaining elements
+        for (; i < static_cast<size_t>(len); i++) {
+            T val = src[i];
+            if (val == std::numeric_limits<T>::min()) {
+                dst[i] = std::numeric_limits<T>::min();
+                raise_overflow = true;
+            } else {
+                dst[i] = -val;
+            }
+        }
+    }
+    else {
+        // General case with floor division semantics
+        const auto vec_scalar = Set(d, scalar);
+        const auto one = Set(d, static_cast<T>(1));
+        const auto vec_zero = Xor(one, one);
+        size_t i = 0;
+        
+        for (; i + N <= static_cast<size_t>(len); i += N) {
+            const auto vec_src = LoadU(src + i);
+            auto vec_div = Div(vec_src, vec_scalar);
+            const auto vec_mul = Mul(vec_div, vec_scalar);
+            const auto eq_mask = Eq(vec_src, vec_mul);
+            const auto diff_signs = Lt(Xor(vec_src, vec_scalar), vec_zero);
+            const auto adjust = AndNot(eq_mask, diff_signs);
+            
+            vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one);
+            StoreU(vec_div, dst + i);
+        }
+        
+        // Handle remaining elements with scalar code
+        for (; i < static_cast<size_t>(len); i++) {
+            T n = src[i];
+            T r = n / scalar;
+            if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) {
+                --r;
+            }
+            dst[i] = r;
+        }
+    }
+    set_float_status(raise_overflow, raise_divbyzero);
+}
+
+// Unsigned integer  DIVIDE  by scalar
+
+template <typename T>
+void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) {
+
+    const size_t N = Lanes(T{});
+    bool raise_divbyzero = false;
+
+    if (scalar == 0) {
+        // Handle division by zero
+        std::fill(dst, dst + len, static_cast<T>(0));
+        raise_divbyzero = true;
+    }
+    else if (scalar == 1) {
+        // Special case for division by 1
+        if (src != dst) {
+            std::memcpy(dst, src, len * sizeof(T));
+        }
+    }
+    else {
+        const auto vec_scalar = Set(scalar);
+        size_t i = 0;
+        for (; i + N <= static_cast<size_t>(len); i += N) {
+            const auto vec_src = LoadU(src + i);
+            const auto vec_res = Div(vec_src, vec_scalar);
+            StoreU(vec_res, dst + i);
+        }
+        // Handle remaining elements
+        for (; i < static_cast<size_t>(len); i++) {
+            dst[i] = src[i] / scalar;
+        }
+    }
+
+    set_float_status(false, raise_divbyzero);
+}
+
+// Signed integer  DIVIDE  array / array
+
+template <typename T>
+void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
+    using D = _Tag<T>;
+    const D d;
+    const size_t N = Lanes(T{});
+
+    bool raise_overflow = false;
+    bool raise_divbyzero = false;
+    const auto vec_one = Set(d, static_cast<T>(1));
+    const auto vec_zero = Xor(vec_one, vec_one);
+    const auto vec_min_val = Set(d, static_cast<T>(std::numeric_limits<T>::min()));
+    const auto vec_neg_one = Set(d, static_cast<T>(-1));
+
+    size_t i = 0;
+    for (; i + N <= static_cast<size_t>(len); i += N) {
+        const auto vec_a = LoadU(src1 + i);
+        const auto vec_b = LoadU(src2 + i);
+        
+        const auto b_is_zero = Eq(vec_b, vec_zero);
+        const auto a_is_min = Eq(vec_a, vec_min_val);
+        const auto b_is_neg_one = Eq(vec_b, vec_neg_one);
+        const auto overflow_cond = And(a_is_min, b_is_neg_one);
+        
+        const auto safe_div_mask = hn::Not(Or(b_is_zero, overflow_cond));
+        const auto safe_b = hn::IfThenElse(Or(b_is_zero, overflow_cond), vec_one, vec_b);
+        
+        auto vec_div = Div(vec_a, safe_b);
+        
+        if (!hn::AllFalse(d, safe_div_mask)) {
+            const auto vec_mul = Mul(vec_div, safe_b);
+            const auto has_remainder = hn::Ne(vec_a, vec_mul);
+            const auto a_sign = Lt(vec_a, vec_zero);
+            const auto b_sign = Lt(vec_b, vec_zero);
+            const auto different_signs = Xor(a_sign, b_sign);
+            const auto needs_adjustment = And(safe_div_mask,
+                                                And(has_remainder, different_signs));
+            
+            vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
+        }
+        
+        vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
+        vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
+        
+        StoreU(vec_div, dst + i);
+        
+        if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
+            raise_divbyzero = true;
+        }
+        if (!raise_overflow && !hn::AllFalse(d, overflow_cond)) {
+            raise_overflow = true;
+        }
+    }
+    
+    // Handle remaining elements
+    for (; i < static_cast<size_t>(len); i++) {
+        T a = src1[i];
+        T b = src2[i];
+
+        if (b == 0) {
+            dst[i] = 0;
+            raise_divbyzero = true;
+        } 
+        else if (a == std::numeric_limits<T>::min() && b == -1) {
+            dst[i] = std::numeric_limits<T>::min();
+            raise_overflow = true;
+        } 
+        else {
+            T r = a / b;
+            if (((a > 0) != (b > 0)) && ((r * b) != a)) {
+                --r;
+            }
+            dst[i] = r;
+        }
+    }
+    
+    npy_clear_floatstatus();
+    set_float_status(raise_overflow, raise_divbyzero);
+}
+
+// Unsigned integer  DIVIDE  array / array
+
+template <typename T>
+void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
+    using D = _Tag<T>;
+    const D d;
+    const size_t N = Lanes(T{});
+
+    bool raise_divbyzero = false;
+    const auto vec_one = Set(d, static_cast<T>(1));
+    const auto vec_zero = Xor(vec_one, vec_one);
+
+    size_t i = 0;
+    for (; i + N <= static_cast<size_t>(len); i += N) {
+        const auto vec_a = LoadU(src1 + i);
+        const auto vec_b = LoadU(src2 + i);
+
+        const auto b_is_zero = Eq(vec_b, vec_zero);
+        
+        const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b);
+        
+        auto vec_div = Div(vec_a, safe_b);
+        
+        vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
+        
+        StoreU(vec_div, dst + i);
+        
+        if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
+            raise_divbyzero = true;
+        }
+    }
+    
+    // Handle remaining elements
+    for (; i < static_cast<size_t>(len); i++) {
+        T a = src1[i];
+        T b = src2[i];
+
+        if (b == 0) {
+            dst[i] = 0;
+            raise_divbyzero = true;
+        } else {
+            dst[i] = a / b;
+        }
+    }
+    
+    npy_clear_floatstatus();
+    set_float_status(false, raise_divbyzero);
+}
+
+#endif // NPY_HWY
+
+// Floor division for signed integers
+template <typename T>
+T floor_div(T n, T d) {
+    if (NPY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return std::numeric_limits<T>::min();
+        }
+    }
+    T r = n / d;
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        --r;
+    }
+    return r;
+}
+
+
+// Dispatch functions for signed integer division
+template <typename T>
+void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    npy_clear_floatstatus();
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(T) {
+            const T divisor = *reinterpret_cast<T*>(ip2);
+            if (NPY_UNLIKELY(divisor == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else if (NPY_UNLIKELY(io1 == std::numeric_limits<T>::min() && divisor == -1)) {
+                npy_set_floatstatus_overflow();
+                io1 = std::numeric_limits<T>::min();
+            } else {
+                io1 = floor_div(io1, divisor);
+            }
+        }
+        *reinterpret_cast<T*>(iop1) = io1;
+        return;
+    }
+
+#if NPY_HWY
+    // Handle array-array case
+    if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes<uint8_t>)) 
+    {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
+                         nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
+        // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
+        if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) {
+            T* src1 = (T*)args[0];
+            T* src2 = (T*)args[1];
+            T* dst = (T*)args[2];
+            simd_divide_contig_signed(src1, src2, dst, dimensions[0]);
+            return;
+        }
+    }
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes<uint8_t>) &&
+        *reinterpret_cast<T*>(args[1]) != 0)
+    {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
+        if (no_overlap) {
+            T* src1 = reinterpret_cast<T*>(args[0]);
+            T* src2 = reinterpret_cast<T*>(args[1]);
+            T* dst = reinterpret_cast<T*>(args[2]);
+            simd_divide_by_scalar_contig_signed(src1, *src2, dst, dimensions[0]);
+            return;
+        }
+    }
+#endif // NPY_HWY
+
+    // Scalar fallback
+    // Fallback for non-blockable, in-place, or zero divisor cases
+    BINARY_LOOP {
+        const T dividend = *reinterpret_cast<T*>(ip1);
+        const T divisor = *reinterpret_cast<T*>(ip2);
+        T* result = reinterpret_cast<T*>(op1);
+
+        if (NPY_UNLIKELY(divisor == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *result = 0;
+        } else if (NPY_UNLIKELY(dividend == std::numeric_limits<T>::min() && divisor == -1)) {
+            npy_set_floatstatus_overflow();
+            *result = std::numeric_limits<T>::min();
+        } else {
+            *result = floor_div(dividend, divisor);
+        }
+    }
+}
+
+// Dispatch functions for unsigned integer division
+template <typename T>
+void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    npy_clear_floatstatus();
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(T) {
+            const T d = *reinterpret_cast<T*>(ip2);
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 = io1 / d;
+            }
+        }
+        *reinterpret_cast<T*>(iop1) = io1;
+        return;
+    }
+#if NPY_HWY
+    // Handle array-array case
+    if (IS_BLOCKABLE_BINARY(sizeof(T), kMaxLanes<uint8_t>)) {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
+                         nomemoverlap(args[2], steps[2], args[1], steps[1], dimensions[0]);
+        // Check if we can use SIMD for contiguous arrays - all steps must equal to sizeof(T)
+        if (steps[0] == sizeof(T) && steps[1] == sizeof(T) && steps[2] == sizeof(T) && no_overlap) {
+            T* src1 = (T*)args[0];
+            T* src2 = (T*)args[1];
+            T* dst = (T*)args[2];
+            simd_divide_contig_unsigned(src1, src2, dst, dimensions[0]);
+            return;
+        }
+    }
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), kMaxLanes<uint8_t>) &&
+        *reinterpret_cast<T*>(args[1]) != 0)
+    {
+        bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
+        if (no_overlap) {
+            T* src1 = reinterpret_cast<T*>(args[0]);
+            T* src2 = reinterpret_cast<T*>(args[1]);
+            T* dst  = reinterpret_cast<T*>(args[2]);
+            simd_divide_by_scalar_contig_unsigned(src1, *src2, dst, dimensions[0]);
+            return;
+        }
+    }
+#endif // NPY_HWY
+
+    // Fallback for non-blockable, in-place, or zero divisor cases
+    BINARY_LOOP {
+        const T in1 = *reinterpret_cast<T*>(ip1);
+        const T in2 = *reinterpret_cast<T*>(ip2);
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *reinterpret_cast<T*>(op1) = 0;
+        } else {
+            *reinterpret_cast<T*>(op1) = in1 / in2;
+        }
+    }
+}
+
+// Indexed division for signed integers
+template <typename T>
+int TYPE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), 
+                       char * const*args, npy_intp const *dimensions, 
+                       npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+
+    for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        T* indexed = (T*)(ip1 + is1 * indx);
+        T divisor = *(T*)value;
+        *indexed = floor_div(*indexed, divisor);
+    }
+    return 0;
+}
+
+// Indexed division for unsigned integers
+template <typename T>
+int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context), 
+                               char * const*args, npy_intp const *dimensions, 
+                               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+
+    for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        T* indexed = (T*)(ip1 + is1 * indx);
+        T divisor = *(T*)value;
+
+        if (NPY_UNLIKELY(divisor == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / divisor;
+        }
+    }
+    return 0;
+}
+
+// Macro to define the dispatch functions for signed types
+#define DEFINE_DIVIDE_FUNCTION(TYPE, SCALAR_TYPE) \
+        NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
+            TYPE_divide<SCALAR_TYPE>(args, dimensions, steps, func); \
+        } \
+        NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
+            return TYPE_divide_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
+        }
+
+// Macro to define the dispatch functions for unsigned types
+#define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \
+        NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TYPE##_divide)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func) { \
+            TYPE_divide_unsigned<SCALAR_TYPE>(args, dimensions, steps, func); \
+        } \
+        NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
+            return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
+        }
+
+
+#ifdef NPY_CPU_DISPATCH_CURFX
+    DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
+    DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
+    DEFINE_DIVIDE_FUNCTION(INT, int32_t)
+    #if NPY_SIZEOF_LONG == 4
+        DEFINE_DIVIDE_FUNCTION(LONG, int32_t)
+    #elif NPY_SIZEOF_LONG == 8
+        DEFINE_DIVIDE_FUNCTION(LONG, int64_t)
+    #endif
+    DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
+#endif
+
+#ifdef NPY_CPU_DISPATCH_CURFX
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
+    #if NPY_SIZEOF_LONG == 4
+        DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t)
+    #elif NPY_SIZEOF_LONG == 8
+        DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)
+    #endif
+    DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
+#endif
+
+#undef DEFINE_DIVIDE_FUNCTION
+#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED