Loongarch: modify lsx optimization(25215PR) for newest branch

abhishek-iitmadras · Dec 20, 2024 · 7c35c37 · 7c35c37
1 parent 35b2c4a
commit 7c35c37
Show file tree

Hide file tree

Showing 14 changed files with 110 additions and 15 deletions.
diff --git a/meson.options b/meson.options
@@ -35,6 +35,7 @@ option('test-simd', type: 'array',
           'VSX', 'VSX2', 'VSX3', 'VSX4',
           'NEON', 'ASIMD',
           'VX', 'VXE', 'VXE2',
+          'LSX',
         ],
         description: 'Specify a list of CPU features to be tested against NumPy SIMD interface')
 option('test-simd-args', type: 'string', value: '',

diff --git a/meson_cpu/loongarch64/meson.build b/meson_cpu/loongarch64/meson.build
@@ -0,0 +1,8 @@
+source_root = meson.project_source_root()
+mod_features = import('features')
+
+LSX = mod_features.new(
+  'LSX', 1, args: ['-mlsx'],
+  test_code: files(source_root + '/numpy/distutils/checks/cpu_lsx.c')[0]
+)
+LOONGARCH64_FEATURES = {'LSX': LSX}
diff --git a/meson_cpu/main_config.h.in b/meson_cpu/main_config.h.in
@@ -389,4 +389,8 @@
 #ifdef @P@HAVE_RVV
     #include <riscv_vector.h>
 #endif
+
+#ifdef @P@HAVE_LSX
+    #include <lsxintrin.h>
+#endif
 #endif // @P@_CPU_DISPATCHER_CONF_H_
diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build
@@ -76,13 +76,15 @@ subdir('ppc64')
 subdir('s390x')
 subdir('arm')
 subdir('riscv64')
+subdir('loongarch64')
 
 CPU_FEATURES = {}
 CPU_FEATURES += ARM_FEATURES
 CPU_FEATURES += X86_FEATURES
 CPU_FEATURES += PPC64_FEATURES
 CPU_FEATURES += S390X_FEATURES
 CPU_FEATURES += RV64_FEATURES
+CPU_FEATURES += LOONGARCH64_FEATURES
 
 # Parse the requested baseline (CPU_CONF_BASELINE) and dispatch features
 # (CPU_CONF_DISPATCH).
@@ -97,6 +99,7 @@ min_features = {
   'aarch64': [ASIMD],
   'riscv64': [],
   'wasm32': [],
+  'loongarch64': [LSX],
 }.get(cpu_family, [])
 if host_machine.endian() == 'little' and cpu_family == 'ppc64'
   min_features = [VSX2]
@@ -112,6 +115,7 @@ max_features_dict = {
   'aarch64': ARM_FEATURES,
   'riscv64': RV64_FEATURES,
   'wasm32': {},
+  'loongarch64': LOONGARCH64_FEATURES,
 }.get(cpu_family, {})
 max_features = []
 foreach fet_name, fet_obj : max_features_dict

diff --git a/numpy/_core/include/numpy/npy_cpu.h b/numpy/_core/include/numpy/npy_cpu.h
@@ -109,8 +109,8 @@
     #elif __riscv_xlen == 32
 	#define NPY_CPU_RISCV32
     #endif
-#elif defined(__loongarch__)
-    #define NPY_CPU_LOONGARCH
+#elif defined(__loongarch64)
+    #define NPY_CPU_LOONGARCH64
 #elif defined(__EMSCRIPTEN__)
     /* __EMSCRIPTEN__ is defined by emscripten: an LLVM-to-Web compiler */
     #define NPY_CPU_WASM

diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
@@ -97,6 +97,10 @@ if use_svml
   endif
 endif
 
+if host_machine.cpu_family() == 'loongarch64'
+  add_project_arguments(['-DHWY_COMPILE_ONLY_SCALAR'], language: ['cpp'])
+endif
+
 use_highway = not get_option('disable-highway')
 if use_highway and not fs.exists('src/highway/README.md')
   error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.')
@@ -880,6 +884,7 @@ foreach gen_mtargets : [
       ASIMD, NEON,
       VSX3, VSX2,
       VXE, VX,
+      LSX,
     ]
   ],
   [
@@ -890,6 +895,7 @@ foreach gen_mtargets : [
       NEON,
       VSX4, VSX2,
       VX,
+      LSX,
     ]
   ],
   [
@@ -900,6 +906,7 @@ foreach gen_mtargets : [
       VSX3, VSX2,
       NEON,
       VXE, VX,
+      LSX,
     ]
   ],
   [
@@ -916,7 +923,8 @@ foreach gen_mtargets : [
       AVX512_SKX, [AVX2, FMA3],
       VSX4, VSX2,
       NEON_VFPV4,
-      VXE
+      VXE,
+      LSX,
     ]
   ],
   [
@@ -927,6 +935,7 @@ foreach gen_mtargets : [
       AVX512_SKX, AVX2, SSE2,
       VSX2,
       VX,
+      LSX,
     ]
   ],
   [
@@ -937,6 +946,7 @@ foreach gen_mtargets : [
       AVX512_SKX, AVX2, SSE2,
       VSX2,
       VXE, VX,
+      LSX,
     ]
   ],
   [
@@ -954,6 +964,7 @@ foreach gen_mtargets : [
       VSX4, VSX3, VSX2,
       NEON_VFPV4,
       VXE2, VXE,
+      LSX,
     ]
   ],
   [
@@ -968,7 +979,8 @@ foreach gen_mtargets : [
       ASIMD, NEON,
       AVX512_SKX, AVX2, SSE2,
       VSX2,
-      VXE, VX
+      VXE, VX,
+      LSX,
     ]
   ],
   [
@@ -978,7 +990,8 @@ foreach gen_mtargets : [
       SSE41, SSE2,
       VSX2,
       ASIMD, NEON,
-      VXE, VX
+      VXE, VX,
+      LSX,
     ]
   ],
   [
@@ -988,6 +1001,7 @@ foreach gen_mtargets : [
       SSE41, SSE2,
       VSX2,
       ASIMD, NEON,
+      LSX,
     ]
   ],
   [
@@ -998,6 +1012,7 @@ foreach gen_mtargets : [
       ASIMD, NEON,
       VSX3, VSX2,
       VXE, VX,
+      LSX,
     ]
   ],
   [
@@ -1008,6 +1023,7 @@ foreach gen_mtargets : [
       NEON,
       VSX2,
       VX,
+      LSX,
     ]
   ],
 ]

diff --git a/numpy/_core/src/common/npy_cpu_features.c b/numpy/_core/src/common/npy_cpu_features.c
@@ -125,7 +125,8 @@ static struct {
                 {NPY_CPU_FEATURE_ASIMDDP, "ASIMDDP"},
                 {NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"},
                 {NPY_CPU_FEATURE_SVE, "SVE"},
-                {NPY_CPU_FEATURE_RVV, "RVV"}};
+                {NPY_CPU_FEATURE_RVV, "RVV"},
+                {NPY_CPU_FEATURE_LSX, "LSX"}};
 
 
 NPY_VISIBILITY_HIDDEN PyObject *
@@ -665,6 +666,25 @@ npy__cpu_init_features(void)
     npy__cpu_have[NPY_CPU_FEATURE_VX]  = 1;
 }
 
+/***************** LoongArch ******************/
+
+#elif defined(__loongarch64)
+
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+static void
+npy__cpu_init_features(void)
+{
+   memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+   unsigned int hwcap = getauxval(AT_HWCAP);
+
+   if ((hwcap & HWCAP_LOONGARCH_LSX)) {
+      npy__cpu_have[NPY_CPU_FEATURE_LSX]  = 1;
+      return;
+   }
+}
+
 
 /***************** ARM ******************/
 

diff --git a/numpy/_core/src/common/npy_cpu_features.h b/numpy/_core/src/common/npy_cpu_features.h
@@ -91,7 +91,7 @@ enum npy_cpu_features
 
     // IBM/ZARCH
     NPY_CPU_FEATURE_VX                = 350,
- 
+
     // Vector-Enhancements Facility 1
     NPY_CPU_FEATURE_VXE               = 351,
 
@@ -101,6 +101,9 @@ enum npy_cpu_features
     // RISC-V
     NPY_CPU_FEATURE_RVV               = 400,
 
+    // LOONGARCH
+    NPY_CPU_FEATURE_LSX               = 500,
+
     NPY_CPU_FEATURE_MAX
 };
 
@@ -113,7 +116,7 @@ enum npy_cpu_features
  *  - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
  *  - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
  *
- * It will set a RuntimeError when 
+ * It will set a RuntimeError when
  *  - CPU baseline features from the build are not supported at runtime
  *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
  *  - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
@@ -122,14 +125,14 @@ enum npy_cpu_features
  *    by the machine or build
  *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
  *    not built with any feature optimization support
- *  
+ *
  * It will set an ImportWarning when:
  *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
  *    by the machine or build
  *  - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
  *    disable/enable a feature when the project was not built with any feature
  *    optimization support
- * 
+ *
  * return 0 on success otherwise return -1
  */
 NPY_VISIBILITY_HIDDEN int

diff --git a/numpy/_core/src/common/simd/intdiv.h b/numpy/_core/src/common/simd/intdiv.h
@@ -216,6 +216,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
     divisor.val[0] = npyv_setall_u8(m);
     divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
     divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[0] = npyv_setall_u16(m);
+    divisor.val[1] = npyv_setall_u8(sh1);
+    divisor.val[2] = npyv_setall_u8(sh2);
 #else
     #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -225,7 +229,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
 NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d);
 NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
 {
-#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
+#if defined(NPY_HAVE_SSE2) || defined(NPY_HAVE_LSX) // SSE/AVX2/AVX512
     npyv_s16x3 p = npyv_divisor_s16(d);
     npyv_s8x3 r;
     r.val[0] = npyv_reinterpret_s8_s16(p.val[0]);
@@ -291,6 +295,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1));
     divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2));
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_u16(sh1);
+    divisor.val[2] = npyv_setall_u16(sh2);
 #else
     #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -321,6 +328,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
     divisor.val[1] = npyv_setall_s16(sh);
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_setall_s16(-sh);
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_s16(sh);
 #else
     #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -358,6 +367,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1));
     divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2));
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_u32(sh1);
+    divisor.val[2] = npyv_setall_u32(sh2);
 #else
     #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -393,6 +405,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
     divisor.val[1] = npyv_setall_s32(sh);
 #elif defined(NPY_HAVE_NEON)
     divisor.val[1] = npyv_setall_s32(-sh);
+#elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_s32(sh);
 #else
     #error "please initialize the shifting operand for the new architecture"
 #endif
@@ -427,6 +441,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
     #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
         divisor.val[1] = npyv_set_u64(sh1);
         divisor.val[2] = npyv_set_u64(sh2);
+    #elif defined(NPY_HAVE_LSX)
+        divisor.val[1] = npyv_setall_u64(sh1);
+        divisor.val[2] = npyv_setall_u64(sh2);
     #else
         #error "please initialize the shifting operand for the new architecture"
     #endif
@@ -465,6 +482,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
     divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0);  // sign of divisor
     #ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
     divisor.val[1] = npyv_set_s64(sh);
+    #elif defined(NPY_HAVE_LSX)
+    divisor.val[1] = npyv_setall_s64(sh);
     #else
         #error "please initialize the shifting operand for the new architecture"
     #endif

diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src
@@ -36,7 +36,7 @@
  *     q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
  ********************************************************************************/
 
-#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
+#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)
     // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
     // may not perform well on both neon and up to VSX3 compared to scalar
     // division.
@@ -452,7 +452,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
  * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
  * Power10(VSX4) is an exception here since it has native support for integer vector division.
  */
-#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX))
     #undef TO_SIMD_SFX
 #endif
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)

diff --git a/numpy/_core/tests/test_cpu_dispatcher.py b/numpy/_core/tests/test_cpu_dispatcher.py
@@ -12,7 +12,7 @@ def test_dispatcher():
         "SSE2", "SSE41", "AVX2",
         "VSX", "VSX2", "VSX3",
         "NEON", "ASIMD", "ASIMDHP",
-        "VX", "VXE"
+        "VX", "VXE", "LSX"
     )
     highest_sfx = ""  # no suffix for the baseline
     all_sfx = []

diff --git a/numpy/_core/tests/test_cpu_features.py b/numpy/_core/tests/test_cpu_features.py
@@ -420,3 +420,12 @@ def load_flags(self):
                 # if the kernel reports any one of the following ARM8 features.
                 "ASIMD": ("AES", "SHA1", "SHA2", "PMULL", "CRC32")
             }
+
+
+is_loongarch = re.match("^(loongarch)", machine, re.IGNORECASE)
+@pytest.mark.skipif(not is_linux or not is_loongarch, reason="Only for Linux and LoongArch")
+class Test_LOONGARCH_Features(AbstractTest):
+    features = ["LSX"]
+
+    def load_flags(self):
+        self.load_flags_cpuinfo("Features")
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
@@ -325,7 +325,7 @@ class _Config:
         ## ARMv8.2 dot product
         ASIMDDP = dict(interest=6, implies="ASIMD"),
         ## ARMv8.2 Single & half-precision Multiply
-        ASIMDFHM = dict(interest=7, implies="ASIMDHP"),
+        ASIMDFHM = dict(interest=7, implies="ASIMDHP")
     )
     def conf_features_partial(self):
         """Return a dictionary of supported CPU features by the platform,