From 644ea59847f51ac9fef512196a8a0431ab1c54a7 Mon Sep 17 00:00:00 2001 From: Naveen Regulla Date: Thu, 9 Apr 2026 14:02:39 +0530 Subject: [PATCH] Fix Windows ARM64EC build issues and correct SIMD ARM NEON path Signed-off-by: nregulla --- .gitignore | 1 + external/OpenJPH/src/core/openjph/ojph_arch.h | 2 +- .../OpenJPH/src/core/others/ojph_arch.cpp | 2 +- src/lib/OpenEXR/ImfSimd.h | 14 +++-- src/lib/OpenEXR/ImfZip.cpp | 6 +-- src/lib/OpenEXRCore/internal_cpuid.h | 4 +- src/lib/OpenEXRCore/internal_dwa_simd.h | 52 +++++++++++++++---- src/lib/OpenEXRCore/internal_zip.c | 25 +++++++-- src/test/OpenEXRCoreTest/base_units.cpp | 2 +- 9 files changed, 81 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 1824615e58..c30331da02 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ __pycache__/ # Ignore Bazel generated files bazel-* MODULE.bazel.lock +*.vsidx diff --git a/external/OpenJPH/src/core/openjph/ojph_arch.h b/external/OpenJPH/src/core/openjph/ojph_arch.h index 01cae776aa..097c46f15c 100644 --- a/external/OpenJPH/src/core/openjph/ojph_arch.h +++ b/external/OpenJPH/src/core/openjph/ojph_arch.h @@ -67,7 +67,7 @@ // preprocessor directives for architecture /////////////////////////////////////////////////////////////////////////////// #if defined(__arm__) || defined(__TARGET_ARCH_ARM) \ - || defined(__aarch64__) || defined(_M_ARM64) + || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define OJPH_ARCH_ARM #elif defined(__i386) || defined(__i386__) || defined(_M_IX86) #define OJPH_ARCH_I386 diff --git a/external/OpenJPH/src/core/others/ojph_arch.cpp b/external/OpenJPH/src/core/others/ojph_arch.cpp index 018e461c95..b44aa1cdb8 100644 --- a/external/OpenJPH/src/core/others/ojph_arch.cpp +++ b/external/OpenJPH/src/core/others/ojph_arch.cpp @@ -170,7 +170,7 @@ namespace ojph { #else // Linux/FreeBSD/OpenBSD - #if defined(__aarch64__) || defined(_M_ARM64) // 64-bit ARM + #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) // 64-bit ARM #include #ifdef OJPH_OS_LINUX diff --git a/src/lib/OpenEXR/ImfSimd.h b/src/lib/OpenEXR/ImfSimd.h index 56060c545a..98cc594d48 100644 --- a/src/lib/OpenEXR/ImfSimd.h +++ b/src/lib/OpenEXR/ImfSimd.h @@ -13,11 +13,11 @@ // // GCC and Visual Studio SSE2 compiler flags -#if defined __SSE2__ || (_MSC_VER && (_M_IX86 || _M_X64)) +#if defined __SSE2__ || (_MSC_VER && (_M_IX86 || _M_X64) && !defined(_M_ARM64EC)) # define IMF_HAVE_SSE2 1 #endif -#if defined __SSE4_1__ || (_MSC_VER && (_M_IX86 || _M_X64)) +#if defined __SSE4_1__ || (_MSC_VER && (_M_IX86 || _M_X64) && !defined(_M_ARM64EC)) # define IMF_HAVE_SSE4_1 1 #endif @@ -42,7 +42,7 @@ # define IMF_HAVE_F16C 1 #endif -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(_M_ARM64) || defined(_M_ARM64EC) # define IMF_HAVE_NEON #endif @@ -50,6 +50,14 @@ # define IMF_HAVE_NEON_AARCH64 1 #endif +#if defined(_M_ARM64) || defined(_M_ARM64EC) +# define IMF_HAVE_NEON_WINDOWS_ARM64 1 +#endif + +#if defined(IMF_HAVE_NEON_AARCH64) || defined(IMF_HAVE_NEON_WINDOWS_ARM64) +# define IMF_HAVE_NEON_ARM64 1 +#endif + extern "C" { #ifdef IMF_HAVE_SSE2 # include diff --git a/src/lib/OpenEXR/ImfZip.cpp b/src/lib/OpenEXR/ImfZip.cpp index 7b209fb3d8..499beaddce 100644 --- a/src/lib/OpenEXR/ImfZip.cpp +++ b/src/lib/OpenEXR/ImfZip.cpp @@ -157,7 +157,7 @@ reconstruct_sse41 (char* buf, size_t outSize) #endif -#ifdef IMF_HAVE_NEON_AARCH64 +#ifdef IMF_HAVE_NEON_ARM64 void reconstruct_neon (char* buf, size_t outSize) @@ -258,7 +258,7 @@ interleave_sse2 (const char* source, size_t outSize, char* out) #endif -#ifdef IMF_HAVE_NEON_AARCH64 +#ifdef IMF_HAVE_NEON_ARM64 void interleave_neon (const char* source, size_t outSize, char* out) @@ -370,7 +370,7 @@ Zip::initializeFuncs () if (cpuId.sse2) { interleave = interleave_sse2; } #endif -#ifdef IMF_HAVE_NEON_AARCH64 +#ifdef IMF_HAVE_NEON_ARM64 reconstruct = reconstruct_neon; interleave = interleave_neon; #endif diff --git a/src/lib/OpenEXRCore/internal_cpuid.h b/src/lib/OpenEXRCore/internal_cpuid.h index 90ce6ae361..f48534e747 100644 --- a/src/lib/OpenEXRCore/internal_cpuid.h +++ b/src/lib/OpenEXRCore/internal_cpuid.h @@ -9,7 +9,7 @@ #include "OpenEXRConfigInternal.h" #if defined(i386) || defined(__i386__) || defined(__i386) || \ - defined(_M_X86) || defined(__x86_64__) || defined(_M_X64) + defined(_M_X86) || defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) # define OPENEXR_ENABLE_X86_SIMD_CHECK 1 #else # define OPENEXR_ENABLE_X86_SIMD_CHECK 0 @@ -132,7 +132,7 @@ has_native_half (void) int sse2, avx, f16c; check_for_x86_simd (&f16c, &avx, &sse2); return avx && f16c; -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) return 1; #else return 0; diff --git a/src/lib/OpenEXRCore/internal_dwa_simd.h b/src/lib/OpenEXRCore/internal_dwa_simd.h index d031a87a03..dfb15c4f1e 100644 --- a/src/lib/OpenEXRCore/internal_dwa_simd.h +++ b/src/lib/OpenEXRCore/internal_dwa_simd.h @@ -18,7 +18,7 @@ // aligned. Unaligned pointers may risk seg-faulting. // -#if defined __SSE2__ || (_MSC_VER && (_M_IX86 || _M_X64)) +#if defined __SSE2__ || (_MSC_VER && (_M_IX86 || _M_X64) && !defined(_M_ARM64EC)) # define IMF_HAVE_SSE2 1 # include # include @@ -35,6 +35,22 @@ # define IMF_HAVE_NEON_AARCH64 1 #endif +#if defined(_M_ARM64) || defined(_M_ARM64EC) +# define IMF_HAVE_NEON_WINDOWS_ARM64 1 +#endif + +#if defined(IMF_HAVE_NEON_AARCH64) || defined(IMF_HAVE_NEON_WINDOWS_ARM64) +# define IMF_HAVE_NEON_ARM64 1 +#endif + +#if defined(IMF_HAVE_NEON_ARM64) +# if defined(_MSC_VER) +# define NEON_RESTRICT __restrict +# else +# define NEON_RESTRICT __restrict__ +# endif +#endif + #include "internal_coding.h" #if defined(OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX) && \ @@ -357,7 +373,7 @@ convertFloatToHalf64_scalar (uint16_t* dst, float* src) dst[i] = float_to_half (src[i]); } -#ifdef IMF_HAVE_NEON_AARCH64 +#ifdef IMF_HAVE_NEON_ARM64 void convertFloatToHalf64_neon (uint16_t* dst, float* src) @@ -786,36 +802,50 @@ fromHalfZigZag_f16c (uint16_t* src, float* dst) #endif /* defined IMF_HAVE_GCC_INLINEASM_X86_64 */ } -#ifdef IMF_HAVE_NEON_AARCH64 +#ifdef IMF_HAVE_NEON_ARM64 void -fromHalfZigZag_neon (uint16_t* __restrict__ src, float* __restrict__ dst) +fromHalfZigZag_neon (uint16_t* NEON_RESTRICT src, float* NEON_RESTRICT dst) { +# if defined(_MSC_VER) + static const uint8_t res_tbl_data[4][16] = { + {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42}, + {3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53}, + {10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60}, + {21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}}; + + uint8x16_t res_tbl[4]; + for (int i = 0; i < 4; i++) + { + res_tbl[i] = vld1q_u8 (res_tbl_data[i]); + } + +# else uint8x16_t res_tbl[4] = { {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42}, {3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53}, {10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60}, {21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}}; +# endif uint8x16x4_t vec_input_l, vec_input_h; - for (int i = 0; i < 4; i++) { uint8x16x2_t vec_in_u8 = vld2q_u8 ((uint8_t*) (src + 16 * i)); vec_input_l.val[i] = vec_in_u8.val[0]; vec_input_h.val[i] = vec_in_u8.val[1]; } - # pragma unroll(4) for (int i = 0; i < 4; i++) { - uint8x16_t res_vec_l, res_vec_h; - res_vec_l = vqtbl4q_u8 (vec_input_l, res_tbl[i]); - res_vec_h = vqtbl4q_u8 (vec_input_h, res_tbl[i]); + uint8x16_t res_vec_l = vqtbl4q_u8 (vec_input_l, res_tbl[i]); + uint8x16_t res_vec_h = vqtbl4q_u8 (vec_input_h, res_tbl[i]); + float16x8_t res_vec_l_f16 = vreinterpretq_f16_u8 (vzip1q_u8 (res_vec_l, res_vec_h)); float16x8_t res_vec_h_f16 = vreinterpretq_f16_u8 (vzip2q_u8 (res_vec_l, res_vec_h)); + vst1q_f32 (dst + i * 16, vcvt_f32_f16 (vget_low_f16 (res_vec_l_f16))); vst1q_f32 (dst + i * 16 + 4, vcvt_high_f32_f16 (res_vec_l_f16)); vst1q_f32 ( @@ -824,7 +854,7 @@ fromHalfZigZag_neon (uint16_t* __restrict__ src, float* __restrict__ dst) } } -#endif // IMF_HAVE_NEON_AARCH64 +#endif //IMF_HAVE_NEON_ARM64 // // Inverse 8x8 DCT, only inverting the DC. This assumes that @@ -2315,7 +2345,7 @@ initializeFuncs (void) if (done) return; done = 1; -#ifdef IMF_HAVE_NEON_AARCH64 +#ifdef IMF_HAVE_NEON_ARM64 { convertFloatToHalf64 = convertFloatToHalf64_neon; fromHalfZigZag = fromHalfZigZag_neon; diff --git a/src/lib/OpenEXRCore/internal_zip.c b/src/lib/OpenEXRCore/internal_zip.c index d94f3d89fa..f7aa6433f3 100644 --- a/src/lib/OpenEXRCore/internal_zip.c +++ b/src/lib/OpenEXRCore/internal_zip.c @@ -15,18 +15,33 @@ #include "openexr_compression.h" -#if defined __SSE2__ || (_MSC_VER >= 1300 && (_M_IX86 || _M_X64)) +#if defined __SSE2__ || (_MSC_VER >= 1300 && (_M_IX86 || _M_X64) && !defined(_M_ARM64EC)) # define IMF_HAVE_SSE2 1 # include # include #endif -#if defined __SSE4_1__ || (_MSC_VER >= 1300 && (_M_IX86 || _M_X64)) +#if defined __SSE4_1__ || (_MSC_VER >= 1300 && (_M_IX86 || _M_X64) && !defined(_M_ARM64EC)) # define IMF_HAVE_SSE4_1 1 # include #endif #if defined(__aarch64__) # define IMF_HAVE_NEON_AARCH64 1 -# include +#endif + +#if defined(_M_ARM64) || defined(_M_ARM64EC) +# define IMF_HAVE_NEON_WINDOWS_ARM64 1 +#endif + +#if defined(IMF_HAVE_NEON_AARCH64) || defined(IMF_HAVE_NEON_WINDOWS_ARM64) +# define IMF_HAVE_NEON_ARM64 1 +#endif + +#if defined(IMF_HAVE_NEON_ARM64) +# if defined(_MSC_VER) +# include +# else +# include +# endif #endif /**************************************/ @@ -77,7 +92,7 @@ reconstruct (uint8_t* buf, const uint64_t outSize) prev = d; } } -#elif defined(IMF_HAVE_NEON_AARCH64) +#elif defined(IMF_HAVE_NEON_ARM64) static void reconstruct (uint8_t* buf, const uint64_t outSize) { @@ -174,7 +189,7 @@ interleave (uint8_t* out, const uint8_t* const source, const uint64_t outSize) *(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++); } -#elif defined(IMF_HAVE_NEON_AARCH64) +#elif defined(IMF_HAVE_NEON_ARM64) static void interleave (uint8_t* out, const uint8_t* const source, const uint64_t outSize) { diff --git a/src/test/OpenEXRCoreTest/base_units.cpp b/src/test/OpenEXRCoreTest/base_units.cpp index d64ead785d..6658844069 100644 --- a/src/test/OpenEXRCoreTest/base_units.cpp +++ b/src/test/OpenEXRCoreTest/base_units.cpp @@ -396,7 +396,7 @@ testCPUIdent (const std::string& tempdir) EXRCORE_TEST (false); } -#if defined(__x86_64__) || defined(_M_X64) +#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) if (has_native_half () != (hf16c && havx)) { std::cerr << "CPU Id test has native half mismatch" << std::endl;