Skip to content

Commit 436fcd2

Browse files
Enable fast Huffman & Huffman zig-zag transform for Arm Neon (#1323)
* Enable fast Huffman decoding on macOS Enable fast Huffman decoding for macOS (x86 and Apple silicon) Signed-off-by: Developer Ecosystem Engineering <[email protected]> * Implement Huffman zig-zag transform Implements Huffman zig-zag transform and 32 to 16 bit floating point Signed-off-by: Developer Ecosystem Engineering <[email protected]> Signed-off-by: Developer Ecosystem Engineering <[email protected]>
1 parent 71bffa3 commit 436fcd2

File tree

5 files changed

+184
-34
lines changed

5 files changed

+184
-34
lines changed

src/lib/OpenEXR/ImfDwaCompressor.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2786,6 +2786,13 @@ DwaCompressor::initializeFuncs ()
27862786
fromHalfZigZag = fromHalfZigZag_f16c;
27872787
}
27882788

2789+
#ifdef IMF_HAVE_NEON
2790+
{
2791+
convertFloatToHalf64 = convertFloatToHalf64_neon;
2792+
fromHalfZigZag = fromHalfZigZag_neon;
2793+
}
2794+
#endif
2795+
27892796
//
27902797
// Setup inverse DCT implementations
27912798
//

src/lib/OpenEXR/ImfDwaCompressorSimd.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,18 @@ convertFloatToHalf64_scalar (unsigned short* dst, float* src)
395395
dst[i] = ((half) src[i]).bits ();
396396
}
397397

398+
#ifdef IMF_HAVE_NEON
399+
400+
void
401+
convertFloatToHalf64_neon (unsigned short* dst, float* src)
402+
{
403+
for (int i = 0; i < 64; i += 8) {
404+
float32x4x2_t vec_fp32 = vld1q_f32_x2 (src + i);
405+
vst1q_u16 (dst + i, vcombine_u16(vreinterpret_u16_f16(vcvt_f16_f32(vec_fp32.val[0])),vreinterpret_u16_f16(vcvt_f16_f32(vec_fp32.val[1]))));
406+
}
407+
}
408+
#endif
409+
398410
//
399411
// F16C conversion - Assumes aligned src and dst
400412
//
@@ -809,6 +821,43 @@ fromHalfZigZag_f16c (unsigned short* src, float* dst)
809821
#endif /* defined IMF_HAVE_GCC_INLINEASM_X86_64 */
810822
}
811823

824+
#ifdef IMF_HAVE_NEON
825+
826+
827+
void
828+
fromHalfZigZag_neon(unsigned short* __restrict__ src, float* __restrict__ dst)
829+
{
830+
uint8x16_t res_tbl[4] = {
831+
{0, 1, 5, 6, 14, 15, 27, 28, 2 , 4 , 7 ,13, 16, 26, 29, 42},
832+
{3 , 8 ,12 ,17, 25, 30, 41, 43,9 ,11 ,18 ,24, 31, 40, 44, 53},
833+
{10 ,19 ,23 ,32, 39, 45, 52, 54,20 ,22 ,33 ,38, 46, 51, 55, 60},
834+
{21 ,34 ,37 ,47, 50, 56, 59, 61,35 ,36 ,48 ,49, 57, 58, 62, 63}};
835+
836+
uint8x16x4_t vec_input_l,vec_input_h;
837+
838+
for (int i = 0; i < 4; i++)
839+
{
840+
uint8x16x2_t vec_in_u8 = vld2q_u8 ((unsigned char*)(src + 16 * i));
841+
vec_input_l.val[i] = vec_in_u8.val[0];
842+
vec_input_h.val[i] = vec_in_u8.val[1];
843+
}
844+
845+
#pragma unroll(4)
846+
for (int i = 0; i < 4 ; i++) {
847+
uint8x16_t res_vec_l,res_vec_h;
848+
res_vec_l = vqtbl4q_u8(vec_input_l,res_tbl[i]);
849+
res_vec_h = vqtbl4q_u8(vec_input_h,res_tbl[i]);
850+
float16x8_t res_vec_l_f16 = vreinterpretq_f16_u8(vzip1q_u8(res_vec_l,res_vec_h));
851+
float16x8_t res_vec_h_f16 = vreinterpretq_f16_u8(vzip2q_u8(res_vec_l,res_vec_h));
852+
vst1q_f32(dst + i*16, vcvt_f32_f16(vget_low_f16(res_vec_l_f16)));
853+
vst1q_f32(dst + i*16+4, vcvt_high_f32_f16(res_vec_l_f16));
854+
vst1q_f32(dst + i*16+8, vcvt_f32_f16(vget_low_f16(res_vec_h_f16)));
855+
vst1q_f32(dst + i*16+12, vcvt_high_f32_f16(res_vec_h_f16));
856+
}
857+
}
858+
859+
#endif // IMF_HAVE_NEON
860+
812861
//
813862
// Inverse 8x8 DCT, only inverting the DC. This assumes that
814863
// all AC frequencies are 0.

src/lib/OpenEXR/ImfFastHuf.cpp

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,40 @@
1111
#include <string.h>
1212
#include <vector>
1313

14+
// Static enabling/disabling the fast huffman decode
15+
16+
17+
#if defined(__clang__)
18+
//
19+
// Enabled for clang on Apple platforms (tested):
20+
//
21+
22+
# if defined(__APPLE__)
23+
# define OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
24+
# endif
25+
26+
#elif defined(__INTEL_COMPILER) || defined(__GNUC__)
27+
//
28+
// Enabled for ICC, GCC:
29+
// __i386__ -> x86
30+
// __x86_64__ -> 64-bit x86
31+
// __e2k__ -> e2k (MCST Elbrus 2000)
32+
33+
# if defined(__i386__) || defined(__x86_64__) || defined(__e2k__)
34+
# define OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
35+
# endif
36+
37+
#elif defined(_MSC_VER)
38+
//
39+
// Enabled for Visual Studio:
40+
// _M_IX86 -> x86
41+
// _M_X64 -> 64bit x86
42+
43+
# if defined(_M_IX86) || defined(_M_X64)
44+
# define OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
45+
# endif
46+
#endif
47+
1448
OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
1549

1650
//
@@ -274,50 +308,31 @@ FastHufDecoder::~FastHufDecoder ()
274308
((uint64_t) (c)[4] << 24) | ((uint64_t) (c)[5] << 16) | \
275309
((uint64_t) (c)[6] << 8) | ((uint64_t) (c)[7])
276310

277-
#ifdef __INTEL_COMPILER // ICC built-in swap for LE hosts
278-
# if defined(__i386__) || defined(__x86_64__)
279-
# undef READ64
280-
# define READ64(c) _bswap64 (*(const uint64_t*) (c))
311+
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
312+
# ifdef __INTEL_COMPILER // ICC built-in swap for LE hosts
313+
# if defined(__i386__) || defined(__x86_64__)
314+
# undef READ64
315+
# define READ64(c) _bswap64 (*(const uint64_t*) (c))
316+
# endif
317+
318+
# else
319+
# ifdef __has_builtin
320+
# if __has_builtin(__builtin_bswap64)
321+
# undef READ64
322+
# define READ64(c) __builtin_bswap64 (*(const uint64_t*) (c))
323+
# endif
324+
# endif
281325
# endif
282326
#endif
283327

284328
bool
285329
FastHufDecoder::enabled ()
286330
{
287-
#if defined(__INTEL_COMPILER) || defined(__GNUC__)
288-
289-
//
290-
// Enabled for ICC, GCC:
291-
// __i386__ -> x86
292-
// __x86_64__ -> 64-bit x86
293-
// __e2k__ -> e2k (MCST Elbrus 2000)
294-
295-
# if defined(__i386__) || defined(__x86_64__) || defined(__e2k__)
296-
return true;
297-
# else
298-
return false;
299-
# endif
300-
301-
#elif defined(_MSC_VER)
302-
303-
//
304-
// Enabled for Visual Studio:
305-
// _M_IX86 -> x86
306-
// _M_X64 -> 64bit x86
307-
308-
# if defined(_M_IX86) || defined(_M_X64)
331+
# ifdef OPENEXR_IMF_ENABLE_FAST_HUF_DECODER
309332
return true;
310333
# else
311334
return false;
312335
# endif
313-
314-
#else
315-
316-
//
317-
// Unknown compiler - Be safe and disable.
318-
//
319-
return false;
320-
#endif
321336
}
322337

323338
//

src/lib/OpenEXR/ImfSimd.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@
4242
# define IMF_HAVE_F16C 1
4343
#endif
4444

45+
#if defined(__ARM_NEON)
46+
# define IMF_HAVE_NEON
47+
#endif
48+
4549
extern "C" {
4650
#ifdef IMF_HAVE_SSE2
4751
# include <emmintrin.h>
@@ -51,6 +55,11 @@ extern "C" {
5155
#ifdef IMF_HAVE_SSE4_1
5256
# include <smmintrin.h>
5357
#endif
58+
59+
#ifdef IMF_HAVE_NEON
60+
# include <arm_neon.h>
61+
#endif
62+
5463
}
5564

5665
#endif

src/test/OpenEXRTest/testDwaCompressorSimd.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,37 @@ testFloatToHalf ()
407407
}
408408
}
409409
}
410+
411+
#ifdef IMF_HAVE_NEON
412+
{
413+
cout << " convertFloatToHalf64_neon()" << endl;
414+
for (int iter = 0; iter < numIter; ++iter)
415+
{
416+
for (int i = 0; i < 64; ++i)
417+
{
418+
if (i < 32)
419+
{
420+
src._buffer[i] = (float) 140000 * (rand48.nextf () - .5);
421+
}
422+
else { src._buffer[i] = (float) (rand48.nextf () - .5); }
423+
dst._buffer[i] = 0;
424+
}
425+
426+
convertFloatToHalf64_neon (dst._buffer, src._buffer);
427+
428+
for (int i = 0; i < 64; ++i)
429+
{
430+
half value = (half) src._buffer[i];
431+
if (value.bits () != dst._buffer[i])
432+
{
433+
cout << src._buffer[i] << " -> " << dst._buffer[i]
434+
<< " expected " << value.bits () << endl;
435+
assert (false);
436+
}
437+
}
438+
}
439+
}
440+
#endif // IMF_HAVE_NEON
410441
}
411442

412443
//
@@ -488,6 +519,45 @@ testFromHalfZigZag ()
488519
}
489520
} // iter
490521
} // f16c
522+
523+
#ifdef IMF_HAVE_NEON
524+
{
525+
const int numIter = 1000000;
526+
Rand48 rand48 (0);
527+
half h;
528+
SimdAlignedBuffer64f dstF16c;
529+
530+
cout << " fromHalfZigZag_neon()" << endl;
531+
532+
for (int iter = 0; iter < numIter; ++iter)
533+
{
534+
for (int i = 0; i < 64; ++i)
535+
{
536+
if (i < 32) { h = (half) (140000. * (rand48.nextf () - .5)); }
537+
else
538+
{
539+
h = (half) (rand48.nextf () - .5);
540+
}
541+
src._buffer[i] = h.bits ();
542+
}
543+
544+
fromHalfZigZag_scalar (src._buffer, dst._buffer);
545+
fromHalfZigZag_neon (src._buffer, dstF16c._buffer);
546+
547+
for (int i = 0; i < 64; ++i)
548+
{
549+
if (fabsf (dst._buffer[i] - dstF16c._buffer[i]) > 1e-5)
550+
{
551+
cout << "At index " << i << ": ";
552+
cout << "expecting " << dst._buffer[i] << "; got "
553+
<< dstF16c._buffer[i] << endl;
554+
assert (false);
555+
}
556+
}
557+
} // iter
558+
} // neon
559+
560+
#endif // IMF_HAVE_NEON
491561
}
492562

493563
} // namespace

0 commit comments

Comments
 (0)