From a1726e08f41620937f6aaa181a99bf9467684ad6 Mon Sep 17 00:00:00 2001
From: aliakseis <aliaksei.sanko@gmail.com>
Date: Wed, 23 Oct 2019 11:02:21 +0300
Subject: [PATCH] changes reapplied

---
 ffmpeg-3.3.3-experimental-patch/common.h      |   8 +-
 .../hevcdsp_template.c                        | 475 ++++++++++++------
 2 files changed, 322 insertions(+), 161 deletions(-)

diff --git a/ffmpeg-3.3.3-experimental-patch/common.h b/ffmpeg-3.3.3-experimental-patch/common.h
index 8142b31..f473b68 100644
--- a/ffmpeg-3.3.3-experimental-patch/common.h
+++ b/ffmpeg-3.3.3-experimental-patch/common.h
@@ -191,8 +191,8 @@ static av_always_inline av_const uint16_t av_clip_uint16_c(int a)
  */
 static av_always_inline av_const int16_t av_clip_int16_c(int a)
 {
-    if ((a+0x8000U) & ~0xFFFF) return (a>>31) ^ 0x7FFF;
-    else                      return a;
+    const int16_t noOverflowCandidate = a;
+    return (noOverflowCandidate == a) ? noOverflowCandidate : ((noOverflowCandidate < a) ? INT16_MAX : INT16_MIN);
 }
 
 /**
@@ -228,8 +228,8 @@ static av_always_inline av_const int av_clip_intp2_c(int a, int p)
  */
 static av_always_inline av_const unsigned av_clip_uintp2_c(int a, int p)
 {
-    if (a & ~((1<<p) - 1)) return -a >> 31 & ((1<<p) - 1);
-    else                   return  a;
+    const unsigned int bits = ((1 << p) - 1);
+    return (((unsigned int)a) <= bits) ? a : ((a < 0) ? 0 : bits);
 }
 
 /**
diff --git a/ffmpeg-3.3.3-experimental-patch/hevcdsp_template.c b/ffmpeg-3.3.3-experimental-patch/hevcdsp_template.c
index 25f1a81..54452c4 100644
--- a/ffmpeg-3.3.3-experimental-patch/hevcdsp_template.c
+++ b/ffmpeg-3.3.3-experimental-patch/hevcdsp_template.c
@@ -20,6 +20,10 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#ifdef _MSC_VER
+#include <emmintrin.h>
+#endif
+
 #include "get_bits.h"
 #include "hevcdec.h"
 
@@ -186,8 +190,8 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
         int i, j;                                                 \
         int e_8[4];                                               \
         int o_8[4] = { 0 };                                       \
-        for (i = 0; i < 4; i++)                                   \
-            for (j = 1; j < end; j += 2)                          \
+        for (j = 1; j < end; j += 2)                              \
+            for (i = 0; i < 4; i++)                               \
                 o_8[i] += transform[4 * j][i] * src[j * sstep];   \
         TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
                                                                   \
@@ -197,13 +201,82 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
         }                                                         \
     } while (0)
 
+#ifdef _MSC_VER
+
+#define TR_16(dst, src, dstep, sstep, assign, end)                  \
+    do {                                                            \
+        int i, j;                                                   \
+        int e_16[8];                                                \
+        int o_16[8];                                                \
+        __m128i o_0, o_1;                                           \
+        o_0 =  o_1 = _mm_setzero_si128();                           \
+        for (j = 1; j < end; j += 2) {                              \
+            __m128i vhi, vlo;                                       \
+            const short multiplier = src[j * sstep];                \
+            __m128i coeffs = _mm_set1_epi16(multiplier);            \
+            __m128i buf = _mm_castpd_si128(_mm_load_sd((const double*) transform[2 * j])); \
+            buf = _mm_srai_epi16(_mm_unpacklo_epi8(buf, buf), 8);   \
+            vhi = _mm_mulhi_epi16(buf, coeffs);                     \
+            vlo = _mm_mullo_epi16(buf, coeffs);                     \
+            o_0 = _mm_add_epi32(o_0, _mm_unpacklo_epi16(vlo, vhi)); \
+            o_1 = _mm_add_epi32(o_1, _mm_unpackhi_epi16(vlo, vhi)); \
+        }                                                           \
+        ((__m128i*) o_16)[0] = o_0;                                 \
+        ((__m128i*) o_16)[1] = o_1;                                 \
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                      \
+                                                                    \
+        for (i = 0; i < 8; i++) {                                   \
+            assign(dst[i * dstep], e_16[i] + o_16[i]);              \
+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);       \
+        }                                                           \
+    } while (0)
+
+
+#define TR_32(dst, src, dstep, sstep, assign, end)                  \
+    do {                                                            \
+        int i, j;                                                   \
+        int e_32[16];                                               \
+        int o_32[16];                                               \
+        __m128i o_0, o_1, o_2, o_3;                                 \
+        o_0 =  o_1 =  o_2 = o_3 = _mm_setzero_si128();              \
+        for (j = 1; j < end; j += 2) {                              \
+            __m128i vhi, vlo;                                       \
+            const short multiplier = src[j * sstep];                \
+            __m128i coeffs = _mm_set1_epi16(multiplier);            \
+            __m128i buf = _mm_castpd_si128(_mm_load_sd((const double*) transform[j])); \
+            buf = _mm_srai_epi16(_mm_unpacklo_epi8(buf, buf), 8);   \
+            vhi = _mm_mulhi_epi16(buf, coeffs);                     \
+            vlo = _mm_mullo_epi16(buf, coeffs);                     \
+            o_0 = _mm_add_epi32(o_0, _mm_unpacklo_epi16(vlo, vhi)); \
+            o_1 = _mm_add_epi32(o_1, _mm_unpackhi_epi16(vlo, vhi)); \
+            buf = _mm_castpd_si128(_mm_load_sd((const double*) &transform[j][8])); \
+            buf = _mm_srai_epi16(_mm_unpacklo_epi8(buf, buf), 8);   \
+            vhi = _mm_mulhi_epi16(buf, coeffs);                     \
+            vlo = _mm_mullo_epi16(buf, coeffs);                     \
+            o_2 = _mm_add_epi32(o_2, _mm_unpacklo_epi16(vlo, vhi)); \
+            o_3 = _mm_add_epi32(o_3, _mm_unpackhi_epi16(vlo, vhi)); \
+        }                                                           \
+        ((__m128i*) o_32)[0] = o_0;                                 \
+        ((__m128i*) o_32)[1] = o_1;                                 \
+        ((__m128i*) o_32)[2] = o_2;                                 \
+        ((__m128i*) o_32)[3] = o_3;                                 \
+        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);               \
+                                                                    \
+        for (i = 0; i < 16; i++) {                                  \
+            assign(dst[i * dstep], e_32[i] + o_32[i]);              \
+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);       \
+        }                                                           \
+    } while (0)
+
+#else
+
 #define TR_16(dst, src, dstep, sstep, assign, end)                \
     do {                                                          \
         int i, j;                                                 \
         int e_16[8];                                              \
         int o_16[8] = { 0 };                                      \
-        for (i = 0; i < 8; i++)                                   \
-            for (j = 1; j < end; j += 2)                          \
+        for (j = 1; j < end; j += 2)                              \
+            for (i = 0; i < 8; i++)                               \
                 o_16[i] += transform[2 * j][i] * src[j * sstep];  \
         TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
                                                                   \
@@ -218,8 +291,8 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
         int i, j;                                                 \
         int e_32[16];                                             \
         int o_32[16] = { 0 };                                     \
-        for (i = 0; i < 16; i++)                                  \
-            for (j = 1; j < end; j += 2)                          \
+        for (j = 1; j < end; j += 2)                              \
+            for (i = 0; i < 16; i++)                              \
                 o_32[i] += transform[j][i] * src[j * sstep];      \
         TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
                                                                   \
@@ -229,6 +302,8 @@ static void FUNC(transform_4x4_luma)(int16_t *coeffs)
         }                                                         \
     } while (0)
 
+#endif
+
 #define IDCT_VAR4(H)                                              \
     int limit2 = FFMIN(col_limit + 4, H)
 #define IDCT_VAR8(H)                                              \
@@ -587,13 +662,14 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
     ptrdiff_t dststride = _dststride / sizeof(pixel);
 
     int shift = 14  + 1 - BIT_DEPTH;
-    int log2Wd = denom + shift - 1;
+    int log2Wd = denom + shift;
+
+    int bias = ((ox0 + ox1) * (1 << (BIT_DEPTH - 8)) + 1) << (log2Wd - 1);
+    wx1 <<= (14 - BIT_DEPTH);
 
-    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
     for (y = 0; y < height; y++) {
         for (x = 0; x < width; x++) {
-            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
+            dst[x] = av_clip_pixel(( src[x] * wx1 + src2[x] * wx0 + bias) >> log2Wd);
         }
         src  += srcstride;
         dst  += dststride;
@@ -604,15 +680,40 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
 ////////////////////////////////////////////////////////////////////////////////
 //
 ////////////////////////////////////////////////////////////////////////////////
-#define QPEL_FILTER(src, stride)                                               \
-    (filter[0] * src[x - 3 * stride] +                                         \
-     filter[1] * src[x - 2 * stride] +                                         \
-     filter[2] * src[x -     stride] +                                         \
-     filter[3] * src[x             ] +                                         \
-     filter[4] * src[x +     stride] +                                         \
-     filter[5] * src[x + 2 * stride] +                                         \
-     filter[6] * src[x + 3 * stride] +                                         \
-     filter[7] * src[x + 4 * stride])
+
+#define QPEL_FILTER_0(src, stride) \
+    (-1 * src[x - 3 * stride] +    \
+    4 * src[x - 2 * stride] +      \
+    -10 * src[x - stride] +        \
+    58 * src[x] +                  \
+    17 * src[x + stride] +         \
+    -5 * src[x + 2 * stride] +     \
+    1 * src[x + 3 * stride])
+
+#define QPEL_FILTER_1(src, stride) \
+    (-1 * src[x - 3 * stride] +    \
+    4 * src[x - 2 * stride] +      \
+    -11 * src[x - stride] +        \
+    40 * src[x] +                  \
+    40 * src[x + stride] +         \
+    -11 * src[x + 2 * stride] +    \
+    4 * src[x + 3 * stride] +      \
+    -1 * src[x + 4 * stride])
+
+#define QPEL_FILTER_2(src, stride) \
+    (1 * src[x - 2 * stride] +     \
+    -5 * src[x - stride] +         \
+    17 * src[x] +                  \
+    58 * src[x + stride] +         \
+    -10 * src[x + 2 * stride] +    \
+    4 * src[x + 3 * stride] +      \
+    -1 * src[x + 4 * stride])
+
+#define QPEL_MACRO_DISPATCH(idx, macro)           \
+    if ((idx) < 1) { macro(QPEL_FILTER_0) }       \
+    else if ((idx) == 1) { macro(QPEL_FILTER_1) } \
+    else { macro(QPEL_FILTER_2) }
+
 
 static void FUNC(put_hevc_qpel_h)(int16_t *dst,
                                   uint8_t *_src, ptrdiff_t _srcstride,
@@ -621,13 +722,17 @@ static void FUNC(put_hevc_qpel_h)(int16_t *dst,
     int x, y;
     pixel        *src       = (pixel*)_src;
     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        dst += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height; y++) {                              \
+        for (x = 0; x < width; x++)                             \
+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                       \
+        dst += MAX_PB_SIZE;                                     \
     }
+
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_v)(int16_t *dst,
@@ -637,13 +742,17 @@ static void FUNC(put_hevc_qpel_v)(int16_t *dst,
     int x, y;
     pixel        *src       = (pixel*)_src;
     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
-    for (y = 0; y < height; y++)  {
-        for (x = 0; x < width; x++)
-            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        dst += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                          \
+    for (y = 0; y < height; y++)  {                                     \
+        for (x = 0; x < width; x++)                                     \
+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                               \
+        dst += MAX_PB_SIZE;                                             \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
@@ -653,29 +762,36 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
                                    intptr_t my, int width)
 {
     int x, y;
-    const int8_t *filter;
     pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
     int16_t *tmp = tmp_array;
 
     src   -= QPEL_EXTRA_BEFORE * srcstride;
-    filter = ff_hevc_qpel_filters[mx - 1];
-    for (y = 0; y < height + QPEL_EXTRA; y++) {
-        for (x = 0; x < width; x++)
-            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        tmp += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height + QPEL_EXTRA; y++) {                 \
+        for (x = 0; x < width; x++)                             \
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                       \
+        tmp += MAX_PB_SIZE;                                     \
     }
 
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
+
     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter = ff_hevc_qpel_filters[my - 1];
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-        tmp += MAX_PB_SIZE;
-        dst += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height; y++) {                              \
+        for (x = 0; x < width; x++)                             \
+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;        \
+        tmp += MAX_PB_SIZE;                                     \
+        dst += MAX_PB_SIZE;                                     \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
@@ -687,7 +803,6 @@ static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
     int shift = 14 - BIT_DEPTH;
 
 #if BIT_DEPTH < 14
@@ -696,12 +811,16 @@ static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
     int offset = 0;
 #endif
 
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-        src += srcstride;
-        dst += dststride;
+#define QPEL_BODY(QPEL_FILTER)                                                                      \
+    for (y = 0; y < height; y++) {                                                                  \
+        for (x = 0; x < width; x++)                                                                 \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);   \
+        src += srcstride;                                                                           \
+        dst += dststride;                                                                           \
     }
+
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
@@ -714,8 +833,6 @@ static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
-
     int shift = 14  + 1 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -723,13 +840,17 @@ static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
     int offset = 0;
 #endif
 
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-        src  += srcstride;
-        dst  += dststride;
-        src2 += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                                                              \
+    for (y = 0; y < height; y++) {                                                                          \
+        for (x = 0; x < width; x++)                                                                         \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); \
+        src  += srcstride;                                                                                  \
+        dst  += dststride;                                                                                  \
+        src2 += MAX_PB_SIZE;                                                                                \
     }
+
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
@@ -741,7 +862,6 @@ static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
     int shift = 14 - BIT_DEPTH;
 
 #if BIT_DEPTH < 14
@@ -750,12 +870,16 @@ static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
     int offset = 0;
 #endif
 
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-        src += srcstride;
-        dst += dststride;
+#define QPEL_BODY(QPEL_FILTER)                                                                              \
+    for (y = 0; y < height; y++) {                                                                          \
+        for (x = 0; x < width; x++)                                                                         \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);   \
+        src += srcstride;                                                                                   \
+        dst += dststride;                                                                                   \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 
@@ -769,8 +893,6 @@ static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
-
     int shift = 14 + 1 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -778,13 +900,17 @@ static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_
     int offset = 0;
 #endif
 
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-        src  += srcstride;
-        dst  += dststride;
-        src2 += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                                                                      \
+    for (y = 0; y < height; y++) {                                                                                  \
+        for (x = 0; x < width; x++)                                                                                 \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); \
+        src  += srcstride;                                                                                          \
+        dst  += dststride;                                                                                          \
+        src2 += MAX_PB_SIZE;                                                                                        \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
@@ -792,7 +918,6 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
                                        int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    const int8_t *filter;
     pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
@@ -808,23 +933,31 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 #endif
 
     src   -= QPEL_EXTRA_BEFORE * srcstride;
-    filter = ff_hevc_qpel_filters[mx - 1];
-    for (y = 0; y < height + QPEL_EXTRA; y++) {
-        for (x = 0; x < width; x++)
-            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        tmp += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height + QPEL_EXTRA; y++) {                 \
+        for (x = 0; x < width; x++)                             \
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                       \
+        tmp += MAX_PB_SIZE;                                     \
     }
 
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
+
+
     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter = ff_hevc_qpel_filters[my - 1];
 
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-        tmp += MAX_PB_SIZE;
-        dst += dststride;
+#define QPEL_BODY(QPEL_FILTER)                                                                  \
+    for (y = 0; y < height; y++) {                                                              \
+        for (x = 0; x < width; x++)                                                             \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);   \
+        tmp += MAX_PB_SIZE;                                                                     \
+        dst += dststride;                                                                       \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
@@ -832,7 +965,6 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8
                                       int height, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    const int8_t *filter;
     pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
@@ -847,24 +979,31 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8
 #endif
 
     src   -= QPEL_EXTRA_BEFORE * srcstride;
-    filter = ff_hevc_qpel_filters[mx - 1];
-    for (y = 0; y < height + QPEL_EXTRA; y++) {
-        for (x = 0; x < width; x++)
-            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        tmp += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height + QPEL_EXTRA; y++) {                 \
+        for (x = 0; x < width; x++)                             \
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                       \
+        tmp += MAX_PB_SIZE;                                     \
     }
 
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
+
     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter = ff_hevc_qpel_filters[my - 1];
 
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-        tmp  += MAX_PB_SIZE;
-        dst  += dststride;
-        src2 += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                                                          \
+    for (y = 0; y < height; y++) {                                                                      \
+        for (x = 0; x < width; x++)                                                                     \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); \
+        tmp  += MAX_PB_SIZE;                                                                            \
+        dst  += dststride;                                                                              \
+        src2 += MAX_PB_SIZE;                                                                            \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
@@ -877,7 +1016,6 @@ static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
     int shift = denom + 14 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -886,12 +1024,16 @@ static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 #endif
 
     ox = ox * (1 << (BIT_DEPTH - 8));
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-        src += srcstride;
-        dst += dststride;
+#define QPEL_BODY(QPEL_FILTER)                                                                                  \
+    for (y = 0; y < height; y++) {                                                                              \
+        for (x = 0; x < width; x++)                                                                             \
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);   \
+        src += srcstride;                                                                                       \
+        dst += dststride;                                                                                       \
     }
+
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
@@ -905,21 +1047,23 @@ static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
-
     int shift = 14  + 1 - BIT_DEPTH;
     int log2Wd = denom + shift - 1;
 
     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-        src  += srcstride;
-        dst  += dststride;
-        src2 += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                                                          \
+    for (y = 0; y < height; y++) {                                                                      \
+        for (x = 0; x < width; x++)                                                                     \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +    \
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));                      \
+        src  += srcstride;                                                                              \
+        dst  += dststride;                                                                              \
+        src2 += MAX_PB_SIZE;                                                                            \
     }
+
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
@@ -932,7 +1076,6 @@ static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
-    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
     int shift = denom + 14 - BIT_DEPTH;
 #if BIT_DEPTH < 14
     int offset = 1 << (shift - 1);
@@ -941,12 +1084,16 @@ static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 #endif
 
     ox = ox * (1 << (BIT_DEPTH - 8));
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-        src += srcstride;
-        dst += dststride;
+#define QPEL_BODY(QPEL_FILTER)                                                                                          \
+    for (y = 0; y < height; y++) {                                                                                      \
+        for (x = 0; x < width; x++)                                                                                     \
+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);   \
+        src += srcstride;                                                                                               \
+        dst += dststride;                                                                                               \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
@@ -960,21 +1107,23 @@ static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint
     pixel *dst          = (pixel *)_dst;
     ptrdiff_t dststride = _dststride / sizeof(pixel);
 
-    const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
-
     int shift = 14 + 1 - BIT_DEPTH;
     int log2Wd = denom + shift - 1;
 
     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-        src  += srcstride;
-        dst  += dststride;
-        src2 += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                                                                  \
+    for (y = 0; y < height; y++) {                                                                              \
+        for (x = 0; x < width; x++)                                                                             \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +    \
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));                              \
+        src  += srcstride;                                                                                      \
+        dst  += dststride;                                                                                      \
+        src2 += MAX_PB_SIZE;                                                                                    \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
@@ -983,7 +1132,6 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
                                          intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    const int8_t *filter;
     pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
@@ -998,24 +1146,32 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 #endif
 
     src   -= QPEL_EXTRA_BEFORE * srcstride;
-    filter = ff_hevc_qpel_filters[mx - 1];
-    for (y = 0; y < height + QPEL_EXTRA; y++) {
-        for (x = 0; x < width; x++)
-            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        tmp += MAX_PB_SIZE;
+
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height + QPEL_EXTRA; y++) {                 \
+        for (x = 0; x < width; x++)                             \
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                       \
+        tmp += MAX_PB_SIZE;                                     \
     }
 
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
+
     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter = ff_hevc_qpel_filters[my - 1];
 
     ox = ox * (1 << (BIT_DEPTH - 8));
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-        tmp += MAX_PB_SIZE;
-        dst += dststride;
+
+#define QPEL_BODY(QPEL_FILTER)                                                                              \
+    for (y = 0; y < height; y++) {                                                                          \
+        for (x = 0; x < width; x++)                                                                         \
+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);   \
+        tmp += MAX_PB_SIZE;                                                                                 \
+        dst += dststride;                                                                                   \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
@@ -1024,7 +1180,6 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin
                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 {
     int x, y;
-    const int8_t *filter;
     pixel *src = (pixel*)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
     pixel *dst          = (pixel *)_dst;
@@ -1035,27 +1190,33 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin
     int log2Wd = denom + shift - 1;
 
     src   -= QPEL_EXTRA_BEFORE * srcstride;
-    filter = ff_hevc_qpel_filters[mx - 1];
-    for (y = 0; y < height + QPEL_EXTRA; y++) {
-        for (x = 0; x < width; x++)
-            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-        src += srcstride;
-        tmp += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                  \
+    for (y = 0; y < height + QPEL_EXTRA; y++) {                 \
+        for (x = 0; x < width; x++)                             \
+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);    \
+        src += srcstride;                                       \
+        tmp += MAX_PB_SIZE;                                     \
     }
 
+    QPEL_MACRO_DISPATCH(mx - 1, QPEL_BODY)
+#undef QPEL_BODY
+
     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-    filter = ff_hevc_qpel_filters[my - 1];
 
     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-        tmp  += MAX_PB_SIZE;
-        dst  += dststride;
-        src2 += MAX_PB_SIZE;
+#define QPEL_BODY(QPEL_FILTER)                                                                      \
+    for (y = 0; y < height; y++) {                                                                  \
+        for (x = 0; x < width; x++)                                                                 \
+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +    \
+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));                  \
+        tmp  += MAX_PB_SIZE;                                                                        \
+        dst  += dststride;                                                                          \
+        src2 += MAX_PB_SIZE;                                                                        \
     }
+
+    QPEL_MACRO_DISPATCH(my - 1, QPEL_BODY)
+#undef QPEL_BODY
 }
 
 ////////////////////////////////////////////////////////////////////////////////