[x265] [PATCH 03/10] AArch64: Optimise HBD interp_horiz_ps_neon

Fri Feb 21 16:06:27 UTC 2025

Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_ps_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.

The new 4-tap filter implementation is up to 34% faster when
coeffIdx==4 and up to 11% faster for the other filter values compared
to the existing Neon implementation.

The new 8-tap filter implementation is up to 34% faster when
coeffIdx==1, 48% when it is 2, and 40% when it is 3; compared to the
existing Neon implementation.
---
 source/common/aarch64/filter-prim.cpp | 497 +++++++++++++++++++++-----
 1 file changed, 413 insertions(+), 84 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index f691b2c36..0ed3fb78c 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -32,6 +32,11 @@
 #include <arm_neon.h>
 
 namespace {
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
 void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
                           const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
 {
@@ -2707,121 +2712,378 @@ void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
     }
 }
 
-#endif // !HIGH_BIT_DEPTH
+template<int coeff4>
+void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f,
+                             const uint32x4_t offset, int16x4_t &d)
+{
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+        uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+        int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+        sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+        // We divided filter values by 4 so -2 from right shift.
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS - 2);
+    }
+    else
+    {
+        uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+        sum = vmlal_lane_u16(sum, s[1], f, 1);
+        sum = vmlal_lane_u16(sum, s[2], f, 2);
+        sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+        d = vshrn_n_s32(vreinterpretq_s32_u32(sum), SHIFT_INTERP_PS);
+    }
 }
 
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f,
+                             const uint32x4_t offset, int16x8_t &d)
 {
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+        uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
 
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+        int32x4_t sum_lo = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+        int32x4_t sum_hi = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+        sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+        sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
 
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
-                          intptr_t dstStride, int coeffIdx)
+        // We divided filter values by 4 so -2 from right shift.
+        int16x4_t d0 = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS - 2);
+        int16x4_t d1 = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS - 2);
+        d = vcombine_s16(d0, d1);
+    }
+    else
+    {
+        uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+        sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
+
+        uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+        sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
+
+        int16x4_t d0 = vshrn_n_s32(vreinterpretq_s32_u32(sum_lo),
+                                   SHIFT_INTERP_PS);
+        int16x4_t d1 = vshrn_n_s32(vreinterpretq_s32_u32(sum_hi),
+                                   SHIFT_INTERP_PS);
+        d = vcombine_s16(d0, d1);
+    }
+}
+
+template<int coeff4, int width, int height>
+void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                           intptr_t dstStride, int coeffIdx, int isRowExt)
 {
-    if (N == 8)
+    const int N_TAPS = 4;
+    int blkheight = height;
+    const uint16x4_t filter = vreinterpret_u16_s16(
+        vabs_s16(vld1_s16(x265::g_chromaFilter[coeffIdx])));
+    uint32x4_t offset;
+
+    if (coeff4)
     {
-        switch (coeffIdx)
-        {
-        case 1:
-            return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
-                                                           dstStride);
-        case 2:
-            return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
-                                                           dstStride);
-        case 3:
-            return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
-                                                           dstStride);
-        }
+        // The -2 is needed because we will divide the filter values by 4.
+        offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2));
     }
     else
     {
-        switch (coeffIdx)
+        offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
+    }
+
+    if (isRowExt)
+    {
+        src -= (N_TAPS / 2 - 1) * srcStride;
+        blkheight += N_TAPS - 1;
+    }
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < blkheight; row++)
+    {
+        if (width % 16 == 0)
         {
-        case 4:
-            return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
-                                                              dst, dstStride,
-                                                              coeffIdx);
-        default:
-            return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
-                                                               dst, dstStride,
-                                                               coeffIdx);
+            for (int col = 0; col < width; col += 16)
+            {
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<4>(src + col + 0, 1, s0);
+                load_u16x8xn<4>(src + col + 8, 1, s1);
+
+                int16x8_t d0, d1;
+                filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+                filter4_ps_u16x8<coeff4>(s1, filter, offset, d1);
+
+                vst1q_s16(dst + col + 0, d0);
+                vst1q_s16(dst + col + 8, d1);
+            }
+        }
+        else
+        {
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src + col, 1, s0);
+
+                int16x8_t d0;
+                filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+                vst1q_s16(dst + col, d0);
+            }
+
+            if (width == 6)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src, 1, s0);
+
+                int16x8_t d0;
+                filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+                store_s16x6xn<1>(dst, dstStride, &d0);
+            }
+            else if (width % 8 != 0)
+            {
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<4>(src + col, 1, s0);
+
+                int16x4_t d0;
+                filter4_ps_u16x4<coeff4>(s0, filter, offset, d0);
+
+                if (width == 2)
+                {
+                    store_s16x2xn<1>(dst + col, dstStride, &d0);
+                }
+                else
+                {
+                    vst1_s16(dst + col, d0);
+                }
+            }
         }
+
+        src += srcStride;
+        dst += dstStride;
     }
 }
 
-#if HIGH_BIT_DEPTH
+template<int coeffIdx>
+void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d,
+                             uint32x4_t offset, uint16x8_t filter)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+        sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
 
-template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
-                          int isRowExt)
+        uint32x4_t sum234 = vmlal_laneq_u16(offset, s[3], filter, 3);
+        sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+        sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
+
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+                                  vreinterpret_s16_u16(sum0156));
+
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+        uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+        uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+        uint16x4_t sum34 = vadd_u16(s[3], s[4]);
+
+        uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+        sum0167 = vsub_u16(sum0167, sum07);
+
+        uint32x4_t sum2345 = vmlal_laneq_u16(offset, sum34, filter, 3);
+        sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
+
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+                                  vreinterpret_s16_u16(sum0167));
+
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+        sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
+
+        uint32x4_t sum345 = vmlal_laneq_u16(offset, s[3], filter, 3);
+        sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+        sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
+
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+                                  vreinterpret_s16_u16(sum1267));
+
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d,
+                             uint32x4_t offset, uint16x8_t filter)
 {
-    const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
-    const int offset = (unsigned) - IF_INTERNAL_OFFS << SHIFT_INTERP_PS;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+        sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+        uint32x4_t sum234_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+        sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+        sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
 
+        uint32x4_t sum234_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+        sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+        sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+        int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+        int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+        d = vcombine_s16(d_lo, d_hi);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+        uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+        uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+        uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+        uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+        sum0167 = vsubq_u16(sum0167, sum07);
+
+        uint32x4_t sum2345_lo = vmlal_laneq_u16(offset, vget_low_u16(sum34),
+                                                filter, 3);
+        sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+                                     filter, 2);
+
+        uint32x4_t sum2345_hi = vmlal_laneq_u16(offset, vget_high_u16(sum34),
+                                                filter, 3);
+        sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+                                     filter, 2);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+        int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+        int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+        d = vcombine_s16(d_lo, d_hi);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+        sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+        uint32x4_t sum345_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+        sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+        sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+        uint32x4_t sum345_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+        sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+        sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+        int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+        int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+
+        d = vcombine_s16(d_lo, d_hi);
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                           intptr_t dstStride, int isRowExt)
+{
+    const int N_TAPS = 8;
     int blkheight = height;
-    src -= N / 2 - 1;
+    const uint16x8_t filter =
+        vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(x265::g_lumaFilter[coeffIdx])));
+    uint32x4_t offset =
+        vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
 
     if (isRowExt)
     {
-        src -= (N / 2 - 1) * srcStride;
-        blkheight += N - 1;
+        src -= (N_TAPS / 2 - 1) * srcStride;
+        blkheight += N_TAPS - 1;
     }
-    int16x8_t vc3 = vld1q_s16(coeff);
-    const int32x4_t voffset = vdupq_n_s32(offset);
 
-    int row, col;
-    for (row = 0; row < blkheight; row++)
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < blkheight; row++)
     {
-        for (col = 0; col < width; col += 8)
+        if (width % 16 == 0)
         {
-            int32x4_t vsum, vsum2;
-
-            int16x8_t input[N];
-            for (int i = 0; i < N; i++)
+            for (int col = 0; col < width; col += 16)
             {
-                input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
-            }
-
-            vsum = voffset;
-            vsum2 = voffset;
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<8>(src + col + 0, 1, s0);
+                load_u16x8xn<8>(src + col + 8, 1, s1);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[0]),
-                                  vget_low_s16(vc3), 0);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[0], vget_low_s16(vc3), 0);
+                int16x8_t d0, d1;
+                filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
+                filter8_ps_u16x8<coeffIdx>(s1, d1, offset, filter);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[1]),
-                                  vget_low_s16(vc3), 1);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[1], vget_low_s16(vc3), 1);
+                vst1q_s16(dst + col + 0, d0);
+                vst1q_s16(dst + col + 8, d1);
+            }
+        }
+        else
+        {
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<8>(src + col, 1, s0);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[2]),
-                                  vget_low_s16(vc3), 2);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[2], vget_low_s16(vc3), 2);
+                int16x8_t d0;
+                filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[3]),
-                                  vget_low_s16(vc3), 3);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[3], vget_low_s16(vc3), 3);
+                vst1q_s16(dst + col, d0);
+            }
 
-            if (N == 8)
+            if (width % 8 == 4)
             {
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[4]), vget_high_s16(vc3), 0);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[4], vget_high_s16(vc3), 0);
-
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[5]), vget_high_s16(vc3), 1);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[5], vget_high_s16(vc3), 1);
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<8>(src + col, 1, s0);
 
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[6]), vget_high_s16(vc3), 2);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[6], vget_high_s16(vc3), 2);
+                int16x4_t d0;
+                filter8_ps_u16x4<coeffIdx>(s0, d0, offset, filter);
 
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[7]), vget_high_s16(vc3), 3);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[7], vget_high_s16(vc3), 3);
+                vst1_s16(dst + col, d0);
             }
-
-            int16x4_t res_lo = vshrn_n_s32(vsum, SHIFT_INTERP_PS);
-            int16x4_t res_hi = vshrn_n_s32(vsum2, SHIFT_INTERP_PS);
-            vst1q_s16(dst + col, vcombine_s16(res_lo, res_hi));
         }
 
         src += srcStride;
@@ -2829,10 +3091,50 @@ void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
     }
 }
 
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    if (N == 8)
+    {
+        switch (coeffIdx)
+        {
+        case 1:
+            return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+    }
+    else
+    {
+        switch (coeffIdx)
+        {
+        case 4:
+            return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
+                                                              dst, dstStride,
+                                                              coeffIdx);
+        default:
+            return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
+                                                               dst, dstStride,
+                                                               coeffIdx);
+        }
+    }
+}
+
 template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
-                          int isRowExt)
+void interp_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx, int isRowExt)
 {
     if (N == 8)
     {
@@ -2867,8 +3169,6 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
     }
 }
 
-#endif // HIGH_BIT_DEPTH
-
 template<int N, int width, int height>
 void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 {
@@ -3440,6 +3740,35 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
     p.pu[LUMA_4x8].luma_hpp                                 = interp_horiz_pp_neon<8, 4, 8>;
     p.pu[LUMA_4x16].luma_hpp                                = interp_horiz_pp_neon<8, 4, 16>;
     p.pu[LUMA_12x16].luma_hpp                               = interp_horiz_pp_neon<8, 12, 16>;
+
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps   = interp_horiz_ps_neon<4, 2, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps   = interp_horiz_ps_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps   = interp_horiz_ps_neon<4, 4, 2>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps   = interp_horiz_ps_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps   = interp_horiz_ps_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps  = interp_horiz_ps_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps   = interp_horiz_ps_neon<4, 6, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = interp_horiz_ps_neon<4, 12, 16>;
+
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hps   = interp_horiz_ps_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hps  = interp_horiz_ps_neon<4, 2, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hps   = interp_horiz_ps_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hps   = interp_horiz_ps_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hps  = interp_horiz_ps_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hps  = interp_horiz_ps_neon<4, 4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hps  = interp_horiz_ps_neon<4, 6, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = interp_horiz_ps_neon<4, 12, 32>;
+
+    p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hps         = interp_horiz_ps_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hps         = interp_horiz_ps_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hps        = interp_horiz_ps_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hps       = interp_horiz_ps_neon<4, 12, 16>;
+
+    p.pu[LUMA_4x4].luma_hps                                 = interp_horiz_ps_neon<8, 4, 4>;
+    p.pu[LUMA_4x8].luma_hps                                 = interp_horiz_ps_neon<8, 4, 8>;
+    p.pu[LUMA_4x16].luma_hps                                = interp_horiz_ps_neon<8, 4, 16>;
+    p.pu[LUMA_12x16].luma_hps                               = interp_horiz_ps_neon<8, 12, 16>;
+
 #endif // HIGH_BIT_DEPTH
 }
 
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From adb7a96b61dff73da23666a3283378cd26aeb1ed Mon Sep 17 00:00:00 2001
Message-Id: <adb7a96b61dff73da23666a3283378cd26aeb1ed.1740153395.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1740153395.git.gerdazsejke.more at arm.com>
References: <cover.1740153395.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sun, 19 Jan 2025 10:26:48 +0100
Subject: [PATCH 03/10] AArch64: Optimise HBD interp_horiz_ps_neon

Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_ps_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.

The new 4-tap filter implementation is up to 34% faster when
coeffIdx==4 and up to 11% faster for the other filter values compared
to the existing Neon implementation.

The new 8-tap filter implementation is up to 34% faster when
coeffIdx==1, 48% when it is 2, and 40% when it is 3; compared to the
existing Neon implementation.
---
 source/common/aarch64/filter-prim.cpp | 497 +++++++++++++++++++++-----
 1 file changed, 413 insertions(+), 84 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index f691b2c36..0ed3fb78c 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -32,6 +32,11 @@
 #include <arm_neon.h>
 
 namespace {
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
 void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
                           const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
 {
@@ -2707,121 +2712,378 @@ void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
     }
 }
 
-#endif // !HIGH_BIT_DEPTH
+template<int coeff4>
+void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f,
+                             const uint32x4_t offset, int16x4_t &d)
+{
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+        uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+        int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+        sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+        // We divided filter values by 4 so -2 from right shift.
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS - 2);
+    }
+    else
+    {
+        uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+        sum = vmlal_lane_u16(sum, s[1], f, 1);
+        sum = vmlal_lane_u16(sum, s[2], f, 2);
+        sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+        d = vshrn_n_s32(vreinterpretq_s32_u32(sum), SHIFT_INTERP_PS);
+    }
 }
 
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f,
+                             const uint32x4_t offset, int16x8_t &d)
 {
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+        uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
 
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+        int32x4_t sum_lo = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+        int32x4_t sum_hi = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+        sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+        sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
 
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
-                          intptr_t dstStride, int coeffIdx)
+        // We divided filter values by 4 so -2 from right shift.
+        int16x4_t d0 = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS - 2);
+        int16x4_t d1 = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS - 2);
+        d = vcombine_s16(d0, d1);
+    }
+    else
+    {
+        uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+        sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
+
+        uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+        sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
+
+        int16x4_t d0 = vshrn_n_s32(vreinterpretq_s32_u32(sum_lo),
+                                   SHIFT_INTERP_PS);
+        int16x4_t d1 = vshrn_n_s32(vreinterpretq_s32_u32(sum_hi),
+                                   SHIFT_INTERP_PS);
+        d = vcombine_s16(d0, d1);
+    }
+}
+
+template<int coeff4, int width, int height>
+void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                           intptr_t dstStride, int coeffIdx, int isRowExt)
 {
-    if (N == 8)
+    const int N_TAPS = 4;
+    int blkheight = height;
+    const uint16x4_t filter = vreinterpret_u16_s16(
+        vabs_s16(vld1_s16(x265::g_chromaFilter[coeffIdx])));
+    uint32x4_t offset;
+
+    if (coeff4)
     {
-        switch (coeffIdx)
-        {
-        case 1:
-            return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
-                                                           dstStride);
-        case 2:
-            return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
-                                                           dstStride);
-        case 3:
-            return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
-                                                           dstStride);
-        }
+        // The -2 is needed because we will divide the filter values by 4.
+        offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2));
     }
     else
     {
-        switch (coeffIdx)
+        offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
+    }
+
+    if (isRowExt)
+    {
+        src -= (N_TAPS / 2 - 1) * srcStride;
+        blkheight += N_TAPS - 1;
+    }
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < blkheight; row++)
+    {
+        if (width % 16 == 0)
         {
-        case 4:
-            return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
-                                                              dst, dstStride,
-                                                              coeffIdx);
-        default:
-            return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
-                                                               dst, dstStride,
-                                                               coeffIdx);
+            for (int col = 0; col < width; col += 16)
+            {
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<4>(src + col + 0, 1, s0);
+                load_u16x8xn<4>(src + col + 8, 1, s1);
+
+                int16x8_t d0, d1;
+                filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+                filter4_ps_u16x8<coeff4>(s1, filter, offset, d1);
+
+                vst1q_s16(dst + col + 0, d0);
+                vst1q_s16(dst + col + 8, d1);
+            }
+        }
+        else
+        {
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src + col, 1, s0);
+
+                int16x8_t d0;
+                filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+                vst1q_s16(dst + col, d0);
+            }
+
+            if (width == 6)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src, 1, s0);
+
+                int16x8_t d0;
+                filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+                store_s16x6xn<1>(dst, dstStride, &d0);
+            }
+            else if (width % 8 != 0)
+            {
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<4>(src + col, 1, s0);
+
+                int16x4_t d0;
+                filter4_ps_u16x4<coeff4>(s0, filter, offset, d0);
+
+                if (width == 2)
+                {
+                    store_s16x2xn<1>(dst + col, dstStride, &d0);
+                }
+                else
+                {
+                    vst1_s16(dst + col, d0);
+                }
+            }
         }
+
+        src += srcStride;
+        dst += dstStride;
     }
 }
 
-#if HIGH_BIT_DEPTH
+template<int coeffIdx>
+void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d,
+                             uint32x4_t offset, uint16x8_t filter)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+        sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
 
-template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
-                          int isRowExt)
+        uint32x4_t sum234 = vmlal_laneq_u16(offset, s[3], filter, 3);
+        sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+        sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
+
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+                                  vreinterpret_s16_u16(sum0156));
+
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+        uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+        uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+        uint16x4_t sum34 = vadd_u16(s[3], s[4]);
+
+        uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+        sum0167 = vsub_u16(sum0167, sum07);
+
+        uint32x4_t sum2345 = vmlal_laneq_u16(offset, sum34, filter, 3);
+        sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
+
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+                                  vreinterpret_s16_u16(sum0167));
+
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+        sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
+
+        uint32x4_t sum345 = vmlal_laneq_u16(offset, s[3], filter, 3);
+        sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+        sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
+
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+                                  vreinterpret_s16_u16(sum1267));
+
+        d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d,
+                             uint32x4_t offset, uint16x8_t filter)
 {
-    const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
-    const int offset = (unsigned) - IF_INTERNAL_OFFS << SHIFT_INTERP_PS;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+        sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+        uint32x4_t sum234_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+        sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+        sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
 
+        uint32x4_t sum234_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+        sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+        sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+        int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+        int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+        d = vcombine_s16(d_lo, d_hi);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+        uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+        uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+        uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+        uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+        sum0167 = vsubq_u16(sum0167, sum07);
+
+        uint32x4_t sum2345_lo = vmlal_laneq_u16(offset, vget_low_u16(sum34),
+                                                filter, 3);
+        sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+                                     filter, 2);
+
+        uint32x4_t sum2345_hi = vmlal_laneq_u16(offset, vget_high_u16(sum34),
+                                                filter, 3);
+        sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+                                     filter, 2);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+        int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+        int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+        d = vcombine_s16(d_lo, d_hi);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+        sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+        uint32x4_t sum345_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+        sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+        sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+        uint32x4_t sum345_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+        sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+        sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+        int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+        int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+
+        d = vcombine_s16(d_lo, d_hi);
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                           intptr_t dstStride, int isRowExt)
+{
+    const int N_TAPS = 8;
     int blkheight = height;
-    src -= N / 2 - 1;
+    const uint16x8_t filter =
+        vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(x265::g_lumaFilter[coeffIdx])));
+    uint32x4_t offset =
+        vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
 
     if (isRowExt)
     {
-        src -= (N / 2 - 1) * srcStride;
-        blkheight += N - 1;
+        src -= (N_TAPS / 2 - 1) * srcStride;
+        blkheight += N_TAPS - 1;
     }
-    int16x8_t vc3 = vld1q_s16(coeff);
-    const int32x4_t voffset = vdupq_n_s32(offset);
 
-    int row, col;
-    for (row = 0; row < blkheight; row++)
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < blkheight; row++)
     {
-        for (col = 0; col < width; col += 8)
+        if (width % 16 == 0)
         {
-            int32x4_t vsum, vsum2;
-
-            int16x8_t input[N];
-            for (int i = 0; i < N; i++)
+            for (int col = 0; col < width; col += 16)
             {
-                input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
-            }
-
-            vsum = voffset;
-            vsum2 = voffset;
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<8>(src + col + 0, 1, s0);
+                load_u16x8xn<8>(src + col + 8, 1, s1);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[0]),
-                                  vget_low_s16(vc3), 0);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[0], vget_low_s16(vc3), 0);
+                int16x8_t d0, d1;
+                filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
+                filter8_ps_u16x8<coeffIdx>(s1, d1, offset, filter);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[1]),
-                                  vget_low_s16(vc3), 1);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[1], vget_low_s16(vc3), 1);
+                vst1q_s16(dst + col + 0, d0);
+                vst1q_s16(dst + col + 8, d1);
+            }
+        }
+        else
+        {
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<8>(src + col, 1, s0);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[2]),
-                                  vget_low_s16(vc3), 2);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[2], vget_low_s16(vc3), 2);
+                int16x8_t d0;
+                filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
 
-            vsum = vmlal_lane_s16(vsum, vget_low_s16(input[3]),
-                                  vget_low_s16(vc3), 3);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[3], vget_low_s16(vc3), 3);
+                vst1q_s16(dst + col, d0);
+            }
 
-            if (N == 8)
+            if (width % 8 == 4)
             {
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[4]), vget_high_s16(vc3), 0);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[4], vget_high_s16(vc3), 0);
-
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[5]), vget_high_s16(vc3), 1);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[5], vget_high_s16(vc3), 1);
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<8>(src + col, 1, s0);
 
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[6]), vget_high_s16(vc3), 2);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[6], vget_high_s16(vc3), 2);
+                int16x4_t d0;
+                filter8_ps_u16x4<coeffIdx>(s0, d0, offset, filter);
 
-                vsum = vmlal_lane_s16(vsum, vget_low_s16(input[7]), vget_high_s16(vc3), 3);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[7], vget_high_s16(vc3), 3);
+                vst1_s16(dst + col, d0);
             }
-
-            int16x4_t res_lo = vshrn_n_s32(vsum, SHIFT_INTERP_PS);
-            int16x4_t res_hi = vshrn_n_s32(vsum2, SHIFT_INTERP_PS);
-            vst1q_s16(dst + col, vcombine_s16(res_lo, res_hi));
         }
 
         src += srcStride;
@@ -2829,10 +3091,50 @@ void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
     }
 }
 
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    if (N == 8)
+    {
+        switch (coeffIdx)
+        {
+        case 1:
+            return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+    }
+    else
+    {
+        switch (coeffIdx)
+        {
+        case 4:
+            return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
+                                                              dst, dstStride,
+                                                              coeffIdx);
+        default:
+            return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
+                                                               dst, dstStride,
+                                                               coeffIdx);
+        }
+    }
+}
+
 template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
-                          int isRowExt)
+void interp_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx, int isRowExt)
 {
     if (N == 8)
     {
@@ -2867,8 +3169,6 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
     }
 }
 
-#endif // HIGH_BIT_DEPTH
-
 template<int N, int width, int height>
 void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 {
@@ -3440,6 +3740,35 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
     p.pu[LUMA_4x8].luma_hpp                                 = interp_horiz_pp_neon<8, 4, 8>;
     p.pu[LUMA_4x16].luma_hpp                                = interp_horiz_pp_neon<8, 4, 16>;
     p.pu[LUMA_12x16].luma_hpp                               = interp_horiz_pp_neon<8, 12, 16>;
+
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps   = interp_horiz_ps_neon<4, 2, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps   = interp_horiz_ps_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps   = interp_horiz_ps_neon<4, 4, 2>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps   = interp_horiz_ps_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps   = interp_horiz_ps_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps  = interp_horiz_ps_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps   = interp_horiz_ps_neon<4, 6, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = interp_horiz_ps_neon<4, 12, 16>;
+
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hps   = interp_horiz_ps_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hps  = interp_horiz_ps_neon<4, 2, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hps   = interp_horiz_ps_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hps   = interp_horiz_ps_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hps  = interp_horiz_ps_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hps  = interp_horiz_ps_neon<4, 4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hps  = interp_horiz_ps_neon<4, 6, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = interp_horiz_ps_neon<4, 12, 32>;
+
+    p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hps         = interp_horiz_ps_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hps         = interp_horiz_ps_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hps        = interp_horiz_ps_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hps       = interp_horiz_ps_neon<4, 12, 16>;
+
+    p.pu[LUMA_4x4].luma_hps                                 = interp_horiz_ps_neon<8, 4, 4>;
+    p.pu[LUMA_4x8].luma_hps                                 = interp_horiz_ps_neon<8, 4, 8>;
+    p.pu[LUMA_4x16].luma_hps                                = interp_horiz_ps_neon<8, 4, 16>;
+    p.pu[LUMA_12x16].luma_hps                               = interp_horiz_ps_neon<8, 12, 16>;
+
 #endif // HIGH_BIT_DEPTH
 }
 
-- 
2.39.5 (Apple Git-154)