[x265] [PATCH v2 01/10] AArch64: Optimise HBD interp_horiz_pp_neon

Tue Mar 11 19:45:11 UTC 2025

Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_pp_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.

The new 4-tap filter implementation is up to 37% faster when
coeffIdx==4 and up to 20% faster for the other filter values compared
to the existing Neon implementation.

The new 8-tap filter implementation is up to 42% faster when
coeffIdx==1, 51% when it is 2, and 44% when it is 3; compared to the
existing Neon implementation.
---
 source/common/aarch64/filter-prim.cpp | 546 +++++++++++++++++++++-----
 source/common/aarch64/mem-neon.h      |  48 ++-
 2 files changed, 497 insertions(+), 97 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 71dfc0d63..ecf0dc141 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -1,3 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Liwei Wang <liwei at multicorewareinc.com>
+ *          Jonathan Swinney <jswinney at amazon.com>
+ *          Hari Limaye <hari.limaye at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
 #if HAVE_NEON
 
 #include "filter-prim.h"
@@ -2049,66 +2075,172 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
     }
 }
 
-#endif // !HIGH_BIT_DEPTH
+#else // !HIGH_BIT_DEPTH
+
+template<bool coeff4>
+void inline filter4_u16x4(const uint16x4_t *s, const uint16x4_t f,
+                          const uint32x4_t offset, const uint16x4_t maxVal,
+                          uint16x4_t &d)
+{
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+        uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+        int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+        sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+        // We divided filter values by 4 so -2 from right shift.
+        d = vqshrun_n_s32(sum, IF_FILTER_PREC - 2);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+        sum = vmlal_lane_u16(sum, s[1], f, 1);
+        sum = vmlal_lane_u16(sum, s[2], f, 2);
+        sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+        d = vqshrun_n_s32(vreinterpretq_s32_u32(sum), IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
 }
 
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_u16x8(const uint16x8_t *s, const uint16x4_t f,
+                          const uint32x4_t offset, const uint16x8_t maxVal,
+                          uint16x8_t &d)
 {
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+        uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
+
+        int32x4_t sum_lo = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+        int32x4_t sum_hi = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+        sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+        sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
+
+        // We divided filter values by 4 so -2 from right shift.
+        uint16x4_t d0 = vqshrun_n_s32(sum_lo, IF_FILTER_PREC - 2);
+        uint16x4_t d1 = vqshrun_n_s32(sum_hi, IF_FILTER_PREC - 2);
+        d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+    }
+    else
+    {
+        uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+        sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
 
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+        uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+        sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
 
-template<int width, int height>
-void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+        uint16x4_t d0 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_lo),
+                                      IF_FILTER_PREC);
+        uint16x4_t d1 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_hi),
+                                      IF_FILTER_PREC);
+        d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+    }
+}
+
+template<bool coeff4, int width, int height>
+void inline interp4_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+                                  pixel *dst, intptr_t dstStride,
+                                  const int16_t coeffIdx)
 {
-    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
-    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
-    for (int row = 0; row < height; row++)
+    const int N_TAPS = 4;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+    const uint16x4_t filter = vreinterpret_u16_s16(
+        vabs_s16(vld1_s16(X265_NS::g_chromaFilter[coeffIdx])));
+    uint32x4_t offset;
+
+    // A shim of 1 << (IF_FILTER_PREC - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    if (coeff4)
     {
+        // The outermost -2 is needed because we will divide the filter values by 4.
+        offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1 - 2));
+    }
+    else
+    {
+        offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1));
+    }
 
-        int col = 0;
-        for (; col + 8 <= width; col += 8)
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)
+    {
+        if (width % 16 == 0)
         {
-            uint16x8_t in;
+            for (int col = 0; col < width; col += 16)
+            {
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<4>(src + col + 0, 1, s0);
+                load_u16x8xn<4>(src + col + 8, 1, s1);
 
-#if HIGH_BIT_DEPTH
-            in = vld1q_u16(src + col);
-#else
-            in = vmovl_u8(vld1_u8(src + col));
-#endif
+                uint16x8_t d0, d1;
+                filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+                filter4_u16x8<coeff4>(s1, filter, offset, maxVal, d1);
 
-            int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
-            tmp = vsubq_s16(tmp, off);
-            vst1q_s16(dst + col, tmp);
+                vst1q_u16(dst + col + 0, d0);
+                vst1q_u16(dst + col + 8, d1);
+            }
         }
-        for (; col + 4 <= width; col += 4)
+        else
         {
-            uint16x4_t in;
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src + col, 1, s0);
 
-#if HIGH_BIT_DEPTH
-            in = vld1_u16(src + col);
-#else
-            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+                uint16x8_t d0;
+                filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
 
-            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
-            tmp = vsub_s16(tmp, vget_low_s16(off));
-            vst1_s16(dst + col, tmp);
-        }
-        for (; col < width; col += 2)
-        {
-            uint16x4_t in;
+                vst1q_u16(dst + col, d0);
+            }
 
-#if HIGH_BIT_DEPTH
-            in = vld1_u16(src + col);
-#else
-            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+            if (width == 6)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src, 1, s0);
 
-            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
-            tmp = vsub_s16(tmp, vget_low_s16(off));
-            store_s16x2xn<1>(dst + col, dstStride, &tmp);
+                uint16x8_t d0;
+                filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+
+                store_u16x6xn<1>(dst, dstStride, &d0);
+            }
+            else if (width % 8 != 0)
+            {
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<4>(src + col, 1, s0);
+
+                uint16x4_t d0;
+                filter4_u16x4<coeff4>(s0, filter, offset,
+                                      vget_low_u16(maxVal), d0);
+
+                if (width == 2)
+                {
+                    store_u16x2xn<1>(dst + col, dstStride, &d0);
+                }
+                else
+                {
+                    vst1_u16(dst + col, d0);
+                }
+            }
         }
 
         src += srcStride;
@@ -2116,73 +2248,205 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
     }
 }
 
-#if HIGH_BIT_DEPTH
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+template<int coeffIdx>
+void inline filter8_u16x4(const uint16x4_t *s, uint16x4_t &d,
+                          uint16x8_t filter, uint16x4_t maxVal)
 {
-    const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
-    int headRoom = IF_FILTER_PREC;
-    int offset = (1 << (headRoom - 1));
-    uint16_t maxVal = (1 << X265_DEPTH) - 1;
-    int cStride = 1;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+        sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
 
-    src -= (N / 2 - 1) * cStride;
-    int16x8_t vc = vld1q_s16(coeff);
-    int16x4_t low_vc = vget_low_s16(vc);
-    int16x4_t high_vc = vget_high_s16(vc);
+        uint32x4_t sum234 = vmull_laneq_u16(s[3], filter, 3);
+        sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+        sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
 
-    const int32x4_t voffset = vdupq_n_s32(offset);
-    const int32x4_t vhr = vdupq_n_s32(-headRoom);
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+                                  vreinterpret_s16_u16(sum0156));
 
-    int row, col;
-    for (row = 0; row < height; row++)
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else if (coeffIdx == 2)
     {
-        for (col = 0; col < width; col += 8)
-        {
-            int32x4_t vsum1, vsum2;
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+        uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+        uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+        uint16x4_t sum34 = vadd_u16(s[3], s[4]);
 
-            int16x8_t input[N];
+        uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+        sum0167 = vsub_u16(sum0167, sum07);
 
-            for (int i = 0; i < N; i++)
-            {
-                input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
-            }
-            vsum1 = voffset;
-            vsum2 = voffset;
+        uint32x4_t sum2345 = vmull_laneq_u16(sum34, filter, 3);
+        sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[0]), low_vc, 0);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[0], low_vc, 0);
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+                                  vreinterpret_s16_u16(sum0167));
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[1]), low_vc, 1);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[1], low_vc, 1);
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+        sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[2]), low_vc, 2);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[2], low_vc, 2);
+        uint32x4_t sum345 = vmull_laneq_u16(s[3], filter, 3);
+        sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+        sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[3]), low_vc, 3);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[3], low_vc, 3);
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+                                  vreinterpret_s16_u16(sum1267));
 
-            if (N == 8)
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_u16x8(const uint16x8_t *s, uint16x8_t &d, uint16x8_t filter,
+                          uint16x8_t maxVal)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+        sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+        uint32x4_t sum234_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+        sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+        sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
+
+        uint32x4_t sum234_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+        sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+        sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+        uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+        uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+        uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+        uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+        sum0167 = vsubq_u16(sum0167, sum07);
+
+        uint32x4_t sum2345_lo = vmull_laneq_u16(vget_low_u16(sum34),
+                                                filter, 3);
+        sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+                                     filter, 2);
+
+        uint32x4_t sum2345_hi = vmull_laneq_u16(vget_high_u16(sum34),
+                                                filter, 3);
+        sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+                                     filter, 2);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+        sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+        uint32x4_t sum345_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+        sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+        sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+        uint32x4_t sum345_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+        sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+        sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+                                  pixel *dst, intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+
+    const uint16x8_t filter =
+        vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(X265_NS::g_lumaFilter[coeffIdx])));
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)
+    {
+        if (width % 16 == 0)
+        {
+            for (int col = 0; col < width; col += 16)
             {
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[4]), high_vc, 0);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[4], high_vc, 0);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[5]), high_vc, 1);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[5], high_vc, 1);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[6]), high_vc, 2);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[6], high_vc, 2);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[7]), high_vc, 3);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[7], high_vc, 3);
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<8>(src + col + 0, 1, s0);
+                load_u16x8xn<8>(src + col + 8, 1, s1);
+
+                uint16x8_t d0, d1;
+                filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
+                filter8_u16x8<coeffIdx>(s1, d1, filter, maxVal);
+
+                vst1q_u16(dst + col + 0, d0);
+                vst1q_u16(dst + col + 8, d1);
+            }
+        }
+        else
+        {
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<8>(src + col, 1, s0);
+
+                uint16x8_t d0;
+                filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
 
+                vst1q_u16(dst + col, d0);
             }
 
-            vsum1 = vshlq_s32(vsum1, vhr);
-            vsum2 = vshlq_s32(vsum2, vhr);
+            if (width % 8 == 4)
+            {
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<8>(src + col, 1, s0);
 
-            int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
-                                        vreinterpretq_s16_s32(vsum2));
-            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
-            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
-            vst1q_u16(dst + col, vreinterpretq_u16_s16(vsum));
+                uint16x4_t d0;
+                filter8_u16x4<coeffIdx>(s0, d0, filter, vget_low_u16(maxVal));
+
+                vst1_u16(dst + col, d0);
+            }
         }
 
         src += srcStride;
@@ -2190,7 +2454,16 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
     }
 }
 
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
 template<int N, int width, int height>
 void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
                           intptr_t dstStride, int coeffIdx)
@@ -2226,8 +2499,6 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
     }
 }
 
-#endif // HIGH_BIT_DEPTH
-
 #if HIGH_BIT_DEPTH
 
 template<int N, int width, int height>
@@ -2676,11 +2947,64 @@ void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_
     interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
 }
 
+template<int width, int height>
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+{
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
 
+    for (int row = 0; row < height; row++)
+    {
+        int col = 0;
+        for (; col + 8 <= width; col += 8)
+        {
+            uint16x8_t in;
 
+#if HIGH_BIT_DEPTH
+            in = vld1q_u16(src + col);
+#else
+            in = vmovl_u8(vld1_u8(src + col));
+#endif
 
+            int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
+            tmp = vsubq_s16(tmp, off);
+            vst1q_s16(dst + col, tmp);
+        }
 
+        for (; col + 4 <= width; col += 4)
+        {
+            uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+            in = vld1_u16(src + col);
+#else
+            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+            tmp = vsub_s16(tmp, vget_low_s16(off));
+            vst1_s16(dst + col, tmp);
+        }
+
+        for (; col < width; col += 2)
+        {
+            uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+            in = vld1_u16(src + col);
+#else
+            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
 
+            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+            tmp = vsub_s16(tmp, vget_low_s16(off));
+            store_s16x2xn<1>(dst + col, dstStride, &tmp);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
 
 #define CHROMA_420(W, H) \
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
@@ -2834,6 +3158,36 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
     CHROMA_444(64, 32);
     CHROMA_444(64, 48);
     CHROMA_444(64, 64);
+
+#if HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp   = interp_horiz_pp_neon<4, 2, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp   = interp_horiz_pp_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp   = interp_horiz_pp_neon<4, 4, 2>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp   = interp_horiz_pp_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp   = interp_horiz_pp_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp  = interp_horiz_pp_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp   = interp_horiz_pp_neon<4, 6, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = interp_horiz_pp_neon<4, 12, 16>;
+
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp   = interp_horiz_pp_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp  = interp_horiz_pp_neon<4, 2, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp   = interp_horiz_pp_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp   = interp_horiz_pp_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp  = interp_horiz_pp_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp  = interp_horiz_pp_neon<4, 4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hpp  = interp_horiz_pp_neon<4, 6, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = interp_horiz_pp_neon<4, 12, 32>;
+
+    p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hpp         = interp_horiz_pp_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hpp         = interp_horiz_pp_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hpp        = interp_horiz_pp_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp       = interp_horiz_pp_neon<4, 12, 16>;
+
+    p.pu[LUMA_4x4].luma_hpp                                 = interp_horiz_pp_neon<8, 4, 4>;
+    p.pu[LUMA_4x8].luma_hpp                                 = interp_horiz_pp_neon<8, 4, 8>;
+    p.pu[LUMA_4x16].luma_hpp                                = interp_horiz_pp_neon<8, 4, 16>;
+    p.pu[LUMA_12x16].luma_hpp                               = interp_horiz_pp_neon<8, 12, 16>;
+#endif // HIGH_BIT_DEPTH
 }
 
 };
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 90788a938..2c6edfccb 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2024 MulticoreWare, Inc
+ * Copyright (C) 2024-2025 MulticoreWare, Inc
  *
  * Authors: Hari Limaye <hari.limaye at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -185,6 +186,51 @@ static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
     }
 }
 
+template<int N>
+static void inline load_u16x4xn(const uint16_t *src, const intptr_t stride,
+                                uint16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1_u16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
+                                uint16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1q_u16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x2xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u16(src[i]), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x6xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u16(dst, vget_low_u16(src[i]));
+        vst1q_lane_u32((uint32_t *)(dst + 4), vreinterpretq_u32_u16(src[i]), 2);
+        dst += dst_stride;
+    }
+}
+
 template<int N>
 static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x4_t *src)
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From c7fb942a2324d4ce3682580626eb57f0e2e3ac7e Mon Sep 17 00:00:00 2001
Message-Id: <c7fb942a2324d4ce3682580626eb57f0e2e3ac7e.1741721714.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1741721714.git.gerdazsejke.more at arm.com>
References: <cover.1741721714.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sun, 24 Nov 2024 12:50:21 +0100
Subject: [PATCH v2 01/10] AArch64: Optimise HBD interp_horiz_pp_neon

Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_pp_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.

The new 4-tap filter implementation is up to 37% faster when
coeffIdx==4 and up to 20% faster for the other filter values compared
to the existing Neon implementation.

The new 8-tap filter implementation is up to 42% faster when
coeffIdx==1, 51% when it is 2, and 44% when it is 3; compared to the
existing Neon implementation.
---
 source/common/aarch64/filter-prim.cpp | 546 +++++++++++++++++++++-----
 source/common/aarch64/mem-neon.h      |  48 ++-
 2 files changed, 497 insertions(+), 97 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 71dfc0d63..ecf0dc141 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -1,3 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Liwei Wang <liwei at multicorewareinc.com>
+ *          Jonathan Swinney <jswinney at amazon.com>
+ *          Hari Limaye <hari.limaye at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
 #if HAVE_NEON
 
 #include "filter-prim.h"
@@ -2049,66 +2075,172 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
     }
 }
 
-#endif // !HIGH_BIT_DEPTH
+#else // !HIGH_BIT_DEPTH
+
+template<bool coeff4>
+void inline filter4_u16x4(const uint16x4_t *s, const uint16x4_t f,
+                          const uint32x4_t offset, const uint16x4_t maxVal,
+                          uint16x4_t &d)
+{
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+        uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+        int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+        sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+        // We divided filter values by 4 so -2 from right shift.
+        d = vqshrun_n_s32(sum, IF_FILTER_PREC - 2);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+        sum = vmlal_lane_u16(sum, s[1], f, 1);
+        sum = vmlal_lane_u16(sum, s[2], f, 2);
+        sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+        d = vqshrun_n_s32(vreinterpretq_s32_u32(sum), IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
 }
 
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_u16x8(const uint16x8_t *s, const uint16x4_t f,
+                          const uint32x4_t offset, const uint16x8_t maxVal,
+                          uint16x8_t &d)
 {
+    if (coeff4)
+    {
+        // { -4, 36, 36, -4 }
+        // Filter values are divisible by 4, factor that out in order to only
+        // need a multiplication by 9 and a subtraction (which is a
+        // multiplication by -1).
+        uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+        uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
+
+        int32x4_t sum_lo = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+        int32x4_t sum_hi = vreinterpretq_s32_u32(
+            vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+        sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+        sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
+
+        // We divided filter values by 4 so -2 from right shift.
+        uint16x4_t d0 = vqshrun_n_s32(sum_lo, IF_FILTER_PREC - 2);
+        uint16x4_t d1 = vqshrun_n_s32(sum_hi, IF_FILTER_PREC - 2);
+        d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+    }
+    else
+    {
+        uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+        sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+        sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
 
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+        uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+        sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+        sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
 
-template<int width, int height>
-void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+        uint16x4_t d0 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_lo),
+                                      IF_FILTER_PREC);
+        uint16x4_t d1 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_hi),
+                                      IF_FILTER_PREC);
+        d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+    }
+}
+
+template<bool coeff4, int width, int height>
+void inline interp4_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+                                  pixel *dst, intptr_t dstStride,
+                                  const int16_t coeffIdx)
 {
-    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
-    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
-    for (int row = 0; row < height; row++)
+    const int N_TAPS = 4;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+    const uint16x4_t filter = vreinterpret_u16_s16(
+        vabs_s16(vld1_s16(X265_NS::g_chromaFilter[coeffIdx])));
+    uint32x4_t offset;
+
+    // A shim of 1 << (IF_FILTER_PREC - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    if (coeff4)
     {
+        // The outermost -2 is needed because we will divide the filter values by 4.
+        offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1 - 2));
+    }
+    else
+    {
+        offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1));
+    }
 
-        int col = 0;
-        for (; col + 8 <= width; col += 8)
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)
+    {
+        if (width % 16 == 0)
         {
-            uint16x8_t in;
+            for (int col = 0; col < width; col += 16)
+            {
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<4>(src + col + 0, 1, s0);
+                load_u16x8xn<4>(src + col + 8, 1, s1);
 
-#if HIGH_BIT_DEPTH
-            in = vld1q_u16(src + col);
-#else
-            in = vmovl_u8(vld1_u8(src + col));
-#endif
+                uint16x8_t d0, d1;
+                filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+                filter4_u16x8<coeff4>(s1, filter, offset, maxVal, d1);
 
-            int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
-            tmp = vsubq_s16(tmp, off);
-            vst1q_s16(dst + col, tmp);
+                vst1q_u16(dst + col + 0, d0);
+                vst1q_u16(dst + col + 8, d1);
+            }
         }
-        for (; col + 4 <= width; col += 4)
+        else
         {
-            uint16x4_t in;
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src + col, 1, s0);
 
-#if HIGH_BIT_DEPTH
-            in = vld1_u16(src + col);
-#else
-            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+                uint16x8_t d0;
+                filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
 
-            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
-            tmp = vsub_s16(tmp, vget_low_s16(off));
-            vst1_s16(dst + col, tmp);
-        }
-        for (; col < width; col += 2)
-        {
-            uint16x4_t in;
+                vst1q_u16(dst + col, d0);
+            }
 
-#if HIGH_BIT_DEPTH
-            in = vld1_u16(src + col);
-#else
-            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+            if (width == 6)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<4>(src, 1, s0);
 
-            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
-            tmp = vsub_s16(tmp, vget_low_s16(off));
-            store_s16x2xn<1>(dst + col, dstStride, &tmp);
+                uint16x8_t d0;
+                filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+
+                store_u16x6xn<1>(dst, dstStride, &d0);
+            }
+            else if (width % 8 != 0)
+            {
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<4>(src + col, 1, s0);
+
+                uint16x4_t d0;
+                filter4_u16x4<coeff4>(s0, filter, offset,
+                                      vget_low_u16(maxVal), d0);
+
+                if (width == 2)
+                {
+                    store_u16x2xn<1>(dst + col, dstStride, &d0);
+                }
+                else
+                {
+                    vst1_u16(dst + col, d0);
+                }
+            }
         }
 
         src += srcStride;
@@ -2116,73 +2248,205 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
     }
 }
 
-#if HIGH_BIT_DEPTH
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+template<int coeffIdx>
+void inline filter8_u16x4(const uint16x4_t *s, uint16x4_t &d,
+                          uint16x8_t filter, uint16x4_t maxVal)
 {
-    const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
-    int headRoom = IF_FILTER_PREC;
-    int offset = (1 << (headRoom - 1));
-    uint16_t maxVal = (1 << X265_DEPTH) - 1;
-    int cStride = 1;
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+        sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
 
-    src -= (N / 2 - 1) * cStride;
-    int16x8_t vc = vld1q_s16(coeff);
-    int16x4_t low_vc = vget_low_s16(vc);
-    int16x4_t high_vc = vget_high_s16(vc);
+        uint32x4_t sum234 = vmull_laneq_u16(s[3], filter, 3);
+        sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+        sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
 
-    const int32x4_t voffset = vdupq_n_s32(offset);
-    const int32x4_t vhr = vdupq_n_s32(-headRoom);
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+                                  vreinterpret_s16_u16(sum0156));
 
-    int row, col;
-    for (row = 0; row < height; row++)
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else if (coeffIdx == 2)
     {
-        for (col = 0; col < width; col += 8)
-        {
-            int32x4_t vsum1, vsum2;
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+        uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+        uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+        uint16x4_t sum34 = vadd_u16(s[3], s[4]);
 
-            int16x8_t input[N];
+        uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+        sum0167 = vsub_u16(sum0167, sum07);
 
-            for (int i = 0; i < N; i++)
-            {
-                input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
-            }
-            vsum1 = voffset;
-            vsum2 = voffset;
+        uint32x4_t sum2345 = vmull_laneq_u16(sum34, filter, 3);
+        sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[0]), low_vc, 0);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[0], low_vc, 0);
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+                                  vreinterpret_s16_u16(sum0167));
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[1]), low_vc, 1);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[1], low_vc, 1);
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+        sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[2]), low_vc, 2);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[2], low_vc, 2);
+        uint32x4_t sum345 = vmull_laneq_u16(s[3], filter, 3);
+        sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+        sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
 
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[3]), low_vc, 3);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[3], low_vc, 3);
+        int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+                                  vreinterpret_s16_u16(sum1267));
 
-            if (N == 8)
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_u16x8(const uint16x8_t *s, uint16x8_t &d, uint16x8_t filter,
+                          uint16x8_t maxVal)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+        sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+        sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+        uint32x4_t sum234_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+        sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+        sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
+
+        uint32x4_t sum234_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+        sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+        sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+        uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+        uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+        uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+        uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+        sum0167 = vsubq_u16(sum0167, sum07);
+
+        uint32x4_t sum2345_lo = vmull_laneq_u16(vget_low_u16(sum34),
+                                                filter, 3);
+        sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+                                     filter, 2);
+
+        uint32x4_t sum2345_hi = vmull_laneq_u16(vget_high_u16(sum34),
+                                                filter, 3);
+        sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+                                     filter, 2);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+        sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+        sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+        uint32x4_t sum345_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+        sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+        sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+        uint32x4_t sum345_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+        sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+        sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+        int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+                                     vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+        int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+                                     vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+                                  pixel *dst, intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+
+    const uint16x8_t filter =
+        vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(X265_NS::g_lumaFilter[coeffIdx])));
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)
+    {
+        if (width % 16 == 0)
+        {
+            for (int col = 0; col < width; col += 16)
             {
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[4]), high_vc, 0);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[4], high_vc, 0);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[5]), high_vc, 1);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[5], high_vc, 1);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[6]), high_vc, 2);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[6], high_vc, 2);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[7]), high_vc, 3);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[7], high_vc, 3);
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                load_u16x8xn<8>(src + col + 0, 1, s0);
+                load_u16x8xn<8>(src + col + 8, 1, s1);
+
+                uint16x8_t d0, d1;
+                filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
+                filter8_u16x8<coeffIdx>(s1, d1, filter, maxVal);
+
+                vst1q_u16(dst + col + 0, d0);
+                vst1q_u16(dst + col + 8, d1);
+            }
+        }
+        else
+        {
+            int col = 0;
+            for (; col + 8 <= width; col += 8)
+            {
+                uint16x8_t s0[N_TAPS];
+                load_u16x8xn<8>(src + col, 1, s0);
+
+                uint16x8_t d0;
+                filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
 
+                vst1q_u16(dst + col, d0);
             }
 
-            vsum1 = vshlq_s32(vsum1, vhr);
-            vsum2 = vshlq_s32(vsum2, vhr);
+            if (width % 8 == 4)
+            {
+                uint16x4_t s0[N_TAPS];
+                load_u16x4xn<8>(src + col, 1, s0);
 
-            int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
-                                        vreinterpretq_s16_s32(vsum2));
-            vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
-            vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
-            vst1q_u16(dst + col, vreinterpretq_u16_s16(vsum));
+                uint16x4_t d0;
+                filter8_u16x4<coeffIdx>(s0, d0, filter, vget_low_u16(maxVal));
+
+                vst1_u16(dst + col, d0);
+            }
         }
 
         src += srcStride;
@@ -2190,7 +2454,16 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
     }
 }
 
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
 template<int N, int width, int height>
 void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
                           intptr_t dstStride, int coeffIdx)
@@ -2226,8 +2499,6 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
     }
 }
 
-#endif // HIGH_BIT_DEPTH
-
 #if HIGH_BIT_DEPTH
 
 template<int N, int width, int height>
@@ -2676,11 +2947,64 @@ void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_
     interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
 }
 
+template<int width, int height>
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+{
+    const int shift = IF_INTERNAL_PREC - X265_DEPTH;
+    const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
 
+    for (int row = 0; row < height; row++)
+    {
+        int col = 0;
+        for (; col + 8 <= width; col += 8)
+        {
+            uint16x8_t in;
 
+#if HIGH_BIT_DEPTH
+            in = vld1q_u16(src + col);
+#else
+            in = vmovl_u8(vld1_u8(src + col));
+#endif
 
+            int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
+            tmp = vsubq_s16(tmp, off);
+            vst1q_s16(dst + col, tmp);
+        }
 
+        for (; col + 4 <= width; col += 4)
+        {
+            uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+            in = vld1_u16(src + col);
+#else
+            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+            tmp = vsub_s16(tmp, vget_low_s16(off));
+            vst1_s16(dst + col, tmp);
+        }
+
+        for (; col < width; col += 2)
+        {
+            uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+            in = vld1_u16(src + col);
+#else
+            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
 
+            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+            tmp = vsub_s16(tmp, vget_low_s16(off));
+            store_s16x2xn<1>(dst + col, dstStride, &tmp);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
 
 #define CHROMA_420(W, H) \
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
@@ -2834,6 +3158,36 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
     CHROMA_444(64, 32);
     CHROMA_444(64, 48);
     CHROMA_444(64, 64);
+
+#if HIGH_BIT_DEPTH
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp   = interp_horiz_pp_neon<4, 2, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp   = interp_horiz_pp_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp   = interp_horiz_pp_neon<4, 4, 2>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp   = interp_horiz_pp_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp   = interp_horiz_pp_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp  = interp_horiz_pp_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp   = interp_horiz_pp_neon<4, 6, 8>;
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = interp_horiz_pp_neon<4, 12, 16>;
+
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp   = interp_horiz_pp_neon<4, 2, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp  = interp_horiz_pp_neon<4, 2, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp   = interp_horiz_pp_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp   = interp_horiz_pp_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp  = interp_horiz_pp_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp  = interp_horiz_pp_neon<4, 4, 32>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hpp  = interp_horiz_pp_neon<4, 6, 16>;
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = interp_horiz_pp_neon<4, 12, 32>;
+
+    p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hpp         = interp_horiz_pp_neon<4, 4, 4>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hpp         = interp_horiz_pp_neon<4, 4, 8>;
+    p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hpp        = interp_horiz_pp_neon<4, 4, 16>;
+    p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp       = interp_horiz_pp_neon<4, 12, 16>;
+
+    p.pu[LUMA_4x4].luma_hpp                                 = interp_horiz_pp_neon<8, 4, 4>;
+    p.pu[LUMA_4x8].luma_hpp                                 = interp_horiz_pp_neon<8, 4, 8>;
+    p.pu[LUMA_4x16].luma_hpp                                = interp_horiz_pp_neon<8, 4, 16>;
+    p.pu[LUMA_12x16].luma_hpp                               = interp_horiz_pp_neon<8, 12, 16>;
+#endif // HIGH_BIT_DEPTH
 }
 
 };
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 90788a938..2c6edfccb 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -1,7 +1,8 @@
 /*****************************************************************************
- * Copyright (C) 2024 MulticoreWare, Inc
+ * Copyright (C) 2024-2025 MulticoreWare, Inc
  *
  * Authors: Hari Limaye <hari.limaye at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -185,6 +186,51 @@ static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
     }
 }
 
+template<int N>
+static void inline load_u16x4xn(const uint16_t *src, const intptr_t stride,
+                                uint16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1_u16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
+                                uint16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1q_u16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x2xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u16(src[i]), 0);
+        dst += dst_stride;
+    }
+}
+
+template<int N>
+static void inline store_u16x6xn(uint16_t *dst, intptr_t dst_stride,
+                                 const uint16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_u16(dst, vget_low_u16(src[i]));
+        vst1q_lane_u32((uint32_t *)(dst + 4), vreinterpretq_u32_u16(src[i]), 2);
+        dst += dst_stride;
+    }
+}
+
 template<int N>
 static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x4_t *src)
-- 
2.39.5 (Apple Git-154)