[x265] [PATCH 1/4] AArch64: Add SVE implementation of HBD interp_horiz_pp

Tue Apr 15 09:37:02 UTC 2025

Add SVE implementation of HBD interp_horiz_pp for LUMA filtering.
An implementation was added for block sizes with width equal to 4 for
both 10-bit and 12-bit build, but for bigger block sizes the SVE
implementation was only enabled for 12-bit build.

This implementation gives up to 9% uplift compared to the existing
Neon implementation.
---
 source/common/CMakeLists.txt              |   2 +-
 source/common/aarch64/asm-primitives.cpp  |   2 +
 source/common/aarch64/filter-prim-sve.cpp | 314 ++++++++++++++++++++++
 source/common/aarch64/filter-prim-sve.h   |  37 +++
 source/common/aarch64/neon-sve-bridge.h   |  12 +
 5 files changed, 366 insertions(+), 1 deletion(-)
 create mode 100644 source/common/aarch64/filter-prim-sve.cpp
 create mode 100644 source/common/aarch64/filter-prim-sve.h

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 744fc21de..a6f56c8c8 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
     set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
-    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..b295d419b 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -380,6 +380,7 @@ extern "C" {
 
 #include "pixel-prim.h"
 #include "filter-prim.h"
+#include "filter-prim-sve.h"
 #include "dct-prim.h"
 #include "loopfilter-prim.h"
 #include "intrapred-prim.h"
@@ -1075,6 +1076,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
 #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
     if (cpuMask & X265_CPU_SVE)
     {
+        setupFilterPrimitives_sve(p);
         setupSaoPrimitives_sve(p);
         setupDCTPrimitives_sve(p);
     }
diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
new file mode 100644
index 000000000..ddc9f3f08
--- /dev/null
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -0,0 +1,314 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-prim-sve.h"
+#include "mem-neon.h"
+#include "neon-sve-bridge.h"
+
+#include <arm_neon.h>
+
+#if HIGH_BIT_DEPTH
+static const uint16_t dotprod_h_permute_tbl[32] = {
+    // clang-format off
+    0, 1, 2, 3, 1, 2, 3, 4,
+    2, 3, 4, 5, 3, 4, 5, 6,
+    3, 2, 1, 0, 4, 3, 2, 1,
+    5, 4, 3, 2, 6, 5, 4, 3,
+    // clang-format on
+};
+
+template<bool coeff2>
+void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter,
+                          uint16x4_t maxVal)
+{
+    if (coeff2)
+    {
+        int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+        int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+
+        int64x2_t sum_lo = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0);
+        int64x2_t sum_hi = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0);
+
+        int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        int64x2_t sum_lo =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0);
+        int64x2_t sum_hi =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0);
+
+        sum_lo = x265_sdotq_lane_s16(sum_lo, vreinterpretq_s16_u16(s[1]), filter, 1);
+        sum_hi = x265_sdotq_lane_s16(sum_hi, vreinterpretq_s16_u16(s[3]), filter, 1);
+
+        int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+}
+
+template<bool coeff2>
+void inline filter8_u16x8(uint16x8_t *s, uint16x8_t &d, int16x8_t filter,
+                          uint16x8_t maxVal)
+{
+    if (coeff2)
+    {
+        int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+        int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+        int16x8_t sum45 = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5]));
+        int16x8_t sum67 = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7]));
+
+        int64x2_t sum0 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0);
+        int64x2_t sum1 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0);
+        int64x2_t sum2 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum45, filter, 0);
+        int64x2_t sum3 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum67, filter, 0);
+
+        int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+        int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else
+    {
+        int64x2_t sum0 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0);
+        int64x2_t sum1 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[1]), filter, 0);
+        int64x2_t sum2 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0);
+        int64x2_t sum3 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[3]), filter, 0);
+
+        sum0 = x265_sdotq_lane_s16(sum0, vreinterpretq_s16_u16(s[4]), filter, 1);
+        sum1 = x265_sdotq_lane_s16(sum1, vreinterpretq_s16_u16(s[5]), filter, 1);
+        sum2 = x265_sdotq_lane_s16(sum2, vreinterpretq_s16_u16(s[6]), filter, 1);
+        sum3 = x265_sdotq_lane_s16(sum3, vreinterpretq_s16_u16(s[7]), filter, 1);
+
+        int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum2));
+        int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum1), vmovn_s64(sum3));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+}
+
+template<bool coeff2>
+void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t *idx)
+{
+    if (coeff2)
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[2]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[3]);
+    }
+    else
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[0]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[1]);
+    }
+}
+
+template<bool coeff2>
+void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t s2,
+                           uint16x8_t *idx)
+{
+    if (coeff2)
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[2]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[3]);
+        d[4] = x265_tblq_u16(s1, idx[0]);
+        d[5] = x265_tblq_u16(s2, idx[2]);
+        d[6] = x265_tblq_u16(s1, idx[1]);
+        d[7] = x265_tblq_u16(s2, idx[3]);
+    }
+    else
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[0]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[1]);
+        d[4] = d[1];
+        d[5] = x265_tblq_u16(s2, idx[0]);
+        d[6] = d[3];
+        d[7] = x265_tblq_u16(s2, idx[1]);
+    }
+}
+
+template<bool coeff2, int width, int height>
+void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride,
+                            pixel *dst, intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 8;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+    const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]);
+    uint16x8_t idx[4];
+
+    idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0);
+    idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8);
+    idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16);
+    idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24);
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)
+    {
+        if (width % 16 == 0 || width == 24)
+        {
+            int col = 0;
+            for (; col <= width - 16; col += 16)
+            {
+                uint16x8_t s[5];
+                load_u16x8xn<5>(src + col, 4, s);
+
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx);
+                setup_s_hpp_x8<coeff2>(s1, s[2], s[3], s[4], idx);
+
+                uint16x8_t d0, d1;
+                filter8_u16x8<coeff2>(s0, d0, filter, maxVal);
+                filter8_u16x8<coeff2>(s1, d1, filter, maxVal);
+
+                vst1q_u16(dst + col + 0, d0);
+                vst1q_u16(dst + col + 8, d1);
+            }
+
+            if (width == 24)
+            {
+                uint16x8_t s[3];
+                load_u16x8xn<3>(src + col, 4, s);
+
+                uint16x8_t s0[N_TAPS];
+                setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx);
+
+                uint16x8_t d0;
+                filter8_u16x8<coeff2>(s0, d0, filter, maxVal);
+
+                vst1q_u16(dst + col, d0);
+            }
+        }
+        else if (width == 4)
+        {
+            uint16x8_t s[2];
+            load_u16x8xn<2>(src, 4, s);
+
+            uint16x8_t s0[N_TAPS];
+            setup_s_hpp_x4<coeff2>(s0, s[0], s[1], idx);
+
+            uint16x4_t d0;
+            filter8_u16x4<coeff2>(s0, d0, filter, vget_low_u16(maxVal));
+
+            vst1_u16(dst, d0);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+namespace X265_NS {
+// Declaration for use in interp8_horiz_pp_sve().
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp8_horiz_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    switch (coeffIdx)
+    {
+    case 1:
+        if (width <= 16)
+        {
+            return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst,
+                                                          dstStride, coeffIdx);
+        }
+        else
+        {
+            return interp8_hpp_sve<false, width, height>(src, srcStride, dst,
+                                                         dstStride, coeffIdx);
+        }
+    case 2:
+        if (width > 4)
+        {
+            return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst,
+                                                          dstStride, coeffIdx);
+        }
+        else
+        {
+            return interp8_hpp_sve<true, width, height>(src, srcStride, dst,
+                                                        dstStride, coeffIdx);
+        }
+    case 3:
+        return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst,
+                                                      dstStride, coeffIdx);
+    }
+}
+
+void setupFilterPrimitives_sve(EncoderPrimitives &p)
+{
+    p.pu[LUMA_4x4].luma_hpp    = interp8_horiz_pp_sve<4, 4>;
+    p.pu[LUMA_4x8].luma_hpp    = interp8_horiz_pp_sve<4, 8>;
+    p.pu[LUMA_4x16].luma_hpp   = interp8_horiz_pp_sve<4, 16>;
+#if X265_DEPTH == 12
+    p.pu[LUMA_16x4].luma_hpp   = interp8_horiz_pp_sve<16, 4>;
+    p.pu[LUMA_16x8].luma_hpp   = interp8_horiz_pp_sve<16, 8>;
+    p.pu[LUMA_16x12].luma_hpp  = interp8_horiz_pp_sve<16, 12>;
+    p.pu[LUMA_16x16].luma_hpp  = interp8_horiz_pp_sve<16, 16>;
+    p.pu[LUMA_16x32].luma_hpp  = interp8_horiz_pp_sve<16, 32>;
+    p.pu[LUMA_16x64].luma_hpp  = interp8_horiz_pp_sve<16, 64>;
+    p.pu[LUMA_24x32].luma_hpp  = interp8_horiz_pp_sve<24, 32>;
+    p.pu[LUMA_32x8].luma_hpp   = interp8_horiz_pp_sve<32, 8>;
+    p.pu[LUMA_32x16].luma_hpp  = interp8_horiz_pp_sve<32, 16>;
+    p.pu[LUMA_32x24].luma_hpp  = interp8_horiz_pp_sve<32, 24>;
+    p.pu[LUMA_32x32].luma_hpp  = interp8_horiz_pp_sve<32, 32>;
+    p.pu[LUMA_32x64].luma_hpp  = interp8_horiz_pp_sve<32, 64>;
+    p.pu[LUMA_48x64].luma_hpp  = interp8_horiz_pp_sve<48, 64>;
+    p.pu[LUMA_64x16].luma_hpp  = interp8_horiz_pp_sve<64, 16>;
+    p.pu[LUMA_64x32].luma_hpp  = interp8_horiz_pp_sve<64, 32>;
+    p.pu[LUMA_64x48].luma_hpp  = interp8_horiz_pp_sve<64, 48>;
+    p.pu[LUMA_64x64].luma_hpp  = interp8_horiz_pp_sve<64, 64>;
+#endif // X265_DEPTH == 12
+}
+} // namespace X265_NS
+#else // !HIGH_BIT_DEPTH
+namespace X265_NS {
+void setupFilterPrimitives_sve(EncoderPrimitives &)
+{
+}
+}
+#endif // HIGH_BIT_DEPTH
diff --git a/source/common/aarch64/filter-prim-sve.h b/source/common/aarch64/filter-prim-sve.h
new file mode 100644
index 000000000..382a1adbd
--- /dev/null
+++ b/source/common/aarch64/filter-prim-sve.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FILTER_PRIM_SVE_H
+#define X265_FILTER_PRIM_SVE_H
+
+#if defined(HAVE_SVE)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_sve(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_SVE)
+
+#endif // X265_FILTER_PRIM_SVE_H
diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h
index dad5fa909..48f89ea6e 100644
--- a/source/common/aarch64/neon-sve-bridge.h
+++ b/source/common/aarch64/neon-sve-bridge.h
@@ -3,6 +3,7 @@
  *
  * Authors: Hari Limaye <hari.limaye at arm.com>
  *          Jonathan Wright <jonathan.wright at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -52,6 +53,17 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
                                      svset_neonq_s16(svundef_s16(), y)));
 }
 
+#define x265_sdotq_lane_s16(sum, s0, f, lane)                               \
+        svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \
+                                       svset_neonq_s16(svundef_s16(), s0),  \
+                                       svset_neonq_s16(svundef_s16(), f), lane))
+
+static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx)
+{
+    return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x),
+                                     svset_neonq_u16(svundef_u16(), idx)));
+}
+
 static inline int8x16_t x265_sve_mask(const int x, const int endX,
                                       const int8x16_t in)
 {
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From 62c905866a575c6f5f8a0593fed1a6f68215ab0f Mon Sep 17 00:00:00 2001
Message-Id: <62c905866a575c6f5f8a0593fed1a6f68215ab0f.1744709613.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1744709613.git.gerdazsejke.more at arm.com>
References: <cover.1744709613.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Mon, 10 Mar 2025 13:22:27 +0100
Subject: [PATCH 1/4] AArch64: Add SVE implementation of HBD interp_horiz_pp

Add SVE implementation of HBD interp_horiz_pp for LUMA filtering.
An implementation was added for block sizes with width equal to 4 for
both 10-bit and 12-bit build, but for bigger block sizes the SVE
implementation was only enabled for 12-bit build.

This implementation gives up to 9% uplift compared to the existing
Neon implementation.
---
 source/common/CMakeLists.txt              |   2 +-
 source/common/aarch64/asm-primitives.cpp  |   2 +
 source/common/aarch64/filter-prim-sve.cpp | 314 ++++++++++++++++++++++
 source/common/aarch64/filter-prim-sve.h   |  37 +++
 source/common/aarch64/neon-sve-bridge.h   |  12 +
 5 files changed, 366 insertions(+), 1 deletion(-)
 create mode 100644 source/common/aarch64/filter-prim-sve.cpp
 create mode 100644 source/common/aarch64/filter-prim-sve.h

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 744fc21de..a6f56c8c8 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
     set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
-    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..b295d419b 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -380,6 +380,7 @@ extern "C" {
 
 #include "pixel-prim.h"
 #include "filter-prim.h"
+#include "filter-prim-sve.h"
 #include "dct-prim.h"
 #include "loopfilter-prim.h"
 #include "intrapred-prim.h"
@@ -1075,6 +1076,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
 #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
     if (cpuMask & X265_CPU_SVE)
     {
+        setupFilterPrimitives_sve(p);
         setupSaoPrimitives_sve(p);
         setupDCTPrimitives_sve(p);
     }
diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
new file mode 100644
index 000000000..ddc9f3f08
--- /dev/null
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -0,0 +1,314 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-prim-sve.h"
+#include "mem-neon.h"
+#include "neon-sve-bridge.h"
+
+#include <arm_neon.h>
+
+#if HIGH_BIT_DEPTH
+static const uint16_t dotprod_h_permute_tbl[32] = {
+    // clang-format off
+    0, 1, 2, 3, 1, 2, 3, 4,
+    2, 3, 4, 5, 3, 4, 5, 6,
+    3, 2, 1, 0, 4, 3, 2, 1,
+    5, 4, 3, 2, 6, 5, 4, 3,
+    // clang-format on
+};
+
+template<bool coeff2>
+void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter,
+                          uint16x4_t maxVal)
+{
+    if (coeff2)
+    {
+        int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+        int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+
+        int64x2_t sum_lo = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0);
+        int64x2_t sum_hi = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0);
+
+        int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+    else
+    {
+        int64x2_t sum_lo =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0);
+        int64x2_t sum_hi =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0);
+
+        sum_lo = x265_sdotq_lane_s16(sum_lo, vreinterpretq_s16_u16(s[1]), filter, 1);
+        sum_hi = x265_sdotq_lane_s16(sum_hi, vreinterpretq_s16_u16(s[3]), filter, 1);
+
+        int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+        d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+        d = vmin_u16(d, maxVal);
+    }
+}
+
+template<bool coeff2>
+void inline filter8_u16x8(uint16x8_t *s, uint16x8_t &d, int16x8_t filter,
+                          uint16x8_t maxVal)
+{
+    if (coeff2)
+    {
+        int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+        int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+        int16x8_t sum45 = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5]));
+        int16x8_t sum67 = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7]));
+
+        int64x2_t sum0 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0);
+        int64x2_t sum1 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0);
+        int64x2_t sum2 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum45, filter, 0);
+        int64x2_t sum3 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum67, filter, 0);
+
+        int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+        int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+    else
+    {
+        int64x2_t sum0 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0);
+        int64x2_t sum1 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[1]), filter, 0);
+        int64x2_t sum2 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0);
+        int64x2_t sum3 =
+            x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[3]), filter, 0);
+
+        sum0 = x265_sdotq_lane_s16(sum0, vreinterpretq_s16_u16(s[4]), filter, 1);
+        sum1 = x265_sdotq_lane_s16(sum1, vreinterpretq_s16_u16(s[5]), filter, 1);
+        sum2 = x265_sdotq_lane_s16(sum2, vreinterpretq_s16_u16(s[6]), filter, 1);
+        sum3 = x265_sdotq_lane_s16(sum3, vreinterpretq_s16_u16(s[7]), filter, 1);
+
+        int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum2));
+        int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum1), vmovn_s64(sum3));
+
+        uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+        uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+        d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+    }
+}
+
+template<bool coeff2>
+void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t *idx)
+{
+    if (coeff2)
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[2]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[3]);
+    }
+    else
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[0]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[1]);
+    }
+}
+
+template<bool coeff2>
+void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t s2,
+                           uint16x8_t *idx)
+{
+    if (coeff2)
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[2]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[3]);
+        d[4] = x265_tblq_u16(s1, idx[0]);
+        d[5] = x265_tblq_u16(s2, idx[2]);
+        d[6] = x265_tblq_u16(s1, idx[1]);
+        d[7] = x265_tblq_u16(s2, idx[3]);
+    }
+    else
+    {
+        d[0] = x265_tblq_u16(s0, idx[0]);
+        d[1] = x265_tblq_u16(s1, idx[0]);
+        d[2] = x265_tblq_u16(s0, idx[1]);
+        d[3] = x265_tblq_u16(s1, idx[1]);
+        d[4] = d[1];
+        d[5] = x265_tblq_u16(s2, idx[0]);
+        d[6] = d[3];
+        d[7] = x265_tblq_u16(s2, idx[1]);
+    }
+}
+
+template<bool coeff2, int width, int height>
+void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride,
+                            pixel *dst, intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 8;
+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+    const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]);
+    uint16x8_t idx[4];
+
+    idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0);
+    idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8);
+    idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16);
+    idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24);
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < height; row++)
+    {
+        if (width % 16 == 0 || width == 24)
+        {
+            int col = 0;
+            for (; col <= width - 16; col += 16)
+            {
+                uint16x8_t s[5];
+                load_u16x8xn<5>(src + col, 4, s);
+
+                uint16x8_t s0[N_TAPS], s1[N_TAPS];
+                setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx);
+                setup_s_hpp_x8<coeff2>(s1, s[2], s[3], s[4], idx);
+
+                uint16x8_t d0, d1;
+                filter8_u16x8<coeff2>(s0, d0, filter, maxVal);
+                filter8_u16x8<coeff2>(s1, d1, filter, maxVal);
+
+                vst1q_u16(dst + col + 0, d0);
+                vst1q_u16(dst + col + 8, d1);
+            }
+
+            if (width == 24)
+            {
+                uint16x8_t s[3];
+                load_u16x8xn<3>(src + col, 4, s);
+
+                uint16x8_t s0[N_TAPS];
+                setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx);
+
+                uint16x8_t d0;
+                filter8_u16x8<coeff2>(s0, d0, filter, maxVal);
+
+                vst1q_u16(dst + col, d0);
+            }
+        }
+        else if (width == 4)
+        {
+            uint16x8_t s[2];
+            load_u16x8xn<2>(src, 4, s);
+
+            uint16x8_t s0[N_TAPS];
+            setup_s_hpp_x4<coeff2>(s0, s[0], s[1], idx);
+
+            uint16x4_t d0;
+            filter8_u16x4<coeff2>(s0, d0, filter, vget_low_u16(maxVal));
+
+            vst1_u16(dst, d0);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+namespace X265_NS {
+// Declaration for use in interp8_horiz_pp_sve().
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp8_horiz_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    switch (coeffIdx)
+    {
+    case 1:
+        if (width <= 16)
+        {
+            return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst,
+                                                          dstStride, coeffIdx);
+        }
+        else
+        {
+            return interp8_hpp_sve<false, width, height>(src, srcStride, dst,
+                                                         dstStride, coeffIdx);
+        }
+    case 2:
+        if (width > 4)
+        {
+            return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst,
+                                                          dstStride, coeffIdx);
+        }
+        else
+        {
+            return interp8_hpp_sve<true, width, height>(src, srcStride, dst,
+                                                        dstStride, coeffIdx);
+        }
+    case 3:
+        return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst,
+                                                      dstStride, coeffIdx);
+    }
+}
+
+void setupFilterPrimitives_sve(EncoderPrimitives &p)
+{
+    p.pu[LUMA_4x4].luma_hpp    = interp8_horiz_pp_sve<4, 4>;
+    p.pu[LUMA_4x8].luma_hpp    = interp8_horiz_pp_sve<4, 8>;
+    p.pu[LUMA_4x16].luma_hpp   = interp8_horiz_pp_sve<4, 16>;
+#if X265_DEPTH == 12
+    p.pu[LUMA_16x4].luma_hpp   = interp8_horiz_pp_sve<16, 4>;
+    p.pu[LUMA_16x8].luma_hpp   = interp8_horiz_pp_sve<16, 8>;
+    p.pu[LUMA_16x12].luma_hpp  = interp8_horiz_pp_sve<16, 12>;
+    p.pu[LUMA_16x16].luma_hpp  = interp8_horiz_pp_sve<16, 16>;
+    p.pu[LUMA_16x32].luma_hpp  = interp8_horiz_pp_sve<16, 32>;
+    p.pu[LUMA_16x64].luma_hpp  = interp8_horiz_pp_sve<16, 64>;
+    p.pu[LUMA_24x32].luma_hpp  = interp8_horiz_pp_sve<24, 32>;
+    p.pu[LUMA_32x8].luma_hpp   = interp8_horiz_pp_sve<32, 8>;
+    p.pu[LUMA_32x16].luma_hpp  = interp8_horiz_pp_sve<32, 16>;
+    p.pu[LUMA_32x24].luma_hpp  = interp8_horiz_pp_sve<32, 24>;
+    p.pu[LUMA_32x32].luma_hpp  = interp8_horiz_pp_sve<32, 32>;
+    p.pu[LUMA_32x64].luma_hpp  = interp8_horiz_pp_sve<32, 64>;
+    p.pu[LUMA_48x64].luma_hpp  = interp8_horiz_pp_sve<48, 64>;
+    p.pu[LUMA_64x16].luma_hpp  = interp8_horiz_pp_sve<64, 16>;
+    p.pu[LUMA_64x32].luma_hpp  = interp8_horiz_pp_sve<64, 32>;
+    p.pu[LUMA_64x48].luma_hpp  = interp8_horiz_pp_sve<64, 48>;
+    p.pu[LUMA_64x64].luma_hpp  = interp8_horiz_pp_sve<64, 64>;
+#endif // X265_DEPTH == 12
+}
+} // namespace X265_NS
+#else // !HIGH_BIT_DEPTH
+namespace X265_NS {
+void setupFilterPrimitives_sve(EncoderPrimitives &)
+{
+}
+}
+#endif // HIGH_BIT_DEPTH
diff --git a/source/common/aarch64/filter-prim-sve.h b/source/common/aarch64/filter-prim-sve.h
new file mode 100644
index 000000000..382a1adbd
--- /dev/null
+++ b/source/common/aarch64/filter-prim-sve.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FILTER_PRIM_SVE_H
+#define X265_FILTER_PRIM_SVE_H
+
+#if defined(HAVE_SVE)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_sve(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_SVE)
+
+#endif // X265_FILTER_PRIM_SVE_H
diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h
index dad5fa909..48f89ea6e 100644
--- a/source/common/aarch64/neon-sve-bridge.h
+++ b/source/common/aarch64/neon-sve-bridge.h
@@ -3,6 +3,7 @@
  *
  * Authors: Hari Limaye <hari.limaye at arm.com>
  *          Jonathan Wright <jonathan.wright at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -52,6 +53,17 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
                                      svset_neonq_s16(svundef_s16(), y)));
 }
 
+#define x265_sdotq_lane_s16(sum, s0, f, lane)                               \
+        svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \
+                                       svset_neonq_s16(svundef_s16(), s0),  \
+                                       svset_neonq_s16(svundef_s16(), f), lane))
+
+static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx)
+{
+    return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x),
+                                     svset_neonq_u16(svundef_u16(), idx)));
+}
+
 static inline int8x16_t x265_sve_mask(const int x, const int endX,
                                       const int8x16_t in)
 {
-- 
2.39.5 (Apple Git-154)