[x265] [PATCH 2/4] AArch64: Add SVE implementation of HBD interp_horiz_ps

Tue Apr 15 09:37:22 UTC 2025

Add SVE implementation of HBD interp_horiz_ps for block sizes of
width equal to 4 for LUMA filtering.

This implementation gives up to 5% uplift compared to the existing
Neon implementation.
---
 source/common/aarch64/filter-prim-sve.cpp | 87 +++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
index ddc9f3f08..b710cd238 100644
--- a/source/common/aarch64/filter-prim-sve.cpp
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -28,6 +28,8 @@
 #include <arm_neon.h>
 
 #if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+
 static const uint16_t dotprod_h_permute_tbl[32] = {
     // clang-format off
     0, 1, 2, 3, 1, 2, 3, 4,
@@ -239,6 +241,64 @@ void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride,
     }
 }
 
+void inline filter8_ps_u16x4(const uint16x8_t *s, int16x4_t &d, int16x8_t filter,
+                             int64x2_t offset)
+{
+    int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+    int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+
+    int64x2_t sum_lo = x265_sdotq_lane_s16(offset, sum01, filter, 0);
+    int64x2_t sum_hi = x265_sdotq_lane_s16(offset, sum23, filter, 0);
+
+    int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+    d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+}
+
+template<int width, int height>
+void inline interp8_hps_sve(const pixel *src, intptr_t srcStride,
+                            int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+{
+    const int N_TAPS = 8;
+    int blkheight = height;
+    const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]);
+    const int64x2_t offset =
+        vdupq_n_s64((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
+
+    uint16x8_t idx[4];
+
+    idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0);
+    idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8);
+    idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16);
+    idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24);
+
+    if (isRowExt)
+    {
+        src -= (N_TAPS / 2 - 1) * srcStride;
+        blkheight += N_TAPS - 1;
+    }
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < blkheight; row++)
+    {
+        uint16x8_t s[2];
+        s[0] = vld1q_u16(src);
+        s[1] = vld1q_u16(src + 4);
+
+        uint16x8_t s0[N_TAPS];
+        setup_s_hpp_x4<true>(s0, s[0], s[1], idx);
+
+        int16x4_t d0;
+        filter8_ps_u16x4(s0, d0, filter, offset);
+
+        vst1_s16(dst, d0);
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
 namespace X265_NS {
 // Declaration for use in interp8_horiz_pp_sve().
 template<int N, int width, int height>
@@ -279,6 +339,29 @@ void interp8_horiz_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
     }
 }
 
+// Declaration for use in interp8_horiz_ps_sve().
+template<int N, int width, int height>
+void interp_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx, int isRowExt);
+
+template<int width, int height>
+void interp8_horiz_ps_sve(const pixel *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx, int isRowExt)
+{
+    switch (coeffIdx)
+    {
+    case 1:
+        return interp_horiz_ps_neon<8, width, height>(src, srcStride, dst, dstStride,
+                                                      coeffIdx, isRowExt);
+    case 2:
+        return interp8_hps_sve<width, height>(src, srcStride, dst, dstStride,
+                                              coeffIdx, isRowExt);
+    case 3:
+        return interp_horiz_ps_neon<8, width, height>(src, srcStride, dst, dstStride,
+                                                      coeffIdx, isRowExt);
+    }
+}
+
 void setupFilterPrimitives_sve(EncoderPrimitives &p)
 {
     p.pu[LUMA_4x4].luma_hpp    = interp8_horiz_pp_sve<4, 4>;
@@ -303,6 +386,10 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p)
     p.pu[LUMA_64x48].luma_hpp  = interp8_horiz_pp_sve<64, 48>;
     p.pu[LUMA_64x64].luma_hpp  = interp8_horiz_pp_sve<64, 64>;
 #endif // X265_DEPTH == 12
+
+    p.pu[LUMA_4x4].luma_hps   = interp8_horiz_ps_sve<4, 4>;
+    p.pu[LUMA_4x8].luma_hps   = interp8_horiz_ps_sve<4, 8>;
+    p.pu[LUMA_4x16].luma_hps  = interp8_horiz_ps_sve<4, 16>;
 }
 } // namespace X265_NS
 #else // !HIGH_BIT_DEPTH
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From daa0a0ddbefd544f89fd70b83f78d1bd8f516bd7 Mon Sep 17 00:00:00 2001
Message-Id: <daa0a0ddbefd544f89fd70b83f78d1bd8f516bd7.1744709613.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1744709613.git.gerdazsejke.more at arm.com>
References: <cover.1744709613.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Thu, 20 Mar 2025 16:06:00 +0100
Subject: [PATCH 2/4] AArch64: Add SVE implementation of HBD interp_horiz_ps

Add SVE implementation of HBD interp_horiz_ps for block sizes of
width equal to 4 for LUMA filtering.

This implementation gives up to 5% uplift compared to the existing
Neon implementation.
---
 source/common/aarch64/filter-prim-sve.cpp | 87 +++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
index ddc9f3f08..b710cd238 100644
--- a/source/common/aarch64/filter-prim-sve.cpp
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -28,6 +28,8 @@
 #include <arm_neon.h>
 
 #if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+
 static const uint16_t dotprod_h_permute_tbl[32] = {
     // clang-format off
     0, 1, 2, 3, 1, 2, 3, 4,
@@ -239,6 +241,64 @@ void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride,
     }
 }
 
+void inline filter8_ps_u16x4(const uint16x8_t *s, int16x4_t &d, int16x8_t filter,
+                             int64x2_t offset)
+{
+    int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1]));
+    int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3]));
+
+    int64x2_t sum_lo = x265_sdotq_lane_s16(offset, sum01, filter, 0);
+    int64x2_t sum_hi = x265_sdotq_lane_s16(offset, sum23, filter, 0);
+
+    int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi));
+
+    d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+}
+
+template<int width, int height>
+void inline interp8_hps_sve(const pixel *src, intptr_t srcStride,
+                            int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+{
+    const int N_TAPS = 8;
+    int blkheight = height;
+    const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]);
+    const int64x2_t offset =
+        vdupq_n_s64((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
+
+    uint16x8_t idx[4];
+
+    idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0);
+    idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8);
+    idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16);
+    idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24);
+
+    if (isRowExt)
+    {
+        src -= (N_TAPS / 2 - 1) * srcStride;
+        blkheight += N_TAPS - 1;
+    }
+
+    src -= N_TAPS / 2 - 1;
+
+    for (int row = 0; row < blkheight; row++)
+    {
+        uint16x8_t s[2];
+        s[0] = vld1q_u16(src);
+        s[1] = vld1q_u16(src + 4);
+
+        uint16x8_t s0[N_TAPS];
+        setup_s_hpp_x4<true>(s0, s[0], s[1], idx);
+
+        int16x4_t d0;
+        filter8_ps_u16x4(s0, d0, filter, offset);
+
+        vst1_s16(dst, d0);
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
 namespace X265_NS {
 // Declaration for use in interp8_horiz_pp_sve().
 template<int N, int width, int height>
@@ -279,6 +339,29 @@ void interp8_horiz_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
     }
 }
 
+// Declaration for use in interp8_horiz_ps_sve().
+template<int N, int width, int height>
+void interp_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx, int isRowExt);
+
+template<int width, int height>
+void interp8_horiz_ps_sve(const pixel *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx, int isRowExt)
+{
+    switch (coeffIdx)
+    {
+    case 1:
+        return interp_horiz_ps_neon<8, width, height>(src, srcStride, dst, dstStride,
+                                                      coeffIdx, isRowExt);
+    case 2:
+        return interp8_hps_sve<width, height>(src, srcStride, dst, dstStride,
+                                              coeffIdx, isRowExt);
+    case 3:
+        return interp_horiz_ps_neon<8, width, height>(src, srcStride, dst, dstStride,
+                                                      coeffIdx, isRowExt);
+    }
+}
+
 void setupFilterPrimitives_sve(EncoderPrimitives &p)
 {
     p.pu[LUMA_4x4].luma_hpp    = interp8_horiz_pp_sve<4, 4>;
@@ -303,6 +386,10 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p)
     p.pu[LUMA_64x48].luma_hpp  = interp8_horiz_pp_sve<64, 48>;
     p.pu[LUMA_64x64].luma_hpp  = interp8_horiz_pp_sve<64, 64>;
 #endif // X265_DEPTH == 12
+
+    p.pu[LUMA_4x4].luma_hps   = interp8_horiz_ps_sve<4, 4>;
+    p.pu[LUMA_4x8].luma_hps   = interp8_horiz_ps_sve<4, 8>;
+    p.pu[LUMA_4x16].luma_hps  = interp8_horiz_ps_sve<4, 16>;
 }
 } // namespace X265_NS
 #else // !HIGH_BIT_DEPTH
-- 
2.39.5 (Apple Git-154)