[x265] [PATCH v2 1/4] AArch64: Add SVE implementation of HBD interp_horiz_pp

Fri Apr 25 05:23:18 UTC 2025

Hi Gerda,

Thank for the patches, I have some comments.

At 2025-04-24 18:01:02, "Gerda Zsejke More" <gerdazsejke.more at arm.com> wrote:
>Add SVE implementation of HBD interp_horiz_pp for LUMA filtering.
>An implementation was added for block sizes with width equal to 4 for
>both 10-bit and 12-bit build, but for bigger block sizes the SVE
>implementation was only enabled for 12-bit build.
>
>This implementation gives up to 9% uplift compared to the existing
>Neon implementation.
>---
> source/common/CMakeLists.txt              |   2 +-
> source/common/aarch64/asm-primitives.cpp  |   2 +
> source/common/aarch64/filter-prim-sve.cpp | 314 ++++++++++++++++++++++
> source/common/aarch64/filter-prim-sve.h   |  37 +++
> source/common/aarch64/neon-sve-bridge.h   |  12 +
> 5 files changed, 366 insertions(+), 1 deletion(-)
> create mode 100644 source/common/aarch64/filter-prim-sve.cpp
> create mode 100644 source/common/aarch64/filter-prim-sve.h
>
>+#if HIGH_BIT_DEPTH
>+static const uint16_t dotprod_h_permute_tbl[32] = {
>+    // clang-format off
>+    0, 1, 2, 3, 1, 2, 3, 4,
>+    2, 3, 4, 5, 3, 4, 5, 6,
>+    3, 2, 1, 0, 4, 3, 2, 1,

Is this resule get from dotprod_h_permute_tbl[0] with "REV64 V.8H"?

>+    5, 4, 3, 2, 6, 5, 4, 3,
>+    // clang-format on
>+};
>+

>+template<bool coeff2>
>+void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t *idx)
>+{
>+    if (coeff2)
>+    {
>+        d[0] = x265_tblq_u16(s0, idx[0]);
>+        d[1] = x265_tblq_u16(s1, idx[2]);
>+        d[2] = x265_tblq_u16(s0, idx[1]);
>+        d[3] = x265_tblq_u16(s1, idx[3]);
>+    }
>+    else
>+    {
>+        d[0] = x265_tblq_u16(s0, idx[0]);
>+        d[1] = x265_tblq_u16(s1, idx[0]);
>+        d[2] = x265_tblq_u16(s0, idx[1]);
>+        d[3] = x265_tblq_u16(s1, idx[1]);
>+    }
>+}
>+
>+template<bool coeff2>
>+void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t s2,
>+                           uint16x8_t *idx)
>+{
>+    if (coeff2)
>+    {
>+        d[0] = x265_tblq_u16(s0, idx[0]);
>+        d[1] = x265_tblq_u16(s1, idx[2]);
>+        d[2] = x265_tblq_u16(s0, idx[1]);
>+        d[3] = x265_tblq_u16(s1, idx[3]);

>+        d[4] = x265_tblq_u16(s1, idx[0]);
Above method REV64.8H?

>+        d[5] = x265_tblq_u16(s2, idx[2]);
>+        d[6] = x265_tblq_u16(s1, idx[1]);
>+        d[7] = x265_tblq_u16(s2, idx[3]);
>+    }
>+    else
>+    {
>+        d[0] = x265_tblq_u16(s0, idx[0]);
>+        d[1] = x265_tblq_u16(s1, idx[0]);
>+        d[2] = x265_tblq_u16(s0, idx[1]);
>+        d[3] = x265_tblq_u16(s1, idx[1]);
>+        d[4] = d[1];
>+        d[5] = x265_tblq_u16(s2, idx[0]);
>+        d[6] = d[3];
>+        d[7] = x265_tblq_u16(s2, idx[1]);
>+    }
>+}
>+
>+template<bool coeff2, int width, int height>
>+void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride,
>+                            pixel *dst, intptr_t dstStride, int coeffIdx)
>+{
>+    const int N_TAPS = 8;
>+    const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
>+    const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]);
>+    uint16x8_t idx[4];
>+
>+    idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0);
>+    idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8);
>+    idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16);

>+    idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24);
idx[2] and idx[3] for <coeff2> only

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250425/a5711816/attachment.htm>