[x265] [PATCH 9/9] AArch64: Add SVE implementation of 32x32 DCT

Thu Aug 22 15:20:04 UTC 2024

The widening 16-bit multiply + pairwise add pattern in the Neon DCT
paths is a good fit for the SVE 16-bit dot-product instructions. This
patch adds an SVE implementation of the 32x32 DCT path.

Relative performance compared to the Neon implementation:

  Neoverse-V1: 1.13x
  Neoverse-V2: 1.44x
  Neoverse-N2: 1.55x
---
 source/common/aarch64/dct-prim-sve.cpp  | 203 ++++++++++++++++++++++++
 source/common/aarch64/neon-sve-bridge.h |   5 +
 2 files changed, 208 insertions(+)

diff --git a/source/common/aarch64/dct-prim-sve.cpp b/source/common/aarch64/dct-prim-sve.cpp
index f5a49b457..8a02ada35 100644
--- a/source/common/aarch64/dct-prim-sve.cpp
+++ b/source/common/aarch64/dct-prim-sve.cpp
@@ -249,6 +249,191 @@ static void partialButterfly16_sve(const int16_t *src, int16_t *dst)
     }
 }
 
+template<int shift>
+static void partialButterfly32_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 32;
+
+    int16x8_t O[line][2];
+    int16x8_t EO[line];
+    int32x4_t EEO[line];
+    int32x4_t EEEE[line / 2];
+    int32x4_t EEEO[line / 2];
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8x4_t in_lo = vld1q_s16_x4(src + (i + 0) * line);
+        in_lo.val[2] = rev16(in_lo.val[2]);
+        in_lo.val[3] = rev16(in_lo.val[3]);
+
+        int16x8x4_t in_hi = vld1q_s16_x4(src + (i + 1) * line);
+        in_hi.val[2] = rev16(in_hi.val[2]);
+        in_hi.val[3] = rev16(in_hi.val[3]);
+
+        int32x4_t E0[4];
+        E0[0] = vaddl_s16(vget_low_s16(in_lo.val[0]),
+                          vget_low_s16(in_lo.val[3]));
+        E0[1] = vaddl_s16(vget_high_s16(in_lo.val[0]),
+                          vget_high_s16(in_lo.val[3]));
+        E0[2] = vaddl_s16(vget_low_s16(in_lo.val[1]),
+                          vget_low_s16(in_lo.val[2]));
+        E0[3] = vaddl_s16(vget_high_s16(in_lo.val[1]),
+                          vget_high_s16(in_lo.val[2]));
+
+        int32x4_t E1[4];
+        E1[0] = vaddl_s16(vget_low_s16(in_hi.val[0]),
+                          vget_low_s16(in_hi.val[3]));
+        E1[1] = vaddl_s16(vget_high_s16(in_hi.val[0]),
+                          vget_high_s16(in_hi.val[3]));
+        E1[2] = vaddl_s16(vget_low_s16(in_hi.val[1]),
+                          vget_low_s16(in_hi.val[2]));
+        E1[3] = vaddl_s16(vget_high_s16(in_hi.val[1]),
+                          vget_high_s16(in_hi.val[2]));
+
+        O[i + 0][0] = vsubq_s16(in_lo.val[0], in_lo.val[3]);
+        O[i + 0][1] = vsubq_s16(in_lo.val[1], in_lo.val[2]);
+
+        O[i + 1][0] = vsubq_s16(in_hi.val[0], in_hi.val[3]);
+        O[i + 1][1] = vsubq_s16(in_hi.val[1], in_hi.val[2]);
+
+        int32x4_t EE0[2];
+        E0[3] = rev32(E0[3]);
+        E0[2] = rev32(E0[2]);
+        EE0[0] = vaddq_s32(E0[0], E0[3]);
+        EE0[1] = vaddq_s32(E0[1], E0[2]);
+        EO[i + 0] = vcombine_s16(vmovn_s32(vsubq_s32(E0[0], E0[3])),
+                                 vmovn_s32(vsubq_s32(E0[1], E0[2])));
+
+        int32x4_t EE1[2];
+        E1[3] = rev32(E1[3]);
+        E1[2] = rev32(E1[2]);
+        EE1[0] = vaddq_s32(E1[0], E1[3]);
+        EE1[1] = vaddq_s32(E1[1], E1[2]);
+        EO[i + 1] = vcombine_s16(vmovn_s32(vsubq_s32(E1[0], E1[3])),
+                                 vmovn_s32(vsubq_s32(E1[1], E1[2])));
+
+        int32x4_t EEE0;
+        EE0[1] = rev32(EE0[1]);
+        EEE0 = vaddq_s32(EE0[0], EE0[1]);
+        EEO[i + 0] = vsubq_s32(EE0[0], EE0[1]);
+
+        int32x4_t EEE1;
+        EE1[1] = rev32(EE1[1]);
+        EEE1 = vaddq_s32(EE1[0], EE1[1]);
+        EEO[i + 1] = vsubq_s32(EE1[0], EE1[1]);
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EEE0),
+                       vreinterpretq_s64_s32(EEE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(EEE0),
+                       vreinterpretq_s64_s32(EEE1))));
+
+        EEEE[i / 2] = vaddq_s32(t0, t1);
+        EEEO[i / 2] = vsubq_s32(t0, t1);
+    }
+
+    for (int k = 1; k < 32; k += 2)
+    {
+        int16_t *d = dst + k * line;
+
+        int16x8_t c0_c1 = vld1q_s16(&g_t32[k][0]);
+        int16x8_t c2_c3 = vld1q_s16(&g_t32[k][8]);
+
+        for (int i = 0; i < line; i += 4)
+        {
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c1, O[i + 0][0]);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c1, O[i + 1][0]);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c1, O[i + 2][0]);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c1, O[i + 3][0]);
+            t0 = x265_sdotq_s16(t0, c2_c3, O[i + 0][1]);
+            t1 = x265_sdotq_s16(t1, c2_c3, O[i + 1][1]);
+            t2 = x265_sdotq_s16(t2, c2_c3, O[i + 2][1]);
+            t3 = x265_sdotq_s16(t3, c2_c3, O[i + 3][1]);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(d, res);
+
+            d += 4;
+        }
+    }
+
+    for (int k = 2; k < 32; k += 4)
+    {
+        int16_t *d = dst + k * line;
+
+        int16x8_t c0 = vld1q_s16(&g_t32[k][0]);
+
+        for (int i = 0; i < line; i += 4)
+        {
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i + 0]);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i + 1]);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i + 2]);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i + 3]);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(d, res);
+
+            d += 4;
+        }
+    }
+
+    for (int k = 4; k < 32; k += 8)
+    {
+        int16_t *d = dst + k * line;
+
+        int32x4_t c = x265_vld1sh_s32(&g_t32[k][0]);
+
+        for (int i = 0; i < line; i += 4)
+        {
+            int32x4_t t0 = vmulq_s32(c, EEO[i + 0]);
+            int32x4_t t1 = vmulq_s32(c, EEO[i + 1]);
+            int32x4_t t2 = vmulq_s32(c, EEO[i + 2]);
+            int32x4_t t3 = vmulq_s32(c, EEO[i + 3]);
+
+            int32x4_t t = vpaddq_s32(vpaddq_s32(t0, t1), vpaddq_s32(t2, t3));
+            int16x4_t res = vrshrn_n_s32(t, shift);
+            vst1_s16(d, res);
+
+            d += 4;
+        }
+    }
+
+    int32x4_t c0 = vld1q_s32(t8_even[0]);
+    int32x4_t c8 = vld1q_s32(t8_even[1]);
+    int32x4_t c16 = vld1q_s32(t8_even[2]);
+    int32x4_t c24 = vld1q_s32(t8_even[3]);
+
+    for (int i = 0; i < line; i += 4)
+    {
+        int32x4_t t0 = vpaddq_s32(EEEE[i / 2 + 0], EEEE[i / 2 + 1]);
+        int32x4_t t1 = vmulq_s32(c0, t0);
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
+        vst1_s16(dst + 0 * line, res0);
+
+        int32x4_t t2 = vmulq_s32(c8, EEEO[i / 2 + 0]);
+        int32x4_t t3 = vmulq_s32(c8, EEEO[i / 2 + 1]);
+        int16x4_t res8 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
+        vst1_s16(dst + 8 * line, res8);
+
+        int32x4_t t4 = vmulq_s32(c16, EEEE[i / 2 + 0]);
+        int32x4_t t5 = vmulq_s32(c16, EEEE[i / 2 + 1]);
+        int16x4_t res16 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
+        vst1_s16(dst + 16 * line, res16);
+
+        int32x4_t t6 = vmulq_s32(c24, EEEO[i / 2 + 0]);
+        int32x4_t t7 = vmulq_s32(c24, EEEO[i / 2 + 1]);
+        int16x4_t res24 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
+        vst1_s16(dst + 24 * line, res24);
+
+        dst += 4;
+    }
+}
+
 }
 
 
@@ -289,10 +474,28 @@ void dct16_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
     partialButterfly16_sve<shift_pass2>(coef, dst);
 }
 
+void dct32_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
+{
+    const int shift_pass1 = 4 + X265_DEPTH - 8;
+    const int shift_pass2 = 11;
+
+    ALIGN_VAR_32(int16_t, coef[32 * 32]);
+    ALIGN_VAR_32(int16_t, block[32 * 32]);
+
+    for (int i = 0; i < 32; i++)
+    {
+        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
+    }
+
+    partialButterfly32_sve<shift_pass1>(block, coef);
+    partialButterfly32_sve<shift_pass2>(coef, dst);
+}
+
 void setupDCTPrimitives_sve(EncoderPrimitives &p)
 {
     p.cu[BLOCK_8x8].dct   = dct8_sve;
     p.cu[BLOCK_16x16].dct = dct16_sve;
+    p.cu[BLOCK_32x32].dct = dct32_sve;
 }
 
 };
diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h
index 59ca8aab7..dad5fa909 100644
--- a/source/common/aarch64/neon-sve-bridge.h
+++ b/source/common/aarch64/neon-sve-bridge.h
@@ -40,6 +40,11 @@
  * remainder of the vector is unused - this approach is still beneficial when
  * compared to a Neon-only implementation. */
 
+static inline int32x4_t x265_vld1sh_s32(const int16_t *ptr)
+{
+    return svget_neonq_s32(svld1sh_s32(svptrue_pat_b32(SV_VL4), ptr));
+}
+
 static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
 {
     return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0009-AArch64-Add-SVE-implementation-of-32x32-DCT.patch
Type: text/x-patch
Size: 9541 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240822/30276278/attachment.bin>