[x265] [PATCH v3 5/7] AArch64: Add Neon implementation of 8x8 IDCT

Fri Dec 13 23:17:53 UTC 2024

Also add a new helper function transpose_4x8_s16.

Relative performance compared to scalar C:

 Neoverse N1: 3.58x
 Neoverse V1: 5.32x
 Neoverse V2: 5.59x
---
 source/common/aarch64/dct-prim.cpp | 168 ++++++++++++++++++++++++++++-
 1 file changed, 167 insertions(+), 1 deletion(-)

diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
index 08816f593..812837b82 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -39,6 +39,32 @@ static inline void transpose_4x4_s16(int16x4_t &s0, int16x4_t &s1, int16x4_t &s2
     s3 = vget_high_s16(s0123.val[1]);
 }
 
+static inline void transpose_4x8_s16(int16x4_t s0, int16x4_t s1, int16x4_t s2, int16x4_t s3,
+                                     int16x4_t s4, int16x4_t s5, int16x4_t s6, int16x4_t s7,
+                                     int16x8_t &d0, int16x8_t &d1, int16x8_t &d2, int16x8_t &d3)
+{
+    int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+    int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+    int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+    int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+    int16x8_t s4q = vcombine_s16(s4, vdup_n_s16(0));
+    int16x8_t s5q = vcombine_s16(s5, vdup_n_s16(0));
+    int16x8_t s6q = vcombine_s16(s6, vdup_n_s16(0));
+    int16x8_t s7q = vcombine_s16(s7, vdup_n_s16(0));
+
+    int16x8_t s04 = vzip1q_s16(s0q, s4q);
+    int16x8_t s15 = vzip1q_s16(s1q, s5q);
+    int16x8_t s26 = vzip1q_s16(s2q, s6q);
+    int16x8_t s37 = vzip1q_s16(s3q, s7q);
+
+    int16x8x2_t s0246 = vzipq_s16(s04, s26);
+    int16x8x2_t s1357 = vzipq_s16(s15, s37);
+
+    d0 = vzip1q_s16(s0246.val[0], s1357.val[0]);
+    d1 = vzip2q_s16(s0246.val[0], s1357.val[0]);
+    d2 = vzip1q_s16(s0246.val[1], s1357.val[1]);
+    d3 = vzip2q_s16(s0246.val[1], s1357.val[1]);
+}
 
 static int scanPosLast_opt(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag,
                            uint8_t *coeffNum, int numSig, const uint16_t * /*scanCG4x4*/, const int /*trSize*/)
@@ -779,7 +805,136 @@ static inline void partialButterflyInverse4_neon(const int16_t *src, int16_t *ds
     vst1_s16(dst + 3 * dstStride, d3);
 }
 
+template<int shift>
+static inline void partialButterflyInverse8_neon(const int16_t *src, int16_t *dst,
+                                                 intptr_t dstStride)
+{
+    const int line = 8;
+
+    const int16x8_t s0 = vld1q_s16(src + 0 * line);
+    const int16x8_t s1 = vld1q_s16(src + 1 * line);
+    const int16x8_t s2 = vld1q_s16(src + 2 * line);
+    const int16x8_t s3 = vld1q_s16(src + 3 * line);
+    const int16x8_t s4 = vld1q_s16(src + 4 * line);
+    const int16x8_t s5 = vld1q_s16(src + 5 * line);
+    const int16x8_t s6 = vld1q_s16(src + 6 * line);
+    const int16x8_t s7 = vld1q_s16(src + 7 * line);
+
+    int32x4_t O_lo[4], O_hi[4];
+    const int16x4_t c_odd = vld1_s16(g_t8[1]);
+    O_lo[0] = vmull_lane_s16(vget_low_s16(s1), c_odd, 0); // 89
+    O_lo[1] = vmull_lane_s16(vget_low_s16(s1), c_odd, 1); // 75
+    O_lo[2] = vmull_lane_s16(vget_low_s16(s1), c_odd, 2); // 50
+    O_lo[3] = vmull_lane_s16(vget_low_s16(s1), c_odd, 3); // 18
+
+    O_hi[0] = vmull_lane_s16(vget_high_s16(s1), c_odd, 0); // 89
+    O_hi[1] = vmull_lane_s16(vget_high_s16(s1), c_odd, 1); // 75
+    O_hi[2] = vmull_lane_s16(vget_high_s16(s1), c_odd, 2); // 50
+    O_hi[3] = vmull_lane_s16(vget_high_s16(s1), c_odd, 3); // 18
+
+    if (vaddlvq_u32(vreinterpretq_u32_s16(s3)) != 0)
+    {
+        O_lo[0] = vmlal_lane_s16(O_lo[0], vget_low_s16(s3), c_odd, 1); //  75
+        O_lo[1] = vmlsl_lane_s16(O_lo[1], vget_low_s16(s3), c_odd, 3); // -18
+        O_lo[2] = vmlsl_lane_s16(O_lo[2], vget_low_s16(s3), c_odd, 0); // -89
+        O_lo[3] = vmlsl_lane_s16(O_lo[3], vget_low_s16(s3), c_odd, 2); // -50
+
+        O_hi[0] = vmlal_lane_s16(O_hi[0], vget_high_s16(s3), c_odd, 1); //  75
+        O_hi[1] = vmlsl_lane_s16(O_hi[1], vget_high_s16(s3), c_odd, 3); // -18
+        O_hi[2] = vmlsl_lane_s16(O_hi[2], vget_high_s16(s3), c_odd, 0); // -89
+        O_hi[3] = vmlsl_lane_s16(O_hi[3], vget_high_s16(s3), c_odd, 2); // -50
+    }
+
+    if (vaddlvq_u32(vreinterpretq_u32_s16(s5)) != 0)
+    {
+        O_lo[0] = vmlal_lane_s16(O_lo[0], vget_low_s16(s5), c_odd, 2); //  50
+        O_lo[1] = vmlsl_lane_s16(O_lo[1], vget_low_s16(s5), c_odd, 0); // -89
+        O_lo[2] = vmlal_lane_s16(O_lo[2], vget_low_s16(s5), c_odd, 3); //  18
+        O_lo[3] = vmlal_lane_s16(O_lo[3], vget_low_s16(s5), c_odd, 1); //  75
+
+        O_hi[0] = vmlal_lane_s16(O_hi[0], vget_high_s16(s5), c_odd, 2); //  50
+        O_hi[1] = vmlsl_lane_s16(O_hi[1], vget_high_s16(s5), c_odd, 0); // -89
+        O_hi[2] = vmlal_lane_s16(O_hi[2], vget_high_s16(s5), c_odd, 3); //  18
+        O_hi[3] = vmlal_lane_s16(O_hi[3], vget_high_s16(s5), c_odd, 1); //  75
+    }
+
+    if (vaddlvq_u32(vreinterpretq_u32_s16(s7)) != 0)
+    {
+        O_lo[0] = vmlal_lane_s16(O_lo[0], vget_low_s16(s7), c_odd, 3); //  18
+        O_lo[1] = vmlsl_lane_s16(O_lo[1], vget_low_s16(s7), c_odd, 2); // -50
+        O_lo[2] = vmlal_lane_s16(O_lo[2], vget_low_s16(s7), c_odd, 1); //  75
+        O_lo[3] = vmlsl_lane_s16(O_lo[3], vget_low_s16(s7), c_odd, 0); // -89
+
+        O_hi[0] = vmlal_lane_s16(O_hi[0], vget_high_s16(s7), c_odd, 3); //  18
+        O_hi[1] = vmlsl_lane_s16(O_hi[1], vget_high_s16(s7), c_odd, 2); // -50
+        O_hi[2] = vmlal_lane_s16(O_hi[2], vget_high_s16(s7), c_odd, 1); //  75
+        O_hi[3] = vmlsl_lane_s16(O_hi[3], vget_high_s16(s7), c_odd, 0); // -89
+    }
+
+    int32x4_t EO_lo[2], EO_hi[2];
+    const int16x4_t c_even = vld1_s16(g_t8[2]);
+    EO_lo[0] = vmull_lane_s16(vget_low_s16(s2), c_even, 0); // 83
+    EO_lo[1] = vmull_lane_s16(vget_low_s16(s2), c_even, 1); // 36
+
+    EO_hi[0] = vmull_lane_s16(vget_high_s16(s2), c_even, 0); // 83
+    EO_hi[1] = vmull_lane_s16(vget_high_s16(s2), c_even, 1); // 36
 
+    EO_lo[0] = vmlal_lane_s16(EO_lo[0], vget_low_s16(s6), c_even, 1); //  36
+    EO_lo[1] = vmlsl_lane_s16(EO_lo[1], vget_low_s16(s6), c_even, 0); // -83
+
+    EO_hi[0] = vmlal_lane_s16(EO_hi[0], vget_high_s16(s6), c_even, 1); //  36
+    EO_hi[1] = vmlsl_lane_s16(EO_hi[1], vget_high_s16(s6), c_even, 0); // -83
+
+    // Replace multiply by 64 with left shift by 6.
+    int32x4_t EE_lo[2], EE_hi[2];
+    EE_lo[0] = vshlq_n_s32(vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)), 6);
+    EE_hi[0] = vshlq_n_s32(vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)), 6);
+
+    EE_lo[1] = vshll_n_s16(vget_low_s16(vsubq_s16(s0, s4)), 6);
+    EE_hi[1] = vshll_n_s16(vget_high_s16(vsubq_s16(s0, s4)), 6);
+
+    int32x4_t E_lo[4], E_hi[4];
+    E_lo[0] = vaddq_s32(EE_lo[0], EO_lo[0]);
+    E_lo[1] = vaddq_s32(EE_lo[1], EO_lo[1]);
+    E_lo[2] = vsubq_s32(EE_lo[1], EO_lo[1]);
+    E_lo[3] = vsubq_s32(EE_lo[0], EO_lo[0]);
+
+    E_hi[0] = vaddq_s32(EE_hi[0], EO_hi[0]);
+    E_hi[1] = vaddq_s32(EE_hi[1], EO_hi[1]);
+    E_hi[2] = vsubq_s32(EE_hi[1], EO_hi[1]);
+    E_hi[3] = vsubq_s32(EE_hi[0], EO_hi[0]);
+
+    int16x4_t d_lo[8], d_hi[8];
+
+    for (int i = 0; i < 4; i++)
+    {
+        int32x4_t t_lo = vaddq_s32(E_lo[i], O_lo[i]);
+        int32x4_t t_hi = vaddq_s32(E_hi[i], O_hi[i]);
+        d_lo[i + 0] = vqrshrn_n_s32(t_lo, shift);
+        d_hi[i + 0] = vqrshrn_n_s32(t_hi, shift);
+
+        t_lo = vsubq_s32(E_lo[3 - i], O_lo[3 - i]);
+        t_hi = vsubq_s32(E_hi[3 - i], O_hi[3 - i]);
+        d_lo[i + 4] = vqrshrn_n_s32(t_lo, shift);
+        d_hi[i + 4] = vqrshrn_n_s32(t_hi, shift);
+    }
+
+    int16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+    transpose_4x8_s16(d_lo[0], d_lo[1], d_lo[2], d_lo[3], d_lo[4], d_lo[5], d_lo[6], d_lo[7],
+                      d0, d1, d2, d3);
+
+    transpose_4x8_s16(d_hi[0], d_hi[1], d_hi[2], d_hi[3], d_hi[4], d_hi[5], d_hi[6], d_hi[7],
+                      d4, d5, d6, d7);
+
+    vst1q_s16(dst + 0 * dstStride, d0);
+    vst1q_s16(dst + 1 * dstStride, d1);
+    vst1q_s16(dst + 2 * dstStride, d2);
+    vst1q_s16(dst + 3 * dstStride, d3);
+    vst1q_s16(dst + 4 * dstStride, d4);
+    vst1q_s16(dst + 5 * dstStride, d5);
+    vst1q_s16(dst + 6 * dstStride, d6);
+    vst1q_s16(dst + 7 * dstStride, d7);
+}
 
 static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst, int shift, int line)
 {
@@ -1240,6 +1395,17 @@ void idct4_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
     partialButterflyInverse4_neon<shift_pass2>(coef, dst, dstStride);
 }
 
+void idct8_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
+{
+    const int shift_pass1 = 7;
+    const int shift_pass2 = 12 - (X265_DEPTH - 8);
+
+    ALIGN_VAR_32(int16_t, coef[8 * 8]);
+
+    partialButterflyInverse8_neon<shift_pass1>(src, coef, 8);
+    partialButterflyInverse8_neon<shift_pass2>(coef, dst, dstStride);
+}
+
 void idct16_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
@@ -1291,6 +1457,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].dct = dct32_neon;
     p.idst4x4 = idst4_neon;
     p.cu[BLOCK_4x4].idct   = idct4_neon;
+    p.cu[BLOCK_8x8].idct   = idct8_neon;
     p.cu[BLOCK_16x16].idct = PFX(idct16_neon);
     p.cu[BLOCK_32x32].idct = idct32_neon;
     p.cu[BLOCK_4x4].count_nonzero = count_nonzero_neon<4>;
@@ -1318,5 +1485,4 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
 };
 
 
-
 #endif
-- 
2.34.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: v3-0005-AArch64-Add-Neon-implementation-of-8x8-IDCT.patch
Type: text/x-diff
Size: 9841 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241213/3ba4a2ae/attachment-0001.patch>