[x265] [PATCH 03/18] AArch64: Use transpose helpers in pixel-prim.cpp

Tue Aug 13 15:19:17 UTC 2024

Refactor duplicated sequences of intrinsics calls to use the appropriate
helper function, and rename these helper functions to reflect the vector
type they take as input.
---
 source/common/aarch64/pixel-prim.cpp | 92 +++++++++++++---------------
 1 file changed, 43 insertions(+), 49 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 2d7cc42ed..9598f5498 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -26,26 +26,26 @@ static inline void SUMSUB_AB(int16x8_t &sum, int16x8_t &sub, const int16x8_t a,
     sub = vsubq_s16(a, b);
 }
 
-static inline void transpose_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
     t1 = vtrn1q_s16(s1, s2);
     t2 = vtrn2q_s16(s1, s2);
 }
 
-static inline void transpose_4s(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
     t1 = vtrn1q_s32(s1, s2);
     t2 = vtrn2q_s32(s1, s2);
 }
 
-#if (X265_DEPTH <= 10)
-static inline void transpose_2d(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2)
+static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2,
+                                   const int16x8_t s1, const int16x8_t s2)
 {
     t1 = vtrn1q_s64(s1, s2);
     t2 = vtrn2q_s64(s1, s2);
 }
-#endif
-
 
 static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2,
                                int16x8_t a, int16x8_t  b, int16x8_t  c, int16x8_t  d)
@@ -75,18 +75,14 @@ static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int1
     SUMSUB_AB(v4 , v6 , v16, v18);
     SUMSUB_AB(v5 , v7 , v17, v19);
 
-    v0 = vtrn1q_s16(v4, v5);
-    v1 = vtrn2q_s16(v4, v5);
-    v2 = vtrn1q_s16(v6, v7);
-    v3 = vtrn2q_s16(v6, v7);
+    transpose_8h_8h(v0, v1, v4, v5);
+    transpose_8h_8h(v2, v3, v6, v7);
 
     SUMSUB_AB(v16, v17, v0,  v1);
     SUMSUB_AB(v18, v19, v2,  v3);
 
-    v0 = vtrn1q_s32(v16, v18);
-    v1 = vtrn2q_s32(v16, v18);
-    v2 = vtrn1q_s32(v17, v19);
-    v3 = vtrn2q_s32(v17, v19);
+    transpose_4s_8h(v0, v1, v16, v18);
+    transpose_4s_8h(v2, v3, v17, v19);
 
     v0 = vabsq_s16(v0);
     v1 = vabsq_s16(v1);
@@ -105,16 +101,13 @@ static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1)
     int16x8_t v2, v3;
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vzip1q_s64(v2, v3);
-    v1 = vzip2q_s64(v2, v3);
+    transpose_2d_8h(v0, v1, v2, v3);
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vtrn1q_s16(v2, v3);
-    v1 = vtrn2q_s16(v2, v3);
+    transpose_8h_8h(v0, v1, v2, v3);
     SUMSUB_AB(v2,  v3,  v0,  v1);
 
-    v0 = vtrn1q_s32(v2, v3);
-    v1 = vtrn2q_s32(v2, v3);
+    transpose_4s_8h(v0, v1, v2, v3);
 
     v0 = vabsq_s16(v0);
     v1 = vabsq_s16(v1);
@@ -133,20 +126,20 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in
 
     HADAMARD4_V(v20, v21, v22, v23, v0,  v1, v2, v3);
 
-    transpose_8h(v0,  v1,  v16, v17);
-    transpose_8h(v2,  v3,  v18, v19);
-    transpose_8h(v4,  v5,  v20, v21);
-    transpose_8h(v6,  v7,  v22, v23);
+    transpose_8h_8h(v0,  v1,  v16, v17);
+    transpose_8h_8h(v2,  v3,  v18, v19);
+    transpose_8h_8h(v4,  v5,  v20, v21);
+    transpose_8h_8h(v6,  v7,  v22, v23);
 
     SUMSUB_AB(v16, v17, v0,  v1);
     SUMSUB_AB(v18, v19, v2,  v3);
     SUMSUB_AB(v20, v21, v4,  v5);
     SUMSUB_AB(v22, v23, v6,  v7);
 
-    transpose_4s(v0,  v2,  v16, v18);
-    transpose_4s(v1,  v3,  v17, v19);
-    transpose_4s(v4,  v6,  v20, v22);
-    transpose_4s(v5,  v7,  v21, v23);
+    transpose_4s_8h(v0,  v2,  v16, v18);
+    transpose_4s_8h(v1,  v3,  v17, v19);
+    transpose_4s_8h(v4,  v6,  v20, v22);
+    transpose_4s_8h(v5,  v7,  v21, v23);
 
     v0 = vabsq_s16(v0);
     v1 = vabsq_s16(v1);
@@ -167,7 +160,8 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in
 #if HIGH_BIT_DEPTH
 
 #if (X265_DEPTH > 10)
-static inline void transpose_2d(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2)
+static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2,
+                                   const int32x4_t s1, const int32x4_t s2)
 {
     t1 = vtrn1q_s64(s1, s2);
     t2 = vtrn2q_s64(s1, s2);
@@ -523,10 +517,10 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
     SUMSUB_AB(v2,  v18, v18, v22);
     SUMSUB_AB(v3,  v19, v19, v23);
 
-    transpose_8h(v20, v21, v16, v17);
-    transpose_8h(v4,  v5,  v0,  v1);
-    transpose_8h(v22, v23, v18, v19);
-    transpose_8h(v6,  v7,  v2,  v3);
+    transpose_8h_8h(v20, v21, v16, v17);
+    transpose_8h_8h(v4,  v5,  v0,  v1);
+    transpose_8h_8h(v22, v23, v18, v19);
+    transpose_8h_8h(v6,  v7,  v2,  v3);
 
 #if (X265_DEPTH <= 10)
 
@@ -537,20 +531,20 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
     SUMSUB_AB(v0,  v1,  v22, v23);
     SUMSUB_AB(v4,  v5,  v6,  v7);
 
-    transpose_4s(v20, v22, v2,  v0);
-    transpose_4s(v21, v23, v3,  v1);
-    transpose_4s(v16, v18, v24, v4);
-    transpose_4s(v17, v19, v25, v5);
+    transpose_4s_8h(v20, v22, v2,  v0);
+    transpose_4s_8h(v21, v23, v3,  v1);
+    transpose_4s_8h(v16, v18, v24, v4);
+    transpose_4s_8h(v17, v19, v25, v5);
 
     SUMSUB_AB(v0,  v2,  v20, v22);
     SUMSUB_AB(v1,  v3,  v21, v23);
     SUMSUB_AB(v4,  v6,  v16, v18);
     SUMSUB_AB(v5,  v7,  v17, v19);
 
-    transpose_2d(v16, v20,  v0,  v4);
-    transpose_2d(v17, v21,  v1,  v5);
-    transpose_2d(v18, v22,  v2,  v6);
-    transpose_2d(v19, v23,  v3,  v7);
+    transpose_2d_8h(v16, v20,  v0,  v4);
+    transpose_2d_8h(v17, v21,  v1,  v5);
+    transpose_2d_8h(v18, v22,  v2,  v6);
+    transpose_2d_8h(v19, v23,  v3,  v7);
 
 
     v16 = vabsq_s16(v16);
@@ -609,15 +603,15 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
     ISUMSUB_AB(v4l,  v5l,  v6l,  v7l);
     ISUMSUB_AB(v4h,  v5h,  v6h,  v7h);
 
-    transpose_2d(v20l, v22l, v2l,  v0l);
-    transpose_2d(v21l, v23l, v3l,  v1l);
-    transpose_2d(v16l, v18l, v24l, v4l);
-    transpose_2d(v17l, v19l, v25l, v5l);
+    transpose_2d_4s(v20l, v22l, v2l,  v0l);
+    transpose_2d_4s(v21l, v23l, v3l,  v1l);
+    transpose_2d_4s(v16l, v18l, v24l, v4l);
+    transpose_2d_4s(v17l, v19l, v25l, v5l);
 
-    transpose_2d(v20h, v22h, v2h,  v0h);
-    transpose_2d(v21h, v23h, v3h,  v1h);
-    transpose_2d(v16h, v18h, v24h, v4h);
-    transpose_2d(v17h, v19h, v25h, v5h);
+    transpose_2d_4s(v20h, v22h, v2h,  v0h);
+    transpose_2d_4s(v21h, v23h, v3h,  v1h);
+    transpose_2d_4s(v16h, v18h, v24h, v4h);
+    transpose_2d_4s(v17h, v19h, v25h, v5h);
 
     ISUMSUB_AB(v0l,  v2l,  v20l, v22l);
     ISUMSUB_AB(v1l,  v3l,  v21l, v23l);
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0003-AArch64-Use-transpose-helpers-in-pixel-prim.cpp.patch
Type: text/x-patch
Size: 7441 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/b0931df6/attachment.bin>