[x265] [PATCH 02/18] AArch64: Refactor output variables in Neon sa8d helper

Tue Aug 13 15:19:08 UTC 2024

Refactor the helper function _sa8d_8x8_neon to use separate input and
output parameters, and declare the correct bit-width (16 or 32)
depending on the value of HIGH_BIT_DEPTH.
---
 source/common/aarch64/pixel-prim.cpp | 72 ++++++++++++++++------------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index de9ddfe8a..2d7cc42ed 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -498,9 +498,17 @@ int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8
 }
 #endif      //HIGH_BIT_DEPTH
 
+#if HIGH_BIT_DEPTH
+typedef int32x4_t sa8d_out_type;
+#else
+typedef int16x8_t sa8d_out_type;
+#endif
 
-static inline void _sa8d_8x8_neon_end(int16x8_t &v0, int16x8_t &v1, int16x8_t v2, int16x8_t v3,
-                                      int16x8_t v20, int16x8_t v21, int16x8_t v22, int16x8_t v23)
+static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2,
+                                      int16x8_t v3, int16x8_t v20,
+                                      int16x8_t v21, int16x8_t v22,
+                                      int16x8_t v23, sa8d_out_type &out0,
+                                      sa8d_out_type &out1)
 {
     int16x8_t v16, v17, v18, v19;
     int16x8_t v4, v5, v6, v7;
@@ -560,15 +568,15 @@ static inline void _sa8d_8x8_neon_end(int16x8_t &v0, int16x8_t &v1, int16x8_t v2
     v19 = vmaxq_u16(v19, v23);
 
 #if HIGH_BIT_DEPTH
-    v0 = vpaddlq_u16(v16);
-    v1 = vpaddlq_u16(v17);
-    v0 = vpadalq_u16(v0, v18);
-    v1 = vpadalq_u16(v1, v19);
+    out0 = vpaddlq_u16(v16);
+    out1 = vpaddlq_u16(v17);
+    out0 = vpadalq_u16(out0, v18);
+    out1 = vpadalq_u16(out1, v19);
 
 #else //HIGH_BIT_DEPTH
 
-    v0 = vaddq_u16(v16, v17);
-    v1 = vaddq_u16(v18, v19);
+    out0 = vaddq_u16(v16, v17);
+    out1 = vaddq_u16(v18, v19);
 
 #endif //HIGH_BIT_DEPTH
 
@@ -674,8 +682,8 @@ static inline void _sa8d_8x8_neon_end(int16x8_t &v0, int16x8_t &v1, int16x8_t v2
     v18l = vaddq_u32(v18l, v18h);
     v19l = vaddq_u32(v19l, v19h);
 
-    v0 = vaddq_u32(v16l, v17l);
-    v1 = vaddq_u32(v18l, v19l);
+    out0 = vaddq_u32(v16l, v17l);
+    out1 = vaddq_u32(v18l, v19l);
 
 
 #endif
@@ -724,15 +732,16 @@ int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pi
 {
     int16x8_t v0, v1, v2, v3;
     int16x8_t v20, v21, v22, v23;
+    sa8d_out_type res0, res1;
 
     _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
 
 #if HIGH_BIT_DEPTH
-    int32x4_t s = vaddq_u32(v0, v1);
+    int32x4_t s = vaddq_u32(res0, res1);
     return (vaddvq_u32(s) + 1) >> 1;
 #else
-    return (vaddlvq_s16(vaddq_u16(v0, v1)) + 1) >> 1;
+    return (vaddlvq_s16(vaddq_u16(res0, res1)) + 1) >> 1;
 #endif
 }
 
@@ -744,51 +753,52 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *
 {
     int16x8_t v0, v1, v2, v3;
     int16x8_t v20, v21, v22, v23;
+    sa8d_out_type res0, res1;
     int32x4_t v30, v31;
 
     _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
 
 #if !(HIGH_BIT_DEPTH)
-    v30 = vpaddlq_u16(v0);
-    v31 = vpaddlq_u16(v1);
+    v30 = vpaddlq_u16(res0);
+    v31 = vpaddlq_u16(res1);
 #else
-    v30 = vaddq_s32(v0, v1);
+    v30 = vaddq_s32(res0, res1);
 #endif
 
     _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
 
 #if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, v0);
-    v31 = vpadalq_u16(v31, v1);
+    v30 = vpadalq_u16(v30, res0);
+    v31 = vpadalq_u16(v31, res1);
 #else
-    v31 = vaddq_s32(v0, v1);
+    v31 = vaddq_s32(res0, res1);
 #endif
 
 
     _sub_8x8_fly(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22,
                  v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
 
 #if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, v0);
-    v31 = vpadalq_u16(v31, v1);
+    v30 = vpadalq_u16(v30, res0);
+    v31 = vpadalq_u16(v31, res1);
 #else
-    v30 = vaddq_s32(v30, v0);
-    v31 = vaddq_s32(v31, v1);
+    v30 = vaddq_s32(v30, res0);
+    v31 = vaddq_s32(v31, res1);
 #endif
 
     _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21,
                  v22, v23);
-    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23);
+    _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1);
 
 #if !(HIGH_BIT_DEPTH)
-    v30 = vpadalq_u16(v30, v0);
-    v31 = vpadalq_u16(v31, v1);
+    v30 = vpadalq_u16(v30, res0);
+    v31 = vpadalq_u16(v31, res1);
 #else
-    v30 = vaddq_s32(v30, v0);
-    v31 = vaddq_s32(v31, v1);
+    v30 = vaddq_s32(v30, res0);
+    v31 = vaddq_s32(v31, res1);
 #endif
 
     v30 = vaddq_u32(v30, v31);
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Refactor-output-variables-in-Neon-sa8d-helpe.patch
Type: text/x-patch
Size: 5808 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/3dc3a813/attachment.bin>