[x265] [PATCH v2 4/7] AArch64: Add Neon saoCuStats primitives for high bitdepth

Wed May 22 19:07:23 UTC 2024

Add Neon optimised implementations of saoCuStats primitives for high
bitdepth, as none currently exist.

Results of speedtests, compared to the existing C code, when compiled
with LLVM 17 (higher is better):

Neoverse V1:
    saoCuStatsBO |  1.09x
    saoCuStatsE0 |  2.39x
    saoCuStatsE1 |  2.67x
    saoCuStatsE2 |  2.47x
    saoCuStatsE3 |  2.86x
---
 source/common/CMakeLists.txt             |  7 +-----
 source/common/aarch64/asm-primitives.cpp |  2 --
 source/common/aarch64/sao-prim.cpp       | 32 +++++++++++++++++++++---
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 4b00a521c..1d7cd3cd0 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -103,12 +103,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
         add_definitions(-DAUTO_VECTORIZE=1)
     endif()
 
-    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
-
-    if(NOT HIGH_BIT_DEPTH)
-        list(APPEND C_SRCS sao-prim.cpp)
-    endif()
-
+    set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp)
     enable_language(ASM)
 
     # add ARM assembly/intrinsic files here
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 907ad3a16..afcab7ad7 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1950,9 +1950,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
         setupDCTPrimitives_neon(p);
         setupLoopFilterPrimitives_neon(p);
         setupIntraPrimitives_neon(p);
-#if !HIGH_BIT_DEPTH
         setupSaoPrimitives_neon(p);
-#endif
     }
 }
 
diff --git a/source/common/aarch64/sao-prim.cpp b/source/common/aarch64/sao-prim.cpp
index 4f70d76f7..d4133196f 100644
--- a/source/common/aarch64/sao-prim.cpp
+++ b/source/common/aarch64/sao-prim.cpp
@@ -27,12 +27,28 @@
 
 static inline int8x16_t signOf_neon(const pixel *a, const pixel *b)
 {
+#if HIGH_BIT_DEPTH
+    uint16x8_t s0_lo = vld1q_u16(a);
+    uint16x8_t s0_hi = vld1q_u16(a + 8);
+    uint16x8_t s1_lo = vld1q_u16(b);
+    uint16x8_t s1_hi = vld1q_u16(b + 8);
+
+    // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
+    int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo));
+    int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi));
+    int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo));
+    int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi));
+
+    int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi));
+    int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi));
+#else // HIGH_BIT_DEPTH
     uint8x16_t s0 = vld1q_u8(a);
     uint8x16_t s1 = vld1q_u8(b);
 
     // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
     int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1));
     int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0));
+#endif // HIGH_BIT_DEPTH
     return vorrq_s8(vnegq_s8(cmp0), cmp1);
 }
 
@@ -138,6 +154,14 @@ namespace X265_NS {
 void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
                        int endX, int endY, int32_t *stats, int32_t *count)
 {
+#if HIGH_BIT_DEPTH
+    const int n_elem = 4;
+    const int elem_width = 16;
+#else
+    const int n_elem = 8;
+    const int elem_width = 8;
+#endif
+
     // Additional temporary buffer for accumulation.
     int32_t stats_tmp[32] = { 0 };
     int32_t count_tmp[32] = { 0 };
@@ -163,15 +187,15 @@ void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
     for (int y = 0; y < endY; y++)
     {
         int x = 0;
-        for (; x + 8 < endX; x += 8)
+        for (; x + n_elem < endX; x += n_elem)
         {
             uint64_t class_idx_64 =
                 *reinterpret_cast<const uint64_t *>(rec + x) >> shift;
 
-            for (int i = 0; i < 8; ++i)
+            for (int i = 0; i < n_elem; ++i)
             {
                 const int idx = i & 1;
-                const int off  = (class_idx_64 >> (i * 8)) & mask;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
                 *reinterpret_cast<uint32_t*>(stats_b[idx] + off) += diff[x + i];
                 *reinterpret_cast<uint32_t*>(count_b[idx] + off) += 1;
             }
@@ -185,7 +209,7 @@ void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
             for (int i = 0; (i + x) < endX; ++i)
             {
                 const int idx = i & 1;
-                const int off  = (class_idx_64 >> (i * 8)) & mask;
+                const int off  = (class_idx_64 >> (i * elem_width)) & mask;
                 *reinterpret_cast<uint32_t*>(stats_b[idx] + off) += diff[x + i];
                 *reinterpret_cast<uint32_t*>(count_b[idx] + off) += 1;
             }
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0004-AArch64-Add-Neon-saoCuStats-primitives-for-high-b.patch
Type: text/x-patch
Size: 5553 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240522/f8f7c384/attachment.bin>