[x265] [PATCH v2 4/7] AArch64: Add Neon saoCuStats primitives for high bitdepth
Hari Limaye
hari.limaye at arm.com
Wed May 22 19:07:23 UTC 2024
Add Neon optimised implementations of saoCuStats primitives for high
bitdepth, as none currently exist.
Results of speedtests, compared to the existing C code, when compiled
with LLVM 17 (higher is better):
Neoverse V1:
saoCuStatsBO | 1.09x
saoCuStatsE0 | 2.39x
saoCuStatsE1 | 2.67x
saoCuStatsE2 | 2.47x
saoCuStatsE3 | 2.86x
---
source/common/CMakeLists.txt | 7 +-----
source/common/aarch64/asm-primitives.cpp | 2 --
source/common/aarch64/sao-prim.cpp | 32 +++++++++++++++++++++---
3 files changed, 29 insertions(+), 12 deletions(-)
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 4b00a521c..1d7cd3cd0 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -103,12 +103,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
add_definitions(-DAUTO_VECTORIZE=1)
endif()
- set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h)
-
- if(NOT HIGH_BIT_DEPTH)
- list(APPEND C_SRCS sao-prim.cpp)
- endif()
-
+ set(C_SRCS asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp)
enable_language(ASM)
# add ARM assembly/intrinsic files here
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 907ad3a16..afcab7ad7 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1950,9 +1950,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
setupDCTPrimitives_neon(p);
setupLoopFilterPrimitives_neon(p);
setupIntraPrimitives_neon(p);
-#if !HIGH_BIT_DEPTH
setupSaoPrimitives_neon(p);
-#endif
}
}
diff --git a/source/common/aarch64/sao-prim.cpp b/source/common/aarch64/sao-prim.cpp
index 4f70d76f7..d4133196f 100644
--- a/source/common/aarch64/sao-prim.cpp
+++ b/source/common/aarch64/sao-prim.cpp
@@ -27,12 +27,28 @@
static inline int8x16_t signOf_neon(const pixel *a, const pixel *b)
{
+#if HIGH_BIT_DEPTH
+ uint16x8_t s0_lo = vld1q_u16(a);
+ uint16x8_t s0_hi = vld1q_u16(a + 8);
+ uint16x8_t s1_lo = vld1q_u16(b);
+ uint16x8_t s1_hi = vld1q_u16(b + 8);
+
+ // signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
+ int16x8_t cmp0_lo = vreinterpretq_s16_u16(vcgtq_u16(s0_lo, s1_lo));
+ int16x8_t cmp0_hi = vreinterpretq_s16_u16(vcgtq_u16(s0_hi, s1_hi));
+ int16x8_t cmp1_lo = vreinterpretq_s16_u16(vcgtq_u16(s1_lo, s0_lo));
+ int16x8_t cmp1_hi = vreinterpretq_s16_u16(vcgtq_u16(s1_hi, s0_hi));
+
+ int8x16_t cmp0 = vcombine_s8(vmovn_s16(cmp0_lo), vmovn_s16(cmp0_hi));
+ int8x16_t cmp1 = vcombine_s8(vmovn_s16(cmp1_lo), vmovn_s16(cmp1_hi));
+#else // HIGH_BIT_DEPTH
uint8x16_t s0 = vld1q_u8(a);
uint8x16_t s1 = vld1q_u8(b);
// signOf(a - b) = -(a > b ? -1 : 0) | (a < b ? -1 : 0)
int8x16_t cmp0 = vreinterpretq_s8_u8(vcgtq_u8(s0, s1));
int8x16_t cmp1 = vreinterpretq_s8_u8(vcgtq_u8(s1, s0));
+#endif // HIGH_BIT_DEPTH
return vorrq_s8(vnegq_s8(cmp0), cmp1);
}
@@ -138,6 +154,14 @@ namespace X265_NS {
void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
int endX, int endY, int32_t *stats, int32_t *count)
{
+#if HIGH_BIT_DEPTH
+ const int n_elem = 4;
+ const int elem_width = 16;
+#else
+ const int n_elem = 8;
+ const int elem_width = 8;
+#endif
+
// Additional temporary buffer for accumulation.
int32_t stats_tmp[32] = { 0 };
int32_t count_tmp[32] = { 0 };
@@ -163,15 +187,15 @@ void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
for (int y = 0; y < endY; y++)
{
int x = 0;
- for (; x + 8 < endX; x += 8)
+ for (; x + n_elem < endX; x += n_elem)
{
uint64_t class_idx_64 =
*reinterpret_cast<const uint64_t *>(rec + x) >> shift;
- for (int i = 0; i < 8; ++i)
+ for (int i = 0; i < n_elem; ++i)
{
const int idx = i & 1;
- const int off = (class_idx_64 >> (i * 8)) & mask;
+ const int off = (class_idx_64 >> (i * elem_width)) & mask;
*reinterpret_cast<uint32_t*>(stats_b[idx] + off) += diff[x + i];
*reinterpret_cast<uint32_t*>(count_b[idx] + off) += 1;
}
@@ -185,7 +209,7 @@ void saoCuStatsBO_neon(const int16_t *diff, const pixel *rec, intptr_t stride,
for (int i = 0; (i + x) < endX; ++i)
{
const int idx = i & 1;
- const int off = (class_idx_64 >> (i * 8)) & mask;
+ const int off = (class_idx_64 >> (i * elem_width)) & mask;
*reinterpret_cast<uint32_t*>(stats_b[idx] + off) += diff[x + i];
*reinterpret_cast<uint32_t*>(count_b[idx] + off) += 1;
}
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0004-AArch64-Add-Neon-saoCuStats-primitives-for-high-b.patch
Type: text/x-patch
Size: 5553 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240522/f8f7c384/attachment.bin>
More information about the x265-devel
mailing list