[x265] [PATCH 1/2] AArch64: Add Neon implementation of findPosFirstLast
Micro Daryl Robles
microdaryl.robles at arm.com
Tue Apr 8 15:13:37 UTC 2025
Relative performance compared to scalar C:
Neoverse N1: 4.97-5.87x
Neoverse N2: 3.80-4.87x
Neoverse V1: 4.70-5.41x
Neoverse V2: 3.79-4.91x
---
source/common/aarch64/dct-prim.cpp | 55 +++++++++++++++++++++++++++++-
source/common/threading.h | 6 ++++
source/test/pixelharness.cpp | 12 +++++--
3 files changed, 70 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
index dea20e522..6a3d95e91 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -1862,6 +1862,59 @@ void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
partialButterflyInverse32_neon<shift_pass2>(coef, dst, dstStride);
}
+uint32_t findPosFirstLast_neon(const int16_t *coeff, const intptr_t trSize,
+ const uint16_t scanTbl[16])
+{
+ X265_CHECK(SCAN_SET_SIZE == 16, "SCAN_SET_SIZE must be 16\n");
+ X265_CHECK(MLS_CG_SIZE == 4, "MLS_CG_SIZE must be 4\n");
+ X265_CHECK(scanTbl[2] == 1 || scanTbl[2] == 2 || scanTbl[2] == 8,
+ "scanTbl is invalid\n");
+
+ int16x4_t c0 = vld1_s16(&coeff[0 * trSize]);
+ int16x4_t c1 = vld1_s16(&coeff[1 * trSize]);
+ int16x4_t c2 = vld1_s16(&coeff[2 * trSize]);
+ int16x4_t c3 = vld1_s16(&coeff[3 * trSize]);
+ int16x8_t coeff01 = vcombine_s16(c0, c1);
+ int16x8_t coeff23 = vcombine_s16(c2, c3);
+
+ // Set cmp bits if coeff[x] != 0.
+ uint16x8_t cmp01 = vtstq_s16(coeff01, coeff01);
+ uint16x8_t cmp23 = vtstq_s16(coeff23, coeff23);
+ uint8x16_t cmp_8bit = vcombine_u8(vmovn_u16(cmp01), vmovn_u16(cmp23));
+
+ if (scanTbl[2] != 2) // Skip if SCAN_HOR.
+ {
+ // Load scanTbl.
+ uint16x8_t t0 = vld1q_u16(scanTbl + 0);
+ uint16x8_t t1 = vld1q_u16(scanTbl + 8);
+ uint8x16_t scan_tbl = vcombine_u8(vmovn_u16(t0), vmovn_u16(t1));
+
+ cmp_8bit = vqtbl1q_u8(cmp_8bit, scan_tbl);
+ }
+
+ // Convert the 8x16 cmp_8bit into 4x16 cmp_4bit.
+ uint64_t cmp_4bit = vget_lane_u64(
+ vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_8bit), 4)), 0);
+
+ // NOTE: If coeff block are all zeros, the lastNZPosInCG is undefined and
+ // firstNZPosInCG is 16.
+ if (cmp_4bit == 0)
+ {
+ return (uint32_t)-1 << 8 | SCAN_SET_SIZE;
+ }
+
+ unsigned long id_first, id_last;
+ CTZ64(id_first, cmp_4bit);
+ uint32_t firstNZPosInCG = (uint32_t)id_first >> 2;
+ CLZ64(id_last, cmp_4bit);
+ uint32_t lastNZPosInCG = (uint32_t)id_last >> 2;
+
+ // Add long not needed, we only need LSB.
+ uint32_t absSumSign = (uint32_t)vaddvq_s16(vaddq_s16(coeff01, coeff23));
+
+ return (absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG;
+}
+
void setupDCTPrimitives_neon(EncoderPrimitives &p)
{
p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_neon<2>;
@@ -1901,7 +1954,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>;
p.scanPosLast = scanPosLast_opt;
-
+ p.findPosFirstLast = findPosFirstLast_neon;
}
};
diff --git a/source/common/threading.h b/source/common/threading.h
index 8a5c39cf0..2a1743738 100644
--- a/source/common/threading.h
+++ b/source/common/threading.h
@@ -60,6 +60,8 @@ int no_atomic_add(int* ptr, int val);
#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x)
+#define CLZ64(id, x) id = (unsigned long)__builtin_clzll(x) ^ 63
+#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x)
#define ATOMIC_OR(ptr, mask) no_atomic_or((int*)ptr, mask)
#define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask)
#define ATOMIC_INC(ptr) no_atomic_inc((int*)ptr)
@@ -74,6 +76,8 @@ int no_atomic_add(int* ptr, int val);
#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x)
+#define CLZ64(id, x) id = (unsigned long)__builtin_clzll(x) ^ 63
+#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x)
#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask)
#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1)
@@ -87,6 +91,8 @@ int no_atomic_add(int* ptr, int val);
#define CLZ(id, x) _BitScanReverse(&id, x)
#define CTZ(id, x) _BitScanForward(&id, x)
+#define CLZ64(id, x) _BitScanReverse64(&id, x)
+#define CTZ64(id, x) _BitScanForward64(&id, x)
#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr)
#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr)
#define ATOMIC_ADD(ptr, val) InterlockedExchangeAdd((volatile LONG*)ptr, val)
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 10f66cda1..380390e1a 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -3697,7 +3697,6 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
if (opt.findPosFirstLast)
{
- HEADER0("findPosFirstLast");
coeff_t coefBuf[32 * MLS_CG_SIZE];
memset(coefBuf, 0, sizeof(coefBuf));
// every CG can't be all zeros!
@@ -3705,7 +3704,16 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
coefBuf[3 + 1 * 32] = 0x0BAD;
coefBuf[3 + 2 * 32] = 0x0BAD;
coefBuf[3 + 3 * 32] = 0x0BAD;
- REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]);
+ const intptr_t trSize = 32;
+ HEADER0("findPosFirstLast[SCAN_DIAG]");
+ REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize,
+ g_scan4x4[SCAN_DIAG]);
+ HEADER0("findPosFirstLast[SCAN_HOR]");
+ REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize,
+ g_scan4x4[SCAN_HOR]);
+ HEADER0("findPosFirstLast[SCAN_VER]");
+ REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize,
+ g_scan4x4[SCAN_VER]);
}
if (opt.costCoeffNxN)
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Add-Neon-implementation-of-findPosFirstLast.patch
Type: text/x-diff
Size: 6744 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250408/66795c2f/attachment-0001.patch>
More information about the x265-devel
mailing list