[x265] [PATCH 1/2] AArch64: Fix Neon asm implementation of dequant_normal for HBD
Micro Daryl Robles
microdaryl.robles at arm.com
Mon Apr 7 13:28:50 UTC 2025
The existing Neon assembly of dequant_normal only works for SBD, so add
the missing code to make it work for HBD.
The parameter values used in REPORT_SPEEDUP are updated to fully test
the high bit-depth version.
---
source/common/aarch64/asm-primitives.cpp | 2 +-
source/common/aarch64/pixel-util.S | 13 +++++++++++--
source/test/mbdstharness.cpp | 6 ++++--
3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..a8560d269 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// dequant_scaling
p.dequant_scaling = PFX(dequant_scaling_neon);
- p.dequant_normal = PFX(dequant_normal_neon);
// ssim_4x4x2_core
p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
@@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
#endif
// quant
+ p.dequant_normal = PFX(dequant_normal_neon);
p.quant = PFX(quant_neon);
p.nquant = PFX(nquant_neon);
}
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 1825466ea..495bac1fa 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1626,7 +1626,16 @@ endfunc
// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
function PFX(dequant_normal_neon)
- lsr w2, w2, #4 // num / 16
+// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num);
+// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num);
+#if HIGH_BIT_DEPTH
+ cmp w3, #32768
+ blt .dqn_skip
+ lsr w3, w3, #(BIT_DEPTH - 8)
+ sub w4, w4, #(BIT_DEPTH - 8)
+.dqn_skip:
+#endif
neg w4, w4
dup v0.8h, w3
dup v1.4s, w4
@@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon)
sqxtn v3.4h, v18.4s
sqxtn2 v3.8h, v19.4s
- sub w2, w2, #1
+ sub w2, w2, #16
st1 {v2.8h, v3.8h}, [x1], #32
cbnz w2, .dqn_loop1
ret
diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp
index cceadd833..05027d109 100644
--- a/source/test/mbdstharness.cpp
+++ b/source/test/mbdstharness.cpp
@@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr
{
if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal))
{
- printf("dequant: Failed!\n");
+ printf("dequant_normal: Failed!\n");
return false;
}
}
@@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
if (opt.dequant_normal)
{
+ int scale = 72 << X265_DEPTH;
+ int shift = X265_DEPTH - 4;
printf("dequant_normal\t");
- REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
+ REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, scale, shift);
}
if (opt.dequant_scaling)
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Fix-Neon-asm-implementation-of-dequant_norma.patch
Type: text/x-diff
Size: 3866 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250407/f4e31bda/attachment.patch>
More information about the x265-devel
mailing list