[x265] [PATCH 2/2] AArch64: Fix SVE2 asm implementation of dequant_normal for HBD
Micro Daryl Robles
microdaryl.robles at arm.com
Mon Apr 7 13:28:57 UTC 2025
The existing SVE2 assembly of dequant_normal only works for SBD, so add
the missing code to make it work for HBD.
---
source/common/aarch64/asm-primitives.cpp | 13 ++++---------
source/common/aarch64/pixel-util-sve2.S | 17 +++++++++++++----
2 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index a8560d269..f9f03f423 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -899,9 +899,9 @@ void setupSvePrimitives(EncoderPrimitives &p)
#endif // defined(HAVE_SVE2) || defined(HAVE_SVE)
#if defined(HAVE_SVE2)
-#if !HIGH_BIT_DEPTH
void setupSve2Primitives(EncoderPrimitives &p)
{
+#if !HIGH_BIT_DEPTH
// pixel_avg_pp
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2);
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2);
@@ -971,9 +971,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2);
- // dequant_normal
- p.dequant_normal = PFX(dequant_normal_sve2);
-
// ssim_4x4x2_core
p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_sve2);
@@ -989,12 +986,10 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].normFact = PFX(normFact16_sve2);
p.cu[BLOCK_32x32].normFact = PFX(normFact32_sve2);
p.cu[BLOCK_64x64].normFact = PFX(normFact64_sve2);
-}
-#else // !HIGH_BIT_DEPTH
-void setupSve2Primitives(EncoderPrimitives &)
-{
-}
#endif // !HIGH_BIT_DEPTH
+
+ p.dequant_normal = PFX(dequant_normal_sve2);
+}
#endif // defined(HAVE_SVE2)
#ifdef HAVE_NEON_DOTPROD
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index c7ff0b35e..56a2253ea 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -869,7 +869,16 @@ endfunc
// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
function PFX(dequant_normal_sve2)
- lsr w2, w2, #4 // num / 16
+// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num);
+// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
+// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num);
+#if HIGH_BIT_DEPTH
+ cmp w3, #32768
+ blt .dqn_skip
+ lsr w3, w3, #(BIT_DEPTH - 8)
+ sub w4, w4, #(BIT_DEPTH - 8)
+.dqn_skip:
+#endif
neg w4, w4
mov z0.h, w3
mov z1.s, w4
@@ -893,7 +902,7 @@ function PFX(dequant_normal_sve2)
sqxtn v3.4h, v18.4s
sqxtn2 v3.8h, v19.4s
- sub w2, w2, #1
+ sub w2, w2, #16
st1 {v2.8h, v3.8h}, [x1], #32
cbnz w2, .dqn_loop1_sve2
ret
@@ -910,8 +919,8 @@ function PFX(dequant_normal_sve2)
sqxtnb z2.h, z16.s
sqxtnt z2.h, z17.s
-
- sub w2, w2, #1
+
+ sub w2, w2, #16
st1h {z2.h}, p0, [x1]
add x1, x1, #32
cbnz w2, .gt_16_dqn_loop1_sve2
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Fix-SVE2-asm-implementation-of-dequant_norma.patch
Type: text/x-diff
Size: 3752 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250407/2b8886d6/attachment.patch>
More information about the x265-devel
mailing list