[x265] [PATCH 4/6] asm-primitives.cpp: Delete dequant_scaling SVE2 implementation
George Steed
george.steed at arm.com
Mon Jan 6 17:17:21 UTC 2025
The implementation makes use of Neon instructions which only operate on
the low 128-bits of an SVE register, meaning that for longer vectors the
function gives incorrect output. We could maintain the kernel and only
use it if the vector length is 128-bits, however it appears to give no
benefit in this case so simply remove it instead.
---
source/common/aarch64/asm-primitives.cpp | 3 +-
source/common/aarch64/pixel-util-sve2.S | 79 ------------------------
2 files changed, 1 insertion(+), 81 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 478b6943a..c1f41cb3c 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -969,8 +969,7 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2);
- // dequant_scaling
- p.dequant_scaling = PFX(dequant_scaling_sve2);
+ // dequant_normal
p.dequant_normal = PFX(dequant_normal_sve2);
// ssim_4x4x2_core
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index b2b4d24c1..5b872f437 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -867,85 +867,6 @@ function PFX(scale1D_128to64_sve2)
ret
endfunc
-/***** dequant_scaling*****/
-// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
-function PFX(dequant_scaling_sve2)
- ptrue p0.h, vl8
- add x5, x5, #4 // shift + 4
- lsr x3, x3, #3 // num / 8
- cmp x5, x4
- blt .dequant_skip_sve2
-
- mov x12, #1
- sub x6, x5, x4 // shift - per
- sub x6, x6, #1 // shift - per - 1
- lsl x6, x12, x6 // 1 << shift - per - 1 (add)
- mov z0.s, w6
- sub x7, x4, x5 // per - shift
- mov z3.s, w7
-
-.dequant_loop1_sve2:
- ld1h {z19.h}, p0/z, [x0]
- ld1w {z2.s}, p0/z, [x1]
- add x1, x1, #16
- ld1w {z20.s}, p0/z, [x1]
- add x0, x0, #16
- add x1, x1, #16
-
- sub x3, x3, #1
- sunpklo z1.s, z19.h
- sunpkhi z19.s, z19.h
-
- mul z1.s, z1.s, z2.s // quantCoef * deQuantCoef
- mul z19.s, z19.s, z20.s
- add z1.s, z1.s, z0.s // quantCoef * deQuantCoef + add
- add z19.s, z19.s, z0.s
-
- // No equivalent instructions in SVE2 for sshl
- // as sqshl has double latency
- sshl v1.4s, v1.4s, v3.4s
- sshl v19.4s, v19.4s, v3.4s
-
- sqxtnb z16.h, z1.s
- sqxtnb z17.h, z19.s
- st1h {z16.s}, p0, [x2]
- st1h {z17.s}, p0, [x2, #1, mul vl]
- add x2, x2, #16
- cbnz x3, .dequant_loop1_sve2
- ret
-
-.dequant_skip_sve2:
- sub x6, x4, x5 // per - shift
- mov z0.h, w6
-
-.dequant_loop2_sve2:
- ld1h {z19.h}, p0/z, [x0]
- ld1w {z2.s}, p0/z, [x1]
- add x1, x1, #16
- ld1w {z20.s}, p0/z, [x1]
- add x0, x0, #16
- add x1, x1, #16
-
-
- sub x3, x3, #1
- sunpklo z1.s, z19.h
- sunpkhi z19.s, z19.h
-
- mul z1.s, z1.s, z2.s // quantCoef * deQuantCoef
- mul z19.s, z19.s, z20.s
-
- // Keeping NEON instructions here in order to have
- // one sqshl later
- sqxtn v16.4h, v1.4s // x265_clip3
- sqxtn2 v16.8h, v19.4s
-
- sqshl z16.h, p0/m, z16.h, z0.h // coefQ << per - shift
- st1h {z16.h}, p0, [x2]
- add x2, x2, #16
- cbnz x3, .dequant_loop2_sve2
- ret
-endfunc
-
// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
function PFX(dequant_normal_sve2)
lsr w2, w2, #4 // num / 16
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-asm-primitives.cpp-Delete-dequant_scaling-SVE2-imple.patch
Type: text/x-diff
Size: 4720 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250106/d42c9ad7/attachment-0001.patch>
More information about the x265-devel
mailing list