[x265] [PATCH 4/4] AArch64: Optimise nquant_neon
Hari Limaye
hari.limaye at arm.com
Mon Aug 12 21:16:22 UTC 2024
Unroll nquant_neon by a factor of 2, performing operations on 16-bit
elements where possible, and use the value of numCoeff directly to
calculate the number of non-zero elements rather than recomputing this
with an add instruction in the loop.
Also remove some unnecessary zeroed vector register usage by using an
instruction that takes an immediate value instead.
Relative performance observed compared to the existing Neon
implementation:
Neoverse N1: 1.79x
Neoverse V1: 1.77x
Neoverse N2: 1.70x
Neoverse V2: 1.73x
---
source/common/aarch64/pixel-util.S | 50 ++++++++++++++++--------------
1 file changed, 26 insertions(+), 24 deletions(-)
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index e56b6d8cf..5d8cc8c8e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1854,41 +1854,43 @@ function PFX(nquant_neon)
dup v0.4s, w12 // q0= -qbits
dup v1.4s, w4 // add
- lsr w5, w5, #2
+ lsr w6, w5, #3
movi v4.4s, #0 // v4= accumulate numsig
- mov x4, #0
- movi v22.4s, #0
+ movi v5.2d, #0 // v5= zero-vector for SABDL(2)
.Loop_nquant:
- ld1 {v16.4h}, [x0], #8
- sub w5, w5, #1
- sxtl v19.4s, v16.4h // v19 = coef[blockpos]
+ ld1 {v16.8h}, [x0], #16
+ sub w6, w6, #1
- cmlt v18.4s, v19.4s, #0 // v18 = sign
+ sabdl v17.4s, v16.4h, v5.4h // v17 = level=abs(coef[blockpos])
+ sabdl2 v18.4s, v16.8h, v5.8h // v18 = level=abs(coef[blockpos])
- abs v19.4s, v19.4s // v19 = level=abs(coef[blockpos])
- ld1 {v20.4s}, [x1], #16 // v20 = quantCoeff[blockpos]
- mul v19.4s, v19.4s, v20.4s // v19 = tmplevel = abs(level) * quantCoeff[blockpos];
+ ld1 {v19.4s, v20.4s}, [x1], #32 // v19, v20 = quantCoeff[blockpos]
+ mul v17.4s, v17.4s, v19.4s // v17 = tmplevel = abs(level) * quantCoeff[blockpos];
+ mul v18.4s, v18.4s, v20.4s // v18 = tmplevel = abs(level) * quantCoeff[blockpos];
- add v20.4s, v19.4s, v1.4s // v20 = tmplevel+add
- sshl v20.4s, v20.4s, v0.4s // v20 = level =(tmplevel+add) >> qbits
+ add v19.4s, v17.4s, v1.4s // v20 = tmplevel+add
+ add v20.4s, v18.4s, v1.4s // v21 = tmplevel+add
+ sshl v19.4s, v19.4s, v0.4s // v20 = level =(tmplevel+add) >> qbits
+ sshl v20.4s, v20.4s, v0.4s // v21 = level =(tmplevel+add) >> qbits
// numsig
- cmeq v21.4s, v20.4s, v22.4s
- add v4.4s, v4.4s, v21.4s
- add x4, x4, #4
+ uzp1 v19.8h, v19.8h, v20.8h
+ cmeq v20.8h, v19.8h, #0
+ add v4.8h, v4.8h, v20.8h
- eor v21.16b, v20.16b, v18.16b
- sub v21.4s, v21.4s, v18.4s
- sqxtn v16.4h, v21.4s
- abs v17.4h, v16.4h
- st1 {v17.4h}, [x2], #8
+ // level *= sign
+ cmlt v16.8h, v16.8h, #0
+ eor v19.16b, v19.16b, v16.16b
+ sub v19.8h, v19.8h, v16.8h
+ abs v19.8h, v19.8h
+ st1 {v19.8h}, [x2], #16
- cbnz w5, .Loop_nquant
+ cbnz w6, .Loop_nquant
- uaddlv d4, v4.4s
- fmov x12, d4
- add x0, x4, x12
+ saddlv s4, v4.8h
+ fmov w9, s4
+ add w0, w5, w9
ret
endfunc
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-AArch64-Optimise-nquant_neon.patch
Type: text/x-patch
Size: 4119 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240812/db80e008/attachment-0001.bin>
More information about the x265-devel
mailing list