[x265] [PATCH 2/4] AArch64: Optimise quant_neon
Hari Limaye
hari.limaye at arm.com
Mon Aug 12 21:15:31 UTC 2024
Unroll quant_neon by a factor of 2, performing operations on 16-bit
elements where possible, and use the value of numCoeff directly to
calculate the number of non-zero elements rather than recomputing this
with an add instruction in the loop.
Also remove some unnecessary zeroed vector register usage by using an
instruction that takes an immediate value instead.
Relative performance observed compared to the existing Neon
implementation:
Neoverse N1: 1.57x
Neoverse V1: 1.59x
Neoverse N2: 1.54x
Neoverse V2: 1.59x
---
source/common/aarch64/pixel-util.S | 53 +++++++++++++++---------------
1 file changed, 27 insertions(+), 26 deletions(-)
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 1df49ba6e..e56b6d8cf 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -1795,55 +1795,56 @@ endfunc
function PFX(quant_neon)
mov w9, #1
lsl w9, w9, w4
- dup v0.2s, w9
+ dup v0.4s, w9
neg w9, w4
dup v1.4s, w9
add w9, w9, #8
dup v2.4s, w9
dup v3.4s, w5
+ movi v31.2d, #0
- lsr w6, w6, #2
- eor v4.16b, v4.16b, v4.16b
- eor w10, w10, w10
- eor v17.16b, v17.16b, v17.16b
+ lsr w7, w6, #3
+ movi v4.2d, #0
.Loop_quant:
- ld1 {v18.4h}, [x0], #8
- ld1 {v7.4s}, [x1], #16
- sxtl v6.4s, v18.4h
+ ld1 {v18.8h}, [x0], #16
+ ld1 {v20.4s, v21.4s}, [x1], #32
- cmlt v5.4s, v6.4s, #0
+ sabdl v6.4s, v18.4h, v31.4h
+ sabdl2 v26.4s, v18.8h, v31.8h
- abs v6.4s, v6.4s
-
-
- mul v6.4s, v6.4s, v7.4s
+ mul v6.4s, v6.4s, v20.4s
+ mul v26.4s, v26.4s, v21.4s
add v7.4s, v6.4s, v3.4s
+ add v27.4s, v26.4s, v3.4s
sshl v7.4s, v7.4s, v1.4s
+ sshl v27.4s, v27.4s, v1.4s
- mls v6.4s, v7.4s, v0.s[0]
+ mls v6.4s, v7.4s, v0.4s
+ mls v26.4s, v27.4s, v0.4s
sshl v16.4s, v6.4s, v2.4s
- st1 {v16.4s}, [x2], #16
+ sshl v17.4s, v26.4s, v2.4s
+ st1 {v16.4s, v17.4s}, [x2], #32
// numsig
- cmeq v16.4s, v7.4s, v17.4s
- add v4.4s, v4.4s, v16.4s
- add w10, w10, #4
+ uzp1 v7.8h, v7.8h, v27.8h
+ cmeq v16.8h, v7.8h, #0
+ add v4.8h, v4.8h, v16.8h
// level *= sign
+ cmlt v5.8h, v18.8h, #0
eor v16.16b, v7.16b, v5.16b
- sub v16.4s, v16.4s, v5.4s
- sqxtn v5.4h, v16.4s
- st1 {v5.4h}, [x3], #8
+ sub v5.8h, v16.8h, v5.8h
+ st1 {v5.8h}, [x3], #16
- subs w6, w6, #1
- b.ne .Loop_quant
+ subs w7, w7, #1
+ b.ne .Loop_quant
- addv s4, v4.4s
- mov w9, v4.s[0]
- add w0, w10, w9
+ saddlv s4, v4.8h
+ fmov w9, s4
+ add w0, w6, w9
ret
endfunc
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Optimise-quant_neon.patch
Type: text/x-patch
Size: 3758 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240812/ca9a8b3c/attachment.bin>
More information about the x265-devel
mailing list