[x265] [PATCH v2] aarch64/pixel-util.S: Optimize scanPosLast_neon
chen
chenm003 at 163.com
Tue Mar 11 02:16:53 UTC 2025
Hi George,
Thank for the revised patch, it looks good now.
btw: below instruction keep origin no change, but new patch use w5, it is not affect performance, so unnecessary to modify in this version, we may do it in next optimize patch.
cbnz x5, .Loop_spl
Regards,
Chen
At 2025-03-10 18:47:05, "George Steed" <george.steed at arm.com> wrote:
>Hi Chen,
>
>Thank you for the comments, please see v2 patch below/attached.
>
>Note that I have kept the original local labels for ".Loop_spl" and
>".Loop_spl_1", however ".pext_end" is not a local label since it does
>not start with ".L", so I have changed that one to ".Lpext_end".
>
>Thanks,
>George
>
>-- >8 --
>Improve the implementation and clean up comments, in particular:
>
>* The exiting code makes use of XTN + XTN2 to narrow data from 16-bits
> to 8-bits. UZP1 does this in a single instruction.
>
>* The existing code makes use of v31 as a zero vector to allow using
> CMHI for a != 0 comparison. Use CMEQ as a == 0 comparison instead and
> just adjust the surrounding logic to work with the negated condition
> to avoid the need for a zero vector.
>
>* The existing code makes use of repeated ADDV + vector MOV to reduce
> sums. We can use ADDP to incrementally sum both masks simutaneously
> which performs better on some micro-architectures.
>
>* Rather than calculating the popcount of the reduced mask, we can
> instead just sum the mask. This takes advantage of the fact that CMEQ
> sets "true" elements to all 1s, which is equivalent to -1 in binary.
> This means that a single ADDV on the mask gives us the negated
> popcount directly.
>
>Taken together these changes reduce the runtime of the scanPosLast_neon
>kernel by 20-35% depending on the micro-architecture and the parameters
>the function is called with.
>---
> source/common/aarch64/pixel-util.S | 85 +++++++++++++-----------------
> 1 file changed, 37 insertions(+), 48 deletions(-)
>
>diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
>index d8b3f4365..1825466ea 100644
>--- a/source/common/aarch64/pixel-util.S
>+++ b/source/common/aarch64/pixel-util.S
>@@ -2213,27 +2213,25 @@ endfunc
> // const uint16_t* scanCG4x4, // x6
> // const int trSize) // x7
> function PFX(scanPosLast_neon)
>- // convert unit of Stride(trSize) to int16_t
>+ // Convert unit of trSize stride from elements (int16) to bytes.
> add x7, x7, x7
>
>- // load scan table and convert to Byte
>+ // Load scan table and convert to bytes.
> ldp q0, q1, [x6]
>- xtn v0.8b, v0.8h
>- xtn2 v0.16b, v1.8h // v0 - Zigzag scan table
>+ uzp1 v0.16b, v0.16b, v1.16b // v0 - Zigzag scan table
>
> movrel x10, g_SPL_and_mask
>- ldr q28, [x10] // v28 = mask for pmovmskb
>- movi v31.16b, #0 // v31 = {0, ..., 0}
>- add x10, x7, x7 // 2*x7
>- add x11, x10, x7 // 3*x7
>- add x9, x4, #1 // CG count
>+ ldr q28, [x10] // v28 = mask for pmovmskb
>+ add x10, x7, x7 // 2*x7
>+ add x11, x7, x7, lsl #1 // 3*x7
>+ add x9, x4, #1 // CG count
>
> .Loop_spl:
>- // position of current CG
>+ // Position of current CG.
> ldrh w6, [x0], #32
> add x6, x1, x6, lsl #1
>
>- // loading current CG
>+ // Loading current CG and saturate to bytes.
> ldr d2, [x6]
> ldr d3, [x6, x7]
> ldr d4, [x6, x10]
>@@ -2243,69 +2241,60 @@ function PFX(scanPosLast_neon)
> sqxtn v2.8b, v2.8h
> sqxtn2 v2.16b, v4.8h
>
>- // Zigzag
>+ // Apply zigzag.
> tbl v3.16b, {v2.16b}, v0.16b
>
>- // get sign
>- cmhi v5.16b, v3.16b, v31.16b // v5 = non-zero
>- cmlt v3.16b, v3.16b, #0 // v3 = negative
>+ // Get zero/sign.
>+ cmeq v5.16b, v3.16b, #0 // v5 = zero
>+ cmlt v3.16b, v3.16b, #0 // v3 = negative
>
>- // val - w13 = pmovmskb(v3)
>+ // val: w13 = pmovmskb(v3)
>+ // mask: w15 = pmovmskb(v4)
> and v3.16b, v3.16b, v28.16b
>- mov d4, v3.d[1]
>- addv b23, v3.8b
>- addv b24, v4.8b
>- mov v23.b[1], v24.b[0]
>- fmov w13, s23
>-
>- // mask - w15 = pmovmskb(v5)
>- and v5.16b, v5.16b, v28.16b
>- mov d6, v5.d[1]
>- addv b25, v5.8b
>- addv b26, v6.8b
>- mov v25.b[1], v26.b[0]
>- fmov w15, s25
>+ bic v4.16b, v28.16b, v5.16b
>+ addp v3.16b, v3.16b, v4.16b
>+ addp v3.16b, v3.16b, v3.16b
>+ addp v3.16b, v3.16b, v3.16b
>+ fmov w13, s3
>+ lsr w15, w13, #16
>+
>+ // coeffNum = addv(v3 != 0) = 16 - addv(v5)
>+ addv b5, v5.16b
>+ smov w6, v5.b[0]
>+ add w6, w6, #16
>+ sub w5, w5, w6
>+ strb w6, [x4], #1
>
> // coeffFlag = reverse_bit(w15) in 16-bit
>- rbit w12, w15
>- lsr w12, w12, #16
>- fmov s30, w12
>+ rbit w12, w13
> strh w12, [x3], #2
>
>- // accelerate by preparing w13 = w13 & w15
>+ // Pack bits from w13 into w14, based on w15 mask.
> and w13, w13, w15
> mov x14, xzr
>+ cbz w15, .Lpext_end
> .Loop_spl_1:
>- cbz w15, .pext_end
> clz w6, w15
> lsl w13, w13, w6
> lsl w15, w15, w6
> extr w14, w14, w13, #31
>- bfm w15, wzr, #1, #0
>- b .Loop_spl_1
>-.pext_end:
>+ bfc w15, #31, #1
>+ cbnz w15, .Loop_spl_1
>+.Lpext_end:
> strh w14, [x2], #2
>
>- // compute coeffNum = popcount(coeffFlag)
>- cnt v30.8b, v30.8b
>- addp v30.8b, v30.8b, v30.8b
>- fmov w6, s30
>- sub x5, x5, x6
>- strb w6, [x4], #1
>-
> cbnz x5, .Loop_spl
>
>- // count trailing zeros
>+ // Count trailing zeros.
> rbit w13, w12
> clz w13, w13
> lsr w12, w12, w13
> strh w12, [x3, #-2]
>
>- // get last pos
>+ // Get last pos.
> sub x9, x4, x9
>- lsl x0, x9, #4
> eor w13, w13, #15
>- add x0, x0, x13
>+ add x0, x13, x9, lsl #4
> ret
> endfunc
>
>--
>2.34.1
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250311/d158901f/attachment-0001.htm>
More information about the x265-devel
mailing list