<div data-ntes="ntes_mail_body_root" style="line-height:1.7;color:#000000;font-size:14px;font-family:Arial"><div id="spnEditorContent"><p style="margin: 0;">Hi <span style="font-family: arial; white-space-collapse: preserve;">George,</span></p><p style="margin: 0;"><span style="font-family: arial; white-space-collapse: preserve;"><br></span></p><p style="margin: 0;"><span style="font-family: arial; white-space-collapse: preserve;">Thank for the revised patch, it looks good now.</span></p><p style="margin: 0;"><br></p><p style="margin: 0;">btw: below instruction keep origin no change, but new patch use w5, it is not affect performance, so unnecessary to modify in this version, we may do it in next optimize patch.</p><p style="margin: 0;">cbnz x5, .Loop_spl</p><p style="margin: 0;"><br></p><p style="margin: 0;">Regards,</p><p style="margin: 0;">Chen</p></div><div style="position:relative;zoom:1"></div><div id="divNeteaseMailCard"></div><p style="margin: 0;"><br></p><pre>At 2025-03-10 18:47:05, "George Steed" <george.steed@arm.com> wrote:
>Hi Chen,
>
>Thank you for the comments, please see v2 patch below/attached.
>
>Note that I have kept the original local labels for ".Loop_spl" and
>".Loop_spl_1", however ".pext_end" is not a local label since it does
>not start with ".L", so I have changed that one to ".Lpext_end".
>
>Thanks,
>George
>
>-- >8 --
>Improve the implementation and clean up comments, in particular:
>
>* The exiting code makes use of XTN + XTN2 to narrow data from 16-bits
> to 8-bits. UZP1 does this in a single instruction.
>
>* The existing code makes use of v31 as a zero vector to allow using
> CMHI for a != 0 comparison. Use CMEQ as a == 0 comparison instead and
> just adjust the surrounding logic to work with the negated condition
> to avoid the need for a zero vector.
>
>* The existing code makes use of repeated ADDV + vector MOV to reduce
> sums. We can use ADDP to incrementally sum both masks simutaneously
> which performs better on some micro-architectures.
>
>* Rather than calculating the popcount of the reduced mask, we can
> instead just sum the mask. This takes advantage of the fact that CMEQ
> sets "true" elements to all 1s, which is equivalent to -1 in binary.
> This means that a single ADDV on the mask gives us the negated
> popcount directly.
>
>Taken together these changes reduce the runtime of the scanPosLast_neon
>kernel by 20-35% depending on the micro-architecture and the parameters
>the function is called with.
>---
> source/common/aarch64/pixel-util.S | 85 +++++++++++++-----------------
> 1 file changed, 37 insertions(+), 48 deletions(-)
>
>diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
>index d8b3f4365..1825466ea 100644
>--- a/source/common/aarch64/pixel-util.S
>+++ b/source/common/aarch64/pixel-util.S
>@@ -2213,27 +2213,25 @@ endfunc
> // const uint16_t* scanCG4x4, // x6
> // const int trSize) // x7
> function PFX(scanPosLast_neon)
>- // convert unit of Stride(trSize) to int16_t
>+ // Convert unit of trSize stride from elements (int16) to bytes.
> add x7, x7, x7
>
>- // load scan table and convert to Byte
>+ // Load scan table and convert to bytes.
> ldp q0, q1, [x6]
>- xtn v0.8b, v0.8h
>- xtn2 v0.16b, v1.8h // v0 - Zigzag scan table
>+ uzp1 v0.16b, v0.16b, v1.16b // v0 - Zigzag scan table
>
> movrel x10, g_SPL_and_mask
>- ldr q28, [x10] // v28 = mask for pmovmskb
>- movi v31.16b, #0 // v31 = {0, ..., 0}
>- add x10, x7, x7 // 2*x7
>- add x11, x10, x7 // 3*x7
>- add x9, x4, #1 // CG count
>+ ldr q28, [x10] // v28 = mask for pmovmskb
>+ add x10, x7, x7 // 2*x7
>+ add x11, x7, x7, lsl #1 // 3*x7
>+ add x9, x4, #1 // CG count
>
> .Loop_spl:
>- // position of current CG
>+ // Position of current CG.
> ldrh w6, [x0], #32
> add x6, x1, x6, lsl #1
>
>- // loading current CG
>+ // Loading current CG and saturate to bytes.
> ldr d2, [x6]
> ldr d3, [x6, x7]
> ldr d4, [x6, x10]
>@@ -2243,69 +2241,60 @@ function PFX(scanPosLast_neon)
> sqxtn v2.8b, v2.8h
> sqxtn2 v2.16b, v4.8h
>
>- // Zigzag
>+ // Apply zigzag.
> tbl v3.16b, {v2.16b}, v0.16b
>
>- // get sign
>- cmhi v5.16b, v3.16b, v31.16b // v5 = non-zero
>- cmlt v3.16b, v3.16b, #0 // v3 = negative
>+ // Get zero/sign.
>+ cmeq v5.16b, v3.16b, #0 // v5 = zero
>+ cmlt v3.16b, v3.16b, #0 // v3 = negative
>
>- // val - w13 = pmovmskb(v3)
>+ // val: w13 = pmovmskb(v3)
>+ // mask: w15 = pmovmskb(v4)
> and v3.16b, v3.16b, v28.16b
>- mov d4, v3.d[1]
>- addv b23, v3.8b
>- addv b24, v4.8b
>- mov v23.b[1], v24.b[0]
>- fmov w13, s23
>-
>- // mask - w15 = pmovmskb(v5)
>- and v5.16b, v5.16b, v28.16b
>- mov d6, v5.d[1]
>- addv b25, v5.8b
>- addv b26, v6.8b
>- mov v25.b[1], v26.b[0]
>- fmov w15, s25
>+ bic v4.16b, v28.16b, v5.16b
>+ addp v3.16b, v3.16b, v4.16b
>+ addp v3.16b, v3.16b, v3.16b
>+ addp v3.16b, v3.16b, v3.16b
>+ fmov w13, s3
>+ lsr w15, w13, #16
>+
>+ // coeffNum = addv(v3 != 0) = 16 - addv(v5)
>+ addv b5, v5.16b
>+ smov w6, v5.b[0]
>+ add w6, w6, #16
>+ sub w5, w5, w6
>+ strb w6, [x4], #1
>
> // coeffFlag = reverse_bit(w15) in 16-bit
>- rbit w12, w15
>- lsr w12, w12, #16
>- fmov s30, w12
>+ rbit w12, w13
> strh w12, [x3], #2
>
>- // accelerate by preparing w13 = w13 & w15
>+ // Pack bits from w13 into w14, based on w15 mask.
> and w13, w13, w15
> mov x14, xzr
>+ cbz w15, .Lpext_end
> .Loop_spl_1:
>- cbz w15, .pext_end
> clz w6, w15
> lsl w13, w13, w6
> lsl w15, w15, w6
> extr w14, w14, w13, #31
>- bfm w15, wzr, #1, #0
>- b .Loop_spl_1
>-.pext_end:
>+ bfc w15, #31, #1
>+ cbnz w15, .Loop_spl_1
>+.Lpext_end:
> strh w14, [x2], #2
>
>- // compute coeffNum = popcount(coeffFlag)
>- cnt v30.8b, v30.8b
>- addp v30.8b, v30.8b, v30.8b
>- fmov w6, s30
>- sub x5, x5, x6
>- strb w6, [x4], #1
>-
> cbnz x5, .Loop_spl
>
>- // count trailing zeros
>+ // Count trailing zeros.
> rbit w13, w12
> clz w13, w13
> lsr w12, w12, w13
> strh w12, [x3, #-2]
>
>- // get last pos
>+ // Get last pos.
> sub x9, x4, x9
>- lsl x0, x9, #4
> eor w13, w13, #15
>- add x0, x0, x13
>+ add x0, x13, x9, lsl #4
> ret
> endfunc
>
>--
>2.34.1
>
</pre></div>