[x265] [PATCH] aarch64/pixel-util.S: Optimize scanPosLast_neon
George Steed
george.steed at arm.com
Fri Mar 7 16:41:05 UTC 2025
Improve the implementation and clean up comments, in particular:
* The exiting code makes use of XTN + XTN2 to narrow data from 16-bits
to 8-bits. UZP1 does this in a single instruction.
* The existing code makes use of v31 as a zero vector to allow using
CMHI for a != 0 comparison. Use CMEQ as a == 0 comparison instead and
just adjust the surrounding logic to work with the negated condition
to avoid the need for a zero vector.
* The existing code makes use of repeated ADDV + vector MOV to reduce
sums. We can use ADDP to incrementally sum both masks simutaneously
which performs better on some micro-architectures.
* Rather than calculating the popcount of the reduced mask, we can
instead just sum the mask. This takes advantage of the fact that CMEQ
sets "true" elements to all 1s, which is equivalent to -1 in binary.
This means that a single ADDV on the mask gives us the negated
popcount directly.
Taken together these changes reduce the runtime of the scanPosLast_neon
kernel by 20-35% depending on the micro-architecture and the parameters
the function is called with.
---
source/common/aarch64/pixel-util.S | 94 +++++++++++++-----------------
1 file changed, 42 insertions(+), 52 deletions(-)
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index d8b3f4365..6635e52b1 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -2213,27 +2213,25 @@ endfunc
// const uint16_t* scanCG4x4, // x6
// const int trSize) // x7
function PFX(scanPosLast_neon)
- // convert unit of Stride(trSize) to int16_t
+ // Convert unit of trSize stride from elements (int16) to bytes.
add x7, x7, x7
- // load scan table and convert to Byte
+ // Load scan table and convert to bytes.
ldp q0, q1, [x6]
- xtn v0.8b, v0.8h
- xtn2 v0.16b, v1.8h // v0 - Zigzag scan table
+ uzp1 v0.16b, v0.16b, v1.16b // v0 - Zigzag scan table
movrel x10, g_SPL_and_mask
- ldr q28, [x10] // v28 = mask for pmovmskb
- movi v31.16b, #0 // v31 = {0, ..., 0}
- add x10, x7, x7 // 2*x7
- add x11, x10, x7 // 3*x7
- add x9, x4, #1 // CG count
-
-.Loop_spl:
- // position of current CG
+ ldr q28, [x10] // v28 = mask for pmovmskb
+ add x10, x7, x7 // 2*x7
+ add x11, x7, x7, lsl #1 // 3*x7
+ add x9, x4, #1 // CG count
+
+1:
+ // Position of current CG.
ldrh w6, [x0], #32
add x6, x1, x6, lsl #1
- // loading current CG
+ // Loading current CG and saturate to bytes.
ldr d2, [x6]
ldr d3, [x6, x7]
ldr d4, [x6, x10]
@@ -2243,69 +2241,61 @@ function PFX(scanPosLast_neon)
sqxtn v2.8b, v2.8h
sqxtn2 v2.16b, v4.8h
- // Zigzag
+ // Apply zigzag.
tbl v3.16b, {v2.16b}, v0.16b
- // get sign
- cmhi v5.16b, v3.16b, v31.16b // v5 = non-zero
- cmlt v3.16b, v3.16b, #0 // v3 = negative
+ // Get zero/sign.
+ cmeq v5.16b, v3.16b, #0 // v5 = zero
+ cmlt v3.16b, v3.16b, #0 // v3 = negative
- // val - w13 = pmovmskb(v3)
+ // val: w13 = pmovmskb(v3)
+ // mask: w15 = pmovmskb(v4)
and v3.16b, v3.16b, v28.16b
- mov d4, v3.d[1]
- addv b23, v3.8b
- addv b24, v4.8b
- mov v23.b[1], v24.b[0]
- fmov w13, s23
-
- // mask - w15 = pmovmskb(v5)
- and v5.16b, v5.16b, v28.16b
- mov d6, v5.d[1]
- addv b25, v5.8b
- addv b26, v6.8b
- mov v25.b[1], v26.b[0]
- fmov w15, s25
+ bic v4.16b, v28.16b, v5.16b
+ addp v3.16b, v3.16b, v4.16b
+ addp v3.16b, v3.16b, v3.16b
+ addp v3.16b, v3.16b, v3.16b
+ fmov w13, s3
+ lsr w15, w13, #16
+
+ // coeffNum = addv(v3 != 0) = 16 - addv(v5)
+ addv b5, v5.16b
+ smov w6, v5.b[0]
+ add w6, w6, #16
+ sub x5, x5, x6
+ strb w6, [x4], #1
// coeffFlag = reverse_bit(w15) in 16-bit
- rbit w12, w15
- lsr w12, w12, #16
- fmov s30, w12
+ rbit w12, w13
+ and w12, w12, #0xffff
strh w12, [x3], #2
- // accelerate by preparing w13 = w13 & w15
+ // Pack bits from w13 into w14, based on w15 mask.
and w13, w13, w15
mov x14, xzr
-.Loop_spl_1:
- cbz w15, .pext_end
+ cbz w15, 3f
+2:
clz w6, w15
lsl w13, w13, w6
lsl w15, w15, w6
extr w14, w14, w13, #31
- bfm w15, wzr, #1, #0
- b .Loop_spl_1
-.pext_end:
+ bfc w15, #31, #1
+ cbnz w15, 2b
+3:
strh w14, [x2], #2
- // compute coeffNum = popcount(coeffFlag)
- cnt v30.8b, v30.8b
- addp v30.8b, v30.8b, v30.8b
- fmov w6, s30
- sub x5, x5, x6
- strb w6, [x4], #1
-
- cbnz x5, .Loop_spl
+ cbnz x5, 1b
- // count trailing zeros
+ // Count trailing zeros.
rbit w13, w12
clz w13, w13
lsr w12, w12, w13
strh w12, [x3, #-2]
- // get last pos
+ // Get last pos.
sub x9, x4, x9
- lsl x0, x9, #4
eor w13, w13, #15
- add x0, x0, x13
+ add x0, x13, x9, lsl #4
ret
endfunc
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-aarch64-pixel-util.S-Optimize-scanPosLast_neon.patch
Type: text/x-diff
Size: 6185 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250307/9c0db634/attachment.patch>
More information about the x265-devel
mailing list