[x265] [PATCH] aarch64/pixel-util.S: Optimize scanPosLast_neon

Tue Mar 11 07:33:16 UTC 2025

> Il giorno 7 mar 2025, alle ore 17:41, George Steed <george.steed at arm.com> ha scritto:
> 
> Improve the implementation and clean up comments, in particular:

Hi, does it work now on Apple Silicon? The existing version used to crash on Apple M* processor,
and it has been disable (see arm-primitives.cpp:739)

> 
> * The exiting code makes use of XTN + XTN2 to narrow data from 16-bits
>  to 8-bits. UZP1 does this in a single instruction.
> 
> * The existing code makes use of v31 as a zero vector to allow using
>  CMHI for a != 0 comparison. Use CMEQ as a == 0 comparison instead and
>  just adjust the surrounding logic to work with the negated condition
>  to avoid the need for a zero vector.
> 
> * The existing code makes use of repeated ADDV + vector MOV to reduce
>  sums. We can use ADDP to incrementally sum both masks simutaneously
>  which performs better on some micro-architectures.
> 
> * Rather than calculating the popcount of the reduced mask, we can
>  instead just sum the mask. This takes advantage of the fact that CMEQ
>  sets "true" elements to all 1s, which is equivalent to -1 in binary.
>  This means that a single ADDV on the mask gives us the negated
>  popcount directly.
> 
> Taken together these changes reduce the runtime of the scanPosLast_neon
> kernel by 20-35% depending on the micro-architecture and the parameters
> the function is called with.
> ---
> source/common/aarch64/pixel-util.S | 94 +++++++++++++-----------------
> 1 file changed, 42 insertions(+), 52 deletions(-)
> 
> diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
> index d8b3f4365..6635e52b1 100644
> --- a/source/common/aarch64/pixel-util.S
> +++ b/source/common/aarch64/pixel-util.S
> @@ -2213,27 +2213,25 @@ endfunc
> //     const uint16_t* scanCG4x4, // x6
> //     const int trSize)          // x7
> function PFX(scanPosLast_neon)
> -    // convert unit of Stride(trSize) to int16_t
> +    // Convert unit of trSize stride from elements (int16) to bytes.
>     add             x7, x7, x7
> 
> -    // load scan table and convert to Byte
> +    // Load scan table and convert to bytes.
>     ldp             q0, q1, [x6]
> -    xtn             v0.8b, v0.8h
> -    xtn2            v0.16b, v1.8h   // v0 - Zigzag scan table
> +    uzp1            v0.16b, v0.16b, v1.16b  // v0 - Zigzag scan table
> 
>     movrel          x10, g_SPL_and_mask
> -    ldr             q28, [x10]      // v28 = mask for pmovmskb
> -    movi            v31.16b, #0     // v31 = {0, ..., 0}
> -    add             x10, x7, x7     // 2*x7
> -    add             x11, x10, x7    // 3*x7
> -    add             x9, x4, #1      // CG count
> -
> -.Loop_spl:
> -    // position of current CG
> +    ldr             q28, [x10]              // v28 = mask for pmovmskb
> +    add             x10, x7, x7             // 2*x7
> +    add             x11, x7, x7, lsl #1     // 3*x7
> +    add             x9, x4, #1              // CG count
> +
> +1:
> +    // Position of current CG.
>     ldrh            w6, [x0], #32
>     add             x6, x1, x6, lsl #1
> 
> -    // loading current CG
> +    // Loading current CG and saturate to bytes.
>     ldr             d2, [x6]
>     ldr             d3, [x6, x7]
>     ldr             d4, [x6, x10]
> @@ -2243,69 +2241,61 @@ function PFX(scanPosLast_neon)
>     sqxtn           v2.8b, v2.8h
>     sqxtn2          v2.16b, v4.8h
> 
> -    // Zigzag
> +    // Apply zigzag.
>     tbl             v3.16b, {v2.16b}, v0.16b
> 
> -    // get sign
> -    cmhi            v5.16b, v3.16b, v31.16b   // v5 = non-zero
> -    cmlt            v3.16b, v3.16b, #0        // v3 = negative
> +    // Get zero/sign.
> +    cmeq            v5.16b, v3.16b, #0   // v5 = zero
> +    cmlt            v3.16b, v3.16b, #0   // v3 = negative
> 
> -    // val - w13 = pmovmskb(v3)
> +    //  val: w13 = pmovmskb(v3)
> +    // mask: w15 = pmovmskb(v4)
>     and             v3.16b, v3.16b, v28.16b
> -    mov             d4, v3.d[1]
> -    addv            b23, v3.8b
> -    addv            b24, v4.8b
> -    mov             v23.b[1], v24.b[0]
> -    fmov            w13, s23
> -
> -    // mask - w15 = pmovmskb(v5)
> -    and             v5.16b, v5.16b, v28.16b
> -    mov             d6, v5.d[1]
> -    addv            b25, v5.8b
> -    addv            b26, v6.8b
> -    mov             v25.b[1], v26.b[0]
> -    fmov            w15, s25
> +    bic             v4.16b, v28.16b, v5.16b
> +    addp            v3.16b, v3.16b, v4.16b
> +    addp            v3.16b, v3.16b, v3.16b
> +    addp            v3.16b, v3.16b, v3.16b
> +    fmov            w13, s3
> +    lsr             w15, w13, #16
> +
> +    // coeffNum = addv(v3 != 0) = 16 - addv(v5)
> +    addv            b5, v5.16b
> +    smov            w6, v5.b[0]
> +    add             w6, w6, #16
> +    sub             x5, x5, x6
> +    strb            w6, [x4], #1
> 
>     // coeffFlag = reverse_bit(w15) in 16-bit
> -    rbit            w12, w15
> -    lsr             w12, w12, #16
> -    fmov            s30, w12
> +    rbit            w12, w13
> +    and             w12, w12, #0xffff
>     strh            w12, [x3], #2
> 
> -    // accelerate by preparing w13 = w13 & w15
> +    // Pack bits from w13 into w14, based on w15 mask.
>     and             w13, w13, w15
>     mov             x14, xzr
> -.Loop_spl_1:
> -    cbz             w15, .pext_end
> +    cbz             w15, 3f
> +2:
>     clz             w6, w15
>     lsl             w13, w13, w6
>     lsl             w15, w15, w6
>     extr            w14, w14, w13, #31
> -    bfm             w15, wzr, #1, #0
> -    b               .Loop_spl_1
> -.pext_end:
> +    bfc             w15, #31, #1
> +    cbnz            w15, 2b
> +3:
>     strh            w14, [x2], #2
> 
> -    // compute coeffNum = popcount(coeffFlag)
> -    cnt             v30.8b, v30.8b
> -    addp            v30.8b, v30.8b, v30.8b
> -    fmov            w6, s30
> -    sub             x5, x5, x6
> -    strb            w6, [x4], #1
> -
> -    cbnz            x5, .Loop_spl
> +    cbnz            x5, 1b
> 
> -    // count trailing zeros
> +    // Count trailing zeros.
>     rbit            w13, w12
>     clz             w13, w13
>     lsr             w12, w12, w13
>     strh            w12, [x3, #-2]
> 
> -    // get last pos
> +    // Get last pos.
>     sub             x9, x4, x9
> -    lsl             x0, x9, #4
>     eor             w13, w13, #15
> -    add             x0, x0, x13
> +    add             x0, x13, x9, lsl #4
>     ret
> endfunc
> 
> -- 
> 2.34.1
> 
> <0001-aarch64-pixel-util.S-Optimize-scanPosLast_neon.patch>_______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel