[x265] [PATCH] AArch64: Remove costCoeffNxN Neon assembly

Wed Apr 16 10:54:45 UTC 2025

Remove the handwritten Neon assembly implementation of costCoeffNxN
because it performs worse on Neoverse CPUs than the code generated by
modern auto-vectorizing compilers.
---
 source/common/aarch64/asm-primitives.cpp |   4 +-
 source/common/aarch64/fun-decls.h        |   1 -
 source/common/aarch64/pixel-util.S       | 136 -----------------------
 3 files changed, 2 insertions(+), 139 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 046dac25c..47388cf6c 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -734,8 +734,8 @@ void setupNeonPrimitives(EncoderPrimitives &p)
 #if !defined(__APPLE__)
     p.scanPosLast = PFX(scanPosLast_neon);
 #endif
-    p.costCoeffNxN = PFX(costCoeffNxN_neon);
-#endif
+
+#endif // !HIGH_BIT_DEPTH
 
     // quant
     p.dequant_normal = PFX(dequant_normal_neon);
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 12383b573..f33076c56 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -228,7 +228,6 @@ int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* re
 int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
-uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 
 uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
 uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 26fdbac6c..f559df582 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -2163,142 +2163,6 @@ function PFX(scanPosLast_neon)
     ret
 endfunc
 
-// uint32_t costCoeffNxN(
-//    uint16_t *scan,        // x0
-//    coeff_t *coeff,        // x1
-//    intptr_t trSize,       // x2
-//    uint16_t *absCoeff,    // x3
-//    uint8_t *tabSigCtx,    // x4
-//    uint16_t scanFlagMask, // x5
-//    uint8_t *baseCtx,      // x6
-//    int offset,            // x7
-//    int scanPosSigOff,     // sp
-//    int subPosBase)        // sp + 8, or sp + 4 on APPLE
-function PFX(costCoeffNxN_neon)
-    // abs(coeff)
-    add             x2, x2, x2
-    ld1             {v1.d}[0], [x1], x2
-    ld1             {v1.d}[1], [x1], x2
-    ld1             {v2.d}[0], [x1], x2
-    ld1             {v2.d}[1], [x1], x2
-    abs             v1.8h, v1.8h
-    abs             v2.8h, v2.8h
-
-    // WARNING: beyond-bound read here!
-    // loading scan table
-    ldr             w2, [sp]
-    eor             w15, w2, #15
-    add             x1, x0, x15, lsl #1
-    ldp             q20, q21, [x1]
-    uzp1            v20.16b, v20.16b, v21.16b
-    movi            v21.16b, #15
-    eor             v0.16b, v20.16b, v21.16b
-
-    // reorder coeff
-    uzp1           v22.16b, v1.16b, v2.16b
-    uzp2           v23.16b, v1.16b, v2.16b
-    tbl            v24.16b, {v22.16b}, v0.16b
-    tbl            v25.16b, {v23.16b}, v0.16b
-    zip1           v2.16b, v24.16b, v25.16b
-    zip2           v3.16b, v24.16b, v25.16b
-
-    // loading tabSigCtx (+offset)
-    ldr             q1, [x4]
-    tbl             v1.16b, {v1.16b}, v0.16b
-    dup             v4.16b, w7
-    movi            v5.16b, #0
-    tbl             v4.16b, {v4.16b}, v5.16b
-    add             v1.16b, v1.16b, v4.16b
-
-    // register mapping
-    // x0 - sum
-    // x1 - entropyStateBits
-    // v1 - sigCtx
-    // {v3,v2} - abs(coeff)
-    // x2 - scanPosSigOff
-    // x3 - absCoeff
-    // x4 - numNonZero
-    // x5 - scanFlagMask
-    // x6 - baseCtx
-    mov             x0, #0
-    movrel          x1, PFX_C(entropyStateBits)
-    mov             x4, #0
-    mov             x11, #0
-    movi            v31.16b, #0
-    cbz             x2, .idx_zero
-.Loop_ccnn:
-//   {
-//        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
-//        ctxSig = cnt & posZeroMask;
-//        const uint32_t mstate = baseCtx[ctxSig];
-//        const uint32_t mps = mstate & 1;
-//        const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
-//        uint32_t nextState = (stateBits >> 24) + mps;
-//        if ((mstate ^ sig) == 1)
-//            nextState = sig;
-//        baseCtx[ctxSig] = (uint8_t)nextState;
-//        sum += stateBits;
-//    }
-//    absCoeff[numNonZero] = tmpCoeff[blkPos];
-//    numNonZero += sig;
-//    scanPosSigOff--;
-
-    add             x13, x3, x4, lsl #1
-    sub             x2, x2, #1
-    str             h2, [x13]             // absCoeff[numNonZero] = tmpCoeff[blkPos]
-    fmov            w14, s1               // x14 = ctxSig
-    uxtb            w14, w14
-    ubfx            w11, w5, #0, #1       // x11 = sig
-    lsr             x5, x5, #1
-    add             x4, x4, x11           // numNonZero += sig
-    ext             v1.16b, v1.16b, v31.16b, #1
-    ext             v2.16b, v2.16b, v3.16b, #2
-    ext             v3.16b, v3.16b, v31.16b, #2
-    ldrb            w9, [x6, x14]         // mstate = baseCtx[ctxSig]
-    and             w10, w9, #1           // mps = mstate & 1
-    eor             w9, w9, w11           // x9 = mstate ^ sig
-    add             x12, x1, x9, lsl #2
-    ldr             w13, [x12]
-    add             w0, w0, w13           // sum += x265_entropyStateBits[mstate ^ sig]
-    ldrb            w13, [x12, #3]
-    add             w10, w10, w13         // nextState = (stateBits >> 24) + mps
-    cmp             w9, #1
-    csel            w10, w11, w10, eq
-    strb            w10, [x6, x14]
-    cbnz            x2, .Loop_ccnn
-.idx_zero:
-
-    add             x13, x3, x4, lsl #1
-    add             x4, x4, x15
-    str             h2, [x13]              // absCoeff[numNonZero] = tmpCoeff[blkPos]
-
-    ldr             x9, [sp, #STACK_ARG_OFFSET(1)]           // subPosBase
-    uxth            w9, w9
-    cmp             w9, #0
-    cset            x2, eq
-    add             x4, x4, x2
-    cbz             x4, .exit_ccnn
-
-    sub             w2, w2, #1
-    uxtb            w2, w2
-    fmov            w3, s1
-    and             w2, w2, w3
-
-    ldrb            w3, [x6, x2]         // mstate = baseCtx[ctxSig]
-    eor             w4, w5, w3            // x5 = mstate ^ sig
-    and             w3, w3, #1            // mps = mstate & 1
-    add             x1, x1, x4, lsl #2
-    ldr             w11, [x1]
-    ldrb            w12, [x1, #3]
-    add             w0, w0, w11           // sum += x265_entropyStateBits[mstate ^ sig]
-    add             w3, w3, w12           // nextState = (stateBits >> 24) + mps
-    cmp             w4, #1
-    csel            w3, w5, w3, eq
-    strb            w3, [x6, x2]
-.exit_ccnn:
-    ubfx            w0, w0, #0, #24
-    ret
-endfunc
 
 const g_SPL_and_mask, align=8
 .byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-- 
2.34.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Remove-costCoeffNxN-Neon-assembly.patch
Type: text/x-diff
Size: 7710 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250416/6b186b9c/attachment.patch>