[x265] [PATCH] AArch64: Remove costCoeffNxN Neon assembly
Micro Daryl Robles
microdaryl.robles at arm.com
Wed Apr 16 10:54:45 UTC 2025
Remove the handwritten Neon assembly implementation of costCoeffNxN
because it performs worse on Neoverse CPUs than the code generated by
modern auto-vectorizing compilers.
---
source/common/aarch64/asm-primitives.cpp | 4 +-
source/common/aarch64/fun-decls.h | 1 -
source/common/aarch64/pixel-util.S | 136 -----------------------
3 files changed, 2 insertions(+), 139 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 046dac25c..47388cf6c 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -734,8 +734,8 @@ void setupNeonPrimitives(EncoderPrimitives &p)
#if !defined(__APPLE__)
p.scanPosLast = PFX(scanPosLast_neon);
#endif
- p.costCoeffNxN = PFX(costCoeffNxN_neon);
-#endif
+
+#endif // !HIGH_BIT_DEPTH
// quant
p.dequant_normal = PFX(dequant_normal_neon);
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 12383b573..f33076c56 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -228,7 +228,6 @@ int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* re
int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
-uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 26fdbac6c..f559df582 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -2163,142 +2163,6 @@ function PFX(scanPosLast_neon)
ret
endfunc
-// uint32_t costCoeffNxN(
-// uint16_t *scan, // x0
-// coeff_t *coeff, // x1
-// intptr_t trSize, // x2
-// uint16_t *absCoeff, // x3
-// uint8_t *tabSigCtx, // x4
-// uint16_t scanFlagMask, // x5
-// uint8_t *baseCtx, // x6
-// int offset, // x7
-// int scanPosSigOff, // sp
-// int subPosBase) // sp + 8, or sp + 4 on APPLE
-function PFX(costCoeffNxN_neon)
- // abs(coeff)
- add x2, x2, x2
- ld1 {v1.d}[0], [x1], x2
- ld1 {v1.d}[1], [x1], x2
- ld1 {v2.d}[0], [x1], x2
- ld1 {v2.d}[1], [x1], x2
- abs v1.8h, v1.8h
- abs v2.8h, v2.8h
-
- // WARNING: beyond-bound read here!
- // loading scan table
- ldr w2, [sp]
- eor w15, w2, #15
- add x1, x0, x15, lsl #1
- ldp q20, q21, [x1]
- uzp1 v20.16b, v20.16b, v21.16b
- movi v21.16b, #15
- eor v0.16b, v20.16b, v21.16b
-
- // reorder coeff
- uzp1 v22.16b, v1.16b, v2.16b
- uzp2 v23.16b, v1.16b, v2.16b
- tbl v24.16b, {v22.16b}, v0.16b
- tbl v25.16b, {v23.16b}, v0.16b
- zip1 v2.16b, v24.16b, v25.16b
- zip2 v3.16b, v24.16b, v25.16b
-
- // loading tabSigCtx (+offset)
- ldr q1, [x4]
- tbl v1.16b, {v1.16b}, v0.16b
- dup v4.16b, w7
- movi v5.16b, #0
- tbl v4.16b, {v4.16b}, v5.16b
- add v1.16b, v1.16b, v4.16b
-
- // register mapping
- // x0 - sum
- // x1 - entropyStateBits
- // v1 - sigCtx
- // {v3,v2} - abs(coeff)
- // x2 - scanPosSigOff
- // x3 - absCoeff
- // x4 - numNonZero
- // x5 - scanFlagMask
- // x6 - baseCtx
- mov x0, #0
- movrel x1, PFX_C(entropyStateBits)
- mov x4, #0
- mov x11, #0
- movi v31.16b, #0
- cbz x2, .idx_zero
-.Loop_ccnn:
-// {
-// const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
-// ctxSig = cnt & posZeroMask;
-// const uint32_t mstate = baseCtx[ctxSig];
-// const uint32_t mps = mstate & 1;
-// const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
-// uint32_t nextState = (stateBits >> 24) + mps;
-// if ((mstate ^ sig) == 1)
-// nextState = sig;
-// baseCtx[ctxSig] = (uint8_t)nextState;
-// sum += stateBits;
-// }
-// absCoeff[numNonZero] = tmpCoeff[blkPos];
-// numNonZero += sig;
-// scanPosSigOff--;
-
- add x13, x3, x4, lsl #1
- sub x2, x2, #1
- str h2, [x13] // absCoeff[numNonZero] = tmpCoeff[blkPos]
- fmov w14, s1 // x14 = ctxSig
- uxtb w14, w14
- ubfx w11, w5, #0, #1 // x11 = sig
- lsr x5, x5, #1
- add x4, x4, x11 // numNonZero += sig
- ext v1.16b, v1.16b, v31.16b, #1
- ext v2.16b, v2.16b, v3.16b, #2
- ext v3.16b, v3.16b, v31.16b, #2
- ldrb w9, [x6, x14] // mstate = baseCtx[ctxSig]
- and w10, w9, #1 // mps = mstate & 1
- eor w9, w9, w11 // x9 = mstate ^ sig
- add x12, x1, x9, lsl #2
- ldr w13, [x12]
- add w0, w0, w13 // sum += x265_entropyStateBits[mstate ^ sig]
- ldrb w13, [x12, #3]
- add w10, w10, w13 // nextState = (stateBits >> 24) + mps
- cmp w9, #1
- csel w10, w11, w10, eq
- strb w10, [x6, x14]
- cbnz x2, .Loop_ccnn
-.idx_zero:
-
- add x13, x3, x4, lsl #1
- add x4, x4, x15
- str h2, [x13] // absCoeff[numNonZero] = tmpCoeff[blkPos]
-
- ldr x9, [sp, #STACK_ARG_OFFSET(1)] // subPosBase
- uxth w9, w9
- cmp w9, #0
- cset x2, eq
- add x4, x4, x2
- cbz x4, .exit_ccnn
-
- sub w2, w2, #1
- uxtb w2, w2
- fmov w3, s1
- and w2, w2, w3
-
- ldrb w3, [x6, x2] // mstate = baseCtx[ctxSig]
- eor w4, w5, w3 // x5 = mstate ^ sig
- and w3, w3, #1 // mps = mstate & 1
- add x1, x1, x4, lsl #2
- ldr w11, [x1]
- ldrb w12, [x1, #3]
- add w0, w0, w11 // sum += x265_entropyStateBits[mstate ^ sig]
- add w3, w3, w12 // nextState = (stateBits >> 24) + mps
- cmp w4, #1
- csel w3, w5, w3, eq
- strb w3, [x6, x2]
-.exit_ccnn:
- ubfx w0, w0, #0, #24
- ret
-endfunc
const g_SPL_and_mask, align=8
.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Remove-costCoeffNxN-Neon-assembly.patch
Type: text/x-diff
Size: 7710 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250416/6b186b9c/attachment.patch>
More information about the x265-devel
mailing list