[x265] [PATCH] asm: AVX2 version costCoeffNxN, 1367c -> 1186c
Min Chen
chenm003 at 163.com
Fri Nov 20 23:03:27 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1448057004 21600
# Node ID 178fe732b4b15fb419ce71ad44fdaa97ed5b0d45
# Parent c303685732c784a11665218ca24643b769638511
asm: AVX2 version costCoeffNxN, 1367c -> 1186c
---
source/common/x86/asm-primitives.cpp | 6 +
source/common/x86/const-a.asm | 1 +
source/common/x86/pixel-util.h | 1 +
source/common/x86/pixel-util8.asm | 173 ++++++++++++++++++++++++++++++++++
source/encoder/entropy.cpp | 2 +-
source/test/pixelharness.cpp | 22 ++++-
6 files changed, 199 insertions(+), 6 deletions(-)
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 20 16:03:24 2015 -0600
@@ -2156,7 +2156,10 @@
p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4]
if (cpuMask & X265_CPU_BMI2)
+ {
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
+ p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
+ }
}
}
#else // if HIGH_BIT_DEPTH
@@ -3653,7 +3656,10 @@
p.propagateCost = PFX(mbtree_propagate_cost_avx2);
if (cpuMask & X265_CPU_BMI2)
+ {
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
+ p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
+ }
p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/const-a.asm Fri Nov 20 16:03:24 2015 -0600
@@ -114,6 +114,7 @@
times 7 dw 0xff
const hmul_16p, times 16 db 1
times 8 db 1, -1
+const pw_exp2_0_15, dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
;; 32-bit constants
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/pixel-util.h Fri Nov 20 16:03:24 2015 -0600
@@ -56,5 +56,6 @@
int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
#endif // ifndef X265_PIXEL_UTIL_H
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/pixel-util8.asm Fri Nov 20 16:03:24 2015 -0600
@@ -49,6 +49,7 @@
mask_ff: times 16 db 0xff
times 16 db 0
deinterleave_shuf: times 2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+interleave_shuf: times 2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
deinterleave_word_shuf: times 2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
hmulw_16p: times 8 dw 1
times 4 dw 1, -1
@@ -78,6 +79,7 @@
cextern trans8_shuf
cextern_naked private_prefix %+ _entropyStateBits
cextern pb_movemask
+cextern pw_exp2_0_15
;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -6874,6 +6876,177 @@
%endif
and eax, 0xFFFFFF
RET
+
+
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
+INIT_YMM avx2,bmi2
+cglobal costCoeffNxN, 6,10,5
+ add r2d, r2d
+
+ ; abs(coeff)
+ movq xm1, [r1]
+ movhps xm1, [r1 + r2]
+ movq xm2, [r1 + r2 * 2]
+ lea r2, [r2 * 3]
+ movhps xm2, [r1 + r2]
+ vinserti128 m1, m1, xm2, 1
+ pabsw m1, m1
+ ; r[1-2] free here
+
+ ; loading tabSigCtx
+ mova xm2, [r4]
+ ; r[4] free here
+
+ ; WARNING: beyond-bound read here!
+ ; loading scan table
+ mov r2d, r8m
+ bzhi r4d, r5d, r2d ; clear non-scan mask bits
+ mov r6d, r2d
+ xor r2d, 15
+ movu m0, [r0 + r2 * 2]
+ packuswb m0, m0
+ pxor m0, [pb_15]
+ vpermq m0, m0, q3120
+ add r4d, r2d ; r4d = (scanPosSigOff == 15) -> (numNonZero == 0)
+ mov r2d, r6d
+
+ ; reorder tabSigCtx (+offset)
+ pshufb xm2, xm0
+ vpbroadcastb xm3, r7m
+ paddb xm2, xm3
+ ; r[0-1] free here
+
+ ; reorder coeff
+ pshufb m1, [deinterleave_shuf]
+ vpermq m1, m1, q3120
+ pshufb m1, m0
+ vpermq m1, m1, q3120
+ pshufb m1, [interleave_shuf]
+ ; r[0-1], m[2-3] free here
+
+ ; sig mask
+ pxor xm3, xm3
+ movd xm4, r5d
+ vpbroadcastw m4, xm4
+ pandn m4, m4, [pw_exp2_0_15]
+ pcmpeqw m4, m3
+
+ ; absCoeff[numNonZero] = tmpCoeff[blkPos]
+ ; [0-3]
+ movq r0, xm4
+ movq r1, xm1
+ pext r6, r1, r0
+ mov qword [r3], r6
+ popcnt r0, r0
+ shr r0, 3
+ add r3, r0
+
+ ; [4-7]
+ pextrq r0, xm4, 1
+ pextrq r1, xm1, 1
+ pext r6, r1, r0
+ mov qword [r3], r6
+ popcnt r0, r0
+ shr r0, 3
+ add r3, r0
+
+ ; [8-B]
+ vextracti128 xm4, m4, 1
+ movq r0, xm4
+ vextracti128 xm1, m1, 1
+ movq r1, xm1
+ pext r6, r1, r0
+ mov qword [r3], r6
+ popcnt r0, r0
+ shr r0, 3
+ add r3, r0
+
+ ; [C-F]
+ pextrq r0, xm4, 1
+ pextrq r1, xm1, 1
+ pext r6, r1, r0
+ mov qword [r3], r6
+ ; r[0-1,3] free here
+
+ ; register mapping
+ ; m0 - Zigzag
+ ; m1 - sigCtx
+ ; r0 - x265_entropyStateBits
+ ; r1 - baseCtx
+ ; r2 - scanPosSigOff
+ ; r5 - scanFlagMask
+ ; r6 - sum
+ ; {r3,r4} - ctxSig[15-0]
+ ; r8m - (numNonZero != 0) || (subPosBase == 0)
+ lea r0, [private_prefix %+ _entropyStateBits]
+ mov r1, r6mp
+ xor r6d, r6d
+ xor r8d, r8d
+
+ test r2d, r2d
+ jz .idx_zero
+
+; {
+; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
+; ctxSig = cnt & posZeroMask;
+; const uint32_t mstate = baseCtx[ctxSig];
+; const uint32_t mps = mstate & 1;
+; const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
+; uint32_t nextState = (stateBits >> 24) + mps;
+; if ((mstate ^ sig) == 1)
+; nextState = sig;
+; baseCtx[ctxSig] = (uint8_t)nextState;
+; sum += stateBits;
+; }
+; absCoeff[numNonZero] = tmpCoeff[blkPos];
+; numNonZero += sig;
+; scanPosSigOff--;
+.loop:
+ shr r5d, 1
+ setc r8b ; r8 = sig
+ movd r7d, xm2 ; r7 = ctxSig
+ movzx r7d, r7b
+ psrldq xm2, 1
+ movzx r9d, byte [r1 + r7] ; mstate = baseCtx[ctxSig]
+ mov r3d, r9d
+ and r3b, 1 ; mps = mstate & 1
+ xor r9d, r8d ; r9 = mstate ^ sig
+ add r6d, [r0 + r9 * 4] ; sum += entropyStateBits[mstate ^ sig]
+ add r3b, byte [r0 + r9 * 4 + 3] ; nextState = (stateBits >> 24) + mps
+ cmp r9d, 1
+ cmove r3d, r8d
+ mov byte [r1 + r7], r3b
+
+ dec r2d
+ jg .loop
+
+.idx_zero:
+ xor r2d, r2d
+ cmp word r9m, 0
+ sete r2b
+ add r4d, r2d ; (numNonZero != 0) || (subPosBase == 0)
+ jz .exit
+
+ dec r2b
+ movd r3d, xm2
+ and r2d, r3d
+
+ movzx r3d, byte [r1 + r2] ; mstate = baseCtx[ctxSig]
+ mov r4d, r5d
+ xor r5d, r3d ; r0 = mstate ^ sig
+ and r3b, 1 ; mps = mstate & 1
+ add r6d, [r0 + r5 * 4] ; sum += x265_entropyStateBits[mstate ^ sig]
+ add r3b, [r0 + r5 * 4 + 3] ; nextState = (stateBits >> 24) + mps
+ cmp r5b, 1
+ cmove r3d, r4d
+ mov byte [r1 + r2], r3b
+
+.exit:
+%ifnidn eax,r6d
+ mov eax, r6d
+%endif
+ and eax, 0xFFFFFF
+ RET
%endif ; ARCH_X86_64
diff -r c303685732c7 -r 178fe732b4b1 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp Thu Nov 19 17:29:39 2015 -0600
+++ b/source/encoder/entropy.cpp Fri Nov 20 16:03:24 2015 -0600
@@ -1566,7 +1566,7 @@
uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
uint32_t c1 = 1;
int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
- ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
+ ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]); // extra 2 bytes space for AVX2 assembly
uint32_t numNonZero = 1;
unsigned long lastNZPosInCG;
unsigned long firstNZPosInCG;
diff -r c303685732c7 -r 178fe732b4b1 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Nov 19 17:29:39 2015 -0600
+++ b/source/test/pixelharness.cpp Fri Nov 20 16:03:24 2015 -0600
@@ -1590,8 +1590,8 @@
bool PixelHarness::check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt)
{
ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 3]);
- ALIGN_VAR_32(uint16_t, ref_absCoeff[1 << MLS_CG_SIZE]);
- ALIGN_VAR_32(uint16_t, opt_absCoeff[1 << MLS_CG_SIZE]);
+ ALIGN_VAR_32(uint16_t, ref_absCoeff[(1 << MLS_CG_SIZE)]);
+ ALIGN_VAR_32(uint16_t, opt_absCoeff[(1 << MLS_CG_SIZE) + 4]);
memset(ref_absCoeff, 0xCD, sizeof(ref_absCoeff));
memset(opt_absCoeff, 0xCD, sizeof(opt_absCoeff));
@@ -1617,6 +1617,12 @@
ref_src[32 * 32 + i] = 0x1234;
}
+ // Safe check magic
+ opt_absCoeff[(1 << MLS_CG_SIZE) + 0] = 0x0123;
+ opt_absCoeff[(1 << MLS_CG_SIZE) + 1] = 0x4567;
+ opt_absCoeff[(1 << MLS_CG_SIZE) + 2] = 0xBA98;
+ opt_absCoeff[(1 << MLS_CG_SIZE) + 3] = 0xFEDC;
+
// generate CABAC context table
uint8_t m_contextState_ref[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
uint8_t m_contextState_opt[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
@@ -1707,8 +1713,8 @@
continue;
const uint32_t blkPosBase = scanTbl[subPosBase];
- uint32_t ref_sum = ref(scanTblCG4x4, &ref_src[blkPosBase + i], trSize, ref_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)ref_baseCtx, offset, rand_scanPosSigOff, subPosBase);
- uint32_t opt_sum = (uint32_t)checked(opt, scanTblCG4x4, &ref_src[blkPosBase + i], trSize, opt_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)opt_baseCtx, offset, rand_scanPosSigOff, subPosBase);
+ uint32_t ref_sum = ref(scanTblCG4x4, &ref_src[blkPosBase + i], (intptr_t)trSize, ref_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)ref_baseCtx, offset, rand_scanPosSigOff, subPosBase);
+ uint32_t opt_sum = (uint32_t)checked(opt, scanTblCG4x4, &ref_src[blkPosBase + i], (intptr_t)trSize, opt_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)opt_baseCtx, offset, rand_scanPosSigOff, subPosBase);
if (ref_sum != opt_sum)
return false;
@@ -1716,7 +1722,13 @@
return false;
// NOTE: just first rand_numCoeff valid, but I check full buffer for confirm no overwrite bug
- if (memcmp(ref_absCoeff, opt_absCoeff, sizeof(ref_absCoeff)))
+ if (memcmp(ref_absCoeff, opt_absCoeff, rand_numCoeff * sizeof(ref_absCoeff[0])))
+ return false;
+
+ // Check memory beyond-bound write
+ if ( opt_absCoeff[(1 << MLS_CG_SIZE) + 1] != 0x4567
+ || opt_absCoeff[(1 << MLS_CG_SIZE) + 2] != 0xBA98
+ || opt_absCoeff[(1 << MLS_CG_SIZE) + 3] != 0xFEDC)
return false;
reportfail();
More information about the x265-devel
mailing list