<div dir="ltr"><div>Thanks, Min.<br><br></div>Sumalatha - can you please work on adding testbench support for the new codeCoeff primitive that has been added? <br></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Jun 5, 2015 at 12:43 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1433445185 25200<br>
# Node ID 24f347c00df01352fa6860e05b376846d8d8cc74<br>
# Parent 093618ce0b26ea4703b5928f618d2895cf6daf32<br>
asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN<br>
---<br>
source/common/constants.cpp | 5 +-<br>
source/common/constants.h | 2 +-<br>
source/common/contexts.h | 3 +-<br>
source/common/dct.cpp | 57 +++++++++++<br>
source/common/primitives.h | 4 +<br>
source/common/x86/asm-primitives.cpp | 2 +<br>
source/common/x86/pixel-util.h | 3 +<br>
source/common/x86/pixel-util8.asm | 176 ++++++++++++++++++++++++++++++++++<br>
source/encoder/entropy.cpp | 113 ++++++++++------------<br>
9 files changed, 297 insertions(+), 68 deletions(-)<br>
<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/constants.cpp<br>
--- a/source/common/constants.cpp Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/constants.cpp Thu Jun 04 12:13:05 2015 -0700<br>
@@ -324,11 +324,12 @@<br>
4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }<br>
};<br>
<br>
-ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =<br>
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]) =<br>
{<br>
{ 0, 4, 1, 8, 5, 2, 12, 9, 6, 3, 13, 10, 7, 14, 11, 15 },<br>
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },<br>
- { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }<br>
+ { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 },<br>
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }<br>
};<br>
<br>
const uint16_t g_scan16x16[16 * 16] =<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/constants.h<br>
--- a/source/common/constants.h Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/constants.h Thu Jun 04 12:13:05 2015 -0700<br>
@@ -83,7 +83,7 @@<br>
extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE];<br>
extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];<br>
extern const uint16_t g_scan8x8diag[8 * 8];<br>
-extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4];<br>
+extern const uint16_t g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]; // +1 for safe buffer area for codeCoeffNxN assembly optimize, there have up to 15 bytes beyond bound read<br>
<br>
extern const uint8_t g_lastCoeffTable[32];<br>
extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/contexts.h<br>
--- a/source/common/contexts.h Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/contexts.h Thu Jun 04 12:13:05 2015 -0700<br>
@@ -102,11 +102,12 @@<br>
#define OFF_TQUANT_BYPASS_FLAG_CTX (OFF_TRANSFORMSKIP_FLAG_CTX + 2 * NUM_TRANSFORMSKIP_FLAG_CTX)<br>
#define MAX_OFF_CTX_MOD (OFF_TQUANT_BYPASS_FLAG_CTX + NUM_TQUANT_BYPASS_FLAG_CTX)<br>
<br>
+extern "C" const uint32_t g_entropyStateBits[128];<br>
+<br>
namespace x265 {<br>
// private namespace<br>
<br>
extern const uint32_t g_entropyBits[128];<br>
-extern const uint32_t g_entropyStateBits[128];<br>
extern const uint8_t g_nextState[128][2];<br>
<br>
#define sbacGetMps(S) ((S) & 1)<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/dct.cpp<br>
--- a/source/common/dct.cpp Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/dct.cpp Thu Jun 04 12:13:05 2015 -0700<br>
@@ -29,6 +29,7 @@<br>
<br>
#include "common.h"<br>
#include "primitives.h"<br>
+#include "contexts.h" // costCoeffNxN_c<br>
<br>
using namespace x265;<br>
<br>
@@ -817,6 +818,61 @@<br>
return ((lastNZPosInCG << 16) | firstNZPosInCG);<br>
}<br>
<br>
+<br>
+uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)<br>
+{<br>
+ ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);<br>
+ uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);<br>
+ uint32_t sum = 0;<br>
+<br>
+ // correct offset to match assembly<br>
+ absCoeff -= numNonZero;<br>
+<br>
+ for (int i = 0; i < MLS_CG_SIZE; i++)<br>
+ {<br>
+ tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);<br>
+ }<br>
+<br>
+ do<br>
+ {<br>
+ uint32_t blkPos, sig, ctxSig;<br>
+ blkPos = scan[scanPosSigOff];<br>
+ const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;<br>
+ sig = scanFlagMask & 1;<br>
+ scanFlagMask >>= 1;<br>
+ X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");<br>
+ if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)<br>
+ {<br>
+ const uint32_t cnt = tabSigCtx[blkPos] + offset;<br>
+ ctxSig = cnt & posZeroMask;<br>
+<br>
+ //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;<br>
+ //encodeBin(sig, baseCtx[ctxSig]);<br>
+ const uint32_t mstate = baseCtx[ctxSig];<br>
+ const uint32_t mps = mstate & 1;<br>
+ const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];<br>
+ uint32_t nextState = (stateBits >> 24) + mps;<br>
+ if ((mstate ^ sig) == 1)<br>
+ nextState = sig;<br>
+ X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");<br>
+ X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");<br>
+ baseCtx[ctxSig] = (uint8_t)nextState;<br>
+ sum += stateBits;<br>
+ }<br>
+ assert(numNonZero <= 15);<br>
+ assert(blkPos <= 15);<br>
+ absCoeff[numNonZero] = tmpCoeff[blkPos];<br>
+ numNonZero += sig;<br>
+ scanPosSigOff--;<br>
+ }<br>
+ while(scanPosSigOff >= 0);<br>
+<br>
+ return (sum & 0xFFFFFF);<br>
+}<br>
+<br>
} // closing - anonymous file-static namespace<br>
<br>
namespace x265 {<br>
@@ -851,5 +907,6 @@<br>
<br>
p.scanPosLast = scanPosLast_c;<br>
p.findPosFirstLast = findPosFirstLast_c;<br>
+ p.costCoeffNxN = costCoeffNxN_c;<br>
}<br>
}<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/primitives.h<br>
--- a/source/common/primitives.h Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/primitives.h Thu Jun 04 12:13:05 2015 -0700<br>
@@ -186,6 +186,8 @@<br>
typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);<br>
typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);<br>
<br>
+typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);<br>
+<br>
/* Function pointers to optimized encoder primitives. Each pointer can reference<br>
* either an assembly routine, a SIMD intrinsic primitive, or a C function */<br>
struct EncoderPrimitives<br>
@@ -310,6 +312,8 @@<br>
scanPosLast_t scanPosLast;<br>
findPosFirstLast_t findPosFirstLast;<br>
<br>
+ costCoeffNxN_t costCoeffNxN;<br>
+<br>
/* There is one set of chroma primitives per color space. An encoder will<br>
* have just a single color space and thus it will only ever use one entry<br>
* in this array. However we always fill all entries in the array in case<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 04 12:13:05 2015 -0700<br>
@@ -2048,6 +2048,9 @@<br>
<br>
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);<br>
ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);<br>
+<br>
+ // TODO: it is passed smoke test, but we need testbench, so temporary disable<br>
+ //p.costCoeffNxN = x265_costCoeffNxN_sse4;<br>
#endif<br>
}<br>
if (cpuMask & X265_CPU_AVX)<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/pixel-util.h<br>
--- a/source/common/x86/pixel-util.h Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/x86/pixel-util.h Thu Jun 04 12:13:05 2015 -0700<br>
@@ -82,6 +82,9 @@<br>
int x265_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);<br>
uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);<br>
<br>
+uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);<br>
+<br>
+<br>
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \<br>
void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \<br>
void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* src1, intptr_t srcStride0, intptr_t srcStride1);<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/pixel-util8.asm<br>
--- a/source/common/x86/pixel-util8.asm Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/common/x86/pixel-util8.asm Thu Jun 04 12:13:05 2015 -0700<br>
@@ -71,6 +71,7 @@<br>
cextern pb_64<br>
cextern hmul_16p<br>
cextern trans8_shuf<br>
+cextern_naked g_entropyStateBits<br>
<br>
;-----------------------------------------------------------------------------<br>
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)<br>
@@ -6362,3 +6363,178 @@<br>
add [r1 + 4 * 4], r6d<br>
RET<br>
%endif ; ARCH_X86_64<br>
+<br>
+<br>
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase)<br>
+;for (int i = 0; i < MLS_CG_SIZE; i++)<br>
+;{<br>
+; tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);<br>
+; tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);<br>
+; tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);<br>
+; tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);<br>
+;}<br>
+;do<br>
+;{<br>
+; uint32_t blkPos, sig, ctxSig;<br>
+; blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff];<br>
+; const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;<br>
+; sig = scanFlagMask & 1;<br>
+; scanFlagMask >>= 1;<br>
+; if (scanPosSigOff + (subSet == 0) + numNonZero)<br>
+; {<br>
+; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;<br>
+; ctxSig = cnt & posZeroMask;<br>
+;<br>
+; const uint32_t mstate = baseCtx[ctxSig];<br>
+; const uint32_t mps = mstate & 1;<br>
+; const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];<br>
+; uint32_t nextState = (stateBits >> 24) + mps;<br>
+; if ((mstate ^ sig) == 1)<br>
+; nextState = sig;<br>
+; baseCtx[ctxSig] = (uint8_t)nextState;<br>
+; sum += stateBits;<br>
+; }<br>
+; absCoeff[numNonZero] = tmpCoeff[blkPos];<br>
+; numNonZero += sig;<br>
+; scanPosSigOff--;<br>
+;}<br>
+;while(scanPosSigOff >= 0);<br>
+; sum &= 0xFFFFFF<br>
+<br>
+%if ARCH_X86_64<br>
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)<br>
+INIT_XMM sse4<br>
+cglobal costCoeffNxN, 6,11,5<br>
+ add r2d, r2d<br>
+<br>
+ ; abs(coeff)<br>
+ movh m1, [r1]<br>
+ movhps m1, [r1 + r2]<br>
+ movh m2, [r1 + r2 * 2]<br>
+ lea r2, [r2 * 3]<br>
+ movhps m2, [r1 + r2]<br>
+ pabsw m1, m1<br>
+ pabsw m2, m2<br>
+ ; r[1-2] free here<br>
+<br>
+ ; WARNING: beyond-bound read here!<br>
+ ; loading scan table<br>
+ mov r2d, r8m<br>
+ xor r2d, 15<br>
+ movu m0, [r0 + r2 * 2]<br>
+ movu m3, [r0 + r2 * 2 + mmsize]<br>
+ packuswb m0, m3<br>
+ pxor m0, [pb_15]<br>
+ xchg r2d, r8m<br>
+ ; r[0-1] free here<br>
+<br>
+ ; reorder coeff<br>
+ mova m3, [deinterleave_shuf]<br>
+ pshufb m1, m3<br>
+ pshufb m2, m3<br>
+ punpcklqdq m3, m1, m2<br>
+ punpckhqdq m1, m2<br>
+ pshufb m3, m0<br>
+ pshufb m1, m0<br>
+ punpcklbw m2, m3, m1<br>
+ punpckhbw m3, m1<br>
+ ; r[0-1], m[1] free here<br>
+<br>
+ ; loading tabSigCtx (+offset)<br>
+ mova m1, [r4]<br>
+ pshufb m1, m0<br>
+ movd m4, r7m<br>
+ pxor m5, m5<br>
+ pshufb m4, m5<br>
+ paddb m1, m4<br>
+<br>
+ ; register mapping<br>
+ ; m0 - Zigzag<br>
+ ; m1 - sigCtx<br>
+ ; {m3,m2} - abs(coeff)<br>
+ ; r0 - g_entropyStateBits<br>
+ ; r1 - baseCtx<br>
+ ; r2 - scanPosSigOff<br>
+ ; r3 - absCoeff<br>
+ ; r4 - nonZero<br>
+ ; r5 - scanFlagMask<br>
+ ; r6 - sum<br>
+ lea r0, [g_entropyStateBits]<br>
+ mov r1, r6mp<br>
+ xor r6d, r6d<br>
+ xor r4d, r4d<br>
+ xor r8d, r8d<br>
+<br>
+ test r2d, r2d<br>
+ jz .idx_zero<br>
+<br>
+.loop:<br>
+; {<br>
+; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;<br>
+; ctxSig = cnt & posZeroMask;<br>
+; const uint32_t mstate = baseCtx[ctxSig];<br>
+; const uint32_t mps = mstate & 1;<br>
+; const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];<br>
+; uint32_t nextState = (stateBits >> 24) + mps;<br>
+; if ((mstate ^ sig) == 1)<br>
+; nextState = sig;<br>
+; baseCtx[ctxSig] = (uint8_t)nextState;<br>
+; sum += stateBits;<br>
+; }<br>
+; absCoeff[numNonZero] = tmpCoeff[blkPos];<br>
+; numNonZero += sig;<br>
+; scanPosSigOff--;<br>
+<br>
+ pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] = tmpCoeff[blkPos]<br>
+ shr r5d, 1<br>
+ setc r8b ; r8 = sig<br>
+ add r4d, r8d ; numNonZero += sig<br>
+ palignr m4, m3, m2, 2<br>
+ psrldq m3, 2<br>
+ mova m2, m4<br>
+ movd r7d, m1 ; r7 = ctxSig<br>
+ movzx r7d, r7b<br>
+ psrldq m1, 1<br>
+ movzx r9d, byte [r1 + r7] ; mstate = baseCtx[ctxSig]<br>
+ mov r10d, r9d<br>
+ and r10d, 1 ; mps = mstate & 1<br>
+ xor r9d, r8d ; r9 = mstate ^ sig<br>
+ add r6d, [r0 + r9 * 4] ; sum += g_entropyStateBits[mstate ^ sig]<br>
+ add r10b, byte [r0 + r9 * 4 + 3] ; nextState = (stateBits >> 24) + mps<br>
+ cmp r9b, 1<br>
+ cmove r10d, r8d<br>
+ mov byte [r1 + r7], r10b<br>
+<br>
+ dec r2d<br>
+ jg .loop<br>
+<br>
+.idx_zero:<br>
+ pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] = tmpCoeff[blkPos]<br>
+ add r4b, r8m<br>
+ xor r2d, r2d<br>
+ cmp word r9m, 0<br>
+ sete r2b<br>
+ add r4b, r2b<br>
+ jz .exit<br>
+<br>
+ dec r2b<br>
+ movd r3d, m1<br>
+ and r2d, r3d<br>
+<br>
+ movzx r3d, byte [r1 + r2] ; mstate = baseCtx[ctxSig]<br>
+ mov r4d, r5d<br>
+ xor r5d, r3d ; r0 = mstate ^ sig<br>
+ and r3d, 1 ; mps = mstate & 1<br>
+ add r6d, [r0 + r5 * 4] ; sum += g_entropyStateBits[mstate ^ sig]<br>
+ add r3b, [r0 + r5 * 4 + 3] ; nextState = (stateBits >> 24) + mps<br>
+ cmp r5b, 1<br>
+ cmove r3d, r4d<br>
+ mov byte [r1 + r2], r3b<br>
+<br>
+.exit:<br>
+%ifnidn eax,r6d<br>
+ mov eax, r6d<br>
+%endif<br>
+ and eax, 0xFFFFFF<br>
+ RET<br>
+%endif ; ARCH_X86_64<br>
diff -r 093618ce0b26 -r 24f347c00df0 source/encoder/entropy.cpp<br>
--- a/source/encoder/entropy.cpp Tue Jun 02 17:21:24 2015 +0800<br>
+++ b/source/encoder/entropy.cpp Thu Jun 04 12:13:05 2015 -0700<br>
@@ -1517,12 +1517,12 @@<br>
uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];<br>
uint32_t c1 = 1;<br>
int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;<br>
- int absCoeff[1 << MLS_CG_SIZE];<br>
+ ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);<br>
uint32_t numNonZero = 1;<br>
unsigned long lastNZPosInCG;<br>
unsigned long firstNZPosInCG;<br>
<br>
- absCoeff[0] = int(abs(coeff[posLast]));<br>
+ absCoeff[0] = (uint16_t)abs(coeff[posLast]);<br>
<br>
for (int subSet = lastScanSet; subSet >= 0; subSet--)<br>
{<br>
@@ -1600,19 +1600,20 @@<br>
<br>
const int offset = codingParameters.firstSignificanceMapContext;<br>
ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);<br>
- // TODO: accelerate by PABSW<br>
const uint32_t blkPosBase = codingParameters.scan[subPosBase];<br>
- for (int i = 0; i < MLS_CG_SIZE; i++)<br>
- {<br>
- tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);<br>
- tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);<br>
- tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);<br>
- tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);<br>
- }<br>
<br>
X265_CHECK(scanPosSigOff >= 0, "scanPosSigOff check failure\n");<br>
if (m_bitIf)<br>
{<br>
+ // TODO: accelerate by PABSW<br>
+ for (int i = 0; i < MLS_CG_SIZE; i++)<br>
+ {<br>
+ tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);<br>
+ }<br>
+<br>
if (log2TrSize == 2)<br>
{<br>
do<br>
@@ -1667,6 +1668,15 @@<br>
uint32_t sum = 0;<br>
if (log2TrSize == 2)<br>
{<br>
+ // TODO: accelerate by PABSW<br>
+ for (int i = 0; i < MLS_CG_SIZE; i++)<br>
+ {<br>
+ tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);<br>
+ tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);<br>
+ }<br>
+<br>
do<br>
{<br>
uint32_t blkPos, sig, ctxSig;<br>
@@ -1681,7 +1691,7 @@<br>
const uint32_t mstate = baseCtx[ctxSig];<br>
const uint32_t mps = mstate & 1;<br>
const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];<br>
- uint32_t nextState = (stateBits >> 23) + mps;<br>
+ uint32_t nextState = (stateBits >> 24) + mps;<br>
if ((mstate ^ sig) == 1)<br>
nextState = sig;<br>
X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");<br>
@@ -1698,39 +1708,13 @@<br>
else<br>
{<br>
X265_CHECK((log2TrSize > 2), "log2TrSize must be more than 2 in this path!\n");<br>
+ const uint8_t *tabSigCtx = table_cnt[(uint32_t)patternSigCtx];<br>
<br>
- const uint8_t *tabSigCtx = table_cnt[(uint32_t)patternSigCtx];<br>
- do<br>
- {<br>
- uint32_t blkPos, sig, ctxSig;<br>
- blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff];<br>
- const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;<br>
- sig = scanFlagMask & 1;<br>
- scanFlagMask >>= 1;<br>
- X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");<br>
- if (scanPosSigOff != 0 || subSet == 0 || numNonZero)<br>
- {<br>
- const uint32_t cnt = tabSigCtx[blkPos] + offset;<br>
- ctxSig = (cnt + posOffset) & posZeroMask;<br>
+ sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);<br>
<br>
- X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;<br>
- //encodeBin(sig, baseCtx[ctxSig]);<br>
- const uint32_t mstate = baseCtx[ctxSig];<br>
- const uint32_t mps = mstate & 1;<br>
- const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];<br>
- uint32_t nextState = (stateBits >> 23) + mps;<br>
- if ((mstate ^ sig) == 1)<br>
- nextState = sig;<br>
- X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");<br>
- X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");<br>
- baseCtx[ctxSig] = (uint8_t)nextState;<br>
- sum += stateBits;<br>
- }<br>
- absCoeff[numNonZero] = tmpCoeff[blkPos];<br>
- numNonZero += sig;<br>
- scanPosSigOff--;<br>
- }<br>
- while(scanPosSigOff >= 0);<br>
+#if CHECKED_BUILD || _DEBUG<br>
+ numNonZero = coeffNum[subSet];<br>
+#endif<br>
} // end of non 4x4 path<br>
sum &= 0xFFFFFF;<br>
<br>
@@ -2271,28 +2255,6 @@<br>
0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f, 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb<br>
};<br>
<br>
-// [8 24] --> [stateMPS BitCost], [stateLPS BitCost]<br>
-const uint32_t g_entropyStateBits[128] =<br>
-{<br>
- // Corrected table, most notably for last state<br>
- 0x01007b23, 0x000085f9, 0x020074a0, 0x00008cbc, 0x03006ee4, 0x01009354, 0x040067f4, 0x02009c1b,<br>
- 0x050060b0, 0x0200a62a, 0x06005a9c, 0x0400af5b, 0x0700548d, 0x0400b955, 0x08004f56, 0x0500c2a9,<br>
- 0x09004a87, 0x0600cbf7, 0x0a0045d6, 0x0700d5c3, 0x0b004144, 0x0800e01b, 0x0c003d88, 0x0900e937,<br>
- 0x0d0039e0, 0x0900f2cd, 0x0e003663, 0x0b00fc9e, 0x0f003347, 0x0b010600, 0x10003050, 0x0c010f95,<br>
- 0x11002d4d, 0x0d011a02, 0x12002ad3, 0x0d012333, 0x1300286e, 0x0f012cad, 0x14002604, 0x0f0136df,<br>
- 0x15002425, 0x10013f48, 0x160021f4, 0x100149c4, 0x1700203e, 0x1201527b, 0x18001e4d, 0x12015d00,<br>
- 0x19001c99, 0x130166de, 0x1a001b18, 0x13017017, 0x1b0019a5, 0x15017988, 0x1c001841, 0x15018327,<br>
- 0x1d0016df, 0x16018d50, 0x1e0015d9, 0x16019547, 0x1f00147c, 0x1701a083, 0x2000138e, 0x1801a8a3,<br>
- 0x21001251, 0x1801b418, 0x22001166, 0x1901bd27, 0x23001068, 0x1a01c77b, 0x24000f7f, 0x1a01d18e,<br>
- 0x25000eda, 0x1b01d91a, 0x26000e19, 0x1b01e254, 0x27000d4f, 0x1c01ec9a, 0x28000c90, 0x1d01f6e0,<br>
- 0x29000c01, 0x1d01fef8, 0x2a000b5f, 0x1e0208b1, 0x2b000ab6, 0x1e021362, 0x2c000a15, 0x1e021e46,<br>
- 0x2d000988, 0x1f02285d, 0x2e000934, 0x20022ea8, 0x2f0008a8, 0x200239b2, 0x3000081d, 0x21024577,<br>
- 0x310007c9, 0x21024ce6, 0x32000763, 0x21025663, 0x33000710, 0x22025e8f, 0x340006a0, 0x22026a26,<br>
- 0x35000672, 0x23026f23, 0x360005e8, 0x23027ef8, 0x370005ba, 0x230284b5, 0x3800055e, 0x24029057,<br>
- 0x3900050c, 0x24029bab, 0x3a0004c1, 0x2402a674, 0x3b0004a7, 0x2502aa5e, 0x3c00046f, 0x2502b32f,<br>
- 0x3d00041f, 0x2502c0ad, 0x3e0003e7, 0x2602ca8d, 0x3e0003ba, 0x2602d323, 0x3f00010c, 0x3f03bfbb,<br>
-};<br>
-<br>
const uint8_t g_nextState[128][2] =<br>
{<br>
{ 2, 1 }, { 0, 3 }, { 4, 0 }, { 1, 5 }, { 6, 2 }, { 3, 7 }, { 8, 4 }, { 5, 9 },<br>
@@ -2314,3 +2276,26 @@<br>
};<br>
<br>
}<br>
+<br>
+// [8 24] --> [stateMPS BitCost], [stateLPS BitCost]<br>
+extern "C" const uint32_t g_entropyStateBits[128] =<br>
+{<br>
+ // Corrected table, most notably for last state<br>
+ 0x02007B23, 0x000085F9, 0x040074A0, 0x00008CBC, 0x06006EE4, 0x02009354, 0x080067F4, 0x04009C1B,<br>
+ 0x0A0060B0, 0x0400A62A, 0x0C005A9C, 0x0800AF5B, 0x0E00548D, 0x0800B955, 0x10004F56, 0x0A00C2A9,<br>
+ 0x12004A87, 0x0C00CBF7, 0x140045D6, 0x0E00D5C3, 0x16004144, 0x1000E01B, 0x18003D88, 0x1200E937,<br>
+ 0x1A0039E0, 0x1200F2CD, 0x1C003663, 0x1600FC9E, 0x1E003347, 0x16010600, 0x20003050, 0x18010F95,<br>
+ 0x22002D4D, 0x1A011A02, 0x24002AD3, 0x1A012333, 0x2600286E, 0x1E012CAD, 0x28002604, 0x1E0136DF,<br>
+ 0x2A002425, 0x20013F48, 0x2C0021F4, 0x200149C4, 0x2E00203E, 0x2401527B, 0x30001E4D, 0x24015D00,<br>
+ 0x32001C99, 0x260166DE, 0x34001B18, 0x26017017, 0x360019A5, 0x2A017988, 0x38001841, 0x2A018327,<br>
+ 0x3A0016DF, 0x2C018D50, 0x3C0015D9, 0x2C019547, 0x3E00147C, 0x2E01A083, 0x4000138E, 0x3001A8A3,<br>
+ 0x42001251, 0x3001B418, 0x44001166, 0x3201BD27, 0x46001068, 0x3401C77B, 0x48000F7F, 0x3401D18E,<br>
+ 0x4A000EDA, 0x3601D91A, 0x4C000E19, 0x3601E254, 0x4E000D4F, 0x3801EC9A, 0x50000C90, 0x3A01F6E0,<br>
+ 0x52000C01, 0x3A01FEF8, 0x54000B5F, 0x3C0208B1, 0x56000AB6, 0x3C021362, 0x58000A15, 0x3C021E46,<br>
+ 0x5A000988, 0x3E02285D, 0x5C000934, 0x40022EA8, 0x5E0008A8, 0x400239B2, 0x6000081D, 0x42024577,<br>
+ 0x620007C9, 0x42024CE6, 0x64000763, 0x42025663, 0x66000710, 0x44025E8F, 0x680006A0, 0x44026A26,<br>
+ 0x6A000672, 0x46026F23, 0x6C0005E8, 0x46027EF8, 0x6E0005BA, 0x460284B5, 0x7000055E, 0x48029057,<br>
+ 0x7200050C, 0x48029BAB, 0x740004C1, 0x4802A674, 0x760004A7, 0x4A02AA5E, 0x7800046F, 0x4A02B32F,<br>
+ 0x7A00041F, 0x4A02C0AD, 0x7C0003E7, 0x4C02CA8D, 0x7C0003BA, 0x4C02D323, 0x7E00010C, 0x7E03BFBB,<br>
+};<br>
+<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>