[x265] [PATCH 1 of 6] asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN
Sumalatha Polureddy
sumalatha at multicorewareinc.com
Fri Jun 5 09:20:43 CEST 2015
Will work on this
Regards
Sumalatha
On Fri, Jun 5, 2015 at 12:40 PM, Deepthi Nandakumar <
deepthi at multicorewareinc.com> wrote:
> Thanks, Min.
>
> Sumalatha - can you please work on adding testbench support for the new
> codeCoeff primitive that has been added?
>
> On Fri, Jun 5, 2015 at 12:43 AM, Min Chen <chenm003 at 163.com> wrote:
>
>> # HG changeset patch
>> # User Min Chen <chenm003 at 163.com>
>> # Date 1433445185 25200
>> # Node ID 24f347c00df01352fa6860e05b376846d8d8cc74
>> # Parent 093618ce0b26ea4703b5928f618d2895cf6daf32
>> asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN
>> ---
>> source/common/constants.cpp | 5 +-
>> source/common/constants.h | 2 +-
>> source/common/contexts.h | 3 +-
>> source/common/dct.cpp | 57 +++++++++++
>> source/common/primitives.h | 4 +
>> source/common/x86/asm-primitives.cpp | 2 +
>> source/common/x86/pixel-util.h | 3 +
>> source/common/x86/pixel-util8.asm | 176
>> ++++++++++++++++++++++++++++++++++
>> source/encoder/entropy.cpp | 113 ++++++++++------------
>> 9 files changed, 297 insertions(+), 68 deletions(-)
>>
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/constants.cpp
>> --- a/source/common/constants.cpp Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/constants.cpp Thu Jun 04 12:13:05 2015 -0700
>> @@ -324,11 +324,12 @@
>> 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
>> 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
>> };
>>
>> -ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
>> +ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]) =
>> {
>> { 0, 4, 1, 8, 5, 2, 12, 9, 6, 3, 13, 10, 7, 14, 11, 15 },
>> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
>> - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }
>> + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 },
>> + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
>> };
>>
>> const uint16_t g_scan16x16[16 * 16] =
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/constants.h
>> --- a/source/common/constants.h Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/constants.h Thu Jun 04 12:13:05 2015 -0700
>> @@ -83,7 +83,7 @@
>> extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
>> extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
>> extern const uint16_t g_scan8x8diag[8 * 8];
>> -extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4];
>> +extern const uint16_t g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]; // +1 for
>> safe buffer area for codeCoeffNxN assembly optimize, there have up to 15
>> bytes beyond bound read
>>
>> extern const uint8_t g_lastCoeffTable[32];
>> extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice
>> codes
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/contexts.h
>> --- a/source/common/contexts.h Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/contexts.h Thu Jun 04 12:13:05 2015 -0700
>> @@ -102,11 +102,12 @@
>> #define OFF_TQUANT_BYPASS_FLAG_CTX (OFF_TRANSFORMSKIP_FLAG_CTX + 2 *
>> NUM_TRANSFORMSKIP_FLAG_CTX)
>> #define MAX_OFF_CTX_MOD (OFF_TQUANT_BYPASS_FLAG_CTX +
>> NUM_TQUANT_BYPASS_FLAG_CTX)
>>
>> +extern "C" const uint32_t g_entropyStateBits[128];
>> +
>> namespace x265 {
>> // private namespace
>>
>> extern const uint32_t g_entropyBits[128];
>> -extern const uint32_t g_entropyStateBits[128];
>> extern const uint8_t g_nextState[128][2];
>>
>> #define sbacGetMps(S) ((S) & 1)
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/dct.cpp
>> --- a/source/common/dct.cpp Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/dct.cpp Thu Jun 04 12:13:05 2015 -0700
>> @@ -29,6 +29,7 @@
>>
>> #include "common.h"
>> #include "primitives.h"
>> +#include "contexts.h" // costCoeffNxN_c
>>
>> using namespace x265;
>>
>> @@ -817,6 +818,61 @@
>> return ((lastNZPosInCG << 16) | firstNZPosInCG);
>> }
>>
>> +
>> +uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff,
>> intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t
>> scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
>> subPosBase)
>> +{
>> + ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
>> + uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
>> + uint32_t sum = 0;
>> +
>> + // correct offset to match assembly
>> + absCoeff -= numNonZero;
>> +
>> + for (int i = 0; i < MLS_CG_SIZE; i++)
>> + {
>> + tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize +
>> 0]);
>> + tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize +
>> 1]);
>> + tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize +
>> 2]);
>> + tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize +
>> 3]);
>> + }
>> +
>> + do
>> + {
>> + uint32_t blkPos, sig, ctxSig;
>> + blkPos = scan[scanPosSigOff];
>> + const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 :
>> 0;
>> + sig = scanFlagMask & 1;
>> + scanFlagMask >>= 1;
>> + X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit
>> mistake\n");
>> + if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
>> + {
>> + const uint32_t cnt = tabSigCtx[blkPos] + offset;
>> + ctxSig = cnt & posZeroMask;
>> +
>> + //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx,
>> log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff],
>> bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx
>> mistake!\n");;
>> + //encodeBin(sig, baseCtx[ctxSig]);
>> + const uint32_t mstate = baseCtx[ctxSig];
>> + const uint32_t mps = mstate & 1;
>> + const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
>> + uint32_t nextState = (stateBits >> 24) + mps;
>> + if ((mstate ^ sig) == 1)
>> + nextState = sig;
>> + X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState
>> check failure\n");
>> + X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits &
>> 0xFFFFFF), "entropyBits check failure\n");
>> + baseCtx[ctxSig] = (uint8_t)nextState;
>> + sum += stateBits;
>> + }
>> + assert(numNonZero <= 15);
>> + assert(blkPos <= 15);
>> + absCoeff[numNonZero] = tmpCoeff[blkPos];
>> + numNonZero += sig;
>> + scanPosSigOff--;
>> + }
>> + while(scanPosSigOff >= 0);
>> +
>> + return (sum & 0xFFFFFF);
>> +}
>> +
>> } // closing - anonymous file-static namespace
>>
>> namespace x265 {
>> @@ -851,5 +907,6 @@
>>
>> p.scanPosLast = scanPosLast_c;
>> p.findPosFirstLast = findPosFirstLast_c;
>> + p.costCoeffNxN = costCoeffNxN_c;
>> }
>> }
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/primitives.h
>> --- a/source/common/primitives.h Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/primitives.h Thu Jun 04 12:13:05 2015 -0700
>> @@ -186,6 +186,8 @@
>> typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff,
>> uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig,
>> const uint16_t* scanCG4x4, const int trSize);
>> typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const
>> intptr_t trSize, const uint16_t scanTbl[16]);
>>
>> +typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t
>> *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx,
>> uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
>> subPosBase);
>> +
>> /* Function pointers to optimized encoder primitives. Each pointer can
>> reference
>> * either an assembly routine, a SIMD intrinsic primitive, or a C
>> function */
>> struct EncoderPrimitives
>> @@ -310,6 +312,8 @@
>> scanPosLast_t scanPosLast;
>> findPosFirstLast_t findPosFirstLast;
>>
>> + costCoeffNxN_t costCoeffNxN;
>> +
>> /* There is one set of chroma primitives per color space. An encoder
>> will
>> * have just a single color space and thus it will only ever use one
>> entry
>> * in this array. However we always fill all entries in the array in
>> case
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/asm-primitives.cpp
>> --- a/source/common/x86/asm-primitives.cpp Tue Jun 02 17:21:24 2015
>> +0800
>> +++ b/source/common/x86/asm-primitives.cpp Thu Jun 04 12:13:05 2015
>> -0700
>> @@ -2048,6 +2048,9 @@
>>
>> ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
>> ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
>> +
>> + // TODO: it is passed smoke test, but we need testbench, so
>> temporary disable
>> + //p.costCoeffNxN = x265_costCoeffNxN_sse4;
>> #endif
>> }
>> if (cpuMask & X265_CPU_AVX)
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/pixel-util.h
>> --- a/source/common/x86/pixel-util.h Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/x86/pixel-util.h Thu Jun 04 12:13:05 2015 -0700
>> @@ -82,6 +82,9 @@
>> int x265_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t
>> *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int
>> numSig, const uint16_t* scanCG4x4, const int trSize);
>> uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const
>> intptr_t trSize, const uint16_t scanTbl[16]);
>>
>> +uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t
>> *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx,
>> uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
>> subPosBase);
>> +
>> +
>> #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
>> void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest,
>> intptr_t destride, const pixel* src0, const pixel* src1, intptr_t
>> srcstride0, intptr_t srcstride1); \
>> void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t
>> destride, const pixel* src0, const int16_t* src1, intptr_t srcStride0,
>> intptr_t srcStride1);
>> diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/pixel-util8.asm
>> --- a/source/common/x86/pixel-util8.asm Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/common/x86/pixel-util8.asm Thu Jun 04 12:13:05 2015 -0700
>> @@ -71,6 +71,7 @@
>> cextern pb_64
>> cextern hmul_16p
>> cextern trans8_shuf
>> +cextern_naked g_entropyStateBits
>>
>>
>> ;-----------------------------------------------------------------------------
>> ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t
>> stride)
>> @@ -6362,3 +6363,178 @@
>> add [r1 + 4 * 4], r6d
>> RET
>> %endif ; ARCH_X86_64
>> +
>> +
>> +; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize,
>> uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t
>> *baseCtx, int offset, int subPosBase)
>> +;for (int i = 0; i < MLS_CG_SIZE; i++)
>> +;{
>> +; tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i
>> * trSize + 0]);
>> +; tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i
>> * trSize + 1]);
>> +; tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i
>> * trSize + 2]);
>> +; tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i
>> * trSize + 3]);
>> +;}
>> +;do
>> +;{
>> +; uint32_t blkPos, sig, ctxSig;
>> +; blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff];
>> +; const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
>> +; sig = scanFlagMask & 1;
>> +; scanFlagMask >>= 1;
>> +; if (scanPosSigOff + (subSet == 0) + numNonZero)
>> +; {
>> +; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
>> +; ctxSig = cnt & posZeroMask;
>> +;
>> +; const uint32_t mstate = baseCtx[ctxSig];
>> +; const uint32_t mps = mstate & 1;
>> +; const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
>> +; uint32_t nextState = (stateBits >> 24) + mps;
>> +; if ((mstate ^ sig) == 1)
>> +; nextState = sig;
>> +; baseCtx[ctxSig] = (uint8_t)nextState;
>> +; sum += stateBits;
>> +; }
>> +; absCoeff[numNonZero] = tmpCoeff[blkPos];
>> +; numNonZero += sig;
>> +; scanPosSigOff--;
>> +;}
>> +;while(scanPosSigOff >= 0);
>> +; sum &= 0xFFFFFF
>> +
>> +%if ARCH_X86_64
>> +; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize,
>> uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t
>> *baseCtx, int offset, int scanPosSigOff, int subPosBase)
>> +INIT_XMM sse4
>> +cglobal costCoeffNxN, 6,11,5
>> + add r2d, r2d
>> +
>> + ; abs(coeff)
>> + movh m1, [r1]
>> + movhps m1, [r1 + r2]
>> + movh m2, [r1 + r2 * 2]
>> + lea r2, [r2 * 3]
>> + movhps m2, [r1 + r2]
>> + pabsw m1, m1
>> + pabsw m2, m2
>> + ; r[1-2] free here
>> +
>> + ; WARNING: beyond-bound read here!
>> + ; loading scan table
>> + mov r2d, r8m
>> + xor r2d, 15
>> + movu m0, [r0 + r2 * 2]
>> + movu m3, [r0 + r2 * 2 + mmsize]
>> + packuswb m0, m3
>> + pxor m0, [pb_15]
>> + xchg r2d, r8m
>> + ; r[0-1] free here
>> +
>> + ; reorder coeff
>> + mova m3, [deinterleave_shuf]
>> + pshufb m1, m3
>> + pshufb m2, m3
>> + punpcklqdq m3, m1, m2
>> + punpckhqdq m1, m2
>> + pshufb m3, m0
>> + pshufb m1, m0
>> + punpcklbw m2, m3, m1
>> + punpckhbw m3, m1
>> + ; r[0-1], m[1] free here
>> +
>> + ; loading tabSigCtx (+offset)
>> + mova m1, [r4]
>> + pshufb m1, m0
>> + movd m4, r7m
>> + pxor m5, m5
>> + pshufb m4, m5
>> + paddb m1, m4
>> +
>> + ; register mapping
>> + ; m0 - Zigzag
>> + ; m1 - sigCtx
>> + ; {m3,m2} - abs(coeff)
>> + ; r0 - g_entropyStateBits
>> + ; r1 - baseCtx
>> + ; r2 - scanPosSigOff
>> + ; r3 - absCoeff
>> + ; r4 - nonZero
>> + ; r5 - scanFlagMask
>> + ; r6 - sum
>> + lea r0, [g_entropyStateBits]
>> + mov r1, r6mp
>> + xor r6d, r6d
>> + xor r4d, r4d
>> + xor r8d, r8d
>> +
>> + test r2d, r2d
>> + jz .idx_zero
>> +
>> +.loop:
>> +; {
>> +; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
>> +; ctxSig = cnt & posZeroMask;
>> +; const uint32_t mstate = baseCtx[ctxSig];
>> +; const uint32_t mps = mstate & 1;
>> +; const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
>> +; uint32_t nextState = (stateBits >> 24) + mps;
>> +; if ((mstate ^ sig) == 1)
>> +; nextState = sig;
>> +; baseCtx[ctxSig] = (uint8_t)nextState;
>> +; sum += stateBits;
>> +; }
>> +; absCoeff[numNonZero] = tmpCoeff[blkPos];
>> +; numNonZero += sig;
>> +; scanPosSigOff--;
>> +
>> + pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] =
>> tmpCoeff[blkPos]
>> + shr r5d, 1
>> + setc r8b ; r8 = sig
>> + add r4d, r8d ; numNonZero += sig
>> + palignr m4, m3, m2, 2
>> + psrldq m3, 2
>> + mova m2, m4
>> + movd r7d, m1 ; r7 = ctxSig
>> + movzx r7d, r7b
>> + psrldq m1, 1
>> + movzx r9d, byte [r1 + r7] ; mstate =
>> baseCtx[ctxSig]
>> + mov r10d, r9d
>> + and r10d, 1 ; mps = mstate & 1
>> + xor r9d, r8d ; r9 = mstate ^ sig
>> + add r6d, [r0 + r9 * 4] ; sum +=
>> g_entropyStateBits[mstate ^ sig]
>> + add r10b, byte [r0 + r9 * 4 + 3] ; nextState = (stateBits
>> >> 24) + mps
>> + cmp r9b, 1
>> + cmove r10d, r8d
>> + mov byte [r1 + r7], r10b
>> +
>> + dec r2d
>> + jg .loop
>> +
>> +.idx_zero:
>> + pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] =
>> tmpCoeff[blkPos]
>> + add r4b, r8m
>> + xor r2d, r2d
>> + cmp word r9m, 0
>> + sete r2b
>> + add r4b, r2b
>> + jz .exit
>> +
>> + dec r2b
>> + movd r3d, m1
>> + and r2d, r3d
>> +
>> + movzx r3d, byte [r1 + r2] ; mstate =
>> baseCtx[ctxSig]
>> + mov r4d, r5d
>> + xor r5d, r3d ; r0 = mstate ^ sig
>> + and r3d, 1 ; mps = mstate & 1
>> + add r6d, [r0 + r5 * 4] ; sum +=
>> g_entropyStateBits[mstate ^ sig]
>> + add r3b, [r0 + r5 * 4 + 3] ; nextState = (stateBits
>> >> 24) + mps
>> + cmp r5b, 1
>> + cmove r3d, r4d
>> + mov byte [r1 + r2], r3b
>> +
>> +.exit:
>> +%ifnidn eax,r6d
>> + mov eax, r6d
>> +%endif
>> + and eax, 0xFFFFFF
>> + RET
>> +%endif ; ARCH_X86_64
>> diff -r 093618ce0b26 -r 24f347c00df0 source/encoder/entropy.cpp
>> --- a/source/encoder/entropy.cpp Tue Jun 02 17:21:24 2015 +0800
>> +++ b/source/encoder/entropy.cpp Thu Jun 04 12:13:05 2015 -0700
>> @@ -1517,12 +1517,12 @@
>> uint8_t * const baseCtx = bIsLuma ?
>> &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX +
>> NUM_SIG_FLAG_CTX_LUMA];
>> uint32_t c1 = 1;
>> int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
>> - int absCoeff[1 << MLS_CG_SIZE];
>> + ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
>> uint32_t numNonZero = 1;
>> unsigned long lastNZPosInCG;
>> unsigned long firstNZPosInCG;
>>
>> - absCoeff[0] = int(abs(coeff[posLast]));
>> + absCoeff[0] = (uint16_t)abs(coeff[posLast]);
>>
>> for (int subSet = lastScanSet; subSet >= 0; subSet--)
>> {
>> @@ -1600,19 +1600,20 @@
>>
>> const int offset =
>> codingParameters.firstSignificanceMapContext;
>> ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
>> - // TODO: accelerate by PABSW
>> const uint32_t blkPosBase =
>> codingParameters.scan[subPosBase];
>> - for (int i = 0; i < MLS_CG_SIZE; i++)
>> - {
>> - tmpCoeff[i * MLS_CG_SIZE + 0] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
>> - tmpCoeff[i * MLS_CG_SIZE + 1] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
>> - tmpCoeff[i * MLS_CG_SIZE + 2] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
>> - tmpCoeff[i * MLS_CG_SIZE + 3] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
>> - }
>>
>> X265_CHECK(scanPosSigOff >= 0, "scanPosSigOff check
>> failure\n");
>> if (m_bitIf)
>> {
>> + // TODO: accelerate by PABSW
>> + for (int i = 0; i < MLS_CG_SIZE; i++)
>> + {
>> + tmpCoeff[i * MLS_CG_SIZE + 0] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
>> + tmpCoeff[i * MLS_CG_SIZE + 1] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
>> + tmpCoeff[i * MLS_CG_SIZE + 2] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
>> + tmpCoeff[i * MLS_CG_SIZE + 3] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
>> + }
>> +
>> if (log2TrSize == 2)
>> {
>> do
>> @@ -1667,6 +1668,15 @@
>> uint32_t sum = 0;
>> if (log2TrSize == 2)
>> {
>> + // TODO: accelerate by PABSW
>> + for (int i = 0; i < MLS_CG_SIZE; i++)
>> + {
>> + tmpCoeff[i * MLS_CG_SIZE + 0] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
>> + tmpCoeff[i * MLS_CG_SIZE + 1] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
>> + tmpCoeff[i * MLS_CG_SIZE + 2] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
>> + tmpCoeff[i * MLS_CG_SIZE + 3] =
>> (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
>> + }
>> +
>> do
>> {
>> uint32_t blkPos, sig, ctxSig;
>> @@ -1681,7 +1691,7 @@
>> const uint32_t mstate = baseCtx[ctxSig];
>> const uint32_t mps = mstate & 1;
>> const uint32_t stateBits =
>> g_entropyStateBits[mstate ^ sig];
>> - uint32_t nextState = (stateBits >> 23) + mps;
>> + uint32_t nextState = (stateBits >> 24) + mps;
>> if ((mstate ^ sig) == 1)
>> nextState = sig;
>> X265_CHECK(sbacNext(mstate, sig) ==
>> nextState, "nextState check failure\n");
>> @@ -1698,39 +1708,13 @@
>> else
>> {
>> X265_CHECK((log2TrSize > 2), "log2TrSize must be
>> more than 2 in this path!\n");
>> + const uint8_t *tabSigCtx =
>> table_cnt[(uint32_t)patternSigCtx];
>>
>> - const uint8_t *tabSigCtx =
>> table_cnt[(uint32_t)patternSigCtx];
>> - do
>> - {
>> - uint32_t blkPos, sig, ctxSig;
>> - blkPos =
>> g_scan4x4[codingParameters.scanType][scanPosSigOff];
>> - const uint32_t posZeroMask = (subPosBase +
>> scanPosSigOff) ? ~0 : 0;
>> - sig = scanFlagMask & 1;
>> - scanFlagMask >>= 1;
>> - X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) ==
>> sig, "sign bit mistake\n");
>> - if (scanPosSigOff != 0 || subSet == 0 ||
>> numNonZero)
>> - {
>> - const uint32_t cnt = tabSigCtx[blkPos] +
>> offset;
>> - ctxSig = (cnt + posOffset) & posZeroMask;
>> + sum =
>> primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType],
>> &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx,
>> scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
>>
>> - X265_CHECK(ctxSig ==
>> Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize,
>> codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma,
>> codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
>> - //encodeBin(sig, baseCtx[ctxSig]);
>> - const uint32_t mstate = baseCtx[ctxSig];
>> - const uint32_t mps = mstate & 1;
>> - const uint32_t stateBits =
>> g_entropyStateBits[mstate ^ sig];
>> - uint32_t nextState = (stateBits >> 23) + mps;
>> - if ((mstate ^ sig) == 1)
>> - nextState = sig;
>> - X265_CHECK(sbacNext(mstate, sig) ==
>> nextState, "nextState check failure\n");
>> - X265_CHECK(sbacGetEntropyBits(mstate, sig)
>> == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
>> - baseCtx[ctxSig] = (uint8_t)nextState;
>> - sum += stateBits;
>> - }
>> - absCoeff[numNonZero] = tmpCoeff[blkPos];
>> - numNonZero += sig;
>> - scanPosSigOff--;
>> - }
>> - while(scanPosSigOff >= 0);
>> +#if CHECKED_BUILD || _DEBUG
>> + numNonZero = coeffNum[subSet];
>> +#endif
>> } // end of non 4x4 path
>> sum &= 0xFFFFFF;
>>
>> @@ -2271,28 +2255,6 @@
>> 0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f,
>> 0x2b32f, 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c,
>> 0x3bfbb
>> };
>>
>> -// [8 24] --> [stateMPS BitCost], [stateLPS BitCost]
>> -const uint32_t g_entropyStateBits[128] =
>> -{
>> - // Corrected table, most notably for last state
>> - 0x01007b23, 0x000085f9, 0x020074a0, 0x00008cbc, 0x03006ee4,
>> 0x01009354, 0x040067f4, 0x02009c1b,
>> - 0x050060b0, 0x0200a62a, 0x06005a9c, 0x0400af5b, 0x0700548d,
>> 0x0400b955, 0x08004f56, 0x0500c2a9,
>> - 0x09004a87, 0x0600cbf7, 0x0a0045d6, 0x0700d5c3, 0x0b004144,
>> 0x0800e01b, 0x0c003d88, 0x0900e937,
>> - 0x0d0039e0, 0x0900f2cd, 0x0e003663, 0x0b00fc9e, 0x0f003347,
>> 0x0b010600, 0x10003050, 0x0c010f95,
>> - 0x11002d4d, 0x0d011a02, 0x12002ad3, 0x0d012333, 0x1300286e,
>> 0x0f012cad, 0x14002604, 0x0f0136df,
>> - 0x15002425, 0x10013f48, 0x160021f4, 0x100149c4, 0x1700203e,
>> 0x1201527b, 0x18001e4d, 0x12015d00,
>> - 0x19001c99, 0x130166de, 0x1a001b18, 0x13017017, 0x1b0019a5,
>> 0x15017988, 0x1c001841, 0x15018327,
>> - 0x1d0016df, 0x16018d50, 0x1e0015d9, 0x16019547, 0x1f00147c,
>> 0x1701a083, 0x2000138e, 0x1801a8a3,
>> - 0x21001251, 0x1801b418, 0x22001166, 0x1901bd27, 0x23001068,
>> 0x1a01c77b, 0x24000f7f, 0x1a01d18e,
>> - 0x25000eda, 0x1b01d91a, 0x26000e19, 0x1b01e254, 0x27000d4f,
>> 0x1c01ec9a, 0x28000c90, 0x1d01f6e0,
>> - 0x29000c01, 0x1d01fef8, 0x2a000b5f, 0x1e0208b1, 0x2b000ab6,
>> 0x1e021362, 0x2c000a15, 0x1e021e46,
>> - 0x2d000988, 0x1f02285d, 0x2e000934, 0x20022ea8, 0x2f0008a8,
>> 0x200239b2, 0x3000081d, 0x21024577,
>> - 0x310007c9, 0x21024ce6, 0x32000763, 0x21025663, 0x33000710,
>> 0x22025e8f, 0x340006a0, 0x22026a26,
>> - 0x35000672, 0x23026f23, 0x360005e8, 0x23027ef8, 0x370005ba,
>> 0x230284b5, 0x3800055e, 0x24029057,
>> - 0x3900050c, 0x24029bab, 0x3a0004c1, 0x2402a674, 0x3b0004a7,
>> 0x2502aa5e, 0x3c00046f, 0x2502b32f,
>> - 0x3d00041f, 0x2502c0ad, 0x3e0003e7, 0x2602ca8d, 0x3e0003ba,
>> 0x2602d323, 0x3f00010c, 0x3f03bfbb,
>> -};
>> -
>> const uint8_t g_nextState[128][2] =
>> {
>> { 2, 1 }, { 0, 3 }, { 4, 0 }, { 1, 5 }, { 6, 2 }, { 3, 7 }, { 8, 4
>> }, { 5, 9 },
>> @@ -2314,3 +2276,26 @@
>> };
>>
>> }
>> +
>> +// [8 24] --> [stateMPS BitCost], [stateLPS BitCost]
>> +extern "C" const uint32_t g_entropyStateBits[128] =
>> +{
>> + // Corrected table, most notably for last state
>> + 0x02007B23, 0x000085F9, 0x040074A0, 0x00008CBC, 0x06006EE4,
>> 0x02009354, 0x080067F4, 0x04009C1B,
>> + 0x0A0060B0, 0x0400A62A, 0x0C005A9C, 0x0800AF5B, 0x0E00548D,
>> 0x0800B955, 0x10004F56, 0x0A00C2A9,
>> + 0x12004A87, 0x0C00CBF7, 0x140045D6, 0x0E00D5C3, 0x16004144,
>> 0x1000E01B, 0x18003D88, 0x1200E937,
>> + 0x1A0039E0, 0x1200F2CD, 0x1C003663, 0x1600FC9E, 0x1E003347,
>> 0x16010600, 0x20003050, 0x18010F95,
>> + 0x22002D4D, 0x1A011A02, 0x24002AD3, 0x1A012333, 0x2600286E,
>> 0x1E012CAD, 0x28002604, 0x1E0136DF,
>> + 0x2A002425, 0x20013F48, 0x2C0021F4, 0x200149C4, 0x2E00203E,
>> 0x2401527B, 0x30001E4D, 0x24015D00,
>> + 0x32001C99, 0x260166DE, 0x34001B18, 0x26017017, 0x360019A5,
>> 0x2A017988, 0x38001841, 0x2A018327,
>> + 0x3A0016DF, 0x2C018D50, 0x3C0015D9, 0x2C019547, 0x3E00147C,
>> 0x2E01A083, 0x4000138E, 0x3001A8A3,
>> + 0x42001251, 0x3001B418, 0x44001166, 0x3201BD27, 0x46001068,
>> 0x3401C77B, 0x48000F7F, 0x3401D18E,
>> + 0x4A000EDA, 0x3601D91A, 0x4C000E19, 0x3601E254, 0x4E000D4F,
>> 0x3801EC9A, 0x50000C90, 0x3A01F6E0,
>> + 0x52000C01, 0x3A01FEF8, 0x54000B5F, 0x3C0208B1, 0x56000AB6,
>> 0x3C021362, 0x58000A15, 0x3C021E46,
>> + 0x5A000988, 0x3E02285D, 0x5C000934, 0x40022EA8, 0x5E0008A8,
>> 0x400239B2, 0x6000081D, 0x42024577,
>> + 0x620007C9, 0x42024CE6, 0x64000763, 0x42025663, 0x66000710,
>> 0x44025E8F, 0x680006A0, 0x44026A26,
>> + 0x6A000672, 0x46026F23, 0x6C0005E8, 0x46027EF8, 0x6E0005BA,
>> 0x460284B5, 0x7000055E, 0x48029057,
>> + 0x7200050C, 0x48029BAB, 0x740004C1, 0x4802A674, 0x760004A7,
>> 0x4A02AA5E, 0x7800046F, 0x4A02B32F,
>> + 0x7A00041F, 0x4A02C0AD, 0x7C0003E7, 0x4C02CA8D, 0x7C0003BA,
>> 0x4C02D323, 0x7E00010C, 0x7E03BFBB,
>> +};
>> +
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150605/d3cecbb6/attachment-0001.html>
More information about the x265-devel
mailing list