[x265] [PATCH 1 of 6] asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN

Min Chen chenm003 at 163.com
Thu Jun 4 21:13:43 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1433445185 25200
# Node ID 24f347c00df01352fa6860e05b376846d8d8cc74
# Parent  093618ce0b26ea4703b5928f618d2895cf6daf32
asm: sse4 version of costCoeffGroupNxN in codeCoeffNxN
---
 source/common/constants.cpp          |    5 +-
 source/common/constants.h            |    2 +-
 source/common/contexts.h             |    3 +-
 source/common/dct.cpp                |   57 +++++++++++
 source/common/primitives.h           |    4 +
 source/common/x86/asm-primitives.cpp |    2 +
 source/common/x86/pixel-util.h       |    3 +
 source/common/x86/pixel-util8.asm    |  176 ++++++++++++++++++++++++++++++++++
 source/encoder/entropy.cpp           |  113 ++++++++++------------
 9 files changed, 297 insertions(+), 68 deletions(-)

diff -r 093618ce0b26 -r 24f347c00df0 source/common/constants.cpp
--- a/source/common/constants.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/constants.cpp	Thu Jun 04 12:13:05 2015 -0700
@@ -324,11 +324,12 @@
       4,  12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
 };
 
-ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4]) =
 {
     { 0,  4,  1,  8,  5,  2, 12,  9,  6,  3, 13, 10,  7, 14, 11, 15 },
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 }
+    { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 },
+    { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 }
 };
 
 const uint16_t g_scan16x16[16 * 16] =
diff -r 093618ce0b26 -r 24f347c00df0 source/common/constants.h
--- a/source/common/constants.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/constants.h	Thu Jun 04 12:13:05 2015 -0700
@@ -83,7 +83,7 @@
 extern const uint16_t* const g_scanOrder[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
 extern const uint16_t* const g_scanOrderCG[NUM_SCAN_TYPE][NUM_SCAN_SIZE];
 extern const uint16_t g_scan8x8diag[8 * 8];
-extern const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4];
+extern const uint16_t g_scan4x4[NUM_SCAN_TYPE + 1][4 * 4];  // +1 for safe buffer area for codeCoeffNxN assembly optimize, there have up to 15 bytes beyond bound read
 
 extern const uint8_t g_lastCoeffTable[32];
 extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes
diff -r 093618ce0b26 -r 24f347c00df0 source/common/contexts.h
--- a/source/common/contexts.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/contexts.h	Thu Jun 04 12:13:05 2015 -0700
@@ -102,11 +102,12 @@
 #define OFF_TQUANT_BYPASS_FLAG_CTX (OFF_TRANSFORMSKIP_FLAG_CTX + 2 * NUM_TRANSFORMSKIP_FLAG_CTX)
 #define MAX_OFF_CTX_MOD            (OFF_TQUANT_BYPASS_FLAG_CTX +     NUM_TQUANT_BYPASS_FLAG_CTX)
 
+extern "C" const uint32_t g_entropyStateBits[128];
+
 namespace x265 {
 // private namespace
 
 extern const uint32_t g_entropyBits[128];
-extern const uint32_t g_entropyStateBits[128];
 extern const uint8_t g_nextState[128][2];
 
 #define sbacGetMps(S)            ((S) & 1)
diff -r 093618ce0b26 -r 24f347c00df0 source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/dct.cpp	Thu Jun 04 12:13:05 2015 -0700
@@ -29,6 +29,7 @@
 
 #include "common.h"
 #include "primitives.h"
+#include "contexts.h"   // costCoeffNxN_c
 
 using namespace x265;
 
@@ -817,6 +818,61 @@
     return ((lastNZPosInCG << 16) | firstNZPosInCG);
 }
 
+
+uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
+{
+    ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
+    uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
+    uint32_t sum = 0;
+
+    // correct offset to match assembly
+    absCoeff -= numNonZero;
+
+    for (int i = 0; i < MLS_CG_SIZE; i++)
+    {
+        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
+        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
+        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
+        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
+    }
+
+    do
+    {
+        uint32_t blkPos, sig, ctxSig;
+        blkPos = scan[scanPosSigOff];
+        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
+        sig     = scanFlagMask & 1;
+        scanFlagMask >>= 1;
+        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
+        if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
+        {
+            const uint32_t cnt = tabSigCtx[blkPos] + offset;
+            ctxSig = cnt & posZeroMask;
+
+            //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
+            //encodeBin(sig, baseCtx[ctxSig]);
+            const uint32_t mstate = baseCtx[ctxSig];
+            const uint32_t mps = mstate & 1;
+            const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
+            uint32_t nextState = (stateBits >> 24) + mps;
+            if ((mstate ^ sig) == 1)
+                nextState = sig;
+            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
+            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
+            baseCtx[ctxSig] = (uint8_t)nextState;
+            sum += stateBits;
+        }
+        assert(numNonZero <= 15);
+        assert(blkPos <= 15);
+        absCoeff[numNonZero] = tmpCoeff[blkPos];
+        numNonZero += sig;
+        scanPosSigOff--;
+    }
+    while(scanPosSigOff >= 0);
+
+    return (sum & 0xFFFFFF);
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace x265 {
@@ -851,5 +907,6 @@
 
     p.scanPosLast = scanPosLast_c;
     p.findPosFirstLast = findPosFirstLast_c;
+    p.costCoeffNxN = costCoeffNxN_c;
 }
 }
diff -r 093618ce0b26 -r 24f347c00df0 source/common/primitives.h
--- a/source/common/primitives.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/primitives.h	Thu Jun 04 12:13:05 2015 -0700
@@ -186,6 +186,8 @@
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
+typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -310,6 +312,8 @@
     scanPosLast_t         scanPosLast;
     findPosFirstLast_t    findPosFirstLast;
 
+    costCoeffNxN_t        costCoeffNxN;
+
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
      * in this array. However we always fill all entries in the array in case
diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 04 12:13:05 2015 -0700
@@ -2048,6 +2048,9 @@
 
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
+
+        // TODO: it is passed smoke test, but we need testbench, so temporary disable
+        //p.costCoeffNxN = x265_costCoeffNxN_sse4;
 #endif
     }
     if (cpuMask & X265_CPU_AVX)
diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/x86/pixel-util.h	Thu Jun 04 12:13:05 2015 -0700
@@ -82,6 +82,9 @@
 int x265_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
+uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+
+
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
     void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
     void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  src1, intptr_t srcStride0, intptr_t srcStride1);
diff -r 093618ce0b26 -r 24f347c00df0 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/common/x86/pixel-util8.asm	Thu Jun 04 12:13:05 2015 -0700
@@ -71,6 +71,7 @@
 cextern pb_64
 cextern hmul_16p
 cextern trans8_shuf
+cextern_naked g_entropyStateBits
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -6362,3 +6363,178 @@
     add         [r1 + 4 * 4], r6d
     RET
 %endif ; ARCH_X86_64
+
+
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase)
+;for (int i = 0; i < MLS_CG_SIZE; i++)
+;{
+;    tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
+;    tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
+;    tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
+;    tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
+;}
+;do
+;{
+;    uint32_t blkPos, sig, ctxSig;
+;    blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff];
+;    const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
+;    sig     = scanFlagMask & 1;
+;    scanFlagMask >>= 1;
+;    if (scanPosSigOff + (subSet == 0) + numNonZero)
+;    {
+;        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
+;        ctxSig = cnt & posZeroMask;
+;
+;        const uint32_t mstate = baseCtx[ctxSig];
+;        const uint32_t mps = mstate & 1;
+;        const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
+;        uint32_t nextState = (stateBits >> 24) + mps;
+;        if ((mstate ^ sig) == 1)
+;            nextState = sig;
+;        baseCtx[ctxSig] = (uint8_t)nextState;
+;        sum += stateBits;
+;    }
+;    absCoeff[numNonZero] = tmpCoeff[blkPos];
+;    numNonZero += sig;
+;    scanPosSigOff--;
+;}
+;while(scanPosSigOff >= 0);
+; sum &= 0xFFFFFF
+
+%if ARCH_X86_64
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
+INIT_XMM sse4
+cglobal costCoeffNxN, 6,11,5
+    add         r2d, r2d
+
+    ; abs(coeff)
+    movh        m1, [r1]
+    movhps      m1, [r1 + r2]
+    movh        m2, [r1 + r2 * 2]
+    lea         r2, [r2 * 3]
+    movhps      m2, [r1 + r2]
+    pabsw       m1, m1
+    pabsw       m2, m2
+    ; r[1-2] free here
+
+    ; WARNING: beyond-bound read here!
+    ; loading scan table
+    mov         r2d, r8m
+    xor         r2d, 15
+    movu        m0, [r0 + r2 * 2]
+    movu        m3, [r0 + r2 * 2 + mmsize]
+    packuswb    m0, m3
+    pxor        m0, [pb_15]
+    xchg        r2d, r8m
+    ; r[0-1] free here
+
+    ; reorder coeff
+    mova        m3, [deinterleave_shuf]
+    pshufb      m1, m3
+    pshufb      m2, m3
+    punpcklqdq  m3, m1, m2
+    punpckhqdq  m1, m2
+    pshufb      m3, m0
+    pshufb      m1, m0
+    punpcklbw   m2, m3, m1
+    punpckhbw   m3, m1
+    ; r[0-1], m[1] free here
+
+    ; loading tabSigCtx (+offset)
+    mova        m1, [r4]
+    pshufb      m1, m0
+    movd        m4, r7m
+    pxor        m5, m5
+    pshufb      m4, m5
+    paddb       m1, m4
+
+    ; register mapping
+    ; m0 - Zigzag
+    ; m1 - sigCtx
+    ; {m3,m2} - abs(coeff)
+    ; r0 - g_entropyStateBits
+    ; r1 - baseCtx
+    ; r2 - scanPosSigOff
+    ; r3 - absCoeff
+    ; r4 - nonZero
+    ; r5 - scanFlagMask
+    ; r6 - sum
+    lea         r0, [g_entropyStateBits]
+    mov         r1, r6mp
+    xor         r6d, r6d
+    xor         r4d, r4d
+    xor         r8d, r8d
+
+    test        r2d, r2d
+    jz         .idx_zero
+
+.loop:
+;   {
+;        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
+;        ctxSig = cnt & posZeroMask;
+;        const uint32_t mstate = baseCtx[ctxSig];
+;        const uint32_t mps = mstate & 1;
+;        const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
+;        uint32_t nextState = (stateBits >> 24) + mps;
+;        if ((mstate ^ sig) == 1)
+;            nextState = sig;
+;        baseCtx[ctxSig] = (uint8_t)nextState;
+;        sum += stateBits;
+;    }
+;    absCoeff[numNonZero] = tmpCoeff[blkPos];
+;    numNonZero += sig;
+;    scanPosSigOff--;
+
+    pextrw      [r3 + r4 * 2], m2, 0            ; absCoeff[numNonZero] = tmpCoeff[blkPos]
+    shr         r5d, 1
+    setc        r8b                             ; r8 = sig
+    add         r4d, r8d                        ; numNonZero += sig
+    palignr     m4, m3, m2, 2
+    psrldq      m3, 2
+    mova        m2, m4
+    movd        r7d, m1                         ; r7 = ctxSig
+    movzx       r7d, r7b
+    psrldq      m1, 1
+    movzx       r9d, byte [r1 + r7]             ; mstate = baseCtx[ctxSig]
+    mov         r10d, r9d
+    and         r10d, 1                         ; mps = mstate & 1
+    xor         r9d, r8d                        ; r9 = mstate ^ sig
+    add         r6d, [r0 + r9 * 4]              ; sum += g_entropyStateBits[mstate ^ sig]
+    add         r10b, byte [r0 + r9 * 4 + 3]    ; nextState = (stateBits >> 24) + mps
+    cmp         r9b, 1
+    cmove       r10d, r8d
+    mov    byte [r1 + r7], r10b
+
+    dec         r2d
+    jg         .loop
+
+.idx_zero:
+    pextrw      [r3 + r4 * 2], m2, 0            ; absCoeff[numNonZero] = tmpCoeff[blkPos]
+    add         r4b, r8m
+    xor         r2d, r2d
+    cmp    word r9m, 0
+    sete        r2b
+    add         r4b, r2b
+    jz         .exit
+
+    dec         r2b
+    movd        r3d, m1
+    and         r2d, r3d
+
+    movzx       r3d, byte [r1 + r2]             ; mstate = baseCtx[ctxSig]
+    mov         r4d, r5d
+    xor         r5d, r3d                        ; r0 = mstate ^ sig
+    and         r3d, 1                          ; mps = mstate & 1
+    add         r6d, [r0 + r5 * 4]              ; sum += g_entropyStateBits[mstate ^ sig]
+    add         r3b, [r0 + r5 * 4 + 3]          ; nextState = (stateBits >> 24) + mps
+    cmp         r5b, 1
+    cmove       r3d, r4d
+    mov    byte [r1 + r2], r3b
+
+.exit:
+%ifnidn eax,r6d
+    mov         eax, r6d
+%endif
+    and         eax, 0xFFFFFF
+    RET
+%endif ; ARCH_X86_64
diff -r 093618ce0b26 -r 24f347c00df0 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Tue Jun 02 17:21:24 2015 +0800
+++ b/source/encoder/entropy.cpp	Thu Jun 04 12:13:05 2015 -0700
@@ -1517,12 +1517,12 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    int absCoeff[1 << MLS_CG_SIZE];
+    ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
     uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
 
-    absCoeff[0] = int(abs(coeff[posLast]));
+    absCoeff[0] = (uint16_t)abs(coeff[posLast]);
 
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
     {
@@ -1600,19 +1600,20 @@
 
             const int offset = codingParameters.firstSignificanceMapContext;
             ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
-            // TODO: accelerate by PABSW
             const uint32_t blkPosBase  = codingParameters.scan[subPosBase];
-            for (int i = 0; i < MLS_CG_SIZE; i++)
-            {
-                tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
-                tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
-                tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
-                tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
-            }
 
             X265_CHECK(scanPosSigOff >= 0, "scanPosSigOff check failure\n");
             if (m_bitIf)
             {
+                // TODO: accelerate by PABSW
+                for (int i = 0; i < MLS_CG_SIZE; i++)
+                {
+                    tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
+                    tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
+                    tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
+                    tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
+                }
+
                 if (log2TrSize == 2)
                 {
                     do
@@ -1667,6 +1668,15 @@
                 uint32_t sum = 0;
                 if (log2TrSize == 2)
                 {
+                    // TODO: accelerate by PABSW
+                    for (int i = 0; i < MLS_CG_SIZE; i++)
+                    {
+                        tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
+                        tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
+                        tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
+                        tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
+                    }
+
                     do
                     {
                         uint32_t blkPos, sig, ctxSig;
@@ -1681,7 +1691,7 @@
                             const uint32_t mstate = baseCtx[ctxSig];
                             const uint32_t mps = mstate & 1;
                             const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
-                            uint32_t nextState = (stateBits >> 23) + mps;
+                            uint32_t nextState = (stateBits >> 24) + mps;
                             if ((mstate ^ sig) == 1)
                                 nextState = sig;
                             X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
@@ -1698,39 +1708,13 @@
                 else
                 {
                     X265_CHECK((log2TrSize > 2), "log2TrSize must be more than 2 in this path!\n");
+                    const uint8_t *tabSigCtx = table_cnt[(uint32_t)patternSigCtx];
 
-                    const uint8_t *tabSigCtx = table_cnt[(uint32_t)patternSigCtx];
-                    do
-                    {
-                        uint32_t blkPos, sig, ctxSig;
-                        blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff];
-                        const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
-                        sig     = scanFlagMask & 1;
-                        scanFlagMask >>= 1;
-                        X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
-                        if (scanPosSigOff != 0 || subSet == 0 || numNonZero)
-                        {
-                            const uint32_t cnt = tabSigCtx[blkPos] + offset;
-                            ctxSig = (cnt + posOffset) & posZeroMask;
+                    sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
 
-                            X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
-                            //encodeBin(sig, baseCtx[ctxSig]);
-                            const uint32_t mstate = baseCtx[ctxSig];
-                            const uint32_t mps = mstate & 1;
-                            const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
-                            uint32_t nextState = (stateBits >> 23) + mps;
-                            if ((mstate ^ sig) == 1)
-                                nextState = sig;
-                            X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
-                            X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
-                            baseCtx[ctxSig] = (uint8_t)nextState;
-                            sum += stateBits;
-                        }
-                        absCoeff[numNonZero] = tmpCoeff[blkPos];
-                        numNonZero += sig;
-                        scanPosSigOff--;
-                    }
-                    while(scanPosSigOff >= 0);
+#if CHECKED_BUILD || _DEBUG
+                    numNonZero = coeffNum[subSet];
+#endif
                 } // end of non 4x4 path
                 sum &= 0xFFFFFF;
 
@@ -2271,28 +2255,6 @@
     0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f, 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb
 };
 
-// [8 24] --> [stateMPS BitCost], [stateLPS BitCost]
-const uint32_t g_entropyStateBits[128] =
-{
-    // Corrected table, most notably for last state
-    0x01007b23, 0x000085f9, 0x020074a0, 0x00008cbc, 0x03006ee4, 0x01009354, 0x040067f4, 0x02009c1b,
-    0x050060b0, 0x0200a62a, 0x06005a9c, 0x0400af5b, 0x0700548d, 0x0400b955, 0x08004f56, 0x0500c2a9,
-    0x09004a87, 0x0600cbf7, 0x0a0045d6, 0x0700d5c3, 0x0b004144, 0x0800e01b, 0x0c003d88, 0x0900e937,
-    0x0d0039e0, 0x0900f2cd, 0x0e003663, 0x0b00fc9e, 0x0f003347, 0x0b010600, 0x10003050, 0x0c010f95,
-    0x11002d4d, 0x0d011a02, 0x12002ad3, 0x0d012333, 0x1300286e, 0x0f012cad, 0x14002604, 0x0f0136df,
-    0x15002425, 0x10013f48, 0x160021f4, 0x100149c4, 0x1700203e, 0x1201527b, 0x18001e4d, 0x12015d00,
-    0x19001c99, 0x130166de, 0x1a001b18, 0x13017017, 0x1b0019a5, 0x15017988, 0x1c001841, 0x15018327,
-    0x1d0016df, 0x16018d50, 0x1e0015d9, 0x16019547, 0x1f00147c, 0x1701a083, 0x2000138e, 0x1801a8a3,
-    0x21001251, 0x1801b418, 0x22001166, 0x1901bd27, 0x23001068, 0x1a01c77b, 0x24000f7f, 0x1a01d18e,
-    0x25000eda, 0x1b01d91a, 0x26000e19, 0x1b01e254, 0x27000d4f, 0x1c01ec9a, 0x28000c90, 0x1d01f6e0,
-    0x29000c01, 0x1d01fef8, 0x2a000b5f, 0x1e0208b1, 0x2b000ab6, 0x1e021362, 0x2c000a15, 0x1e021e46,
-    0x2d000988, 0x1f02285d, 0x2e000934, 0x20022ea8, 0x2f0008a8, 0x200239b2, 0x3000081d, 0x21024577,
-    0x310007c9, 0x21024ce6, 0x32000763, 0x21025663, 0x33000710, 0x22025e8f, 0x340006a0, 0x22026a26,
-    0x35000672, 0x23026f23, 0x360005e8, 0x23027ef8, 0x370005ba, 0x230284b5, 0x3800055e, 0x24029057,
-    0x3900050c, 0x24029bab, 0x3a0004c1, 0x2402a674, 0x3b0004a7, 0x2502aa5e, 0x3c00046f, 0x2502b32f,
-    0x3d00041f, 0x2502c0ad, 0x3e0003e7, 0x2602ca8d, 0x3e0003ba, 0x2602d323, 0x3f00010c, 0x3f03bfbb,
-};
-
 const uint8_t g_nextState[128][2] =
 {
     { 2, 1 }, { 0, 3 }, { 4, 0 }, { 1, 5 }, { 6, 2 }, { 3, 7 }, { 8, 4 }, { 5, 9 },
@@ -2314,3 +2276,26 @@
 };
 
 }
+
+// [8 24] --> [stateMPS BitCost], [stateLPS BitCost]
+extern "C" const uint32_t g_entropyStateBits[128] =
+{
+    // Corrected table, most notably for last state
+    0x02007B23, 0x000085F9, 0x040074A0, 0x00008CBC, 0x06006EE4, 0x02009354, 0x080067F4, 0x04009C1B,
+    0x0A0060B0, 0x0400A62A, 0x0C005A9C, 0x0800AF5B, 0x0E00548D, 0x0800B955, 0x10004F56, 0x0A00C2A9,
+    0x12004A87, 0x0C00CBF7, 0x140045D6, 0x0E00D5C3, 0x16004144, 0x1000E01B, 0x18003D88, 0x1200E937,
+    0x1A0039E0, 0x1200F2CD, 0x1C003663, 0x1600FC9E, 0x1E003347, 0x16010600, 0x20003050, 0x18010F95,
+    0x22002D4D, 0x1A011A02, 0x24002AD3, 0x1A012333, 0x2600286E, 0x1E012CAD, 0x28002604, 0x1E0136DF,
+    0x2A002425, 0x20013F48, 0x2C0021F4, 0x200149C4, 0x2E00203E, 0x2401527B, 0x30001E4D, 0x24015D00,
+    0x32001C99, 0x260166DE, 0x34001B18, 0x26017017, 0x360019A5, 0x2A017988, 0x38001841, 0x2A018327,
+    0x3A0016DF, 0x2C018D50, 0x3C0015D9, 0x2C019547, 0x3E00147C, 0x2E01A083, 0x4000138E, 0x3001A8A3,
+    0x42001251, 0x3001B418, 0x44001166, 0x3201BD27, 0x46001068, 0x3401C77B, 0x48000F7F, 0x3401D18E,
+    0x4A000EDA, 0x3601D91A, 0x4C000E19, 0x3601E254, 0x4E000D4F, 0x3801EC9A, 0x50000C90, 0x3A01F6E0,
+    0x52000C01, 0x3A01FEF8, 0x54000B5F, 0x3C0208B1, 0x56000AB6, 0x3C021362, 0x58000A15, 0x3C021E46,
+    0x5A000988, 0x3E02285D, 0x5C000934, 0x40022EA8, 0x5E0008A8, 0x400239B2, 0x6000081D, 0x42024577,
+    0x620007C9, 0x42024CE6, 0x64000763, 0x42025663, 0x66000710, 0x44025E8F, 0x680006A0, 0x44026A26,
+    0x6A000672, 0x46026F23, 0x6C0005E8, 0x46027EF8, 0x6E0005BA, 0x460284B5, 0x7000055E, 0x48029057,
+    0x7200050C, 0x48029BAB, 0x740004C1, 0x4802A674, 0x760004A7, 0x4A02AA5E, 0x7800046F, 0x4A02B32F,
+    0x7A00041F, 0x4A02C0AD, 0x7C0003E7, 0x4C02CA8D, 0x7C0003BA, 0x4C02D323, 0x7E00010C, 0x7E03BFBB,
+};
+



More information about the x265-devel mailing list