[x265] [PATCH 6 of 6] asm: improve costCoeffRemain by bypass uncoded coeff

Tue Jun 9 20:06:06 CEST 2015

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1433872879 25200
# Node ID e5b6f0a984bdd8ab16b63fb1c11a508a444515ec
# Parent  134670771e0c1dd0800c3e9db0a1f9f69c467e36
asm: improve costCoeffRemain by bypass uncoded coeff
---
 source/common/dct.cpp             |   17 ++++----
 source/common/primitives.h        |    2 +-
 source/common/x86/pixel-util.h    |    2 +-
 source/common/x86/pixel-util8.asm |   41 +++++++++----------
 source/encoder/entropy.cpp        |   82 ++++++++++++++++++++++++++++++++-----
 5 files changed, 101 insertions(+), 43 deletions(-)

diff -r 134670771e0c -r e5b6f0a984bd source/common/dct.cpp

--- a/source/common/dct.cpp	Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/dct.cpp	Tue Jun 09 11:01:19 2015 -0700
@@ -874,19 +874,19 @@
     return (sum & 0xFFFFFF);
 }
 
-uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)
+uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
 {
     uint32_t goRiceParam = 0;
-    int firstCoeff2 = 1;
-    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
 
     uint32_t sum = 0;
-    int idx = 0;
+    int baseLevel = 3;
     do
     {
-        int baseLevel = (baseLevelN & 3) | firstCoeff2;
-        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
-        baseLevelN >>= 2;
+        if (idx >= C1FLAG_NUMBER)
+            baseLevel = 1;
+
+        // TODO: the IDX is not really idx, so this check inactive
+        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
         int codeNumber = absCoeff[idx] - baseLevel;
 
         if (codeNumber >= 0)
@@ -912,8 +912,7 @@
                 goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
             X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
         }
-        if (absCoeff[idx] >= 2)
-            firstCoeff2 = 0;
+        baseLevel = 2;
         idx++;
     }
     while(idx < numNonZero);
diff -r 134670771e0c -r e5b6f0a984bd source/common/primitives.h
--- a/source/common/primitives.h	Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/primitives.h	Tue Jun 09 11:01:19 2015 -0700
@@ -187,7 +187,7 @@
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
-typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/x86/pixel-util.h	Tue Jun 09 11:01:19 2015 -0700
@@ -83,7 +83,7 @@
 uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
-uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);
+uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero, int idx);
 
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/x86/pixel-util8.asm	Tue Jun 09 11:01:19 2015 -0700
@@ -6572,7 +6572,7 @@
 ;}
 ;while(idx < numNonZero);
 
-; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)
+; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx)
 INIT_XMM sse4
 cglobal costCoeffRemain, 0,7,1
     ; assign RCX to R3
@@ -6580,48 +6580,43 @@
   %if WIN64
     DECLARE_REG_TMP 3,1,2,0
     mov         t0, r0
+    mov         r4d, r2d
   %elif ARCH_X86_64
     ; *nix x64 didn't do anything
     DECLARE_REG_TMP 0,1,2,3
+    mov         r4d, r2d
   %else ; X86_32
     DECLARE_REG_TMP 6,3,2,1
     mov         t0, r0m
+    mov         r4d, r2m
   %endif
 
-    mova        m0, [t0]
-    packsswb    m0, [t0 + mmsize]
-    pcmpgtb     m0, [pb_1]
-    pmovmskb    r2d, m0
-    bsf         r2d, r2d
-    lea         r2d, [r2 * 2 + 1]
-    xor         r4d, r4d
-    bts         r4d, r2d
-    dec         r4d
-    and         r4d, 0x55555555
-    or          r4d, 0x5555AAAA
-
     xor         t3d, t3d
     xor         r5d, r5d
 
+    lea         t0, [t0 + r4 * 2]
+    mov         r2d, 3
+
     ; register mapping
-    ; r4d - baseLevelN
-    ; r2  - tmp
+    ; r2d - baseLevel & tmp
+    ; r4d - idx
     ; t3  - goRiceParam
-    ; eax - tmp - absCoeff[idx]
+    ; eax - absCoeff[idx] & tmp
     ; r5  - sum
 
 .loop:
+    mov         eax, 1
+    cmp         r4d, 8
+    cmovge      r2d, eax
+
     movzx       eax, word [t0]
     add         t0, 2
-    mov         r2d, r4d
-    and         r2d, 3
-    shr         r4d, 2
     sub         eax, r2d                ; codeNumber = absCoeff[idx] - baseLevel
     jl         .next
 
     shr         eax, t3b                ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
 
-    lea         r2d, [eax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);
+    lea         r2d, [rax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);
     bsr         r2d, r2d
     add         r2d, r2d                ; codeNumber = (length + length)
 
@@ -6644,8 +6639,10 @@
     add         t3b, al
 
 .next:
-    dec   dword r1m
-    jnz        .loop
+    inc         r4d
+    mov         r2d, 2
+    cmp         r4d, r1m
+    jl         .loop
 
     mov         eax, r5d
     RET
diff -r 134670771e0c -r e5b6f0a984bd source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Tue Jun 09 11:01:15 2015 -0700
+++ b/source/encoder/entropy.cpp	Tue Jun 09 11:01:19 2015 -0700
@@ -1431,6 +1431,55 @@
         encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
 }
 
+#if CHECKED_BUILD || _DEBUG
+uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)
+{
+    uint32_t goRiceParam = 0;
+    int firstCoeff2 = 1;
+    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+
+    uint32_t sum = 0;
+    int idx = 0;
+    do
+    {
+        int baseLevel = (baseLevelN & 3) | firstCoeff2;
+        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
+        baseLevelN >>= 2;
+        int codeNumber = absCoeff[idx] - baseLevel;
+
+        if (codeNumber >= 0)
+        {
+            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
+            uint32_t length = 0;
+
+            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+            if (codeNumber >= 0)
+            {
+                {
+                    unsigned long cidx;
+                    CLZ(cidx, codeNumber + 1);
+                    length = cidx;
+                }
+                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
+
+                codeNumber = (length + length);
+            }
+            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+
+            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
+        }
+        if (absCoeff[idx] >= 2)
+            firstCoeff2 = 0;
+        idx++;
+    }
+    while(idx < numNonZero);
+
+    return sum;
+}
+#endif // debug only code
+
 void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
 {
     uint32_t trSize = 1 << log2TrSize;
@@ -1519,7 +1568,7 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
     uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
@@ -1700,6 +1749,7 @@
             uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);
             X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");
 
+            uint32_t firstC2Idx = 8;
             uint32_t firstC2Flag = 2;
             uint32_t c1Next = 0xFFFFFFFE;
             if (!m_bitIf)
@@ -1720,9 +1770,13 @@
 
                     if (symbol1)
                         c1Next = 0;
+
                     if (symbol1 + firstC2Flag == 3)
                         firstC2Flag = symbol2;
 
+                    if (symbol1 + firstC2Idx == 9)
+                        firstC2Idx  = idx;
+
                     c1 = (c1Next & 3);
                     c1Next >>= 2;
                     X265_CHECK(c1 <= 3, "c1 check failure\n");
@@ -1749,9 +1803,10 @@
                 //encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - hiddenShift);
                 m_fracBits += (numNonZero - hiddenShift) << 15;
 
-                if (!c1 || numNonZero > C1FLAG_NUMBER)
+                if (numNonZero > firstC2Idx)
                 {
-                    uint32_t sum = primitives.costCoeffRemain(absCoeff, numNonZero);
+                    sum = primitives.costCoeffRemain(absCoeff, numNonZero, firstC2Idx);
+                    X265_CHECK(sum == costCoeffRemain_c0(absCoeff, numNonZero), "costCoeffRemain check failure\n");
                     m_fracBits += ((uint64_t)sum << 15);
                 }
             }
@@ -1771,6 +1826,9 @@
                     if (symbol1 + firstC2Flag == 3)
                         firstC2Flag = symbol2;
 
+                    if (symbol1 + firstC2Idx == 9)
+                        firstC2Idx  = idx;
+
                     c1 = (c1Next & 3);
                     c1Next >>= 2;
                     X265_CHECK(c1 <= 3, "c1 check failure\n");
@@ -1793,15 +1851,17 @@
                 {
                     // Standard path
                     uint32_t goRiceParam = 0;
+                    int baseLevel = 3;
+#if CHECKED_BUILD || _DEBUG
                     int firstCoeff2 = 1;
-                    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
-
-                    idx = 0;
+#endif
+                    idx = firstC2Idx;
                     do
                     {
-                        int baseLevel = (baseLevelN & 3) | firstCoeff2;
+                        if (idx >= C1FLAG_NUMBER)
+                            baseLevel = 1;
+                        // TODO: fast algorithm maybe broken this check logic
                         X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
-                        baseLevelN >>= 2;
 
                         if (absCoeff[idx] >= baseLevel)
                         {
@@ -1810,8 +1870,10 @@
                                 goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
                             X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
                         }
-                        if (absCoeff[idx] >= 2)
-                            firstCoeff2 = 0;
+#if CHECKED_BUILD || _DEBUG
+                        firstCoeff2 = 0;
+#endif
+                        baseLevel = 2;
                         idx++;
                     }
                     while(idx < numNonZero);