<div dir="ltr">Min, thanks for this series of patches, all pushed. <br></div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Jun 9, 2015 at 11:36 PM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1433872879 25200<br>
# Node ID e5b6f0a984bdd8ab16b63fb1c11a508a444515ec<br>
# Parent  134670771e0c1dd0800c3e9db0a1f9f69c467e36<br>
asm: improve costCoeffRemain by bypass uncoded coeff<br>
---<br>
 source/common/dct.cpp             |   17 ++++----<br>
 source/common/primitives.h        |    2 +-<br>
 source/common/x86/pixel-util.h    |    2 +-<br>
 source/common/x86/pixel-util8.asm |   41 +++++++++----------<br>
 source/encoder/entropy.cpp        |   82 ++++++++++++++++++++++++++++++++-----<br>
 5 files changed, 101 insertions(+), 43 deletions(-)<br>
<br>
diff -r 134670771e0c -r e5b6f0a984bd source/common/dct.cpp<br>
--- a/source/common/dct.cpp     Tue Jun 09 11:01:15 2015 -0700<br>
+++ b/source/common/dct.cpp     Tue Jun 09 11:01:19 2015 -0700<br>
@@ -874,19 +874,19 @@<br>
     return (sum & 0xFFFFFF);<br>
 }<br>
<br>
-uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)<br>
+uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)<br>
 {<br>
     uint32_t goRiceParam = 0;<br>
-    int firstCoeff2 = 1;<br>
-    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel<br>
<br>
     uint32_t sum = 0;<br>
-    int idx = 0;<br>
+    int baseLevel = 3;<br>
     do<br>
     {<br>
-        int baseLevel = (baseLevelN & 3) | firstCoeff2;<br>
-        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");<br>
-        baseLevelN >>= 2;<br>
+        if (idx >= C1FLAG_NUMBER)<br>
+            baseLevel = 1;<br>
+<br>
+        // TODO: the IDX is not really idx, so this check inactive<br>
+        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");<br>
         int codeNumber = absCoeff[idx] - baseLevel;<br>
<br>
         if (codeNumber >= 0)<br>
@@ -912,8 +912,7 @@<br>
                 goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);<br>
             X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");<br>
         }<br>
-        if (absCoeff[idx] >= 2)<br>
-            firstCoeff2 = 0;<br>
+        baseLevel = 2;<br>
         idx++;<br>
     }<br>
     while(idx < numNonZero);<br>
diff -r 134670771e0c -r e5b6f0a984bd source/common/primitives.h<br>
--- a/source/common/primitives.h        Tue Jun 09 11:01:15 2015 -0700<br>
+++ b/source/common/primitives.h        Tue Jun 09 11:01:19 2015 -0700<br>
@@ -187,7 +187,7 @@<br>
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);<br>
<br>
 typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);<br>
-typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);<br>
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);<br>
<br>
 /* Function pointers to optimized encoder primitives. Each pointer can reference<br>
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */<br>
diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util.h<br>
--- a/source/common/x86/pixel-util.h    Tue Jun 09 11:01:15 2015 -0700<br>
+++ b/source/common/x86/pixel-util.h    Tue Jun 09 11:01:19 2015 -0700<br>
@@ -83,7 +83,7 @@<br>
 uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);<br>
<br>
 uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);<br>
-uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);<br>
+uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero, int idx);<br>
<br>
<br>
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \<br>
diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util8.asm<br>
--- a/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:15 2015 -0700<br>
+++ b/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:19 2015 -0700<br>
@@ -6572,7 +6572,7 @@<br>
 ;}<br>
 ;while(idx < numNonZero);<br>
<br>
-; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)<br>
+; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx)<br>
 INIT_XMM sse4<br>
 cglobal costCoeffRemain, 0,7,1<br>
     ; assign RCX to R3<br>
@@ -6580,48 +6580,43 @@<br>
   %if WIN64<br>
     DECLARE_REG_TMP 3,1,2,0<br>
     mov         t0, r0<br>
+    mov         r4d, r2d<br>
   %elif ARCH_X86_64<br>
     ; *nix x64 didn't do anything<br>
     DECLARE_REG_TMP 0,1,2,3<br>
+    mov         r4d, r2d<br>
   %else ; X86_32<br>
     DECLARE_REG_TMP 6,3,2,1<br>
     mov         t0, r0m<br>
+    mov         r4d, r2m<br>
   %endif<br>
<br>
-    mova        m0, [t0]<br>
-    packsswb    m0, [t0 + mmsize]<br>
-    pcmpgtb     m0, [pb_1]<br>
-    pmovmskb    r2d, m0<br>
-    bsf         r2d, r2d<br>
-    lea         r2d, [r2 * 2 + 1]<br>
-    xor         r4d, r4d<br>
-    bts         r4d, r2d<br>
-    dec         r4d<br>
-    and         r4d, 0x55555555<br>
-    or          r4d, 0x5555AAAA<br>
-<br>
     xor         t3d, t3d<br>
     xor         r5d, r5d<br>
<br>
+    lea         t0, [t0 + r4 * 2]<br>
+    mov         r2d, 3<br>
+<br>
     ; register mapping<br>
-    ; r4d - baseLevelN<br>
-    ; r2  - tmp<br>
+    ; r2d - baseLevel & tmp<br>
+    ; r4d - idx<br>
     ; t3  - goRiceParam<br>
-    ; eax - tmp - absCoeff[idx]<br>
+    ; eax - absCoeff[idx] & tmp<br>
     ; r5  - sum<br>
<br>
 .loop:<br>
+    mov         eax, 1<br>
+    cmp         r4d, 8<br>
+    cmovge      r2d, eax<br>
+<br>
     movzx       eax, word [t0]<br>
     add         t0, 2<br>
-    mov         r2d, r4d<br>
-    and         r2d, 3<br>
-    shr         r4d, 2<br>
     sub         eax, r2d                ; codeNumber = absCoeff[idx] - baseLevel<br>
     jl         .next<br>
<br>
     shr         eax, t3b                ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION<br>
<br>
-    lea         r2d, [eax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);<br>
+    lea         r2d, [rax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);<br>
     bsr         r2d, r2d<br>
     add         r2d, r2d                ; codeNumber = (length + length)<br>
<br>
@@ -6644,8 +6639,10 @@<br>
     add         t3b, al<br>
<br>
 .next:<br>
-    dec   dword r1m<br>
-    jnz        .loop<br>
+    inc         r4d<br>
+    mov         r2d, 2<br>
+    cmp         r4d, r1m<br>
+    jl         .loop<br>
<br>
     mov         eax, r5d<br>
     RET<br>
diff -r 134670771e0c -r e5b6f0a984bd source/encoder/entropy.cpp<br>
--- a/source/encoder/entropy.cpp        Tue Jun 09 11:01:15 2015 -0700<br>
+++ b/source/encoder/entropy.cpp        Tue Jun 09 11:01:19 2015 -0700<br>
@@ -1431,6 +1431,55 @@<br>
         encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);<br>
 }<br>
<br>
+#if CHECKED_BUILD || _DEBUG<br>
+uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)<br>
+{<br>
+    uint32_t goRiceParam = 0;<br>
+    int firstCoeff2 = 1;<br>
+    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel<br>
+<br>
+    uint32_t sum = 0;<br>
+    int idx = 0;<br>
+    do<br>
+    {<br>
+        int baseLevel = (baseLevelN & 3) | firstCoeff2;<br>
+        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");<br>
+        baseLevelN >>= 2;<br>
+        int codeNumber = absCoeff[idx] - baseLevel;<br>
+<br>
+        if (codeNumber >= 0)<br>
+        {<br>
+            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);<br>
+            uint32_t length = 0;<br>
+<br>
+            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;<br>
+            if (codeNumber >= 0)<br>
+            {<br>
+                {<br>
+                    unsigned long cidx;<br>
+                    CLZ(cidx, codeNumber + 1);<br>
+                    length = cidx;<br>
+                }<br>
+                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");<br>
+<br>
+                codeNumber = (length + length);<br>
+            }<br>
+            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);<br>
+<br>
+            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))<br>
+                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);<br>
+            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");<br>
+        }<br>
+        if (absCoeff[idx] >= 2)<br>
+            firstCoeff2 = 0;<br>
+        idx++;<br>
+    }<br>
+    while(idx < numNonZero);<br>
+<br>
+    return sum;<br>
+}<br>
+#endif // debug only code<br>
+<br>
 void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)<br>
 {<br>
     uint32_t trSize = 1 << log2TrSize;<br>
@@ -1519,7 +1568,7 @@<br>
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];<br>
     uint32_t c1 = 1;<br>
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;<br>
-    ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);<br>
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);<br>
     uint32_t numNonZero = 1;<br>
     unsigned long lastNZPosInCG;<br>
     unsigned long firstNZPosInCG;<br>
@@ -1700,6 +1749,7 @@<br>
             uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);<br>
             X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");<br>
<br>
+            uint32_t firstC2Idx = 8;<br>
             uint32_t firstC2Flag = 2;<br>
             uint32_t c1Next = 0xFFFFFFFE;<br>
             if (!m_bitIf)<br>
@@ -1720,9 +1770,13 @@<br>
<br>
                     if (symbol1)<br>
                         c1Next = 0;<br>
+<br>
                     if (symbol1 + firstC2Flag == 3)<br>
                         firstC2Flag = symbol2;<br>
<br>
+                    if (symbol1 + firstC2Idx == 9)<br>
+                        firstC2Idx  = idx;<br>
+<br>
                     c1 = (c1Next & 3);<br>
                     c1Next >>= 2;<br>
                     X265_CHECK(c1 <= 3, "c1 check failure\n");<br>
@@ -1749,9 +1803,10 @@<br>
                 //encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - hiddenShift);<br>
                 m_fracBits += (numNonZero - hiddenShift) << 15;<br>
<br>
-                if (!c1 || numNonZero > C1FLAG_NUMBER)<br>
+                if (numNonZero > firstC2Idx)<br>
                 {<br>
-                    uint32_t sum = primitives.costCoeffRemain(absCoeff, numNonZero);<br>
+                    sum = primitives.costCoeffRemain(absCoeff, numNonZero, firstC2Idx);<br>
+                    X265_CHECK(sum == costCoeffRemain_c0(absCoeff, numNonZero), "costCoeffRemain check failure\n");<br>
                     m_fracBits += ((uint64_t)sum << 15);<br>
                 }<br>
             }<br>
@@ -1771,6 +1826,9 @@<br>
                     if (symbol1 + firstC2Flag == 3)<br>
                         firstC2Flag = symbol2;<br>
<br>
+                    if (symbol1 + firstC2Idx == 9)<br>
+                        firstC2Idx  = idx;<br>
+<br>
                     c1 = (c1Next & 3);<br>
                     c1Next >>= 2;<br>
                     X265_CHECK(c1 <= 3, "c1 check failure\n");<br>
@@ -1793,15 +1851,17 @@<br>
                 {<br>
                     // Standard path<br>
                     uint32_t goRiceParam = 0;<br>
+                    int baseLevel = 3;<br>
+#if CHECKED_BUILD || _DEBUG<br>
                     int firstCoeff2 = 1;<br>
-                    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel<br>
-<br>
-                    idx = 0;<br>
+#endif<br>
+                    idx = firstC2Idx;<br>
                     do<br>
                     {<br>
-                        int baseLevel = (baseLevelN & 3) | firstCoeff2;<br>
+                        if (idx >= C1FLAG_NUMBER)<br>
+                            baseLevel = 1;<br>
+                        // TODO: fast algorithm maybe broken this check logic<br>
                         X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");<br>
-                        baseLevelN >>= 2;<br>
<br>
                         if (absCoeff[idx] >= baseLevel)<br>
                         {<br>
@@ -1810,8 +1870,10 @@<br>
                                 goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);<br>
                             X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");<br>
                         }<br>
-                        if (absCoeff[idx] >= 2)<br>
-                            firstCoeff2 = 0;<br>
+#if CHECKED_BUILD || _DEBUG<br>
+                        firstCoeff2 = 0;<br>
+#endif<br>
+                        baseLevel = 2;<br>
                         idx++;<br>
                     }<br>
                     while(idx < numNonZero);<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>