[x265] [PATCH 08 of 16] modify findPosFirstLast API to output absSubSign

Wed Oct 7 00:55:19 CEST 2015

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444148069 18000
# Node ID 777136c4c5ff37a5af7371bdd139c808be5af078
# Parent  87b6363c2832a5339e75866fa685a5b39f2b0512
modify findPosFirstLast API to output absSubSign
---
 source/common/dct.cpp             |   14 ++++++++++++--
 source/common/quant.cpp           |   36 +++++++++++++++++++++---------------
 source/common/x86/pixel-util8.asm |   23 ++++++++++++++++++-----
 source/test/pixelharness.cpp      |    4 ++--
 4 files changed, 53 insertions(+), 24 deletions(-)

diff -r 87b6363c2832 -r 777136c4c5ff source/common/dct.cpp

--- a/source/common/dct.cpp	Tue Oct 06 11:14:27 2015 -0500
+++ b/source/common/dct.cpp	Tue Oct 06 11:14:29 2015 -0500
@@ -787,11 +787,12 @@
     return scanPosLast - 1;
 }
 
+// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
 {
     int n;
 
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
+    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
     {
         const uint32_t idx = scanTbl[n];
         const uint32_t idxY = idx / MLS_CG_SIZE;
@@ -815,8 +816,17 @@
 
     uint32_t firstNZPosInCG = (uint32_t)n;
 
+    uint32_t absSumSign = 0;
+    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
+    {
+        const uint32_t idx = scanTbl[n];
+        const uint32_t idxY = idx / MLS_CG_SIZE;
+        const uint32_t idxX = idx % MLS_CG_SIZE;
+        absSumSign += dstCoeff[idxY * trSize + idxX];
+    }
+
     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
-    return ((lastNZPosInCG << 16) | firstNZPosInCG);
+    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
 }
 
 
diff -r 87b6363c2832 -r 777136c4c5ff source/common/quant.cpp
--- a/source/common/quant.cpp	Tue Oct 06 11:14:27 2015 -0500
+++ b/source/common/quant.cpp	Tue Oct 06 11:14:29 2015 -0500
@@ -1232,7 +1232,8 @@
 
     // Average 49.62 pixels
     /* clean uncoded coefficients */
-    for (int pos = bestLastIdx; pos <= fastMin(lastScanPos, (bestLastIdx | (SCAN_SET_SIZE - 1))); pos++)
+    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
+    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
     {
         dstCoeff[codeParams.scan[pos]] = 0;
     }
@@ -1262,19 +1263,23 @@
             /* measure distance between first and last non-zero coef in this
              * coding group */
             const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
-            int firstNZPosInCG = (uint16_t)posFirstLast;
-            int lastNZPosInCG = posFirstLast >> 16;
-
+            const int firstNZPosInCG = (uint8_t)posFirstLast;
+            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
+            const uint32_t absSumSign = posFirstLast;
 
             if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
             {
-                int absSum = dstCoeff[codeParams.scan[subPos + firstNZPosInCG]];
-                const uint32_t signbit = ((uint32_t)absSum >> 31);
+                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
 
-                for (n = firstNZPosInCG + 1; n <= lastNZPosInCG; n++)
-                    absSum += dstCoeff[codeParams.scan[n + subPos]];
+#if CHECKED_BUILD || _DEBUG
+                int32_t absSum_dummy = 0;
+                for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
+                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
+                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
+#endif
 
-                if (signbit != (absSum & 1U))
+                //if (signbit != absSumSign)
+                if (((int32_t)(signbit ^ absSumSign)) < 0)
                 {
                     /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
                      * is properly implied. Note dstCoeff[] are signed by this point but curChange and
@@ -1284,13 +1289,13 @@
                     uint32_t minPos = 0;
                     int8_t finalChange = 0;
                     int curChange = 0;
-                    uint32_t lastCoeff = lastCG;
+                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
 
                     for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
                     {
                         const uint32_t blkPos = codeParams.scan[n + subPos];
-                        const int signCoef    = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
-                        const int absLevel    = abs(dstCoeff[blkPos]);
+                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
+                        const int absLevel = abs(dstCoeff[blkPos]);
                         // TODO: this is constant in non-scaling mode
                         const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
                         const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
@@ -1316,13 +1321,14 @@
                             int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
                             int64_t costDown = DELTARDCOST(origDist, d, downBits);
 
-                            costDown -= (lastCoeff & isOne) * 4 * IEP_RATE;
+                            costDown -= lastCoeffAdjust;
                             curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
 
                             curChange = 2 * (costUp < costDown) - 1;
                             curCost = (costUp < costDown) ? costUp : curCost;
                         }
-                        else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
+                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
+                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
                         {
                             /* don't try to make a new coded coeff before the first coeff if its
                              * sign would be different than the first coeff, the inferred sign would
@@ -1344,7 +1350,7 @@
                             finalChange = (int8_t)curChange;
                             minPos = blkPos + (absLevel << 16);
                         }
-                        lastCoeff = 0;
+                        lastCoeffAdjust = 0;
                     }
 
                     const int absInMinPos = (minPos >> 16);
diff -r 87b6363c2832 -r 777136c4c5ff source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Oct 06 11:14:27 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Tue Oct 06 11:14:29 2015 -0500
@@ -6653,10 +6653,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum)
 ;-----------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal findPosFirstLast, 3,3,3
+cglobal findPosFirstLast, 3,3,4
     ; convert stride to int16_t
     add         r1d, r1d
 
@@ -6668,10 +6668,22 @@
     movh        m1, [r0]
     movhps      m1, [r0 + r1]
     movh        m2, [r0 + r1 * 2]
-    lea         r1, [r1 * 3]
+    lea         r1d, [r1 * 3]
     movhps      m2, [r0 + r1]
+    pxor        m3, m1, m2
     packsswb    m1, m2
 
+    ; get absSum
+    movhlps     m2, m3
+    pxor        m3, m2
+    pshufd      m2, m3, q2301
+    pxor        m3, m2
+    movd        r0d, m3
+    mov         r2d, r0d
+    shr         r2d, 16
+    xor         r2d, r0d
+    shl         r2d, 31
+
     ; get non-zero mask
     pxor        m2, m2
     pcmpeqb     m1, m2
@@ -6684,8 +6696,9 @@
     not         r0d
     bsr         r1w, r0w
     bsf         eax, r0d    ; side effect: clear AH to Zero
-    shl         r1d, 16
-    or          eax, r1d
+    shl         r1d, 8
+    or          eax, r2d    ; merge absSumSign
+    or          eax, r1d    ; merge lastNZPosInCG
     RET
 
 
diff -r 87b6363c2832 -r 777136c4c5ff source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 11:14:27 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Oct 06 11:14:29 2015 -0500
@@ -1571,8 +1571,8 @@
         // specially case: all coeff group are zero
         if (j >= SCAN_SET_SIZE)
         {
-            // all zero block the high 16-bits undefined
-            if ((uint16_t)ref_scanPos != (uint16_t)opt_scanPos)
+            // all zero block the high 24-bits undefined
+            if ((uint8_t)ref_scanPos != (uint8_t)opt_scanPos)
                 return false;
         }
         else if (ref_scanPos != opt_scanPos)