[x265] add primitives.nquant for RDOQ

Wed Jul 2 09:41:32 CEST 2014

# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1404286661 -32400
#      Wed Jul 02 16:37:41 2014 +0900
# Node ID 3f25ca9b5addda057040a5e1a544b9ede9afc509
# Parent  a18972fd05b1d6242a881bef979b9e1ff17543d9
add primitives.nquant for RDOQ

diff -r a18972fd05b1 -r 3f25ca9b5add source/Lib/TLibCommon/TComTrQuant.cpp

--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Wed Jul 02 16:37:41 2014 +0900
@@ -508,23 +508,30 @@
 uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize,
                                         TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
 {
-    x265_emms();
-    selectLambda(ttype);
-
     const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
-    uint32_t absSum = 0;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
-    uint32_t goRiceParam = 0;
-    double blockUncodedCost = 0;
     int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
 
     X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
 
     int qbits = QUANT_SHIFT + m_qpParam.m_per + transformShift; // Right shift of non-RDOQ quantizer;  level = (coeff*Q + offset)>>q_bits
     int add = (1 << (qbits - 1));
-    double *errScale = getErrScaleCoeff(scalingListType, log2TrSize - 2, m_qpParam.m_rem);
     int32_t *qCoef = getQuantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
 
+    int numCoeff = 1 << log2TrSize * 2;
+    int scaledCoeff[32 * 32];
+    uint32_t numSig = primitives.nquant(srcCoeff, qCoef, scaledCoeff, dstCoeff, qbits, add, numCoeff);
+
+    X265_CHECK(numSig == primitives.count_nonzero(dstCoeff, numCoeff), "numSig differ\n");
+    if (numSig == 0)
+        return 0;
+
+    x265_emms();
+    selectLambda(ttype);
+
+    double *errScale = getErrScaleCoeff(scalingListType, log2TrSize - 2, m_qpParam.m_rem);
+
+    double blockUncodedCost = 0;
     double costCoeff[32 * 32];
     double costSig[32 * 32];
     double costCoeff0[32 * 32];
@@ -544,6 +551,7 @@
     int    c2            = 0;
     double baseCost      = 0;
     int    lastScanPos   = -1;
+    uint32_t goRiceParam = 0;
     uint32_t c1Idx       = 0;
     uint32_t c2Idx       = 0;
     int cgLastScanPos    = -1;
@@ -567,16 +575,13 @@
             //===== quantization =====
             uint32_t blkPos = codingParameters.scan[scanPos];
             // set coeff
-            int Q = qCoef[blkPos];
             double scaleFactor = errScale[blkPos];
-            int levelDouble    = srcCoeff[blkPos];
-            levelDouble        = (int)std::min<int64_t>((int64_t)abs((int)levelDouble) * Q, MAX_INT - add);
 
-            uint32_t maxAbsLevel = (levelDouble + add) >> qbits;
+            int levelDouble      = scaledCoeff[blkPos];
+            uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);
 
             costCoeff0[scanPos] = ((uint64_t)levelDouble * levelDouble) * scaleFactor;
             blockUncodedCost   += costCoeff0[scanPos];
-            dstCoeff[blkPos]    = maxAbsLevel;
 
             if (maxAbsLevel > 0 && lastScanPos < 0)
             {
@@ -776,7 +781,7 @@
     //===== estimate last position =====
     if (lastScanPos < 0)
     {
-        return absSum;
+        return 0;
     }
 
     double bestCost = 0;
@@ -840,6 +845,7 @@
         } // end if (sigCoeffGroupFlag[ cgBlkPos ])
     } // end for
 
+    uint32_t absSum = 0;
     for (int pos = 0; pos < bestLastIdxp1; pos++)
     {
         int blkPos = codingParameters.scan[pos];
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/dct.cpp	Wed Jul 02 16:37:41 2014 +0900
@@ -780,10 +780,8 @@
 
     for (int blockpos = 0; blockpos < numCoeff; blockpos++)
     {
-        int level;
-        int sign;
-        level = coef[blockpos];
-        sign  = (level < 0 ? -1 : 1);
+        int level = coef[blockpos];
+        int sign  = (level < 0 ? -1 : 1);
 
         int tmplevel = abs(level) * quantCoeff[blockpos];
         level = ((tmplevel + add) >> qBits);
@@ -798,6 +796,27 @@
     return acSum;
 }
 
+uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
+{
+    uint32_t numSig = 0;
+
+    for (int blockpos = 0; blockpos < numCoeff; blockpos++)
+    {
+        int level = coef[blockpos];
+        int sign  = (level < 0 ? -1 : 1);
+
+        int tmplevel = abs(level) * quantCoeff[blockpos];
+        scaledCoeff[blockpos] = tmplevel;
+        level = ((tmplevel + add) >> qBits);
+        if (level)
+            ++numSig;
+        level *= sign;
+        qCoef[blockpos] = Clip3(-32768, 32767, level);
+    }
+
+    return numSig;
+}
+
 int  count_nonzero_c(const int32_t *quantCoeff, int numCoeff)
 {
     X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
@@ -822,6 +841,7 @@
     p.dequant_scaling = dequant_scaling_c;
     p.dequant_normal = dequant_normal_c;
     p.quant = quant_c;
+    p.nquant = nquant_c;
     p.dct[DST_4x4] = dst4_c;
     p.dct[DCT_4x4] = dct4_c;
     p.dct[DCT_8x8] = dct8_c;
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/primitives.h
--- a/source/common/primitives.h	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/primitives.h	Wed Jul 02 16:37:41 2014 +0900
@@ -147,6 +147,7 @@
 typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
 typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 typedef int  (*count_nonzero_t)(const int32_t *quantCoeff, int numCoeff);
@@ -242,6 +243,7 @@
     dct_t           dct[NUM_DCTS];
     idct_t          idct[NUM_IDCTS];
     quant_t         quant;
+    nquant_t        nquant;
     dequant_scaling_t dequant_scaling;
     dequant_normal_t dequant_normal;
     count_nonzero_t count_nonzero;
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Jul 02 16:37:41 2014 +0900
@@ -1061,6 +1061,7 @@
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
         p.quant = x265_quant_sse4;
+        p.nquant = x265_nquant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
         p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
         p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
@@ -1257,6 +1258,7 @@
         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
         p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
         p.quant = x265_quant_sse4;
+        p.nquant = x265_nquant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
         p.weight_pp = x265_weight_pp_sse4;
         p.weight_sp = x265_weight_sp_sse4;
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/x86/pixel-util.h	Wed Jul 02 16:37:41 2014 +0900
@@ -45,6 +45,7 @@
 void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
 
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
 
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/x86/pixel-util8.asm	Wed Jul 02 16:37:41 2014 +0900
@@ -879,7 +879,7 @@
   %define qbits8    [rsp + 2 * mmsize]
 %endif
 
-    ; fill qbits-8
+    ; fill qbits
     movd        m0, r4d
     mova        qbits, m0
 
@@ -979,6 +979,81 @@
 
 
 ;-----------------------------------------------------------------------------
+; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal nquant, 5,6,8
+
+    ; fill qbits
+    movd        m5, r4d         ; m5 = qbits
+
+    ; fill offset
+    movd        m6, r5m
+    pshufd      m6, m6, 0       ; m6 = add
+
+    mov         r4d, r6m
+    shr         r4d, 3
+    pxor        m7, m7          ; m7 = numZero
+.loop:
+    ; 4 coeff
+    movu        m0, [r0]        ; m0 = level
+    pxor        m1, m1
+    pcmpgtd     m1, m0          ; m1 = sign
+    movu        m2, [r1]        ; m2 = qcoeff
+    pabsd       m0, m0
+    pmulld      m0, m2          ; m0 = tmpLevel1
+    movu        [r2], m0        ; m0 = scaledCoeff
+    paddd       m2, m0, m6
+    psrad       m2, m5          ; m2 = level1
+    pxor        m4, m4
+    pcmpeqd     m4, m2          ; m4 = mask4
+
+    pxor        m2, m1
+    psubd       m2, m1
+    packssdw    m2, m2
+    pmovsxwd    m2, m2
+    movu        [r3], m2
+    ; 4 coeff
+    movu        m0, [r0 + 16]   ; m0 = level
+    pxor        m1, m1
+    pcmpgtd     m1, m0          ; m1 = sign
+    movu        m2, [r1 + 16]   ; m2 = qcoeff
+    pabsd       m0, m0
+    pmulld      m0, m2          ; m0 = tmpLevel1
+    movu        [r2 + 16], m0   ; m0 = scaledCoeff
+    paddd       m2, m0, m6
+    psrad       m2, m5          ; m2 = level1
+    pxor        m0, m0
+    pcmpeqd     m0, m2          ; m0 = mask4
+
+    pxor        m2, m1
+    psubd       m2, m1
+    packssdw    m2, m2
+    pmovsxwd    m2, m2
+    movu        [r3 + 16], m2
+
+    packssdw    m4, m0          ; m4 = mask8
+    psubw       m7, m4          ; m7 = numZero
+
+    add         r0, 32
+    add         r1, 32
+    add         r2, 32
+    add         r3, 32
+
+    dec         r4d
+    jnz        .loop
+
+    packuswb    m7, m7
+    pxor        m0, m0
+    psadbw      m0, m7
+    mov         eax, r6m
+    movd        r4d, m0
+    sub         eax, r4d        ; numSig
+
+    RET
+
+
+;-----------------------------------------------------------------------------
 ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
diff -r a18972fd05b1 -r 3f25ca9b5add source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/test/mbdstharness.cpp	Wed Jul 02 16:37:41 2014 +0900
@@ -327,6 +327,50 @@
     return true;
 }
 
+bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
+{
+    int j = 0;
+
+    for (int i = 0; i <= ITERS; i++)
+    {
+        int width = (rand() % 4 + 1) * 4;
+
+        if (width == 12)
+        {
+            width = 32;
+        }
+        int height = width;
+
+        uint32_t optReturnValue = 0;
+        uint32_t refReturnValue = 0;
+
+        int bits = rand() % 32;
+        int valueToAdd = rand() % (32 * 1024);
+        int cmp_size = sizeof(int) * height * width;
+        int numCoeff = height * width;
+
+        int index1 = rand() % TEST_CASES;
+        int index2 = rand() % TEST_CASES;
+
+        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
+        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
+
+        if (memcmp(mintbuf3, mintbuf5, cmp_size))
+            return false;
+
+        if (memcmp(mintbuf4, mintbuf6, cmp_size))
+            return false;
+
+        if (optReturnValue != refReturnValue)
+            return false;
+
+        reportfail();
+        j += 16;
+    }
+
+    return true;
+}
+
 bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
 {
     ALIGN_VAR_32(int32_t, qcoeff[32 * 32]);
@@ -409,6 +453,15 @@
         }
     }
 
+    if (opt.nquant)
+    {
+        if (!check_nquant_primitive(ref.nquant, opt.nquant))
+        {
+            printf("nquant: Failed!\n");
+            return false;
+        }
+    }
+
     if (opt.count_nonzero)
     {
         if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero))
@@ -460,6 +513,12 @@
         REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
     }
 
+    if (opt.nquant)
+    {
+        printf("nquant\t\t");
+        REPORT_SPEEDUP(opt.nquant, ref.nquant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
+    }
+
     if (opt.count_nonzero)
     {
         for (int i = 4; i <= 32; i <<= 1)
diff -r a18972fd05b1 -r 3f25ca9b5add source/test/mbdstharness.h
--- a/source/test/mbdstharness.h	Tue Jul 01 14:58:35 2014 -0500
+++ b/source/test/mbdstharness.h	Wed Jul 02 16:37:41 2014 +0900
@@ -44,6 +44,7 @@
     bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
     bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
     bool check_quant_primitive(quant_t ref, quant_t opt);
+    bool check_nquant_primitive(nquant_t ref, nquant_t opt);
     bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
     bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
     bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);