[x265] [PATCH] Enable PropagateCost assembly primitive

Wed Aug 31 11:03:46 CEST 2016

# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1472623041 -19800
#      Wed Aug 31 11:27:21 2016 +0530
# Node ID 3c0561db9c3c0e7dd80e8df18813e2c88407378b
# Parent  9e624761d74e996f885efb7f6ba51b6a6c691482
Enable PropagateCost assembly primitive

diff -r 9e624761d74e -r 3c0561db9c3c source/common/lowres.cpp

--- a/source/common/lowres.cpp	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/lowres.cpp	Wed Aug 31 11:27:21 2016 +0530
@@ -60,6 +60,8 @@
         CHECKED_MALLOC(invQscaleFactor, int, cuCountFullRes);
         CHECKED_MALLOC(qpCuTreeOffset, double, cuCountFullRes);
         CHECKED_MALLOC(blockVariance, uint32_t, cuCountFullRes);
+        if (qgSize == 8)
+            CHECKED_MALLOC(invQscaleFactor8x8, int, cuCountFullRes);
     }
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
@@ -129,6 +131,7 @@
     X265_FREE(qpCuTreeOffset);
     X265_FREE(propagateCost);
     X265_FREE(blockVariance);
+    X265_FREE(invQscaleFactor8x8);
 }
 
 // (re) initialize lowres state
diff -r 9e624761d74e -r 3c0561db9c3c source/common/lowres.h
--- a/source/common/lowres.h	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/lowres.h	Wed Aug 31 11:27:21 2016 +0530
@@ -145,6 +145,7 @@
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
+    int*      invQscaleFactor8x8; // temporary buffer for qg-size 8
     uint32_t* blockVariance;
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum[3];
diff -r 9e624761d74e -r 3c0561db9c3c source/common/pixel.cpp
--- a/source/common/pixel.cpp	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/pixel.cpp	Wed Aug 31 11:27:21 2016 +0530
@@ -845,58 +845,31 @@
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
-                                    const int32_t* invQscales, const double* fpsFactor, int len, uint32_t qgSize)
+                                    const int32_t* invQscales, const double* fpsFactor, int len)
 {
     double fps = *fpsFactor / 256;  // range[0.01, 1.00]
-    if (qgSize == 8)
+    for (int i = 0; i < len; i++)
     {
-        for (int i = 0; i < len; i++)
-        {
-            int intraCost = intraCosts[i];
-            int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
-            int invQscaleFactor = (invQscales[i * 2] + invQscales[i * 2 + 1] + invQscales[i * 2 + len * 2] + invQscales[i * 2 + len * 2 + 1]) / 4;
-            double propagateIntra = intraCost * invQscaleFactor; // Q16 x Q8.8 = Q24.8
-            double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
-            double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+        int intraCost = intraCosts[i];
+        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+        double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+        double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
 
 #if 0
-            // algorithm that output match to asm
-            float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
-            float intraRcpError1 = (float)intraCost * (float)intraRcp;
-            intraRcpError1 *= (float)intraRcp;
-            float intraRcpError2 = intraRcp + intraRcp;
-            float propagateDenom = intraRcpError2 - intraRcpError1;
-            dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+        // algorithm that output match to asm
+        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
+        float intraRcpError1 = (float)intraCost * (float)intraRcp;
+        intraRcpError1 *= (float)intraRcp;
+        float intraRcpError2 = intraRcp + intraRcp;
+        float propagateDenom = intraRcpError2 - intraRcpError1;
+        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
 #else
-            double propagateDenom = (double)intraCost;             // Q32
-            dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+        double propagateDenom = (double)intraCost;             // Q32
+        dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
 #endif
         }
-    }
-    else
-    {
-        for (int i = 0; i < len; i++)
-        {
-            int intraCost = intraCosts[i];
-            int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
-            double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
-            double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
-            double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
-
-#if 0
-            // algorithm that output match to asm
-            float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
-            float intraRcpError1 = (float)intraCost * (float)intraRcp;
-            intraRcpError1 *= (float)intraRcp;
-            float intraRcpError2 = intraRcp + intraRcp;
-            float propagateDenom = intraRcpError2 - intraRcpError1;
-            dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
-#else
-            double propagateDenom = (double)intraCost;             // Q32
-            dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
-#endif
-        }
-    }
+    //}
 }
 
 /* Conversion between double and Q8.8 fixed point (big-endian) for storage */
diff -r 9e624761d74e -r 3c0561db9c3c source/common/primitives.h
--- a/source/common/primitives.h	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/primitives.h	Wed Aug 31 11:27:21 2016 +0530
@@ -187,7 +187,7 @@
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
-typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len, uint32_t qgSize);
+typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
 typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
 typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
diff -r 9e624761d74e -r 3c0561db9c3c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 31 11:27:21 2016 +0530
@@ -1027,7 +1027,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
-        //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -1312,7 +1312,7 @@
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
-        //p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -2153,7 +2153,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
-        //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
 
@@ -2356,7 +2356,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
-        //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -2670,7 +2670,7 @@
         p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
-        //p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -3666,7 +3666,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
-        //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
         p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
diff -r 9e624761d74e -r 3c0561db9c3c source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/x86/mc-a2.asm	Wed Aug 31 11:27:21 2016 +0530
@@ -994,7 +994,7 @@
 
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
-;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len, uint32_t qgSize)
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal mbtree_propagate_cost, 7,7,7
diff -r 9e624761d74e -r 3c0561db9c3c source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/encoder/slicetype.cpp	Wed Aug 31 11:27:21 2016 +0530
@@ -237,6 +237,21 @@
         }
     }
 
+    if (param->rc.qgSize == 8)
+    {
+        for (int cuY = 0; cuY < heightInCU; cuY++)
+        {
+            for (int cuX = 0; cuX < widthInCU; cuX++)
+            {
+                const int cuXY = cuX + cuY * widthInCU;
+                curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+            }
+        }
+    }
+
     if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
     {
         int hShift = CHROMA_H_SHIFT(param->internalCsp);
@@ -347,15 +362,9 @@
             frame cost estimates, they are not very accurate */
             const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                         cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
-            int invQscaleFactor, icostAq;
+            int icostAq;
             if (qgSize == 8)
-            {
-                invQscaleFactor = (fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
-                                   fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
-                                   fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes] +
-                                   fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes + 1]) / 4;
-                icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * invQscaleFactor + 128) >> 8) : icost;
-            }
+                icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor8x8[cuXY] + 128) >> 8) : icost;
             else
                 icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] +128) >> 8) : icost;
 
@@ -1816,11 +1825,11 @@
         if (m_param->rc.qgSize == 8)
             primitives.propagateCost(m_scratch, propagateCost,
                        frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
-                       frames[b]->invQscaleFactor + (cuIndex * 4), &fpsFactor, m_8x8Width, m_param->rc.qgSize);
+                       frames[b]->invQscaleFactor8x8 + cuIndex, &fpsFactor, m_8x8Width);
         else
             primitives.propagateCost(m_scratch, propagateCost,
                        frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
-                       frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width, m_param->rc.qgSize);
+                       frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);
 
         if (referenced)
             propagateCost += m_8x8Width;
@@ -1913,12 +1922,7 @@
             for (int cuX = 0; cuX < m_8x8Width; cuX++)
             {
                 const int cuXY = cuX + cuY * m_8x8Width;
-                int invQscaleFactor = (frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4] +
-                                       frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + 1] +
-                                       frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] +
-                                       frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1]) / 4;
-
-                int intracost = ((frame->intraCost[cuXY]) / 4 * invQscaleFactor + 128) >> 8;
+                int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
                 if (intracost)
                 {
                     int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
@@ -1928,7 +1932,6 @@
                     frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
                     frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
                 }
-
             }
         }
     }
@@ -2292,15 +2295,9 @@
     /* do not include edge blocks in the frame cost estimates, they are not very accurate */
     const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                 cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
-    int invQscaleFactor, bcostAq;
+    int bcostAq;
     if (m_lookahead.m_param->rc.qgSize == 8)
-    {
-        invQscaleFactor = (fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
-                           fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
-                           fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes] +
-                           fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes + 1]) / 4;
-        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * invQscaleFactor + 128) >> 8) : bcost;
-    }
+        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
     else
         bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;
 
diff -r 9e624761d74e -r 3c0561db9c3c source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/test/pixelharness.cpp	Wed Aug 31 11:27:21 2016 +0530
@@ -1387,8 +1387,8 @@
     {
         int width = 16 + rand() % 64;
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width, 32);
-        ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width, 32);
+        checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
+        ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
@@ -3102,7 +3102,7 @@
     if (opt.propagateCost)
     {
         HEADER0("propagateCost");
-        REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80, 32);
+        REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
     }
 
     if (opt.fix8Pack)
diff -r 9e624761d74e -r 3c0561db9c3c source/x265.h
--- a/source/x265.h	Mon Aug 29 13:49:09 2016 +0530
+++ b/source/x265.h	Wed Aug 31 11:27:21 2016 +0530
@@ -263,8 +263,8 @@
     /* An array of quantizer offsets to be applied to this image during encoding.
      * These are added on top of the decisions made by rateControl.
      * Adaptive quantization must be enabled to use this feature. These quantizer
-     * offsets should be given for each 16x16 block. Behavior if quant
-     * offsets differ between encoding passes is undefined. */
+     * offsets should be given for each 16x16 block (8x8 block, when qg-size is 8).
+     * Behavior if quant offsets differ between encoding passes is undefined. */
     float            *quantOffsets;
 
     /* Frame level statistics */