[x265] [PATCH] primitives: add sa8d and sse_pp aliases for chroma square blocks

Mon Jan 5 05:23:48 CET 2015

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1420431805 -19800
#      Mon Jan 05 09:53:25 2015 +0530
# Node ID c781e15eb4d5146efa115f1ab170ca673440baf6
# Parent  f255e8d06423231cb8c58ab5d3b10de7fb27b424
primitives: add sa8d and sse_pp aliases for chroma square blocks

This avoids the need for calling partitionFromSizes() in some key analysis
functions

diff -r f255e8d06423 -r c781e15eb4d5 source/common/primitives.cpp

--- a/source/common/primitives.cpp	Fri Jan 02 18:22:38 2015 +0530
+++ b/source/common/primitives.cpp	Mon Jan 05 09:53:25 2015 +0530
@@ -81,8 +81,10 @@
 
     for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
     {
-        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];
-        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
+        p.chroma[X265_CSP_I444].sa8d[i]   = p.sa8d[i];
+        p.chroma[X265_CSP_I444].sse_pp[i] = p.sse_pp[i];
+        p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
+        p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
     }
 
     primitives.sa8d[BLOCK_4x4]   = primitives.sa8d_inter[LUMA_4x4];
@@ -145,6 +147,28 @@
     //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
     p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16];
     //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
+
+    p.chroma[X265_CSP_I420].sa8d[BLOCK_4x4] = NULL;
+    p.chroma[X265_CSP_I422].sa8d[BLOCK_4x4] = NULL;
+    p.chroma[X265_CSP_I420].sa8d[BLOCK_8x8] = p.satd[LUMA_4x4];
+    p.chroma[X265_CSP_I422].sa8d[BLOCK_8x8] = p.satd[LUMA_4x8];
+    p.chroma[X265_CSP_I420].sa8d[BLOCK_16x16] = p.sa8d[LUMA_8x8];
+    p.chroma[X265_CSP_I422].sa8d[BLOCK_16x16] = p.sa8d_inter[LUMA_8x16];
+    p.chroma[X265_CSP_I420].sa8d[BLOCK_32x32] = p.sa8d[LUMA_16x16];
+    p.chroma[X265_CSP_I422].sa8d[BLOCK_32x32] = p.sa8d_inter[LUMA_16x32];
+    p.chroma[X265_CSP_I420].sa8d[BLOCK_64x64] = p.sa8d[LUMA_32x32];
+    p.chroma[X265_CSP_I422].sa8d[BLOCK_64x64] = p.sa8d_inter[LUMA_32x64];
+
+    p.chroma[X265_CSP_I420].sse_pp[BLOCK_4x4] = NULL;
+    p.chroma[X265_CSP_I422].sse_pp[BLOCK_4x4] = NULL;
+    p.chroma[X265_CSP_I420].sse_pp[BLOCK_8x8] = p.sse_pp[LUMA_4x4];
+    p.chroma[X265_CSP_I422].sse_pp[BLOCK_8x8] = p.sse_pp[LUMA_4x8];
+    p.chroma[X265_CSP_I420].sse_pp[BLOCK_16x16] = p.sse_pp[LUMA_8x8];
+    p.chroma[X265_CSP_I422].sse_pp[BLOCK_16x16] = p.sse_pp[LUMA_8x16];
+    p.chroma[X265_CSP_I420].sse_pp[BLOCK_32x32] = p.sse_pp[LUMA_16x16];
+    p.chroma[X265_CSP_I422].sse_pp[BLOCK_32x32] = p.sse_pp[LUMA_16x32];
+    p.chroma[X265_CSP_I420].sse_pp[BLOCK_64x64] = p.sse_pp[LUMA_32x32];
+    p.chroma[X265_CSP_I422].sse_pp[BLOCK_64x64] = p.sse_pp[LUMA_32x64];
 }
 }
 using namespace x265;
diff -r f255e8d06423 -r c781e15eb4d5 source/common/primitives.h
--- a/source/common/primitives.h	Fri Jan 02 18:22:38 2015 +0530
+++ b/source/common/primitives.h	Mon Jan 05 09:53:25 2015 +0530
@@ -274,6 +274,12 @@
 
     struct
     {
+        pixelcmp_t      sa8d[NUM_SQUARE_BLOCKS];
+        pixelcmp_t      sse_pp[NUM_SQUARE_BLOCKS];
+        pixel_sub_ps_t  sub_ps[NUM_SQUARE_BLOCKS];
+        pixel_add_ps_t  add_ps[NUM_SQUARE_BLOCKS];
+        filter_p2s_t    p2s;
+
         pixelcmp_t      satd[NUM_LUMA_PARTITIONS];
         filter_pp_t     filter_vpp[NUM_LUMA_PARTITIONS];
         filter_ps_t     filter_vps[NUM_LUMA_PARTITIONS];
@@ -286,9 +292,6 @@
         copy_sp_t       copy_sp[NUM_LUMA_PARTITIONS];
         copy_ps_t       copy_ps[NUM_LUMA_PARTITIONS];
         copy_ss_t       copy_ss[NUM_LUMA_PARTITIONS];
-        pixel_sub_ps_t  sub_ps[NUM_SQUARE_BLOCKS];
-        pixel_add_ps_t  add_ps[NUM_SQUARE_BLOCKS];
-        filter_p2s_t    p2s;
     } chroma[X265_CSP_COUNT];
 };
 
diff -r f255e8d06423 -r c781e15eb4d5 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Jan 02 18:22:38 2015 +0530
+++ b/source/encoder/analysis.cpp	Mon Jan 05 09:53:25 2015 +0530
@@ -1228,12 +1228,8 @@
 
     bestPred->sa8dCost = MAX_INT64;
     int bestSadCand = -1;
-    int cpart, sizeIdx = cuGeom.log2CUSize - 2;
-    if (m_bChromaSa8d)
-    {
-        int cuSize = 1 << cuGeom.log2CUSize;
-        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
-    }
+    int sizeIdx = cuGeom.log2CUSize - 2;
+
     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
     {
         if (m_bFrameParallel &&
@@ -1255,8 +1251,8 @@
         tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
         if (m_bChromaSa8d)
         {
-            tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
-            tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
+            tempPred->distortion += primitives.chroma[m_csp].sa8d[sizeIdx](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
+            tempPred->distortion += primitives.chroma[m_csp].sa8d[sizeIdx](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
         }
         tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
 
@@ -1450,10 +1446,8 @@
         interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
         if (m_bChromaSa8d)
         {
-            uint32_t cuSize = 1 << cuGeom.log2CUSize;
-            int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
-            interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
-            interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
+            interMode.distortion += primitives.chroma[m_csp].sa8d[part](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
+            interMode.distortion += primitives.chroma[m_csp].sa8d[part](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
         }
         interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
 
@@ -1534,13 +1528,7 @@
 
     const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
     MV   mvzero(0, 0);
-    int  cpart, partEnum = cuGeom.log2CUSize - 2;
-
-    if (m_bChromaSa8d)
-    {
-        int cuSize = 1 << cuGeom.log2CUSize;
-        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
-    }
+    int  partEnum = cuGeom.log2CUSize - 2;
 
     bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
     bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
@@ -1576,8 +1564,8 @@
     if (m_bChromaSa8d)
     {
         /* Add in chroma distortion */
-        sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
-        sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
+        sa8d += primitives.chroma[m_csp].sa8d[partEnum](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
+        sa8d += primitives.chroma[m_csp].sa8d[partEnum](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
     }
     bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
     bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
@@ -1613,8 +1601,8 @@
             motionCompensation(tmpPredYuv, true, true);
 
             zsa8d  = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
-            zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
-            zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
+            zsa8d += primitives.chroma[m_csp].sa8d[partEnum](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
+            zsa8d += primitives.chroma[m_csp].sa8d[partEnum](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
         }
         else
         {
diff -r f255e8d06423 -r c781e15eb4d5 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Fri Jan 02 18:22:38 2015 +0530
+++ b/source/encoder/search.cpp	Mon Jan 05 09:53:25 2015 +0530
@@ -2510,11 +2510,8 @@
     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
 
     uint32_t log2CUSize = cu.m_log2CUSize[0];
-    uint32_t cuSize = 1 << log2CUSize;
-    uint32_t depth  = cu.m_cuDepth[0];
-
-    int part = partitionFromLog2Size(log2CUSize);
-    int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+    uint32_t depth = cu.m_cuDepth[0];
+    int sizeIdx = log2CUSize - 2;
 
     m_quant.setQPforQuant(interMode.cu);
 
@@ -2530,9 +2527,9 @@
 
     if (!cu.m_tqBypass[0])
     {
-        uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
-        cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
-        cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+        uint32_t cbf0Dist = primitives.sse_pp[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
+        cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].sse_pp[sizeIdx](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+        cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].sse_pp[sizeIdx](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
 
         /* Consider the RD cost of not signaling any residual */
         m_entropyCoder.load(m_rqt[depth].cur);
@@ -2603,9 +2600,9 @@
         reconYuv->copyFromYuv(*predYuv);
 
     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
-    uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
-    bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+    uint32_t bestDist = primitives.sse_pp[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    bestDist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].sse_pp[sizeIdx](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+    bestDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].sse_pp[sizeIdx](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
     if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);