[x265] [PATCH 2 of 3] motion: chroma ME [CHANGES OUTPUTS]

Steve Borho steve at borho.org
Wed Dec 10 19:26:17 CET 2014


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1418086408 21600
#      Mon Dec 08 18:53:28 2014 -0600
# Node ID afd5620c77a4729f4c599f9ad69000082693a32e
# Parent  4c97d85c8488c2b9199c457f136891005abfa232
motion: chroma ME [CHANGES OUTPUTS]

include chroma distortion in satd decisions when --subme > 2 and chroma blocks
are multiples of 4x4

This required making the MotionEstimate class more aware of PicYuv and its
indexing scheme so that it could find the correct chroma pixels to interpolate.
This allowed me to merge the setSourcePlane() method into the lookahead's
version of setSourcePU.

This requires further work. The Reference class needs to generate weighted
chroma planes if subpel refine will use chroma residual cost. Until this is
fixed, the chroma subpel steps will use unweighted reference pixels.

diff -r 4c97d85c8488 -r afd5620c77a4 doc/reST/cli.rst
--- a/doc/reST/cli.rst	Tue Dec 09 15:31:50 2014 -0600
+++ b/doc/reST/cli.rst	Mon Dec 08 18:53:28 2014 -0600
@@ -392,7 +392,7 @@
 	+-------+---------------------------------------------------------------+
 	| 2     | RDO splits and merge/skip selection                           |
 	+-------+---------------------------------------------------------------+
-	| 3     | RDO mode and split decisions                                  |
+	| 3     | RDO mode and split decisions, chroma residual used for sa8d   |
 	+-------+---------------------------------------------------------------+
 	| 4     | Adds RDO Quant                                                |
 	+-------+---------------------------------------------------------------+
@@ -589,6 +589,13 @@
 	|  7 | 2          | 8         | 2          | 8         | true      |
 	+----+------------+-----------+------------+-----------+-----------+
 
+	At --subme values larger than 2, chroma residual cost is included
+	in all subpel refinement steps and chroma residual is included in
+	all motion estimation decisions (selecting the best reference
+	picture in each list, and chosing between merge, uni-directional
+	motion and bi-directional motion). The 'slow' preset is the first
+	preset to enable the use of chroma residual.
+
 .. option:: --merange <integer>
 
 	Motion search range. Default 57
diff -r 4c97d85c8488 -r afd5620c77a4 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Tue Dec 09 15:31:50 2014 -0600
+++ b/source/encoder/analysis.cpp	Mon Dec 08 18:53:28 2014 -0600
@@ -61,9 +61,12 @@
  *
  *   RDO selection between merge and skip
  *   sa8d selection of best inter mode
+ *   sa8d decisions include chroma residual cost
  *   RDO selection between (merge/skip) / best inter mode / intra / split
  *
  * rd-level 4 enables RDOQuant
+ *   chroma residual cost included in satd decisions, including subpel refine
+ *    (as a result of --subme 3 being used by preset slow)
  *
  * rd-level 5,6 does RDO for each inter mode
  */
@@ -358,11 +361,7 @@
         slave->m_slice = m_slice;
         slave->m_frame = m_frame;
 
-        PicYuv* fencPic = m_frame->m_fencPic;
-        pixel* pu = fencPic->getLumaAddr(m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
-        slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
-        slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_puAbsPartIdx, pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
-
+        slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
         slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
     }
 
@@ -385,8 +384,6 @@
         slave->m_frame = m_frame;
         slave->setQP(*m_slice, m_rdCost.m_qp);
         slave->invalidateContexts(0);
-        if (jobId)
-            slave->m_me.setSourcePlane(m_frame->m_fencPic->m_picOrg[0], m_frame->m_fencPic->m_stride);
     }
 
     ModeDepth& md = m_modeDepth[m_curGeom->depth];
diff -r 4c97d85c8488 -r afd5620c77a4 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Tue Dec 09 15:31:50 2014 -0600
+++ b/source/encoder/frameencoder.cpp	Mon Dec 08 18:53:28 2014 -0600
@@ -722,9 +722,6 @@
     Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
     FrameData& curEncData = *m_frame->m_encData;
     Slice *slice = curEncData.m_slice;
-    PicYuv* fencPic = m_frame->m_fencPic;
-
-    tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
 
     int64_t startTime = x265_mdate();
     const uint32_t numCols = m_numCols;
diff -r 4c97d85c8488 -r afd5620c77a4 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Tue Dec 09 15:31:50 2014 -0600
+++ b/source/encoder/motion.cpp	Mon Dec 08 18:53:28 2014 -0600
@@ -91,6 +91,53 @@
 #undef SETUP_SCALE
 }
 
+/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
+const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
+const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
+const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
+const MV hex4[16] =
+{
+    MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
+    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
+    MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
+    MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
+};
+const MV offsets[] =
+{
+    MV(-1, 0), MV(0, -1),
+    MV(-1, -1), MV(1, -1),
+    MV(-1, 0), MV(1, 0),
+    MV(-1, 1), MV(-1, -1),
+    MV(1, -1), MV(1, 1),
+    MV(-1, 0), MV(0, 1),
+    MV(-1, 1), MV(1, 1),
+    MV(1, 0), MV(0, 1),
+}; // offsets for Two Point Search
+
+/* sum of absolute differences between MV candidates, used for adaptive ME range */
+inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
+{
+    int sum = 0;
+
+    for (int i = 0; i < numCandidates - 1; i++)
+    {
+        sum += abs(mvc[i].x - mvc[i + 1].x)
+            +  abs(mvc[i].y - mvc[i + 1].y);
+    }
+
+    return sum;
+}
+
+}
+
+MotionEstimate::MotionEstimate()
+{
+    ctuAddr = -1;
+    absPartIdx = -1;
+    searchMethod = X265_HEX_SEARCH;
+    subpelRefine = 2;
+    bChromaSATD = false;
+    chromaSatd = NULL;
 }
 
 void MotionEstimate::init(int method, int refine, int csp)
@@ -108,7 +155,8 @@
     fencPUYuv.destroy();
 }
 
-void MotionEstimate::setSourcePU(intptr_t offset, int pwidth, int pheight)
+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
 {
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -119,12 +167,15 @@
 
     blockwidth = pwidth;
     blockOffset = offset;
+    absPartIdx = ctuAddr = -1;
 
     /* copy PU block into cache */
-    primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencplane + offset, fencLumaStride);
+    primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
 }
 
-void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int partOffset, intptr_t offset, int pwidth, int pheight)
+/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
 {
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -132,48 +183,20 @@
     satd = primitives.satd[partEnum];
     sad_x3 = primitives.sad_x3[partEnum];
     sad_x4 = primitives.sad_x4[partEnum];
+    chromaSatd = primitives.chroma[fencPUYuv.m_csp].satd[partEnum];
 
+    /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
+     * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
+    bChromaSATD = subpelRefine > 2 && chromaSatd;
+    X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
+
+    ctuAddr = _ctuAddr;
+    absPartIdx = cuPartIdx + puPartIdx;
     blockwidth = pwidth;
-    blockOffset = offset;
+    blockOffset = 0;
 
-    fencPUYuv.copyPUFromYuv(srcFencYuv, partOffset, partEnum, false);
-}
-
-/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
-static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
-static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
-static const MV hex4[16] =
-{
-    MV(0, -4),  MV(0, 4),  MV(-2, -3), MV(2, -3),
-    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
-    MV(-4, 0),  MV(4, 0),  MV(-4, 1),  MV(4, 1),
-    MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
-};
-static const MV offsets[] =
-{
-    MV(-1, 0), MV(0, -1),
-    MV(-1, -1), MV(1, -1),
-    MV(-1, 0), MV(1, 0),
-    MV(-1, 1), MV(-1, -1),
-    MV(1, -1), MV(1, 1),
-    MV(-1, 0), MV(0, 1),
-    MV(-1, 1), MV(1, 1),
-    MV(1, 0), MV(0, 1),
-}; // offsets for Two Point Search
-
-/* sum of absolute differences between MV candidates */
-static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates)
-{
-    int sum = 0;
-
-    for (int i = 0; i < numCandidates - 1; i++)
-    {
-        sum += abs(mvc[i].x - mvc[i + 1].x)
-            +  abs(mvc[i].y - mvc[i + 1].y);
-    }
-
-    return sum;
+    /* copy PU from CU Yuv */
+    fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
 }
 
 #define COST_MV_PT_DIST(mx, my, point, dist) \
@@ -548,9 +571,11 @@
                                    MV &             outQMv)
 {
     ALIGN_VAR_16(int, costs[16]);
+    if (ctuAddr >= 0)
+        blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
+    intptr_t stride = ref->lumaStride;
     pixel* fenc = fencPUYuv.m_buf[0];
     pixel* fref = ref->fpelPlane + blockOffset;
-    intptr_t stride = ref->lumaStride;
 
     setMVP(qmvp);
 
@@ -809,7 +834,7 @@
                     mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
                     denom++;
                 }
-                mvd += x265_predictor_difference(mvc, numCandidates);
+                mvd += predictorDifference(mvc, numCandidates);
             }
 
             sad_ctx = SAD_THRESH(1000) ? 0
@@ -1159,34 +1184,100 @@
 
 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
 {
-    intptr_t stride = ref->lumaStride;
-    pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * stride;
+    intptr_t refStride = ref->lumaStride;
+    pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
     int xFrac = qmv.x & 0x3;
     int yFrac = qmv.y & 0x3;
+    int cost;
+    intptr_t lclStride = fencPUYuv.m_size;
+    X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
 
-    if ((yFrac | xFrac) == 0)
-        return cmp(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride);
+    if (!(yFrac | xFrac))
+        cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
     else
     {
-        /* We are taking a short-cut here if the reference is weighted. To be
+        /* we are taking a short-cut here if the reference is weighted. To be
          * accurate we should be interpolating unweighted pixels and weighting
-         * the final 16bit values prior to rounding and downshifting. Instead we
+         * the final 16bit values prior to rounding and down shifting. Instead we
          * are simply interpolating the weighted full-pel pixels. Not 100%
          * accurate but good enough for fast qpel ME */
         ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
-        if (yFrac == 0)
-            primitives.luma_hpp[partEnum](fref, stride, subpelbuf, FENC_STRIDE, xFrac);
-        else if (xFrac == 0)
-            primitives.luma_vpp[partEnum](fref, stride, subpelbuf, FENC_STRIDE, yFrac);
+        if (!yFrac)
+            primitives.luma_hpp[partEnum](fref, refStride, subpelbuf, lclStride, xFrac);
+        else if (!xFrac)
+            primitives.luma_vpp[partEnum](fref, refStride, subpelbuf, lclStride, yFrac);
         else
         {
-            ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
+            ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_LUMA)]);
 
             int filterSize = NTAPS_LUMA;
             int halfFilterSize = filterSize >> 1;
-            primitives.luma_hps[partEnum](fref, stride, immed, blockwidth, xFrac, 1);
-            primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
+            primitives.luma_hps[partEnum](fref, refStride, immed, blockwidth, xFrac, 1);
+            primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, lclStride, yFrac);
         }
-        return cmp(fencPUYuv.m_buf[0], FENC_STRIDE, subpelbuf, FENC_STRIDE);
+        cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
     }
+
+    if (bChromaSATD)
+    {
+        int csp    = fencPUYuv.m_csp;
+        int hshift = fencPUYuv.m_hChromaShift;
+        int vshift = fencPUYuv.m_vChromaShift;
+        int shiftHor = (2 + hshift);
+        int shiftVer = (2 + vshift);
+        lclStride = fencPUYuv.m_csize;
+
+        intptr_t refStrideC = ref->reconPic->m_strideC;
+        intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
+
+        const pixel* refCb = ref->reconPic->getCbAddr(ctuAddr, absPartIdx) + refOffset;
+        const pixel* refCr = ref->reconPic->getCrAddr(ctuAddr, absPartIdx) + refOffset;
+
+        xFrac = qmv.x & ((1 << shiftHor) - 1);
+        yFrac = qmv.y & ((1 << shiftVer) - 1);
+
+        if (!(yFrac | xFrac))
+        {
+            cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
+            cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
+        }
+        else
+        {
+            ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+            if (!yFrac)
+            {
+                primitives.chroma[csp].filter_hpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+                primitives.chroma[csp].filter_hpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
+                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+            }
+            else if (!xFrac)
+            {
+                primitives.chroma[csp].filter_vpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+                primitives.chroma[csp].filter_vpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+            }
+            else
+            {
+                ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
+
+                int extStride = blockwidth >> hshift;
+                int filterSize = NTAPS_CHROMA;
+                int halfFilterSize = (filterSize >> 1);
+
+                primitives.chroma[csp].filter_hps[partEnum](refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+                primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
+
+                primitives.chroma[csp].filter_hps[partEnum](refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
+                primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
+                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
+            }
+        }
+    }
+
+    return cost;
 }
diff -r 4c97d85c8488 -r afd5620c77a4 source/encoder/motion.h
--- a/source/encoder/motion.h	Tue Dec 09 15:31:50 2014 -0600
+++ b/source/encoder/motion.h	Mon Dec 08 18:53:28 2014 -0600
@@ -37,21 +37,22 @@
 {
 protected:
 
-    /* Aligned copy of original pixels, extra room for manual alignment */
-    pixel*   fencplane;
-    intptr_t fencLumaStride;
+    intptr_t blockOffset;
+    
+    int ctuAddr;
+    int absPartIdx;  // part index of PU, including CU offset within CTU
 
-    intptr_t blockOffset;
     int searchMethod;
     int subpelRefine;
 
     int blockwidth;
-    int partEnum;
+    int blockheight;
 
     pixelcmp_t sad;
-    pixelcmp_t satd;
     pixelcmp_x3_t sad_x3;
     pixelcmp_x4_t sad_x4;
+    pixelcmp_t satd;
+    pixelcmp_t chromaSatd;
 
     MotionEstimate& operator =(const MotionEstimate&);
 
@@ -60,33 +61,35 @@
     static const int COST_MAX = 1 << 28;
 
     Yuv fencPUYuv;
+    int partEnum;
+    bool bChromaSATD;
 
-    MotionEstimate() {}
+    MotionEstimate();
     ~MotionEstimate();
 
     void init(int method, int refine, int csp);
 
     /* Methods called at slice setup */
 
-    void setSourcePlane(pixel *Y, intptr_t luma)
-    {
-        fencplane = Y;
-        fencLumaStride = luma;
-    }
-
-    void setSourcePU(intptr_t offset, int pwidth, int pheight);
-    void setSourcePU(const Yuv& srcFencYuv, int partOffset, intptr_t offset, int pwidth, int pheight);
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
+    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
 
     /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
      * require setSourcePU() to be called prior. */
 
-    inline int bufSAD(pixel *fref, intptr_t stride)  { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
+    inline int bufSAD(const pixel* fref, intptr_t stride)  { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
 
-    inline int bufSATD(pixel *fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
+    inline int bufSATD(const pixel* fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
 
-    int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
+    inline int bufChromaSATD(const Yuv& refYuv, int puPartIdx)
+    {
+        return chromaSatd(refYuv.getCbAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[1], fencPUYuv.m_csize) +
+               chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize);
+    }
 
-    int subpelCompare(ReferencePlanes * ref, const MV &qmv, pixelcmp_t);
+    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
+
+    int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
 
 protected:
 
diff -r 4c97d85c8488 -r afd5620c77a4 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Tue Dec 09 15:31:50 2014 -0600
+++ b/source/encoder/search.cpp	Mon Dec 08 18:53:28 2014 -0600
@@ -1796,8 +1796,12 @@
         cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
 
         prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(tempYuv, true, false);
+        motionCompensation(tempYuv, true, m_me.bChromaSATD);
+
         uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+        if (m_me.bChromaSATD)
+            costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
+
         uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
         costCand = costCand + m_rdCost.getCost(bitsCand);
         if (costCand < outCost)
@@ -1883,7 +1887,7 @@
 
 /* search of the best candidate for inter prediction
  * returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
 {
     CUData& cu = interMode.cu;
     Yuv* predYuv = &interMode.predYuv;
@@ -1891,7 +1895,6 @@
     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
 
     const Slice *slice = m_slice;
-    PicYuv* fencPic = m_frame->m_fencPic;
     int numPart     = cu.getNumPartInter();
     int numPredDir  = slice->isInterP() ? 1 : 2;
     const int* numRefIdx = slice->m_numRefIdx;
@@ -1911,12 +1914,11 @@
         /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
         initMotionCompensation(cu, cuGeom, puIdx);
 
-        pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-        m_me.setSourcePU(*interMode.fencYuv, m_puAbsPartIdx, pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
+        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
 
         uint32_t mrgCost = MAX_UINT;
 
-        /* find best cost merge candidate */
+        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
         if (cu.m_partSize[0] != SIZE_2Nx2N)
         {
             merge.absPartIdx = m_puAbsPartIdx;
@@ -1924,7 +1926,7 @@
             merge.height     = m_puHeight;
             mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
 
-            if (bMergeOnly && cu.m_log2CUSize[0] > 3)
+            if (bMergeOnly)
             {
                 if (mrgCost == MAX_UINT)
                 {
@@ -1943,7 +1945,7 @@
                 totalmebits += merge.bits;
 
                 prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(*predYuv, true, bChroma);
+                motionCompensation(*predYuv, true, bChromaSA8D);
                 continue;
             }
         }
@@ -2142,19 +2144,35 @@
             bidir[0] = bestME[0];
             bidir[1] = bestME[1];
 
-            /* Generate reference subpels */
-            PicYuv* refPic0  = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
-            PicYuv* refPic1  = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
-            Yuv*    bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
-            predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
-            predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
-
-            pixel* pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
-            pixel* pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
-
-            int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
-            primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
-            int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+            int satdCost;
+
+            if (m_me.bChromaSATD)
+            {
+                cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
+                cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+                cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
+                cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
+
+                prepMotionCompensation(cu, cuGeom, puIdx);
+                motionCompensation(tmpPredYuv, true, true);
+
+                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+                           m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+            }
+            else
+            {
+                PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
+                PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
+                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
+
+                /* Generate reference subpels */
+                predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
+                predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
+
+                primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
+                                                                                              bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+            }
 
             bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
             bidirCost = satdCost + m_rdCost.getCost(bidirBits);
@@ -2177,12 +2195,28 @@
             if (bTryZero)
             {
                 /* coincident blocks of the two reference pictures */
-                const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-                const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-                intptr_t refStride = slice->m_mref[0][0].lumaStride;
-
-                primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
-                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                if (m_me.bChromaSATD)
+                {
+                    cu.m_mv[0][m_puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+                    cu.m_mv[1][m_puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
+
+                    prepMotionCompensation(cu, cuGeom, puIdx);
+                    motionCompensation(tmpPredYuv, true, true);
+
+                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+                               m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                }
+                else
+                {
+                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
+
+                    primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                }
 
                 MV mvp0 = bestME[0].mvp;
                 int mvpIdx0 = bestME[0].mvpIdx;
@@ -2277,7 +2311,7 @@
         }
 
         prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(*predYuv, true, bChroma);
+        motionCompensation(*predYuv, true, bChromaSA8D);
     }
 
     interMode.sa8dBits += totalmebits;
diff -r 4c97d85c8488 -r afd5620c77a4 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Tue Dec 09 15:31:50 2014 -0600
+++ b/source/encoder/slicetype.cpp	Mon Dec 08 18:53:28 2014 -0600
@@ -1302,7 +1302,6 @@
         for (int i = 0; i < m_heightInCU; i++)
         {
             m_rows[i].init();
-            m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
             if (!fenc->bIntraCalculated)
                 fenc->rowSatds[0][0][i] = 0;
             fenc->rowSatds[b - p0][p1 - b][i] = 0;
@@ -1543,7 +1542,7 @@
     const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
                                 cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
 
-    m_me.setSourcePU(pelOffset, cuSize, cuSize);
+    m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;


More information about the x265-devel mailing list