[x265] [PATCH] Reduce half HPEL interpolate works by merge nest point

Wed Sep 25 07:29:22 CEST 2013

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1380086952 -28800
# Node ID 57efca19f5b8d8b5bdc22a0bb9fbfc6169724266
# Parent  bdd26fd0325acf0f36409e994bdc262b11fa70f4
Reduce half HPEL interpolate works by merge nest point

In the square1[9], the candidate pixel below:

 5 1 7
 3 x 4
 6 2 8

The main idea is in the HPEL ME, every two pixel distance is unit '1',
so we can merge 1-2, 3-4, 5-6-7-8 with one interpolate by increment 1 extra row/col.

diff -r bdd26fd0325a -r 57efca19f5b8 source/common/vec/ipfilter8.inc

--- a/source/common/vec/ipfilter8.inc	Tue Sep 24 15:21:06 2013 -0500
+++ b/source/common/vec/ipfilter8.inc	Wed Sep 25 13:29:12 2013 +0800
@@ -679,7 +679,8 @@
 
     int row, col;
 
-    assert(height % 2 == 0);
+    if (N == 4)
+        assert(height % 2 == 0);
 
     uint32_t leftCols = (8 - (width & 7)) * 8;
     uint32_t mask_shift = ((uint32_t)~0 >> leftCols);
diff -r bdd26fd0325a -r 57efca19f5b8 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Tue Sep 24 15:21:06 2013 -0500
+++ b/source/encoder/motion.cpp	Wed Sep 25 13:29:12 2013 +0800
@@ -87,8 +87,8 @@
         init_scales();
 
     fenc = (pixel*)X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-    subpelbuf = (pixel*)X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-    immedVal = (short*)X265_MALLOC(short, MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1));
+    subpelbuf = (pixel*)X265_MALLOC(pixel, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1));
+    immedVal = (short*)X265_MALLOC(short, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1 + NTAPS_LUMA - 1));
 }
 
 MotionEstimate::~MotionEstimate()
@@ -122,6 +122,7 @@
 static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
 static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
 static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
+static const int square1_dir[9] = { 0, 1, 1, 2, 2, 1, 1, 1, 1 };
 static const MV hex4[16] =
 {
     MV(0, -4),  MV(0, 4),  MV(-2, -3), MV(2, -3),
@@ -793,17 +794,55 @@
     else
         hpelcomp = sad;
 
-    for (int iter = 0; iter < wl.hpel_iters; iter++)
+    if (ref->isLowres)
     {
-        int bdir = 0, cost;
-        for (int i = 1; i <= wl.hpel_dirs; i++)
+        for (int iter = 0; iter < wl.hpel_iters; iter++)
         {
-            MV qmv = bmv + square1[i] * 2;
-            cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
-            COPY2_IF_LT(bcost, cost, bdir, i);
+            int bdir = 0, cost;
+            for (int i = 1; i <= wl.hpel_dirs; i++)
+            {
+                MV qmv = bmv + square1[i] * 2;
+                cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
+                COPY2_IF_LT(bcost, cost, bdir, i+0);
+            }
+            bmv += square1[bdir] * 2;
         }
+    }
+    else
+    {
+        for (int iter = 0; iter < wl.hpel_iters; iter++)
+        {
+            int bdir = 0, cost0, cost1;
+            for (int i = 1; i <= wl.hpel_dirs; i+=2)
+            {
+                MV qmv0 = bmv + square1[i  ] * 2;
+                MV qmv1 = bmv + square1[i+1] * 2;
+                int mvcost0 = mvcost(qmv0);
+                int mvcost1 = mvcost(qmv1);
+                int dir = square1_dir[i];
 
-        bmv += square1[bdir] * 2;
+                pixel *fref = ref->fpelPlane + blockOffset + (qmv0.x >> 2) + (qmv0.y >> 2) * ref->lumaStride;
+                int xFrac = qmv0.x & 0x3;
+                int yFrac = qmv0.y & 0x3;
+
+                // TODO: sad_x2
+                if (xFrac == 0 && yFrac == 0)
+                {
+                    intptr_t offset = (dir == 2) + (dir == 1 ? ref->lumaStride : 0);
+                    cost0 = hpelcomp(fenc, FENC_STRIDE, fref, ref->lumaStride) + mvcost0;
+                    cost1 = hpelcomp(fenc, FENC_STRIDE, fref + offset, ref->lumaStride) + mvcost1;
+                }
+                else
+                {
+                    subpelInterpolate2(fref, ref->lumaStride, xFrac, yFrac, dir);
+                    cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
+                    cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+                }
+                COPY2_IF_LT(bcost, cost0, bdir, i+0);
+                COPY2_IF_LT(bcost, cost1, bdir, i+1);
+            }
+            bmv += square1[bdir] * 2;
+        }
     }
     /* if HPEL search used SAD, remeasure with SATD before QPEL */
     if (!wl.hpel_satd)
@@ -1125,3 +1164,28 @@
         return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
     }
 }
+
+void MotionEstimate::subpelInterpolate2(pixel *fref, intptr_t lumaStride, int xFrac, int yFrac, int dir)
+{
+    assert(yFrac | xFrac);
+
+    int realWidth = blockwidth + (dir == 2);
+    int realHeight = blockheight + (dir == 1);
+    intptr_t realStride = FENC_STRIDE + (dir == 2);
+
+    if (yFrac == 0)
+    {
+        primitives.ipfilter_pp[FILTER_H_P_P_8](fref, lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
+    }
+    else if (xFrac == 0)
+    {
+        primitives.ipfilter_pp[FILTER_V_P_P_8](fref, lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+    }
+    else
+    {
+        int filterSize = NTAPS_LUMA;
+        int halfFilterSize = (filterSize >> 1);
+        primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * lumaStride, lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+    }
+}
diff -r bdd26fd0325a -r 57efca19f5b8 source/encoder/motion.h
--- a/source/encoder/motion.h	Tue Sep 24 15:21:06 2013 -0500
+++ b/source/encoder/motion.h	Wed Sep 25 13:29:12 2013 +0800
@@ -95,6 +95,7 @@
     int motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
 
     int subpelCompare(ReferencePlanes *ref, const MV & qmv, pixelcmp_t);
+    void subpelInterpolate2(pixel *fref, intptr_t lumaStride, int xFrac, int yFrac, int dir);
 
 protected: