[x265] [PATCH] motion: use fast weighted subpel refine

Fri Nov 22 00:30:09 CET 2013

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1385076566 21600
#      Thu Nov 21 17:29:26 2013 -0600
# Node ID 8f156b97360be563a52743826bded075bd98b267
# Parent  b172259c07f1b7a04deadfeb89469a700f644feb
motion: use fast weighted subpel refine

Don't do the full-blown weighted motion compensation for ME.  Just interpolate
the weighted full pel pixels. It is not 100% accurate to the pixels that will
be used to encode the final prediction; but close enough for ME.

Testing with sintel_trailer_2k_720p24.y4m at medium preset and all defaults
x265 [info]: 651 of 1124 (57.92%) P frames weighted

before: 1253 frames in 512.74s (2.44 fps), 223.51 kb/s, Global PSNR: 50.552
after:  1253 frames in 410.25s (3.05 fps), 223.59 kb/s, Global PSNR: 50.589

diff -r b172259c07f1 -r 8f156b97360b source/encoder/motion.cpp

--- a/source/encoder/motion.cpp	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/encoder/motion.cpp	Thu Nov 21 17:29:26 2013 -0600
@@ -104,17 +104,11 @@
         init_scales();
 
     fenc = (pixel*)X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-    subpelbuf = (pixel*)X265_MALLOC(pixel, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1));
-    immedVal = (int16_t*)X265_MALLOC(int16_t, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1 + NTAPS_LUMA - 1));
-    immedVal2 = (int16_t*)X265_MALLOC(int16_t, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1 + NTAPS_LUMA - 1));
 }
 
 MotionEstimate::~MotionEstimate()
 {
     X265_FREE(fenc);
-    X265_FREE(subpelbuf);
-    X265_FREE(immedVal);
-    X265_FREE(immedVal2);
 }
 
 void MotionEstimate::setSourcePU(int offset, int width, int height)
@@ -1137,50 +1131,32 @@
     }
     else
     {
-        if (ref->isWeighted)
+        /* We are taking a short-cut here if the reference is weighted. To be
+         * accurate we should be interpolating unweighted pixels and weighting
+         * the final 16bit values prior to rounding and downshifting. Instead we
+         * are simply interpolating the weighted full-pel pixels. Not 100%
+         * accurate but good enough for fast qpel ME */
+        ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+        pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
+        if (yFrac == 0)
         {
-            int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
-            int shift = ref->shift + shiftNum;
-            int round = shift ? (1 << (shift - 1)) : 0;
-            pixel *fref = ref->unweightedFPelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
-
-            if (yFrac == 0)
-            {
-                primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, blockheight, g_lumaFilter[xFrac]);
-                primitives.weight_sp(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, blockheight, ref->weight, round, shift, ref->offset);
-            }
-            else if (xFrac == 0)
-            {
-                primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, blockheight, g_lumaFilter[yFrac]);
-                primitives.weight_sp(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, blockheight, ref->weight, round, shift, ref->offset);
-            }
-            else
-            {
-                int filterSize = NTAPS_LUMA;
-                int halfFilterSize = (filterSize >> 1);
-                primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, blockheight + filterSize - 1, g_lumaFilter[xFrac]);
-                primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, blockheight, yFrac);
-                primitives.weight_sp(immedVal2, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, blockheight, ref->weight, round, shift, ref->offset);
-            }
+            primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
+        }
+        else if (xFrac == 0)
+        {
+            primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
         }
         else
         {
-            pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
-            if (yFrac == 0)
-            {
-                primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
-            }
-            else if (xFrac == 0)
-            {
-                primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
-            }
-            else
-            {
-                int filterSize = NTAPS_LUMA;
-                int halfFilterSize = (filterSize >> 1);
-                primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, blockheight + filterSize - 1, g_lumaFilter[xFrac]);
-                primitives.luma_vsp[partEnum](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
-            }
+            ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
+
+            int filterSize = NTAPS_LUMA;
+            int halfFilterSize = filterSize >> 1;
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride,
+                                                   immed, blockwidth,
+                                                   blockwidth, blockheight + filterSize - 1,
+                                                   g_lumaFilter[xFrac]);
+            primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
         }
         return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
     }
diff -r b172259c07f1 -r 8f156b97360b source/encoder/motion.h
--- a/source/encoder/motion.h	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/encoder/motion.h	Thu Nov 21 17:29:26 2013 -0600
@@ -52,9 +52,6 @@
     int subpelRefine;
 
     /* subpel generation buffers */
-    pixel *subpelbuf;
-    int16_t *immedVal;
-    int16_t *immedVal2;
     int blockwidth;
     int blockheight;