[x265] [PATCH 2 of 4] disable interpolate horizontal merge

Min Chen chenm003 at 163.com
Mon Oct 28 15:24:34 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1382970193 -28800
# Node ID 6bc4a571ed625cd23503810cfaa5256a3f32a20d
# Parent  4a14bd24b6572ed1ffcf9dcfa9f4c841adc62211
disable interpolate horizontal merge

we need width is multiple of 4 in asm code, the maskmovq is very expensive

diff -r 4a14bd24b657 -r 6bc4a571ed62 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Mon Oct 28 22:22:11 2013 +0800
+++ b/source/encoder/motion.cpp	Mon Oct 28 22:23:13 2013 +0800
@@ -854,9 +854,19 @@
                 }
                 else
                 {
-                    subpelInterpolate(ref, qmv0, dir);
-                    cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
-                    cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+                    if (dir == 1)
+                    {
+                        subpelInterpolate(ref, qmv0, 1);
+                        cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
+                        cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+                    }
+                    else
+                    {
+                        subpelInterpolate(ref, qmv0, 0);
+                        cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE) + mvcost0;
+                        subpelInterpolate(ref, qmv1, 0);
+                        cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE) + mvcost1;
+                    }
                 }
                 COPY2_IF_LT(bcost, cost0, bdir, i + 0);
                 COPY2_IF_LT(bcost, cost1, bdir, i + 1);
@@ -1179,9 +1189,9 @@
     int yFrac = qmv.y & 0x3;
 
     assert(yFrac | xFrac);
-    int realWidth = blockwidth + (dir == 2);
+    assert(dir != 2);
+    assert((blockwidth % 4) == 0);
     int realHeight = blockheight + (dir == 1);
-    intptr_t realStride = FENC_STRIDE + (dir == 2);
     pixel *fref = ref->unweightedFPelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
     int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
     int local_shift = ref->shift + shiftNum;
@@ -1190,39 +1200,39 @@
     {
         if (yFrac == 0)
         {
-            primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
-            primitives.weightpUni(immedVal, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[xFrac]);
+            primitives.weightpUni(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
         }
         else if (xFrac == 0)
         {
-            primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
-            primitives.weightpUni(immedVal, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+            primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.weightpUni(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
         }
         else
         {
             int filterSize = NTAPS_LUMA;
             int halfFilterSize = (filterSize >> 1);
-            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
-            primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, immedVal2, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
-            primitives.weightpUni(immedVal2, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+            primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.weightpUni(immedVal2, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
         }
     }
     else
     {
         if (yFrac == 0)
         {
-            primitives.ipfilter_pp[FILTER_H_P_P_8](fref, ref->lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
+            primitives.ipfilter_pp[FILTER_H_P_P_8](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[xFrac]);
         }
         else if (xFrac == 0)
         {
-            primitives.ipfilter_pp[FILTER_V_P_P_8](fref, ref->lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.ipfilter_pp[FILTER_V_P_P_8](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
         }
         else
         {
             int filterSize = NTAPS_LUMA;
             int halfFilterSize = (filterSize >> 1);
-            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
-            primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+            primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
         }
     }
 }



More information about the x265-devel mailing list