[x264-devel] [PATCH] subpel refinement

Sat Sep 11 21:50:36 CEST 2004

Patch improves motion estimation: previously, MVs with both
coordinates halfpel or both qpel were unreachable.

i_subpel_refine values:
1 => same as before (default)
2 => all MVs reachable, ~10% slower, 2-3% better compression
>2 => more iterations, only a small improvement

Further optimization may be possible: I'll look into caching of 
motion-compensated pixels or satd results to avoid duplicate 
h->mc[MC_LUMA]() calls.

Benchmarks:
mencoder tng.yuv -o x1_subq1.avi -ovc x264 -x264encopts 
qp_constant=22:fullinter:cabac=1:iframe=100:psnr:subq=1

==> x1_subq1.log <==
x264 [info]: PSNR Global:45.32 kb/s:1161.8 fps:15.747
69.970u 2.930s 1:20.18 90.9%    0+0k 0+0io 1094pf+0w

==> x1_subq2.log <==
x264 [info]: PSNR Global:45.35 kb/s:1135.5 fps:14.531
75.220u 3.150s 1:25.22 91.9%    0+0k 0+0io 1094pf+0w

==> x1_subq3.log <==
x264 [info]: PSNR Global:45.35 kb/s:1130.8 fps:14.110
77.420u 3.150s 1:29.73 89.7%    0+0k 0+0io 1094pf+0w

==> x1_subq4.log <==
x264 [info]: PSNR Global:45.35 kb/s:1128.0 fps:14.144
77.810u 3.020s 1:28.06 91.7%    0+0k 0+0io 1094pf+0w

==> x1_subq6.log <==
x264 [info]: PSNR Global:45.35 kb/s:1126.8 fps:14.198
77.890u 3.220s 1:28.68 91.4%    0+0k 0+0io 1094pf+0w


--Loren Merritt
-------------- next part --------------
Index: encoder/me.c
===================================================================

--- encoder/me.c	(revision 46)
+++ encoder/me.c	(working copy)
@@ -124,6 +124,7 @@
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
     const int bh = x264_pixel_size[m->i_pixel].h;
+    int step, iter;
 
     DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
     int cost[4];
@@ -132,60 +133,39 @@
     int bmx = m->mv[0];
     int bmy = m->mv[1];
 
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 2, bw, bh );
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 2, bw, bh );
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 2, bmy + 0, bw, bh );
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 2, bmy + 0, bw, bh );
-
-    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
-              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 2 - m->mvp[1] ) );
-    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
-              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 2 - m->mvp[1] ) );
-    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
-              m->lm * ( bs_size_se( bmx - 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
-    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
-              m->lm * ( bs_size_se( bmx + 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
-
-    best = 0;
-    if( cost[1] < cost[0] )    best = 1;
-    if( cost[2] < cost[best] ) best = 2;
-    if( cost[3] < cost[best] ) best = 3;
-
-    if( cost[best] < m->cost )
+    for( step = 2; step >= 1; step-- )
     {
-        m->cost = cost[best];
-        if( best == 0 )      bmy -= 2;
-        else if( best == 1 ) bmy += 2;
-        else if( best == 2 ) bmx -= 2;
-        else if( best == 3 ) bmx += 2;
-    }
+        for( iter = 0; iter < h->param.analyse.i_subpel_refine; iter++ )
+        {
+            h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - step, bw, bh );
+            h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + step, bw, bh );
+            h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - step, bmy + 0, bw, bh );
+            h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + step, bmy + 0, bw, bh );
 
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 1, bw, bh );
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 1, bw, bh );
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 1, bmy + 0, bw, bh );
-    h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 1, bmy + 0, bw, bh );
+            cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
+                      m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - step - m->mvp[1] ) );
+            cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
+                      m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + step - m->mvp[1] ) );
+            cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
+                      m->lm * ( bs_size_se( bmx - step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+            cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
+                      m->lm * ( bs_size_se( bmx + step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
 
-    cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) +
-              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 1 - m->mvp[1] ) );
-    cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) +
-              m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 1 - m->mvp[1] ) );
-    cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) +
-              m->lm * ( bs_size_se( bmx - 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
-    cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) +
-              m->lm * ( bs_size_se( bmx + 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+            best = 0;
+            if( cost[1] < cost[0] )    best = 1;
+            if( cost[2] < cost[best] ) best = 2;
+            if( cost[3] < cost[best] ) best = 3;
 
-    best = 0;
-    if( cost[1] < cost[0] )    best = 1;
-    if( cost[2] < cost[best] ) best = 2;
-    if( cost[3] < cost[best] ) best = 3;
-
-    if( cost[best] < m->cost )
-    {
-        m->cost = cost[best];
-        if( best == 0 )      bmy--;
-        else if( best == 1 ) bmy++;
-        else if( best == 2 ) bmx--;
-        else if( best == 3 ) bmx++;
+            if( cost[best] < m->cost )
+            {
+                m->cost = cost[best];
+                if( best == 0 )      bmy -= step;
+                else if( best == 1 ) bmy += step;
+                else if( best == 2 ) bmx -= step;
+                else if( best == 3 ) bmx += step;
+            }
+            else break;
+        }
     }
 
     m->mv[0] = bmx;
Index: x264.c
===================================================================
--- x264.c	(revision 46)
+++ x264.c	(working copy)
@@ -132,6 +132,7 @@
              "                                  - i4x4\n"
              "                                  - psub16x16,psub8x8\n"
              "                                  - none, all\n"
+             "      --subq <integer>            Subpixel motion estimation quality\n"
              "\n"
              "  -s, --sar width:height      Specify Sample Aspect Ratio\n"
              "  -o, --output                Specify output file\n"
@@ -176,6 +177,7 @@
 #define OPT_QCOMP 266
 #define OPT_NOPSNR 267
 #define OPT_QUIET 268
+#define OPT_SUBQ 269
 
         static struct option long_options[] =
         {
@@ -196,6 +198,7 @@
             { "sar",     required_argument, NULL, 's' },
             { "output",  required_argument, NULL, 'o' },
             { "analyse", required_argument, NULL, 'A' },
+            { "subq",    required_argument, NULL, OPT_SUBQ },
             { "rcsens",  required_argument, NULL, OPT_RCSENS },
             { "rcbuf",   required_argument, NULL, OPT_RCBUF },
             { "rcinitbuf",required_argument, NULL, OPT_RCIBUF },
@@ -304,6 +307,9 @@
                 if( strstr( optarg, "psub16x16" ) ) param->analyse.inter |= X264_ANALYSE_PSUB16x16;
                 if( strstr( optarg, "psub8x8" ) )   param->analyse.inter |= X264_ANALYSE_PSUB8x8;
                 break;
+            case OPT_SUBQ:
+                param->analyse.i_subpel_refine = atoi(optarg);
+                break;
             case OPT_RCBUF:
                 param->rc.i_rc_buffer_size = atoi(optarg);
                 break;
Index: core/common.c
===================================================================
--- core/common.c	(revision 46)
+++ core/common.c	(working copy)
@@ -96,6 +96,7 @@
     /* */
     param->analyse.intra = X264_ANALYSE_I4x4;
     param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16;
+    param->analyse.i_subpel_refine = 1;
     param->analyse.b_psnr = 1;
 }
 
Index: x264.h
===================================================================
--- x264.h	(revision 46)
+++ x264.h	(working copy)
@@ -124,6 +124,8 @@
         unsigned int intra;     /* intra flags */
         unsigned int inter;     /* inter flags */
 
+        int          i_subpel_refine;  /* number of EPZS diamond iterations */
+
         int          b_psnr;    /* Do we compute PSNR stats (save a few % of cpu) */
     } analyse;