[x264-devel] commit: Faster qpel-RD (Jason Garrett-Glaser )

Fri Oct 31 16:57:51 CET 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Oct 22 02:20:06 2008 -0700| [d0add77f5f084253202747266f85daa65f7fc9cc] | committer: Jason Garrett-Glaser 

Faster qpel-RD
3-4% faster qpel-RD; avoid re-checking bmv/pmv during the hex search.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d0add77f5f084253202747266f85daa65f7fc9cc
---

 encoder/me.c |   39 ++++++++++++++++++++++++++-------------
 1 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/encoder/me.c b/encoder/me.c
index 25e1195..12c20b0 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -942,18 +942,23 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
 }
 
 #undef COST_MV_SATD
-#define COST_MV_SATD( mx, my, dst ) \
+#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \
 { \
-    int stride = 16; \
-    uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
-    dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
-        + p_cost_mvx[mx] + p_cost_mvy[my]; \
-    COPY1_IF_LT( bsatd, dst ); \
+    if( !avoid_pmv || !(mx == pmx && my == pmy) ) \
+    { \
+        int stride = 16; \
+        uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
+        dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+            + p_cost_mvx[mx] + p_cost_mvy[my]; \
+        COPY1_IF_LT( bsatd, dst ); \
+    } \
+    else \
+        dst = COST_MAX; \
 }
 
 #define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
 { \
-    if( satd <= bsatd * SATD_THRESH )\
+    if( satd <= bsatd * SATD_THRESH ) \
     { \
         uint64_t cost; \
         *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
@@ -991,7 +996,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8, int
     pmy = m->mvp[1];
     p_cost_mvx = m->p_cost_mv - pmx;
     p_cost_mvy = m->p_cost_mv - pmy;
-    COST_MV_SATD( bmx, bmy, bsatd );
+    COST_MV_SATD( bmx, bmy, bsatd, 0 );
     COST_MV_RD( bmx, bmy, 0, 0, 0 );
 
     /* check the predicted mv */
@@ -999,16 +1004,24 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8, int
         && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
         && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
     {
-        COST_MV_SATD( pmx, pmy, satd );
+        COST_MV_SATD( pmx, pmy, satd, 0 );
         COST_MV_RD( pmx, pmy, satd, 0,0 );
+        /* The hex motion search is guaranteed to not repeat the center candidate,
+         * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
+        if( bmx == pmx && bmy == pmy )
+        {
+            pmx = m->mv[0];
+            pmy = m->mv[1];
+        }
     }
 
     /* subpel hex search, same pattern as ME HEX. */
     dir = -2;
     omx = bmx;
     omy = bmy;
-    for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j] );
+    for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 );
     for( j=0; j<6; j++ ) COST_MV_RD  ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j );
+
     if( dir != -2 )
     {
         /* half hexagon, not overlapping the previous iteration */
@@ -1021,7 +1034,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8, int
             dir = -2;
             omx = bmx;
             omy = bmy;
-            for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j] );
+            for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 );
             for( j=0; j<3; j++ ) COST_MV_RD  ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j );
             if( dir == -2 )
                 break;
@@ -1031,7 +1044,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8, int
     /* square refine, same as pattern as ME HEX. */
     omx = bmx;
     omy = bmy;
-    for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i][0], omy  + square1[i][1], satds[i] );
+    for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i][0], omy  + square1[i][1], satds[i], 1 );
     for( i=0; i<8; i++ ) COST_MV_RD  ( omx + square1[i][0], omy  + square1[i][1], satds[i], 0,0 );
 
     bmy = x264_clip3( bmy, h->mb.mv_min_spel[1],  h->mb.mv_max_spel[1] );
@@ -1039,6 +1052,6 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8, int
     m->mv[0] = bmx;
     m->mv[1] = bmy;
     x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, i_list, pack16to32_mask(bmx, bmy) );
-    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, i_list, pack16to32_mask(bmx - pmx, bmy - pmy) );
+    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
 }