[x264-devel] commit: Further reduce code size in bime (Jason Garrett-Glaser )

Fri Oct 30 03:13:34 CET 2009

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Oct 29 12:28:37 2009 -0700| [fe83a906ee1bb5170b112de717818e278ff59ddb] | committer: Jason Garrett-Glaser 

Further reduce code size in bime
~7-8 kilobytes saved, ~0.6% faster subme 9.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=fe83a906ee1bb5170b112de717818e278ff59ddb
---

 encoder/me.c |  124 ++++++++++++++++++++++------------------------------------
 encoder/me.h |   14 ++++++-
 2 files changed, 59 insertions(+), 79 deletions(-)

diff --git a/encoder/me.c b/encoder/me.c
index e7dc007..6690ce0 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -866,72 +866,16 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh ); \
     if( rd )\
     {\
-        if( h->mb.b_interlaced & ref##list )\
-            mvy += (h->mb.i_mb_y & 1)*4 - 2;\
-        h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\
-        h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\
+        h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+        h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
     }\
 }
 
-#define BIME_CACHE2(a,b,list) \
-    BIME_CACHE(a,b,list) \
-    BIME_CACHE(-(a),-(b),list)
-
-#define BIME_CACHE8(list) \
-{\
-    BIME_CACHE2( 1, 0, list );\
-    BIME_CACHE2( 0, 1, list );\
-    BIME_CACHE2( 1, 1, list );\
-    BIME_CACHE2( 1,-1, list );\
-}
-
 #define SATD_THRESH 17/16
 
-#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
-if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
-{ \
-    int cost; \
-    int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
-    int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
-    visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
-    h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \
-    cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
-         + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
-         + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
-    if( rd ) \
-    { \
-        if( cost < bcost * SATD_THRESH ) \
-        { \
-            uint64_t costrd; \
-            if( cost < bcost ) \
-                bcost = cost; \
-            *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \
-            *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \
-            h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );\
-            h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );\
-            costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); \
-            if( costrd < bcostrd ) \
-            {\
-                bcostrd = costrd;\
-                bm0x = m0x;      \
-                bm0y = m0y;      \
-                bm1x = m1x;      \
-                bm1y = m1y;      \
-            }\
-        } \
-    } \
-    else if( cost < bcost ) \
-    {                  \
-        bcost = cost;  \
-        bm0x = m0x;    \
-        bm0y = m0y;    \
-        bm1x = m1x;    \
-        bm1y = m1y;    \
-    } \
-}
-
-#define CHECK_BIDIR(a,b,c,d) \
-    COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d)
+/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
+ * other than making its iteration count not a compile-time constant. */
+int x264_iter_kludge = 0;
 
 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
 {
@@ -955,8 +899,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
     uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
     uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
-    int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
-    int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
+    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
+    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
+    const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
     int stride0[9];
     int stride1[9];
     int bm0x = m0->mv[0], om0x = bm0x;
@@ -971,7 +917,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
     ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
     /* all permutations of an offset in up to 2 of the dimensions */
-    static const int8_t dia4d[32][4] = {
+    static const int8_t dia4d[33][4] = {
+        {0,0,0,0},
         {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
         {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0},
         {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0},
@@ -988,10 +935,6 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
 
     h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
 
-    BIME_CACHE( 0, 0, 0 );
-    BIME_CACHE( 0, 0, 1 );
-    CHECK_BIDIR( 0, 0, 0, 0 );
-
     for( pass = 0; pass < 8; pass++ )
     {
         /* check all mv pairs that differ in at most 2 components from the current mvs. */
@@ -999,12 +942,44 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
          * from bidir ME are the same with and without chroma ME. */
 
         if( mc_list0 )
-            BIME_CACHE8( 0 );
+            for( j = x264_iter_kludge; j < 9; j++ )
+                BIME_CACHE( square1[j][0], square1[j][1], 0 );
+
         if( mc_list1 )
-            BIME_CACHE8( 1 );
+            for( j = x264_iter_kludge; j < 9; j++ )
+                BIME_CACHE( square1[j][0], square1[j][1], 1 );
 
-        for( j=0; j<32; j++ )
-            CHECK_BIDIR( dia4d[j][0], dia4d[j][1], dia4d[j][2], dia4d[j][3] );
+        for( j = !!pass; j < 33; j++ )
+        {
+            int m0x = dia4d[j][0] + om0x;
+            int m0y = dia4d[j][1] + om0y;
+            int m1x = dia4d[j][2] + om1x;
+            int m1y = dia4d[j][3] + om1y;
+            if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) )
+            {
+                int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y);
+                int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y);
+                visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
+                h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight );
+                int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
+                         + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
+                if( rd )
+                {
+                    if( cost < bcost * SATD_THRESH )
+                    {
+                        bcost = X264_MIN( cost, bcost );
+                        *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y);
+                        *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y);
+                        h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+                        h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+                        uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
+                        COPY5_IF_LT( bcostrd, costrd, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+                    }
+                }
+                else
+                    COPY5_IF_LT( bcost, cost, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+            }
+        }
 
         mc_list0 = (om0x-bm0x)|(om0y-bm0y);
         mc_list1 = (om1x-bm1x)|(om1y-bm1y);
@@ -1015,11 +990,6 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
         om0y = bm0y;
         om1x = bm1x;
         om1y = bm1y;
-
-        if( mc_list0 )
-            BIME_CACHE( 0, 0, 0 );
-        if( mc_list1 )
-            BIME_CACHE( 0, 0, 1 );
     }
 
     m0->mv[0] = bm0x;
diff --git a/encoder/me.h b/encoder/me.h
index 0122b8b..ed75da3 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -84,13 +84,23 @@ if((y)<(x))\
     (c)=(d);\
 }
 
-#define COPY4_IF_LT(x,y,a,b,c,d,f,e)\
+#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
 if((y)<(x))\
 {\
     (x)=(y);\
     (a)=(b);\
     (c)=(d);\
-    (f)=(e);\
+    (e)=(f);\
+}
+
+#define COPY5_IF_LT(x,y,a,b,c,d,e,f,g,h)\
+if((y)<(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
+    (c)=(d);\
+    (e)=(f);\
+    (g)=(h);\
 }
 
 #define COPY2_IF_GT(x,y,a,b)\