[x264-devel] commit: Further reduce code size in bime (Jason Garrett-Glaser )
git version control
git at videolan.org
Fri Oct 30 03:13:34 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Oct 29 12:28:37 2009 -0700| [fe83a906ee1bb5170b112de717818e278ff59ddb] | committer: Jason Garrett-Glaser
Further reduce code size in bime
~7-8 kilobytes saved, ~0.6% faster subme 9.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=fe83a906ee1bb5170b112de717818e278ff59ddb
---
encoder/me.c | 124 ++++++++++++++++++++++------------------------------------
encoder/me.h | 14 ++++++-
2 files changed, 59 insertions(+), 79 deletions(-)
diff --git a/encoder/me.c b/encoder/me.c
index e7dc007..6690ce0 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -866,72 +866,16 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh ); \
if( rd )\
{\
- if( h->mb.b_interlaced & ref##list )\
- mvy += (h->mb.i_mb_y & 1)*4 - 2;\
- h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\
- h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\
+ h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+ h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
}\
}
-#define BIME_CACHE2(a,b,list) \
- BIME_CACHE(a,b,list) \
- BIME_CACHE(-(a),-(b),list)
-
-#define BIME_CACHE8(list) \
-{\
- BIME_CACHE2( 1, 0, list );\
- BIME_CACHE2( 0, 1, list );\
- BIME_CACHE2( 1, 1, list );\
- BIME_CACHE2( 1,-1, list );\
-}
-
#define SATD_THRESH 17/16
-#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
-if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
-{ \
- int cost; \
- int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
- int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
- visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
- h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \
- cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
- + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
- + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
- if( rd ) \
- { \
- if( cost < bcost * SATD_THRESH ) \
- { \
- uint64_t costrd; \
- if( cost < bcost ) \
- bcost = cost; \
- *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \
- *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \
- h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );\
- h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );\
- costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); \
- if( costrd < bcostrd ) \
- {\
- bcostrd = costrd;\
- bm0x = m0x; \
- bm0y = m0y; \
- bm1x = m1x; \
- bm1y = m1y; \
- }\
- } \
- } \
- else if( cost < bcost ) \
- { \
- bcost = cost; \
- bm0x = m0x; \
- bm0y = m0y; \
- bm1x = m1x; \
- bm1y = m1y; \
- } \
-}
-
-#define CHECK_BIDIR(a,b,c,d) \
- COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d)
+/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
+ * other than making its iteration count not a compile-time constant. */
+int x264_iter_kludge = 0;
static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
{
@@ -955,8 +899,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
- int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
+ const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
+ const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
+ const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
int stride0[9];
int stride1[9];
int bm0x = m0->mv[0], om0x = bm0x;
@@ -971,7 +917,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
- static const int8_t dia4d[32][4] = {
+ static const int8_t dia4d[33][4] = {
+ {0,0,0,0},
{0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
{0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0},
{0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0},
@@ -988,10 +935,6 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
- BIME_CACHE( 0, 0, 0 );
- BIME_CACHE( 0, 0, 1 );
- CHECK_BIDIR( 0, 0, 0, 0 );
-
for( pass = 0; pass < 8; pass++ )
{
/* check all mv pairs that differ in at most 2 components from the current mvs. */
@@ -999,12 +942,44 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
* from bidir ME are the same with and without chroma ME. */
if( mc_list0 )
- BIME_CACHE8( 0 );
+ for( j = x264_iter_kludge; j < 9; j++ )
+ BIME_CACHE( square1[j][0], square1[j][1], 0 );
+
if( mc_list1 )
- BIME_CACHE8( 1 );
+ for( j = x264_iter_kludge; j < 9; j++ )
+ BIME_CACHE( square1[j][0], square1[j][1], 1 );
- for( j=0; j<32; j++ )
- CHECK_BIDIR( dia4d[j][0], dia4d[j][1], dia4d[j][2], dia4d[j][3] );
+ for( j = !!pass; j < 33; j++ )
+ {
+ int m0x = dia4d[j][0] + om0x;
+ int m0y = dia4d[j][1] + om0y;
+ int m1x = dia4d[j][2] + om1x;
+ int m1y = dia4d[j][3] + om1y;
+ if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) )
+ {
+ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y);
+ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y);
+ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
+ h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight );
+ int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
+ + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
+ if( rd )
+ {
+ if( cost < bcost * SATD_THRESH )
+ {
+ bcost = X264_MIN( cost, bcost );
+ *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y);
+ *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y);
+ h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+ h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
+ COPY5_IF_LT( bcostrd, costrd, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+ }
+ }
+ else
+ COPY5_IF_LT( bcost, cost, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+ }
+ }
mc_list0 = (om0x-bm0x)|(om0y-bm0y);
mc_list1 = (om1x-bm1x)|(om1y-bm1y);
@@ -1015,11 +990,6 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
om0y = bm0y;
om1x = bm1x;
om1y = bm1y;
-
- if( mc_list0 )
- BIME_CACHE( 0, 0, 0 );
- if( mc_list1 )
- BIME_CACHE( 0, 0, 1 );
}
m0->mv[0] = bm0x;
diff --git a/encoder/me.h b/encoder/me.h
index 0122b8b..ed75da3 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -84,13 +84,23 @@ if((y)<(x))\
(c)=(d);\
}
-#define COPY4_IF_LT(x,y,a,b,c,d,f,e)\
+#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
if((y)<(x))\
{\
(x)=(y);\
(a)=(b);\
(c)=(d);\
- (f)=(e);\
+ (e)=(f);\
+}
+
+#define COPY5_IF_LT(x,y,a,b,c,d,e,f,g,h)\
+if((y)<(x))\
+{\
+ (x)=(y);\
+ (a)=(b);\
+ (c)=(d);\
+ (e)=(f);\
+ (g)=(h);\
}
#define COPY2_IF_GT(x,y,a,b)\
More information about the x264-devel
mailing list