[x264-devel] commit: Motion compensation optimizations (Jason Garrett-Glaser )
git version control
git at videolan.org
Fri Oct 30 03:13:33 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Mon Oct 26 12:53:07 2009 -0700| [1f9c733dc27ae279943a3ddd7eaf8a50cbf4dc2d] | committer: Jason Garrett-Glaser
Motion compensation optimizations
Turning off inlining saves a whole boatload of code size for near-zero speed cost.
Simplify offset calculation.
Various other optimizations.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1f9c733dc27ae279943a3ddd7eaf8a50cbf4dc2d
---
common/macroblock.c | 46 +++++++++++++++++++++++-----------------------
encoder/analyse.c | 24 +++++++++++-------------
2 files changed, 34 insertions(+), 36 deletions(-)
diff --git a/common/macroblock.c b/common/macroblock.c
index b6d9b31..56e771f 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -462,62 +462,62 @@ static void setup_inverse_delta_pocs( x264_t *h )
}
}
-static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref = h->mb.cache.ref[0][i8];
- const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
- mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+ mvx, mvy, 4*width, 4*height );
// chroma is offset if MCing from a field of opposite parity
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
}
-static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref = h->mb.cache.ref[1][i8];
- const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
- mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+ mvx, mvy, 4*width, 4*height );
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
mvx, mvy, 2*width, 2*height );
}
-static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8];
const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
- const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
- int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
- int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+ const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+ int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
+ int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
@@ -525,9 +525,9 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
- mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
+ mvx0, mvy0, 4*width, 4*height );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
- mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+ mvx1, mvy1, 4*width, 4*height );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight );
@@ -536,14 +536,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
- h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
- h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
- h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
}
diff --git a/encoder/analyse.c b/encoder/analyse.c
index c2a4e1b..dec5f93 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1488,20 +1488,23 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
if( pixel == PIXEL_4x4 )
{
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
- CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
+ x264_me_t *m = a->l0.me4x4[i8x8];
+ CHROMA4x4MC( 2,2, m[0], 0,0 );
+ CHROMA4x4MC( 2,2, m[1], 2,0 );
+ CHROMA4x4MC( 2,2, m[2], 0,2 );
+ CHROMA4x4MC( 2,2, m[3], 2,2 );
}
else if( pixel == PIXEL_8x4 )
{
- CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
- CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
+ x264_me_t *m = a->l0.me8x4[i8x8];
+ CHROMA4x4MC( 4,2, m[0], 0,0 );
+ CHROMA4x4MC( 4,2, m[1], 0,2 );
}
else
{
- CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
- CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
+ x264_me_t *m = a->l0.me4x8[i8x8];
+ CHROMA4x4MC( 2,4, m[0], 0,0 );
+ CHROMA4x4MC( 2,4, m[1], 2,0 );
}
return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
@@ -1645,11 +1648,6 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
}
}
-#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
-{ \
- h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
-}
-
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
More information about the x264-devel
mailing list