[x264-devel] commit: Use aligned memcpy for x264_me_t struct and cosmetics ( Jason Garrett-Glaser )
git version control
git at videolan.org
Mon Jun 16 03:02:27 CEST 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sun Jun 15 11:51:36 2008 -0600| [63657b5a1fe05034846631377fbd584fd3d5def6]
Use aligned memcpy for x264_me_t struct and cosmetics
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=63657b5a1fe05034846631377fbd584fd3d5def6
---
encoder/analyse.c | 96 +++++++++++++++++++++++++-------------------------
encoder/me.h | 2 +-
encoder/slicetype.c | 20 ++++------
3 files changed, 57 insertions(+), 61 deletions(-)
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 17efe03..9200ace 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1010,7 +1010,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
i_halfpel_thresh += i_ref_cost;
if( m.cost < a->l0.me16x16.cost )
- a->l0.me16x16 = m;
+ h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
/* save mv for predicting neighbors */
*(uint32_t*)a->l0.mvc[i_ref][0] =
@@ -1072,22 +1072,22 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
l0m->cost = INT_MAX;
for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
{
- const int i_ref_cost = REF_COST( 0, i_ref );
- i_halfpel_thresh -= i_ref_cost;
- m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
-
- LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
- x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
- x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
- x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
-
- m.cost += i_ref_cost;
- i_halfpel_thresh += i_ref_cost;
- *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
-
- if( m.cost < l0m->cost )
- *l0m = m;
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ i_halfpel_thresh -= i_ref_cost;
+ m.i_ref_cost = i_ref_cost;
+ m.i_ref = i_ref;
+
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+ x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
+ x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+ x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
+
+ m.cost += i_ref_cost;
+ i_halfpel_thresh += i_ref_cost;
+ *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
+
+ if( m.cost < l0m->cost )
+ h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
}
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
@@ -1176,25 +1176,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
l0m->cost = INT_MAX;
for( j = 0; j < i_ref8s; j++ )
{
- const int i_ref = ref8[j];
- const int i_ref_cost = REF_COST( 0, i_ref );
- m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
+ const int i_ref = ref8[j];
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ m.i_ref_cost = i_ref_cost;
+ m.i_ref = i_ref;
- /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
- *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
- *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
+ /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
+ *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
+ *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
+ *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
- LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
- x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
- x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
- x264_me_search( h, &m, mvc, 3 );
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
+ x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
+ x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
+ x264_me_search( h, &m, mvc, 3 );
- m.cost += i_ref_cost;
+ m.cost += i_ref_cost;
- if( m.cost < l0m->cost )
- *l0m = m;
+ if( m.cost < l0m->cost )
+ h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
}
x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
@@ -1226,24 +1226,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
l0m->cost = INT_MAX;
for( j = 0; j < i_ref8s; j++ )
{
- const int i_ref = ref8[j];
- const int i_ref_cost = REF_COST( 0, i_ref );
- m.i_ref_cost = i_ref_cost;
- m.i_ref = i_ref;
+ const int i_ref = ref8[j];
+ const int i_ref_cost = REF_COST( 0, i_ref );
+ m.i_ref_cost = i_ref_cost;
+ m.i_ref = i_ref;
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
- *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
- *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
+ *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
+ *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
+ *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
- LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
- x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
- x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
- x264_me_search( h, &m, mvc, 3 );
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
+ x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
+ x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+ x264_me_search( h, &m, mvc, 3 );
- m.cost += i_ref_cost;
+ m.cost += i_ref_cost;
- if( m.cost < l0m->cost )
- *l0m = m;
+ if( m.cost < l0m->cost )
+ h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
}
x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
@@ -1467,7 +1467,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
if( m.cost < a->l0.me16x16.cost )
{
a->l0.i_ref = i_ref;
- a->l0.me16x16 = m;
+ h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
}
/* save mv for predicting neighbors */
@@ -1494,7 +1494,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
if( m.cost < a->l1.me16x16.cost )
{
a->l1.i_ref = i_ref;
- a->l1.me16x16 = m;
+ h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
}
/* save mv for predicting neighbors */
diff --git a/encoder/me.h b/encoder/me.h
index 96135c9..6775a97 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -45,7 +45,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
DECLARE_ALIGNED_4( int16_t mv[2] );
-} x264_me_t;
+} DECLARE_ALIGNED_16( x264_me_t );
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index fff7bc4..d72e40a 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -89,13 +89,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
}
#define SAVE_MVS( mv0, mv1 ) \
{ \
- fenc->mv[0][i_mb_xy][0] = mv0[0]; \
- fenc->mv[0][i_mb_xy][1] = mv0[1]; \
+ *(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \
if( b_bidir ) \
- { \
- fenc->mv[1][i_mb_xy][0] = mv1[0]; \
- fenc->mv[1][i_mb_xy][1] = mv1[1]; \
- } \
+ *(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \
}
#define CLIP_MV( mv ) \
{ \
@@ -133,7 +129,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
int dmv[2][2];
int mv0[2] = {0,0};
- m[1] = m[0];
+ h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
@@ -144,7 +140,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
CLIP_MV( dmv[1] );
TRY_BIDIR( dmv[0], dmv[1], 0 );
- if( dmv[0][0] || dmv[0][1] || dmv[1][0] || dmv[1][1] )
+ if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] )
TRY_BIDIR( mv0, mv0, 0 );
// if( i_bcost < 60 ) // arbitrary threshold
// return i_bcost;
@@ -153,10 +149,10 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
i_cost_bak = i_bcost;
for( l = 0; l < 1 + b_bidir; l++ )
{
- int16_t mvc[4][2] = {{0}};
+ DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}};
int i_mvc = 0;
int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
-#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; }
+#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
if( i_mb_x > 0 )
MVC(fenc_mv[-1]);
if( i_mb_y > 0 )
@@ -172,12 +168,12 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_me_search( h, &m[l], mvc, i_mvc );
m[l].cost -= 2; // remove mvcost from skip mbs
- if( m[l].mv[0] || m[l].mv[1] )
+ if( *(uint32_t*)m[l].mv )
m[l].cost += 5;
i_bcost = X264_MIN( i_bcost, m[l].cost );
}
- if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) )
+ if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
if( i_bcost < i_cost_bak )
More information about the x264-devel
mailing list