[x264-devel] commit: force unroll macroblock_load_pic_pointers (Jason Garrett-Glaser )
git version control
git at videolan.org
Sat May 17 08:57:02 CEST 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu May 15 05:41:43 2008 -0600| [b9073299fd9dbc116c0c47ddba925162fe0d1f83]
force unroll macroblock_load_pic_pointers
and a few other minor optimizations
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b9073299fd9dbc116c0c47ddba925162fe0d1f83
---
common/macroblock.c | 78 +++++++++++++++++++++++++-------------------------
encoder/macroblock.c | 20 ++++---------
2 files changed, 45 insertions(+), 53 deletions(-)
diff --git a/common/macroblock.c b/common/macroblock.c
index a68c815..cd1f9cc 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1011,6 +1011,42 @@ static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src )
dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
}
+static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb_x, int i_mb_y, int i)
+{
+ const int w = (i == 0 ? 16 : 8);
+ const int i_stride = h->fdec->i_stride[i];
+ const int i_stride2 = i_stride << h->mb.b_interlaced;
+ const int i_pix_offset = h->mb.b_interlaced
+ ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
+ : w * (i_mb_x + i_mb_y * i_stride);
+ int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
+ const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
+ x264_frame_t **fref[2] = { h->fref0, h->fref1 };
+ int j, k, l;
+ if( h->mb.b_interlaced )
+ ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
+ h->mb.pic.i_stride[i] = i_stride2;
+ h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
+ &h->fenc->plane[i][i_pix_offset], i_stride2, w );
+ memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+ if( h->mb.b_interlaced )
+ {
+ const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ for( j = 0; j < w; j++ )
+ h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
+ }
+ for( l=0; l<2; l++ )
+ {
+ for( j=0; j<h->mb.pic.i_fref[l]; j++ )
+ {
+ h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
+ if( i == 0 )
+ for( k = 1; k < 4; k++ )
+ h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+ }
+ }
+}
+
void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
int i_mb_xy = i_mb_y * h->mb.i_mb_stride + i_mb_x;
@@ -1189,45 +1225,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
}
/* load picture pointers */
- for( i = 0; i < 3; i++ )
- {
- const int w = (i == 0 ? 16 : 8);
- const int i_stride = h->fdec->i_stride[i];
- const int i_stride2 = i_stride << h->mb.b_interlaced;
- const int i_pix_offset = h->mb.b_interlaced
- ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
- : w * (i_mb_x + i_mb_y * i_stride);
- int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
- const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
- x264_frame_t **fref[2] = { h->fref0, h->fref1 };
- int j, k, l;
-
- if( h->mb.b_interlaced )
- ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
-
- h->mb.pic.i_stride[i] = i_stride2;
-
- h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
- &h->fenc->plane[i][i_pix_offset], i_stride2, w );
- memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
- if( h->mb.b_interlaced )
- {
- const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
- for( j = 0; j < w; j++ )
- h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
- }
-
- for( l=0; l<2; l++ )
- {
- for( j=0; j<h->mb.pic.i_fref[l]; j++ )
- {
- h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
- if( i == 0 )
- for( k = 1; k < 4; k++ )
- h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
- }
- }
- }
+ x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 0 );
+ x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 1 );
+ x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 2 );
if( h->fdec->integral )
{
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 01d09bc..3354714 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -64,7 +64,7 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
{
int i_run;
- if( abs( dct[idx--] ) > 1 )
+ if( (unsigned)(dct[idx--] + 1) > 2 )
return 9;
i_run = 0;
@@ -273,15 +273,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
static void x264_macroblock_encode_skip( x264_t *h )
{
- int i;
h->mb.i_cbp_luma = 0x00;
h->mb.i_cbp_chroma = 0x00;
-
- for( i = 0; i < 16+8; i++ )
- {
- h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
- }
-
+ memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
/* store cbp */
h->mb.cbp[h->mb.i_mb_xy] = 0;
}
@@ -500,8 +494,8 @@ void x264_macroblock_encode( x264_t *h )
h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-
- if( b_decimate )
+
+ if( b_decimate && i_decimate_8x8 <= 6 )
i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
}
@@ -799,10 +793,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
int i4;
DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
- h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
- h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
- h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
- h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+ for( i4 = 0; i4 < 4; i4++ )
+ h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
for( i4 = 0; i4 < 4; i4++ )
h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
More information about the x264-devel
mailing list