[x264-devel] commit: Various CABAC optimizations (Jason Garrett-Glaser )
git version control
git at videolan.org
Thu Apr 9 11:16:37 CEST 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Apr 9 02:14:41 2009 -0700| [1024283b0321e53a3e08fddb1411429330bf1731] | committer: Jason Garrett-Glaser
Various CABAC optimizations
Move calculation of b_intra out of the core residual loop and hardcode it where applicable.
Inlining cabac_mb_mvd was unnecessary and wasted tremendous amounts of code size. Inlining only cache_mvd is faster and significantly smaller.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1024283b0321e53a3e08fddb1411429330bf1731
---
encoder/cabac.c | 55 ++++++++++++++++++++++++++++++++-----------------------
1 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 5be20e7..80db2df 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -444,7 +444,7 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
}
}
-static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
+static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
{
DECLARE_ALIGNED_4( int16_t mvp[2] );
int mdx, mdy;
@@ -458,8 +458,13 @@ static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, i
x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx );
x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy );
- /* save value */
- x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, pack16to32_mask(mdx,mdy) );
+ return pack16to32_mask(mdx,mdy);
+}
+
+#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
+{\
+ uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width,height);\
+ x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
}
static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i )
@@ -505,11 +510,10 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list
* 5-> Luma8x8 i_idx = luma8x8idx
*/
-static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
+static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra )
{
int i_nza;
int i_nzb;
- int b_intra = IS_INTRA( h->mb.i_type );
switch( i_cat )
{
@@ -672,7 +676,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] );
} while( i_coeff > 0 );
}
-#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 )
+#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64, 0 )
#else
@@ -784,9 +788,9 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
}
#endif
-#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \
+#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, b_intra ) \
{ \
- int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \
+ int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \
block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \
}
@@ -990,18 +994,19 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
{
+ const int b_intra = IS_INTRA( i_mb_type );
x264_cabac_mb_qp_delta( h, cb );
/* write residual */
if( i_mb_type == I_16x16 )
{
/* DC Luma */
- block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16, 1 );
/* AC Luma */
if( h->mb.i_cbp_luma != 0 )
for( i = 0; i < 16; i++ )
- block_residual_write_cabac( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15, 1 );
}
else if( h->mb.b_transform_8x8 )
{
@@ -1013,18 +1018,18 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
for( i = 0; i < 16; i++ )
if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
- block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], 16 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], 16, b_intra );
}
if( h->mb.i_cbp_chroma &0x03 ) /* Chroma DC residual present */
{
- block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
- block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4, b_intra );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4, b_intra );
}
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
{
for( i = 16; i < 24; i++ )
- block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15, b_intra );
}
}
@@ -1050,7 +1055,9 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
if( i_mb_type == P_8x8 )
x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
else if( i_mb_type == P_L0 )
+ {
x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
+ }
else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
{
if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
@@ -1077,12 +1084,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
{
int i4;
for( i4 = 0; i4 < 4; i4++ )
- block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16, 0 );
}
}
- block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
- block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15, 0 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15, 0 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
@@ -1091,13 +1098,15 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
{
int b_8x4 = i_pixel == PIXEL_8x4;
- block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16, 0 );
if( i_pixel == PIXEL_4x4 )
+ {
x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
+ }
else
{
x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
- block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16, 0 );
}
}
@@ -1116,7 +1125,7 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
i_mode = x264_mb_pred_mode4x4_fix( i_mode );
x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
- block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
+ block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16, 1 );
}
static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
@@ -1125,14 +1134,14 @@ static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
x264_cabac_mb_cbp_chroma( h, cb );
if( h->mb.i_cbp_chroma > 0 )
{
- block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
- block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4, 1 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4, 1 );
if( h->mb.i_cbp_chroma == 2 )
{
int i;
for( i = 16; i < 24; i++ )
- block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
+ block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15, 1 );
}
}
}
More information about the x264-devel
mailing list