[x264-devel] commit: Much faster CABAC residual context selection (Jason Garrett-Glaser )
git version control
git at videolan.org
Mon Feb 9 11:32:49 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sat Feb 7 02:27:16 2009 -0800| [9bf45f6d397559486b5fe038c3847b0d35c61728] | committer: Jason Garrett-Glaser
Much faster CABAC residual context selection
Up to ~17% faster CABAC RDO, ~36% faster intra-only CABAC RDO.
Up to 7% faster overall in extreme cases.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9bf45f6d397559486b5fe038c3847b0d35c61728
---
encoder/cabac.c | 121 ++++++++++++++++++-------------------------------------
1 files changed, 39 insertions(+), 82 deletions(-)
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 94c9a83..62280c1 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -505,92 +505,43 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list
}
}
-static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
+/* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
+ * 1-> AC 16x16 i_idx = luma4x4idx
+ * 2-> Luma4x4 i_idx = luma4x4idx
+ * 3-> DC Chroma i_idx = iCbCr
+ * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+ * 5-> Luma8x8 i_idx = luma8x8idx
+ */
+
+static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
{
- /* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0
- * 1-> AC 16x16 i_idx = luma4x4idx
- * 2-> Luma4x4 i_idx = luma4x4idx
- * 3-> DC Chroma i_idx = iCbCr
- * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
- * 5-> Luma8x8 i_idx = luma8x8idx
- */
-
- int i_mba_xy = -1;
- int i_mbb_xy = -1;
- int i_nza = 0;
- int i_nzb = 0;
+ int i_nza;
+ int i_nzb;
+ int b_intra = IS_INTRA( h->mb.i_type );
switch( i_cat )
{
- case DCT_LUMA_DC:
- if( h->mb.i_neighbour & MB_LEFT )
- {
- i_mba_xy = h->mb.i_mb_xy - 1;
- i_nza = h->mb.cbp[i_mba_xy] & 0x100;
- }
- if( h->mb.i_neighbour & MB_TOP )
- {
- i_mbb_xy = h->mb.i_mb_top_xy;
- i_nzb = h->mb.cbp[i_mbb_xy] & 0x100;
- }
- break;
case DCT_LUMA_AC:
case DCT_LUMA_4x4:
- if( i_idx & ~10 ) // block_idx_x > 0
- i_mba_xy = h->mb.i_mb_xy;
- else if( h->mb.i_neighbour & MB_LEFT )
- i_mba_xy = h->mb.i_mb_xy - 1;
-
- if( i_idx & ~5 ) // block_idx_y > 0
- i_mbb_xy = h->mb.i_mb_xy;
- else if( h->mb.i_neighbour & MB_TOP )
- i_mbb_xy = h->mb.i_mb_top_xy;
-
+ case DCT_CHROMA_AC:
/* no need to test for skip/pcm */
- if( i_mba_xy >= 0 )
- i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
- if( i_mbb_xy >= 0 )
- i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
- break;
+ i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
+ i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
+ i_nza &= 0x7f + (b_intra << 7);
+ i_nzb &= 0x7f + (b_intra << 7);
+ return 4*i_cat + 2*!!i_nzb + !!i_nza;
+ case DCT_LUMA_DC:
+ /* Note: this depends on the exact values of MB_LEFT and MB_TOP enums */
+ i_nza = ((h->mb.cbp[h->mb.i_mb_xy - 1] >> 8) | ~h->mb.i_neighbour) & 1;
+ i_nzb = ((h->mb.cbp[h->mb.i_mb_top_xy] >> 7) | ~h->mb.i_neighbour) & 2;
+ return 4*i_cat + i_nzb + i_nza;
case DCT_CHROMA_DC:
/* no need to test skip/pcm */
i_idx -= 25;
- if( h->mb.i_neighbour & MB_LEFT )
- {
- i_mba_xy = h->mb.i_mb_xy - 1;
- i_nza = h->mb.cbp[i_mba_xy] & (0x200 << i_idx);
- }
- if( h->mb.i_neighbour & MB_TOP )
- {
- i_mbb_xy = h->mb.i_mb_top_xy;
- i_nzb = h->mb.cbp[i_mbb_xy] & (0x200 << i_idx);
- }
- break;
- case DCT_CHROMA_AC:
- if( i_idx & 1 )
- i_mba_xy = h->mb.i_mb_xy;
- else if( h->mb.i_neighbour & MB_LEFT )
- i_mba_xy = h->mb.i_mb_xy - 1;
-
- if( i_idx & 2 )
- i_mbb_xy = h->mb.i_mb_xy;
- else if( h->mb.i_neighbour & MB_TOP )
- i_mbb_xy = h->mb.i_mb_top_xy;
-
- /* no need to test skip/pcm */
- if( i_mba_xy >= 0 )
- i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
- if( i_mbb_xy >= 0 )
- i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
+ i_nza = h->mb.i_neighbour & MB_LEFT ? (h->mb.cbp[h->mb.i_mb_xy - 1] >> (9 + i_idx)) & 1 : b_intra;
+ i_nzb = h->mb.i_neighbour & MB_TOP ? (h->mb.cbp[h->mb.i_mb_top_xy] >> (9 + i_idx)) & 1 : b_intra;
+ return 4*i_cat + 2*i_nzb + i_nza;
}
-
- if( IS_INTRA( h->mb.i_type ) )
- {
- i_nza |= i_mba_xy < 0;
- i_nzb |= i_mbb_xy < 0;
- }
-
- return 4*i_cat + 2*!!i_nzb + !!i_nza;
}
@@ -637,7 +588,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
};
#if !RDO_SKIP_BS
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -655,7 +606,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
if( i_count != 64 )
{
/* coded block flag */
- int ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+ int ctx = 85 + ctxidxinc;
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
x264_cabac_encode_decision( cb, ctx, 1 );
else
@@ -736,7 +687,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
* this is slightly incorrect because the sigmap is not reversible
* (contexts are repeated). However, there is nearly no quality penalty
* for this (~0.001db) and the speed boost (~30%) is worth it. */
-static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc, int b_8x8 )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -747,7 +698,7 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
if( !b_8x8 )
{
/* coded block flag */
- ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+ ctx = 85 + ctxidxinc;
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
x264_cabac_encode_decision( cb, ctx, 1 );
else
@@ -820,14 +771,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
{
- block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
+ block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 0, 1 );
}
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc )
{
- block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
+ block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc, 0 );
}
#endif
+#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \
+{ \
+ int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \
+ block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \
+}
+
void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
const int i_mb_type = h->mb.i_type;
More information about the x264-devel
mailing list