[x264-devel] commit: Much faster CABAC RDO (Jason Garrett-Glaser )
git version control
git at videolan.org
Sun Dec 28 16:33:19 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sat Dec 27 21:36:14 2008 -0500| [84a1ca6ce70fe7bad4922ddc5a72c2e9cd73703b] | committer: Jason Garrett-Glaser
Much faster CABAC RDO
Since RDO doesn't care about what order bit costs are calculated, merge sigmap and level coding into the same loop in RDO.
This is bit-exact for 4x4dct but slightly incorrect for 8x8dct due to the sigmap containing duplicated contexts.
However, the PSNR penalty of this is extremely small (~0.001db).
Speed benefit is about 15% in 4x4dct and 30% in 8x8dct residual bit cost calculation at QP20.
Overall encoding speed benefit is up to 5%, depending on encoding settings.
Also remove an old unnecessary CABAC table that hasn't been used for years.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=84a1ca6ce70fe7bad4922ddc5a72c2e9cd73703b
---
common/cabac.c | 35 ----------------
encoder/cabac.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 101 insertions(+), 50 deletions(-)
diff --git a/common/cabac.c b/common/cabac.c
index 722451b..7a2e94d 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -742,41 +742,6 @@ const uint8_t x264_cabac_renorm_shift[64]= {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
-static const uint8_t x264_cabac_probability[128] =
-{
- FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781),
- FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730),
- FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667),
- FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590),
- FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495),
- FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378),
- FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234),
- FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057),
- FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838),
- FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569),
- FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237),
- FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828),
- FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325),
- FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705),
- FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941),
- FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000),
- FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276),
- FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472),
- FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818),
- FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288),
- FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857),
- FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508),
- FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224),
- FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994),
- FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807),
- FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655),
- FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532),
- FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432),
- FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350),
- FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284),
- FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231),
- FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187)
-};
/* -ln2(probability) */
#define F(a,b) {FIX8(a),FIX8(b)}
const uint16_t x264_cabac_entropy[128][2] =
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 420a4ba..ea57b32 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -636,6 +636,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
{ 4, 4, 4, 4, 5, 6, 7, 7 }
};
+#if !RDO_SKIP_BS
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -692,9 +693,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
if( i == i_last )
{
i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;
-#if !RDO_SKIP_BS
i_coeff_sign[i_coeff] = l[i] < 0;
-#endif
i_coeff++;
}
@@ -711,15 +710,10 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
{
x264_cabac_encode_decision( cb, ctx, 1 );
ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
-#if RDO_SKIP_BS
- cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
- cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
-#else
for( i = 0; i < i_prefix - 1; i++ )
x264_cabac_encode_decision( cb, ctx, 1 );
if( i_prefix < 14 )
x264_cabac_encode_decision( cb, ctx, 0 );
-#endif
if( i_prefix >= 14 )
x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 );
@@ -729,18 +723,110 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
{
x264_cabac_encode_decision( cb, ctx, 0 );
node_ctx = coeff_abs_level_transition[0][node_ctx];
-#if RDO_SKIP_BS
- x264_cabac_encode_bypass( cb, 0 ); // sign
-#endif
}
-#if !RDO_SKIP_BS
x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] );
-#endif
} while( i_coeff > 0 );
}
+#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 )
+
+#else
+
+/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct
+ * this is slightly incorrect because the sigmap is not reversible
+ * (contexts are repeated). However, there is nearly no quality penalty
+ * for this (~0.001db) and the speed boost (~30%) is worth it. */
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
+{
+ const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
+ const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
+ const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat];
+ const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced];
+ int i_last, i_coeff_abs_m1, ctx, i_prefix, i, node_ctx;
+
+ if( !b_8x8 )
+ {
+ /* coded block flag */
+ ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
+ x264_cabac_encode_decision( cb, ctx, 1 );
+ else
+ {
+ x264_cabac_encode_decision( cb, ctx, 0 );
+ return;
+ }
+ }
+
+ i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
+ i_coeff_abs_m1 = abs(l[i_last]) - 1;
+ i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
+ ctx = coeff_abs_level1_ctx[0] + i_ctx_level;
+ if( i_last != i_count - 1 )
+ {
+ x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 );
+ x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 );
+ }
+
+ if( i_prefix )
+ {
+ x264_cabac_encode_decision( cb, ctx, 1 );
+ ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level;
+ cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
+ cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
+ if( i_prefix >= 14 )
+ x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
+ node_ctx = coeff_abs_level_transition[1][0];
+ }
+ else
+ {
+ x264_cabac_encode_decision( cb, ctx, 0 );
+ node_ctx = coeff_abs_level_transition[0][0];
+ x264_cabac_encode_bypass( cb, 0 ); // sign
+ }
+
+ for( i = i_last-1 ; i >= 0; i-- )
+ {
+ if( l[i] )
+ {
+ x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 );
+ x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 );
+ ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level;
+
+ if( (unsigned)(l[i]+1) > 2 )
+ {
+ i_coeff_abs_m1 = abs(l[i]) - 1;
+ i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
+ x264_cabac_encode_decision( cb, ctx, 1 );
+ ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
+ cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
+ cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
+ if( i_prefix >= 14 )
+ x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
+ node_ctx = coeff_abs_level_transition[1][node_ctx];
+ }
+ else
+ {
+ x264_cabac_encode_decision( cb, ctx, 0 );
+ node_ctx = coeff_abs_level_transition[0][node_ctx];
+ x264_cabac_encode_bypass( cb, 0 );
+ }
+ }
+ else
+ x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 );
+ }
+}
+
+static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
+{
+ block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
+}
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+{
+ block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
+}
+#endif
void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
@@ -959,7 +1045,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
{
for( i = 0; i < 4; i++ )
if( h->mb.i_cbp_luma & ( 1 << i ) )
- block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i, h->dct.luma8x8[i], 64 );
+ block_residual_write_cabac_8x8( h, cb, i, h->dct.luma8x8[i] );
}
else
{
@@ -1024,7 +1110,7 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
if( h->mb.i_cbp_luma & (1 << i8) )
{
if( h->mb.b_transform_8x8 )
- block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
+ block_residual_write_cabac_8x8( h, cb, i8, h->dct.luma8x8[i8] );
else
{
int i4;
@@ -1063,7 +1149,7 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8,
{
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
*(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
- block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
+ block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
}
else
{
More information about the x264-devel
mailing list