[x264-devel] commit: use lookup tables instead of actual exp/pow for AQ ( U-TACHIKOMA\Jason )
git version control
git at videolan.org
Thu Dec 11 07:44:38 CET 2008
x264 | branch: master | U-TACHIKOMA\Jason <Jason at Tachikoma.(none)> | Wed Dec 10 20:54:17 2008 -0800| [ec3ee90261bc5576d511211e593f7c9bcf6d5b4f] | committer: U-TACHIKOMA\Jason
use lookup tables instead of actual exp/pow for AQ
Significant speed boost, especially on CPUs with atrociously slow floating point units (e.g. Pentium 4 saves 800 clocks per MB with this change).
Add x264_clz function as part of the LUT system: this may be useful later.
Note this changes output somewhat as the numbers from the lookup table are not exact.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=ec3ee90261bc5576d511211e593f7c9bcf6d5b4f
---
common/osdep.h | 16 +++++++++++++
encoder/ratecontrol.c | 59 +++++++++++++++++++++++++++++++++++++++---------
2 files changed, 64 insertions(+), 11 deletions(-)
diff --git a/common/osdep.h b/common/osdep.h
index 416a76e..25bb138 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -169,4 +169,20 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
}
#endif
+#ifdef __GNUC__
+#define x264_clz(x) __builtin_clz(x)
+#else
+static int ALWAYS_INLINE x264_clz( uint32_t x )
+{
+ static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};
+ int y, z = ((x - 0x10000) >> 27) & 16;
+ x >>= z^16;
+ z += y = ((x - 0x100) >> 28) & 8;
+ x >>= y^8;
+ z += y = ((x - 0x10) >> 29) & 4;
+ x >>= y^4;
+ return z + lut[x];
+}
+#endif
+
#endif /* X264_OSDEP_H */
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 6932388..712a721 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -191,21 +191,62 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
return var;
}
+static const float log2_lut[128] = {
+ 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
+ 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
+ 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
+ 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
+ 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
+ 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
+ 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
+ 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
+ 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
+ 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
+ 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
+ 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
+ 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
+ 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
+ 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
+ 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
+};
+
+static const uint8_t exp2_lut[64] = {
+ 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47,
+ 50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104,
+ 108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
+ 177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
+};
+
+static int x264_exp2fix8( float x )
+{
+ int i, f;
+ x += 8;
+ if( x <= 0 ) return 0;
+ if( x >= 16 ) return 0xffff;
+ i = x;
+ f = (x-i)*64;
+ return (exp2_lut[f]+256) << i >> 8;
+}
+
void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
{
+ /* constants chosen to result in approximately the same overall bitrate as without AQ.
+ * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
+ float strength = h->param.rc.f_aq_strength * 1.0397;
int mb_x, mb_y;
- for( mb_y=0; mb_y<h->sps->i_mb_height; mb_y++ )
- for( mb_x=0; mb_x<h->sps->i_mb_width; mb_x++ )
+ for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
+ for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
{
- int energy = ac_energy_mb( h, mb_x, mb_y, frame );
- /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. */
- float qp_adj = h->param.rc.f_aq_strength * 1.5 * (logf(energy) - 10.0);
+ uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
+ int lz = x264_clz( energy );
+ float qp_adj = strength * (log2_lut[(energy<<lz>>24)&0x7f] - lz + 16.573f);
frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
if( h->frames.b_have_lowres )
- frame->i_inv_qscale_factor[mb_x+mb_y*h->mb.i_mb_stride] = FIX8(pow(2.0,-qp_adj/6.0));
+ frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj*(-1.f/6.f));
}
}
+
/*****************************************************************************
* x264_adaptive_quant:
* adjust macroblock QP based on variance (AC energy) of the MB.
@@ -215,16 +256,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
*****************************************************************************/
void x264_adaptive_quant( x264_t *h )
{
- float qp, qp_adj;
x264_emms();
- qp = h->rc->f_qpm;
- qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride];
- h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+ h->mb.i_qp = x264_clip3( h->rc->f_qpm + h->fenc->f_qp_offset[h->mb.i_mb_xy] + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
/* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
* to lower the bit cost of the qp_delta. */
if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
h->mb.i_qp = h->mb.i_last_qp;
- h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
}
int x264_ratecontrol_new( x264_t *h )
More information about the x264-devel
mailing list