[x264-devel] Use a large LUT for CAVLC zero-run bit codes
Jason Garrett-Glaser
git at videolan.org
Mon Jan 16 02:11:58 CET 2012
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Dec 8 13:45:41 2011 -0800| [c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2] | committer: Jason Garrett-Glaser
Use a large LUT for CAVLC zero-run bit codes
Helps the most with trellis and RD, but also helps with bitstream writing.
Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8).
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2
---
common/bitstream.h | 7 ++++++-
common/common.h | 2 +-
common/quant.c | 3 +++
common/vlc.c | 28 ++++++++++++++++++++++++++--
common/x86/quant-a.asm | 20 ++++++++++++++------
encoder/cavlc.c | 9 +++------
encoder/encoder.c | 8 ++++----
tools/checkasm.c | 1 +
8 files changed, 58 insertions(+), 20 deletions(-)
diff --git a/common/bitstream.h b/common/bitstream.h
index 1a15338..f407e1d 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -56,6 +56,7 @@ typedef struct bs_s
typedef struct
{
int last;
+ int mask;
dctcoef level[16];
uint8_t run[16];
} x264_run_level_t;
@@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];
extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_2x2_dc[3][4];
extern const vlc_t x264_total_zeros_2x4_dc[7][8];
-extern const vlc_t x264_run_before[7][16];
typedef struct
{
@@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
#define LEVEL_TABLE_SIZE 128
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+/* The longest possible set of zero run codes sums to 25 bits. This leaves
+ * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
+
+extern uint32_t x264_run_before[1<<16];
+
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
int offset = ((intptr_t)p_data & 3);
diff --git a/common/common.h b/common/common.h
index 2704f29..b6cec65 100644
--- a/common/common.h
+++ b/common/common.h
@@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
-void x264_cavlc_init( void );
+void x264_cavlc_init( x264_t *h );
void x264_cabac_init( x264_t *h );
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
diff --git a/common/quant.c b/common/quant.c
index 9b6b6d8..3897f53 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )
{\
int i_last = runlevel->last = x264_coeff_last##num(dct);\
int i_total = 0;\
+ int mask = 0;\
do\
{\
int r = 0;\
runlevel->level[i_total] = dct[i_last];\
+ mask |= 1 << (i_last);\
while( --i_last >= 0 && dct[i_last] == 0 )\
r++;\
runlevel->run[i_total++] = r;\
} while( i_last >= 0 );\
+ runlevel->mask = mask;\
return i_total;\
}
diff --git a/common/vlc.c b/common/vlc.c
index c4c3ad3..9adcc89 100644
--- a/common/vlc.c
+++ b/common/vlc.c
@@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =
};
/* [MIN( i_zero_left-1, 6 )][run_before] */
-const vlc_t x264_run_before[7][16] =
+static const vlc_t run_before[7][16] =
{
{ /* i_zero_left 1 */
{ 0x1, 1 }, /* str=1 */
@@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =
};
vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+uint32_t x264_run_before[1<<16];
-void x264_cavlc_init( void )
+void x264_cavlc_init( x264_t *h )
{
for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
@@ -840,4 +841,27 @@ void x264_cavlc_init( void )
i_next++;
vlc->i_next = i_next;
}
+
+ for( int i = 1; i < (1<<16); i++ )
+ {
+ x264_run_level_t runlevel;
+ ALIGNED_ARRAY_16( dctcoef, dct, [16] );
+ int size = 0;
+ int bits = 0;
+ for( int j = 0; j < 16; j++ )
+ dct[j] = i&(1<<j);
+ int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );
+ int zeros = runlevel.last + 1 - total;
+ for( int j = 0; j < total-1 && zeros > 0; j++ )
+ {
+ int idx = X264_MIN(zeros, 7) - 1;
+ int run = runlevel.run[j];
+ int len = run_before[idx][run].i_size;
+ size += len;
+ bits <<= len;
+ bits |= run_before[idx][run].i_bits;
+ zeros -= run;
+ }
+ x264_run_before[i] = (bits << 5) + size;
+ }
}
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 7d33a5e..7a4a9c0 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7
movifnidn t1, r1mp
pxor m2, m2
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
- not t5d
- shl t5d, 32-((%1+1)&~1)
+%if %1==15
+ shr t5d, 1
+%elif %1==8
+ and t5d, 0xff
+%elif %1==4
+ and t5d, 0xf
+%endif
+ xor t5d, (1<<%1)-1
+ mov [t1+4], t5d
+ shl t5d, 32-%1
mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
@@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7
LZCOUNT t3d, t5d, 0x1f
%ifdef HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
- mov [t1+t6 +4+16*4], t3b
- mov [t1+t6*4+ 4], t2d
+ mov [t1+t6+8+16*4], t3b
+ mov [t1+t6*4+ 8], t2d
%else
mov t2w, [t0+t4*2]
- mov [t1+t6 +4+16*2], t3b
- mov [t1+t6*2+ 4], t2w
+ mov [t1+t6+8+16*2], t3b
+ mov [t1+t6*2+ 8], t2w
%endif
inc t3d
shl t5d, t3b
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 26af61f..29ed0b0 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
runlevel.level[1] = 2;
runlevel.level[2] = 2;
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
+ x264_prefetch( &x264_run_before[runlevel.mask] );
i_total_zero = runlevel.last + 1 - i_total;
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
@@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
- for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
- {
- int i_zl = X264_MIN( i_total_zero, 7 );
- bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
- i_total_zero -= runlevel.run[i];
- }
+ int zero_run_code = x264_run_before[runlevel.mask];
+ bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
return i_total;
}
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 607ece1..253aabb 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
- if( h->param.b_cabac )
- x264_cabac_init( h );
- else
- x264_cavlc_init();
x264_pixel_init( h->param.cpu, &h->pixf );
x264_dct_init( h->param.cpu, &h->dctf );
x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
@@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
x264_bitstream_init( h->param.cpu, &h->bsf );
x264_dct_init_weights();
+ if( h->param.b_cabac )
+ x264_cabac_init( h );
+ else
+ x264_cavlc_init( h );
mbcmp_init( h );
chroma_dsp_init( h );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 249a6bb..b97b001 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )
int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+ runlevel_c.mask != runlevel_a.mask || \
memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
{ \
More information about the x264-devel
mailing list