[x264-devel] Use a large LUT for CAVLC zero-run bit codes
David Sze
sze.david at gmail.com
Wed Feb 22 20:26:10 CET 2012
Does the call to x264_cavlc_init need to be wrapped in x264_stack_align?
Without it I get crashes in my app on Win32 x86 when linked with MSVC2010
against the libx264 DLL (compiled with MinGW gcc 4.6.2).
On Sun, Jan 15, 2012 at 8:11 PM, Jason Garrett-Glaser <git at videolan.org>wrote:
> x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Dec
> 8 13:45:41 2011 -0800| [c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2] |
> committer: Jason Garrett-Glaser
>
> Use a large LUT for CAVLC zero-run bit codes
> Helps the most with trellis and RD, but also helps with bitstream writing.
> Seems at worst neutral even in the extreme case of a CPU with small L2
> cache (e.g. ARM Cortex A8).
>
> >
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2
> ---
>
> common/bitstream.h | 7 ++++++-
> common/common.h | 2 +-
> common/quant.c | 3 +++
> common/vlc.c | 28 ++++++++++++++++++++++++++--
> common/x86/quant-a.asm | 20 ++++++++++++++------
> encoder/cavlc.c | 9 +++------
> encoder/encoder.c | 8 ++++----
> tools/checkasm.c | 1 +
> 8 files changed, 58 insertions(+), 20 deletions(-)
>
> diff --git a/common/bitstream.h b/common/bitstream.h
> index 1a15338..f407e1d 100644
> --- a/common/bitstream.h
> +++ b/common/bitstream.h
> @@ -56,6 +56,7 @@ typedef struct bs_s
> typedef struct
> {
> int last;
> + int mask;
> dctcoef level[16];
> uint8_t run[16];
> } x264_run_level_t;
> @@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];
> extern const vlc_t x264_total_zeros[15][16];
> extern const vlc_t x264_total_zeros_2x2_dc[3][4];
> extern const vlc_t x264_total_zeros_2x4_dc[7][8];
> -extern const vlc_t x264_run_before[7][16];
>
> typedef struct
> {
> @@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu,
> x264_bitstream_function_t *pf );
> #define LEVEL_TABLE_SIZE 128
> extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
>
> +/* The longest possible set of zero run codes sums to 25 bits. This
> leaves
> + * plenty of room for both the code (25 bits) and size (5 bits) in a
> uint32_t. */
> +
> +extern uint32_t x264_run_before[1<<16];
> +
> static inline void bs_init( bs_t *s, void *p_data, int i_data )
> {
> int offset = ((intptr_t)p_data & 3);
> diff --git a/common/common.h b/common/common.h
> index 2704f29..b6cec65 100644
> --- a/common/common.h
> +++ b/common/common.h
> @@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char
> *psz_fmt, ... );
>
> void x264_reduce_fraction( uint32_t *n, uint32_t *d );
> void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
> -void x264_cavlc_init( void );
> +void x264_cavlc_init( x264_t *h );
> void x264_cabac_init( x264_t *h );
>
> static ALWAYS_INLINE pixel x264_clip_pixel( int x )
> diff --git a/common/quant.c b/common/quant.c
> index 9b6b6d8..3897f53 100644
> --- a/common/quant.c
> +++ b/common/quant.c
> @@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct,
> x264_run_level_t *runlevel )
> {\
> int i_last = runlevel->last = x264_coeff_last##num(dct);\
> int i_total = 0;\
> + int mask = 0;\
> do\
> {\
> int r = 0;\
> runlevel->level[i_total] = dct[i_last];\
> + mask |= 1 << (i_last);\
> while( --i_last >= 0 && dct[i_last] == 0 )\
> r++;\
> runlevel->run[i_total++] = r;\
> } while( i_last >= 0 );\
> + runlevel->mask = mask;\
> return i_total;\
> }
>
> diff --git a/common/vlc.c b/common/vlc.c
> index c4c3ad3..9adcc89 100644
> --- a/common/vlc.c
> +++ b/common/vlc.c
> @@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =
> };
>
> /* [MIN( i_zero_left-1, 6 )][run_before] */
> -const vlc_t x264_run_before[7][16] =
> +static const vlc_t run_before[7][16] =
> {
> { /* i_zero_left 1 */
> { 0x1, 1 }, /* str=1 */
> @@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =
> };
>
> vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
> +uint32_t x264_run_before[1<<16];
>
> -void x264_cavlc_init( void )
> +void x264_cavlc_init( x264_t *h )
> {
> for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
> for( int16_t level = -LEVEL_TABLE_SIZE/2; level <
> LEVEL_TABLE_SIZE/2; level++ )
> @@ -840,4 +841,27 @@ void x264_cavlc_init( void )
> i_next++;
> vlc->i_next = i_next;
> }
> +
> + for( int i = 1; i < (1<<16); i++ )
> + {
> + x264_run_level_t runlevel;
> + ALIGNED_ARRAY_16( dctcoef, dct, [16] );
> + int size = 0;
> + int bits = 0;
> + for( int j = 0; j < 16; j++ )
> + dct[j] = i&(1<<j);
> + int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct,
> &runlevel );
> + int zeros = runlevel.last + 1 - total;
> + for( int j = 0; j < total-1 && zeros > 0; j++ )
> + {
> + int idx = X264_MIN(zeros, 7) - 1;
> + int run = runlevel.run[j];
> + int len = run_before[idx][run].i_size;
> + size += len;
> + bits <<= len;
> + bits |= run_before[idx][run].i_bits;
> + zeros -= run;
> + }
> + x264_run_before[i] = (bits << 5) + size;
> + }
> }
> diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
> index 7d33a5e..7a4a9c0 100644
> --- a/common/x86/quant-a.asm
> +++ b/common/x86/quant-a.asm
> @@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7
> movifnidn t1, r1mp
> pxor m2, m2
> LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
> - not t5d
> - shl t5d, 32-((%1+1)&~1)
> +%if %1==15
> + shr t5d, 1
> +%elif %1==8
> + and t5d, 0xff
> +%elif %1==4
> + and t5d, 0xf
> +%endif
> + xor t5d, (1<<%1)-1
> + mov [t1+4], t5d
> + shl t5d, 32-%1
> mov t4d, %1-1
> LZCOUNT t3d, t5d, 0x1f
> xor t6d, t6d
> @@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7
> LZCOUNT t3d, t5d, 0x1f
> %ifdef HIGH_BIT_DEPTH
> mov t2d, [t0+t4*4]
> - mov [t1+t6 +4+16*4], t3b
> - mov [t1+t6*4+ 4], t2d
> + mov [t1+t6+8+16*4], t3b
> + mov [t1+t6*4+ 8], t2d
> %else
> mov t2w, [t0+t4*2]
> - mov [t1+t6 +4+16*2], t3b
> - mov [t1+t6*2+ 4], t2w
> + mov [t1+t6+8+16*2], t3b
> + mov [t1+t6*2+ 8], t2w
> %endif
> inc t3d
> shl t5d, t3b
> diff --git a/encoder/cavlc.c b/encoder/cavlc.c
> index 26af61f..29ed0b0 100644
> --- a/encoder/cavlc.c
> +++ b/encoder/cavlc.c
> @@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t
> *h, int ctx_block_cat, dct
> runlevel.level[1] = 2;
> runlevel.level[2] = 2;
> i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
> + x264_prefetch( &x264_run_before[runlevel.mask] );
> i_total_zero = runlevel.last + 1 - i_total;
>
> i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31)
> & 1) // abs(runlevel.level[0])>1
> @@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t
> *h, int ctx_block_cat, dct
> else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
> bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
>
> - for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
> - {
> - int i_zl = X264_MIN( i_total_zero, 7 );
> - bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
> - i_total_zero -= runlevel.run[i];
> - }
> + int zero_run_code = x264_run_before[runlevel.mask];
> + bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
>
> return i_total;
> }
> diff --git a/encoder/encoder.c b/encoder/encoder.c
> index 607ece1..253aabb 100644
> --- a/encoder/encoder.c
> +++ b/encoder/encoder.c
> @@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
> x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
> x264_predict_8x8_init( h->param.cpu, h->predict_8x8,
> &h->predict_8x8_filter );
> x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
> - if( h->param.b_cabac )
> - x264_cabac_init( h );
> - else
> - x264_cavlc_init();
> x264_pixel_init( h->param.cpu, &h->pixf );
> x264_dct_init( h->param.cpu, &h->dctf );
> x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive,
> &h->zigzagf_interlaced );
> @@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
> x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
> x264_bitstream_init( h->param.cpu, &h->bsf );
> x264_dct_init_weights();
> + if( h->param.b_cabac )
> + x264_cabac_init( h );
> + else
> + x264_cavlc_init( h );
>
> mbcmp_init( h );
> chroma_dsp_init( h );
> diff --git a/tools/checkasm.c b/tools/checkasm.c
> index 249a6bb..b97b001 100644
> --- a/tools/checkasm.c
> +++ b/tools/checkasm.c
> @@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )
> int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
> int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
> if( result_c != result_a || runlevel_c.last != runlevel_a.last
> || \
> + runlevel_c.mask != runlevel_a.mask || \
> memcmp(runlevel_c.level, runlevel_a.level,
> sizeof(dctcoef)*result_c) || \
> memcmp(runlevel_c.run, runlevel_a.run,
> sizeof(uint8_t)*(result_c-1)) ) \
> { \
>
> _______________________________________________
> x264-devel mailing list
> x264-devel at videolan.org
> http://mailman.videolan.org/listinfo/x264-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x264-devel/attachments/20120222/9648581f/attachment-0001.html>
More information about the x264-devel
mailing list