[x264-devel] Use a large LUT for CAVLC zero-run bit codes

Wed Feb 22 20:26:10 CET 2012

Does the call to x264_cavlc_init need to be wrapped in x264_stack_align?

Without it I get crashes in my app on Win32 x86 when linked with MSVC2010
against the libx264 DLL (compiled with MinGW gcc 4.6.2).

On Sun, Jan 15, 2012 at 8:11 PM, Jason Garrett-Glaser <git at videolan.org>wrote:

> x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu Dec
>  8 13:45:41 2011 -0800| [c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2] |
> committer: Jason Garrett-Glaser
>
> Use a large LUT for CAVLC zero-run bit codes
> Helps the most with trellis and RD, but also helps with bitstream writing.
> Seems at worst neutral even in the extreme case of a CPU with small L2
> cache (e.g. ARM Cortex A8).
>
> >
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2
> ---
>
>  common/bitstream.h     |    7 ++++++-
>  common/common.h        |    2 +-
>  common/quant.c         |    3 +++
>  common/vlc.c           |   28 ++++++++++++++++++++++++++--
>  common/x86/quant-a.asm |   20 ++++++++++++++------
>  encoder/cavlc.c        |    9 +++------
>  encoder/encoder.c      |    8 ++++----
>  tools/checkasm.c       |    1 +
>  8 files changed, 58 insertions(+), 20 deletions(-)
>
> diff --git a/common/bitstream.h b/common/bitstream.h
> index 1a15338..f407e1d 100644
> --- a/common/bitstream.h
> +++ b/common/bitstream.h
> @@ -56,6 +56,7 @@ typedef struct bs_s
>  typedef struct
>  {
>     int     last;
> +    int     mask;
>     dctcoef level[16];
>     uint8_t run[16];
>  } x264_run_level_t;
> @@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];
>  extern const vlc_t x264_total_zeros[15][16];
>  extern const vlc_t x264_total_zeros_2x2_dc[3][4];
>  extern const vlc_t x264_total_zeros_2x4_dc[7][8];
> -extern const vlc_t x264_run_before[7][16];
>
>  typedef struct
>  {
> @@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu,
> x264_bitstream_function_t *pf );
>  #define LEVEL_TABLE_SIZE 128
>  extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
>
> +/* The longest possible set of zero run codes sums to 25 bits.  This
> leaves
> + * plenty of room for both the code (25 bits) and size (5 bits) in a
> uint32_t. */
> +
> +extern uint32_t x264_run_before[1<<16];
> +
>  static inline void bs_init( bs_t *s, void *p_data, int i_data )
>  {
>     int offset = ((intptr_t)p_data & 3);
> diff --git a/common/common.h b/common/common.h
> index 2704f29..b6cec65 100644
> --- a/common/common.h
> +++ b/common/common.h
> @@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char
> *psz_fmt, ... );
>
>  void x264_reduce_fraction( uint32_t *n, uint32_t *d );
>  void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
> -void x264_cavlc_init( void );
> +void x264_cavlc_init( x264_t *h );
>  void x264_cabac_init( x264_t *h );
>
>  static ALWAYS_INLINE pixel x264_clip_pixel( int x )
> diff --git a/common/quant.c b/common/quant.c
> index 9b6b6d8..3897f53 100644
> --- a/common/quant.c
> +++ b/common/quant.c
> @@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct,
> x264_run_level_t *runlevel )
>  {\
>     int i_last = runlevel->last = x264_coeff_last##num(dct);\
>     int i_total = 0;\
> +    int mask = 0;\
>     do\
>     {\
>         int r = 0;\
>         runlevel->level[i_total] = dct[i_last];\
> +        mask |= 1 << (i_last);\
>         while( --i_last >= 0 && dct[i_last] == 0 )\
>             r++;\
>         runlevel->run[i_total++] = r;\
>     } while( i_last >= 0 );\
> +    runlevel->mask = mask;\
>     return i_total;\
>  }
>
> diff --git a/common/vlc.c b/common/vlc.c
> index c4c3ad3..9adcc89 100644
> --- a/common/vlc.c
> +++ b/common/vlc.c
> @@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =
>  };
>
>  /* [MIN( i_zero_left-1, 6 )][run_before] */
> -const vlc_t x264_run_before[7][16] =
> +static const vlc_t run_before[7][16] =
>  {
>     { /* i_zero_left 1 */
>         { 0x1, 1 }, /* str=1 */
> @@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =
>  };
>
>  vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
> +uint32_t x264_run_before[1<<16];
>
> -void x264_cavlc_init( void )
> +void x264_cavlc_init( x264_t *h )
>  {
>     for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
>         for( int16_t level = -LEVEL_TABLE_SIZE/2; level <
> LEVEL_TABLE_SIZE/2; level++ )
> @@ -840,4 +841,27 @@ void x264_cavlc_init( void )
>                 i_next++;
>             vlc->i_next = i_next;
>         }
> +
> +    for( int i = 1; i < (1<<16); i++ )
> +    {
> +        x264_run_level_t runlevel;
> +        ALIGNED_ARRAY_16( dctcoef, dct, [16] );
> +        int size = 0;
> +        int bits = 0;
> +        for( int j = 0; j < 16; j++ )
> +            dct[j] = i&(1<<j);
> +        int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct,
> &runlevel );
> +        int zeros = runlevel.last + 1 - total;
> +        for( int j = 0; j < total-1 && zeros > 0; j++ )
> +        {
> +            int idx = X264_MIN(zeros, 7) - 1;
> +            int run = runlevel.run[j];
> +            int len = run_before[idx][run].i_size;
> +            size += len;
> +            bits <<= len;
> +            bits |= run_before[idx][run].i_bits;
> +            zeros -= run;
> +        }
> +        x264_run_before[i] = (bits << 5) + size;
> +    }
>  }
> diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
> index 7d33a5e..7a4a9c0 100644
> --- a/common/x86/quant-a.asm
> +++ b/common/x86/quant-a.asm
> @@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7
>     movifnidn t1, r1mp
>     pxor    m2, m2
>     LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
> -    not    t5d
> -    shl    t5d, 32-((%1+1)&~1)
> +%if %1==15
> +    shr   t5d, 1
> +%elif %1==8
> +    and   t5d, 0xff
> +%elif %1==4
> +    and   t5d, 0xf
> +%endif
> +    xor   t5d, (1<<%1)-1
> +    mov   [t1+4], t5d
> +    shl    t5d, 32-%1
>     mov    t4d, %1-1
>     LZCOUNT t3d, t5d, 0x1f
>     xor    t6d, t6d
> @@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7
>     LZCOUNT t3d, t5d, 0x1f
>  %ifdef HIGH_BIT_DEPTH
>     mov    t2d, [t0+t4*4]
> -    mov   [t1+t6  +4+16*4], t3b
> -    mov   [t1+t6*4+ 4], t2d
> +    mov   [t1+t6+8+16*4], t3b
> +    mov   [t1+t6*4+ 8], t2d
>  %else
>     mov    t2w, [t0+t4*2]
> -    mov   [t1+t6  +4+16*2], t3b
> -    mov   [t1+t6*2+ 4], t2w
> +    mov   [t1+t6+8+16*2], t3b
> +    mov   [t1+t6*2+ 8], t2w
>  %endif
>     inc    t3d
>     shl    t5d, t3b
> diff --git a/encoder/cavlc.c b/encoder/cavlc.c
> index 26af61f..29ed0b0 100644
> --- a/encoder/cavlc.c
> +++ b/encoder/cavlc.c
> @@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t
> *h, int ctx_block_cat, dct
>     runlevel.level[1] = 2;
>     runlevel.level[2] = 2;
>     i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
> +    x264_prefetch( &x264_run_before[runlevel.mask] );
>     i_total_zero = runlevel.last + 1 - i_total;
>
>     i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31)
> & 1) // abs(runlevel.level[0])>1
> @@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t
> *h, int ctx_block_cat, dct
>     else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
>         bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
>
> -    for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
> -    {
> -        int i_zl = X264_MIN( i_total_zero, 7 );
> -        bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
> -        i_total_zero -= runlevel.run[i];
> -    }
> +    int zero_run_code = x264_run_before[runlevel.mask];
> +    bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
>
>     return i_total;
>  }
> diff --git a/encoder/encoder.c b/encoder/encoder.c
> index 607ece1..253aabb 100644
> --- a/encoder/encoder.c
> +++ b/encoder/encoder.c
> @@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
>     x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
>     x264_predict_8x8_init( h->param.cpu, h->predict_8x8,
> &h->predict_8x8_filter );
>     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
> -    if( h->param.b_cabac )
> -        x264_cabac_init( h );
> -    else
> -        x264_cavlc_init();
>     x264_pixel_init( h->param.cpu, &h->pixf );
>     x264_dct_init( h->param.cpu, &h->dctf );
>     x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive,
> &h->zigzagf_interlaced );
> @@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
>     x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
>     x264_bitstream_init( h->param.cpu, &h->bsf );
>     x264_dct_init_weights();
> +    if( h->param.b_cabac )
> +        x264_cabac_init( h );
> +    else
> +        x264_cavlc_init( h );
>
>     mbcmp_init( h );
>     chroma_dsp_init( h );
> diff --git a/tools/checkasm.c b/tools/checkasm.c
> index 249a6bb..b97b001 100644
> --- a/tools/checkasm.c
> +++ b/tools/checkasm.c
> @@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )
>             int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
>             int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
>             if( result_c != result_a || runlevel_c.last != runlevel_a.last
> || \
> +                runlevel_c.mask != runlevel_a.mask || \
>                 memcmp(runlevel_c.level, runlevel_a.level,
> sizeof(dctcoef)*result_c) || \
>                 memcmp(runlevel_c.run, runlevel_a.run,
> sizeof(uint8_t)*(result_c-1)) ) \
>             { \
>
> _______________________________________________
> x264-devel mailing list
> x264-devel at videolan.org
> http://mailman.videolan.org/listinfo/x264-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x264-devel/attachments/20120222/9648581f/attachment-0001.html>