Does the call to x264_cavlc_init need to be wrapped in x264_stack_align?<div><br></div><div>Without it I get crashes in my app on Win32 x86 when linked with MSVC2010 against the libx264 DLL (compiled with MinGW gcc 4.6.2).</div>


<div><br><br><div class="gmail_quote">On Sun, Jan 15, 2012 at 8:11 PM, Jason Garrett-Glaser <span dir="ltr"><<a href="mailto:git@videolan.org">git@videolan.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


x264 | branch: master | Jason Garrett-Glaser <<a href="mailto:jason@x264.com">jason@x264.com</a>> | Thu Dec  8 13:45:41 2011 -0800| [c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2] | committer: Jason Garrett-Glaser<br>

<br>

Use a large LUT for CAVLC zero-run bit codes<br>

Helps the most with trellis and RD, but also helps with bitstream writing.<br>

Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8).<br>

<br>

> <a href="http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2" target="_blank">http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2</a><br>


---<br>

<br>

 common/bitstream.h     |    7 ++++++-<br>

 common/common.h        |    2 +-<br>

 common/quant.c         |    3 +++<br>

 common/vlc.c           |   28 ++++++++++++++++++++++++++--<br>

 common/x86/quant-a.asm |   20 ++++++++++++++------<br>

 encoder/cavlc.c        |    9 +++------<br>

 encoder/encoder.c      |    8 ++++----<br>

 tools/checkasm.c       |    1 +<br>

 8 files changed, 58 insertions(+), 20 deletions(-)<br>

<br>

diff --git a/common/bitstream.h b/common/bitstream.h<br>

index 1a15338..f407e1d 100644<br>

--- a/common/bitstream.h<br>

+++ b/common/bitstream.h<br>

@@ -56,6 +56,7 @@ typedef struct bs_s<br>

 typedef struct<br>

 {<br>

     int     last;<br>

+    int     mask;<br>

     dctcoef level[16];<br>

     uint8_t run[16];<br>

 } x264_run_level_t;<br>

@@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];<br>

 extern const vlc_t x264_total_zeros[15][16];<br>

 extern const vlc_t x264_total_zeros_2x2_dc[3][4];<br>

 extern const vlc_t x264_total_zeros_2x4_dc[7][8];<br>

-extern const vlc_t x264_run_before[7][16];<br>

<br>

 typedef struct<br>

 {<br>

@@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );<br>

 #define LEVEL_TABLE_SIZE 128<br>

 extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];<br>

<br>

+/* The longest possible set of zero run codes sums to 25 bits.  This leaves<br>

+ * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */<br>

+<br>

+extern uint32_t x264_run_before[1<<16];<br>

+<br>

 static inline void bs_init( bs_t *s, void *p_data, int i_data )<br>

 {<br>

     int offset = ((intptr_t)p_data & 3);<br>

diff --git a/common/common.h b/common/common.h<br>

index 2704f29..b6cec65 100644<br>

--- a/common/common.h<br>

+++ b/common/common.h<br>

@@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );<br>

<br>

 void x264_reduce_fraction( uint32_t *n, uint32_t *d );<br>

 void x264_reduce_fraction64( uint64_t *n, uint64_t *d );<br>

-void x264_cavlc_init( void );<br>

+void x264_cavlc_init( x264_t *h );<br>

 void x264_cabac_init( x264_t *h );<br>

<br>

 static ALWAYS_INLINE pixel x264_clip_pixel( int x )<br>

diff --git a/common/quant.c b/common/quant.c<br>

index 9b6b6d8..3897f53 100644<br>

--- a/common/quant.c<br>

+++ b/common/quant.c<br>

@@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )<br>

 {\<br>

     int i_last = runlevel->last = x264_coeff_last##num(dct);\<br>

     int i_total = 0;\<br>

+    int mask = 0;\<br>

     do\<br>

     {\<br>

         int r = 0;\<br>

         runlevel->level[i_total] = dct[i_last];\<br>

+        mask |= 1 << (i_last);\<br>

         while( --i_last >= 0 && dct[i_last] == 0 )\<br>

             r++;\<br>

         runlevel->run[i_total++] = r;\<br>

     } while( i_last >= 0 );\<br>

+    runlevel->mask = mask;\<br>

     return i_total;\<br>

 }<br>

<br>

diff --git a/common/vlc.c b/common/vlc.c<br>

index c4c3ad3..9adcc89 100644<br>

--- a/common/vlc.c<br>

+++ b/common/vlc.c<br>

@@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =<br>

 };<br>

<br>

 /* [MIN( i_zero_left-1, 6 )][run_before] */<br>

-const vlc_t x264_run_before[7][16] =<br>

+static const vlc_t run_before[7][16] =<br>

 {<br>

     { /* i_zero_left 1 */<br>

         { 0x1, 1 }, /* str=1 */<br>

@@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =<br>

 };<br>

<br>

 vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];<br>

+uint32_t x264_run_before[1<<16];<br>

<br>

-void x264_cavlc_init( void )<br>

+void x264_cavlc_init( x264_t *h )<br>

 {<br>

     for( int i_suffix = 0; i_suffix < 7; i_suffix++ )<br>

         for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )<br>

@@ -840,4 +841,27 @@ void x264_cavlc_init( void )<br>

                 i_next++;<br>

             vlc->i_next = i_next;<br>

         }<br>

+<br>

+    for( int i = 1; i < (1<<16); i++ )<br>

+    {<br>

+        x264_run_level_t runlevel;<br>

+        ALIGNED_ARRAY_16( dctcoef, dct, [16] );<br>

+        int size = 0;<br>

+        int bits = 0;<br>

+        for( int j = 0; j < 16; j++ )<br>

+            dct[j] = i&(1<<j);<br>

+        int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );<br>

+        int zeros = runlevel.last + 1 - total;<br>

+        for( int j = 0; j < total-1 && zeros > 0; j++ )<br>

+        {<br>

+            int idx = X264_MIN(zeros, 7) - 1;<br>

+            int run = runlevel.run[j];<br>

+            int len = run_before[idx][run].i_size;<br>

+            size += len;<br>

+            bits <<= len;<br>

+            bits |= run_before[idx][run].i_bits;<br>

+            zeros -= run;<br>

+        }<br>

+        x264_run_before[i] = (bits << 5) + size;<br>

+    }<br>

 }<br>

diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm<br>

index 7d33a5e..7a4a9c0 100644<br>

--- a/common/x86/quant-a.asm<br>

+++ b/common/x86/quant-a.asm<br>

@@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7<br>

     movifnidn t1, r1mp<br>

     pxor    m2, m2<br>

     LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d<br>

-    not    t5d<br>

-    shl    t5d, 32-((%1+1)&~1)<br>

+%if %1==15<br>

+    shr   t5d, 1<br>

+%elif %1==8<br>

+    and   t5d, 0xff<br>

+%elif %1==4<br>

+    and   t5d, 0xf<br>

+%endif<br>

+    xor   t5d, (1<<%1)-1<br>

+    mov   [t1+4], t5d<br>

+    shl    t5d, 32-%1<br>

     mov    t4d, %1-1<br>

     LZCOUNT t3d, t5d, 0x1f<br>

     xor    t6d, t6d<br>

@@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7<br>

     LZCOUNT t3d, t5d, 0x1f<br>

 %ifdef HIGH_BIT_DEPTH<br>

     mov    t2d, [t0+t4*4]<br>

-    mov   [t1+t6  +4+16*4], t3b<br>

-    mov   [t1+t6*4+ 4], t2d<br>

+    mov   [t1+t6+8+16*4], t3b<br>

+    mov   [t1+t6*4+ 8], t2d<br>

 %else<br>

     mov    t2w, [t0+t4*2]<br>

-    mov   [t1+t6  +4+16*2], t3b<br>

-    mov   [t1+t6*2+ 4], t2w<br>

+    mov   [t1+t6+8+16*2], t3b<br>

+    mov   [t1+t6*2+ 8], t2w<br>

 %endif<br>

     inc    t3d<br>

     shl    t5d, t3b<br>

diff --git a/encoder/cavlc.c b/encoder/cavlc.c<br>

index 26af61f..29ed0b0 100644<br>

--- a/encoder/cavlc.c<br>

+++ b/encoder/cavlc.c<br>

@@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct<br>

     runlevel.level[1] = 2;<br>

     runlevel.level[2] = 2;<br>

     i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );<br>

+    x264_prefetch( &x264_run_before[runlevel.mask] );<br>

     i_total_zero = runlevel.last + 1 - i_total;<br>

<br>

     i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1<br>

@@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct<br>

     else if( (uint8_t)i_total < count_cat[ctx_block_cat] )<br>

         bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );<br>

<br>

-    for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )<br>

-    {<br>

-        int i_zl = X264_MIN( i_total_zero, 7 );<br>

-        bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );<br>

-        i_total_zero -= runlevel.run[i];<br>

-    }<br>

+    int zero_run_code = x264_run_before[runlevel.mask];<br>

+    bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );<br>

<br>

     return i_total;<br>

 }<br>

diff --git a/encoder/encoder.c b/encoder/encoder.c<br>

index 607ece1..253aabb 100644<br>

--- a/encoder/encoder.c<br>

+++ b/encoder/encoder.c<br>

@@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )<br>

     x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );<br>

     x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );<br>

     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );<br>

-    if( h->param.b_cabac )<br>

-        x264_cabac_init( h );<br>

-    else<br>

-        x264_cavlc_init();<br>

     x264_pixel_init( h->param.cpu, &h->pixf );<br>

     x264_dct_init( h->param.cpu, &h->dctf );<br>

     x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );<br>

@@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )<br>

     x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );<br>

     x264_bitstream_init( h->param.cpu, &h->bsf );<br>

     x264_dct_init_weights();<br>

+    if( h->param.b_cabac )<br>

+        x264_cabac_init( h );<br>

+    else<br>

+        x264_cavlc_init( h );<br>

<br>

     mbcmp_init( h );<br>

     chroma_dsp_init( h );<br>

diff --git a/tools/checkasm.c b/tools/checkasm.c<br>

index 249a6bb..b97b001 100644<br>

--- a/tools/checkasm.c<br>

+++ b/tools/checkasm.c<br>

@@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )<br>

             int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \<br>

             int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \<br>

             if( result_c != result_a || runlevel_c.last != runlevel_a.last || \<br>

+                runlevel_c.mask != runlevel_a.mask || \<br>

                 memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \<br>

                 memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \<br>

             { \<br>

<br>

_______________________________________________<br>

x264-devel mailing list<br>

<a href="mailto:x264-devel@videolan.org">x264-devel@videolan.org</a><br>

<a href="http://mailman.videolan.org/listinfo/x264-devel" target="_blank">http://mailman.videolan.org/listinfo/x264-devel</a><br>

</blockquote></div><br></div>