Does the call to x264_cavlc_init need to be wrapped in x264_stack_align?<div><br></div><div>Without it I get crashes in my app on Win32 x86 when linked with MSVC2010 against the libx264 DLL (compiled with MinGW gcc 4.6.2).</div>
<div><br><br><div class="gmail_quote">On Sun, Jan 15, 2012 at 8:11 PM, Jason Garrett-Glaser <span dir="ltr"><<a href="mailto:git@videolan.org">git@videolan.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
x264 | branch: master | Jason Garrett-Glaser <<a href="mailto:jason@x264.com">jason@x264.com</a>> | Thu Dec 8 13:45:41 2011 -0800| [c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2] | committer: Jason Garrett-Glaser<br>
<br>
Use a large LUT for CAVLC zero-run bit codes<br>
Helps the most with trellis and RD, but also helps with bitstream writing.<br>
Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8).<br>
<br>
> <a href="http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2" target="_blank">http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c032fbaa3801fb4cf8dd1dd95a6479ca5bd262e2</a><br>
---<br>
<br>
common/bitstream.h | 7 ++++++-<br>
common/common.h | 2 +-<br>
common/quant.c | 3 +++<br>
common/vlc.c | 28 ++++++++++++++++++++++++++--<br>
common/x86/quant-a.asm | 20 ++++++++++++++------<br>
encoder/cavlc.c | 9 +++------<br>
encoder/encoder.c | 8 ++++----<br>
tools/checkasm.c | 1 +<br>
8 files changed, 58 insertions(+), 20 deletions(-)<br>
<br>
diff --git a/common/bitstream.h b/common/bitstream.h<br>
index 1a15338..f407e1d 100644<br>
--- a/common/bitstream.h<br>
+++ b/common/bitstream.h<br>
@@ -56,6 +56,7 @@ typedef struct bs_s<br>
typedef struct<br>
{<br>
int last;<br>
+ int mask;<br>
dctcoef level[16];<br>
uint8_t run[16];<br>
} x264_run_level_t;<br>
@@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];<br>
extern const vlc_t x264_total_zeros[15][16];<br>
extern const vlc_t x264_total_zeros_2x2_dc[3][4];<br>
extern const vlc_t x264_total_zeros_2x4_dc[7][8];<br>
-extern const vlc_t x264_run_before[7][16];<br>
<br>
typedef struct<br>
{<br>
@@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );<br>
#define LEVEL_TABLE_SIZE 128<br>
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];<br>
<br>
+/* The longest possible set of zero run codes sums to 25 bits. This leaves<br>
+ * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */<br>
+<br>
+extern uint32_t x264_run_before[1<<16];<br>
+<br>
static inline void bs_init( bs_t *s, void *p_data, int i_data )<br>
{<br>
int offset = ((intptr_t)p_data & 3);<br>
diff --git a/common/common.h b/common/common.h<br>
index 2704f29..b6cec65 100644<br>
--- a/common/common.h<br>
+++ b/common/common.h<br>
@@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );<br>
<br>
void x264_reduce_fraction( uint32_t *n, uint32_t *d );<br>
void x264_reduce_fraction64( uint64_t *n, uint64_t *d );<br>
-void x264_cavlc_init( void );<br>
+void x264_cavlc_init( x264_t *h );<br>
void x264_cabac_init( x264_t *h );<br>
<br>
static ALWAYS_INLINE pixel x264_clip_pixel( int x )<br>
diff --git a/common/quant.c b/common/quant.c<br>
index 9b6b6d8..3897f53 100644<br>
--- a/common/quant.c<br>
+++ b/common/quant.c<br>
@@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )<br>
{\<br>
int i_last = runlevel->last = x264_coeff_last##num(dct);\<br>
int i_total = 0;\<br>
+ int mask = 0;\<br>
do\<br>
{\<br>
int r = 0;\<br>
runlevel->level[i_total] = dct[i_last];\<br>
+ mask |= 1 << (i_last);\<br>
while( --i_last >= 0 && dct[i_last] == 0 )\<br>
r++;\<br>
runlevel->run[i_total++] = r;\<br>
} while( i_last >= 0 );\<br>
+ runlevel->mask = mask;\<br>
return i_total;\<br>
}<br>
<br>
diff --git a/common/vlc.c b/common/vlc.c<br>
index c4c3ad3..9adcc89 100644<br>
--- a/common/vlc.c<br>
+++ b/common/vlc.c<br>
@@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =<br>
};<br>
<br>
/* [MIN( i_zero_left-1, 6 )][run_before] */<br>
-const vlc_t x264_run_before[7][16] =<br>
+static const vlc_t run_before[7][16] =<br>
{<br>
{ /* i_zero_left 1 */<br>
{ 0x1, 1 }, /* str=1 */<br>
@@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =<br>
};<br>
<br>
vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];<br>
+uint32_t x264_run_before[1<<16];<br>
<br>
-void x264_cavlc_init( void )<br>
+void x264_cavlc_init( x264_t *h )<br>
{<br>
for( int i_suffix = 0; i_suffix < 7; i_suffix++ )<br>
for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )<br>
@@ -840,4 +841,27 @@ void x264_cavlc_init( void )<br>
i_next++;<br>
vlc->i_next = i_next;<br>
}<br>
+<br>
+ for( int i = 1; i < (1<<16); i++ )<br>
+ {<br>
+ x264_run_level_t runlevel;<br>
+ ALIGNED_ARRAY_16( dctcoef, dct, [16] );<br>
+ int size = 0;<br>
+ int bits = 0;<br>
+ for( int j = 0; j < 16; j++ )<br>
+ dct[j] = i&(1<<j);<br>
+ int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );<br>
+ int zeros = runlevel.last + 1 - total;<br>
+ for( int j = 0; j < total-1 && zeros > 0; j++ )<br>
+ {<br>
+ int idx = X264_MIN(zeros, 7) - 1;<br>
+ int run = runlevel.run[j];<br>
+ int len = run_before[idx][run].i_size;<br>
+ size += len;<br>
+ bits <<= len;<br>
+ bits |= run_before[idx][run].i_bits;<br>
+ zeros -= run;<br>
+ }<br>
+ x264_run_before[i] = (bits << 5) + size;<br>
+ }<br>
}<br>
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm<br>
index 7d33a5e..7a4a9c0 100644<br>
--- a/common/x86/quant-a.asm<br>
+++ b/common/x86/quant-a.asm<br>
@@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7<br>
movifnidn t1, r1mp<br>
pxor m2, m2<br>
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d<br>
- not t5d<br>
- shl t5d, 32-((%1+1)&~1)<br>
+%if %1==15<br>
+ shr t5d, 1<br>
+%elif %1==8<br>
+ and t5d, 0xff<br>
+%elif %1==4<br>
+ and t5d, 0xf<br>
+%endif<br>
+ xor t5d, (1<<%1)-1<br>
+ mov [t1+4], t5d<br>
+ shl t5d, 32-%1<br>
mov t4d, %1-1<br>
LZCOUNT t3d, t5d, 0x1f<br>
xor t6d, t6d<br>
@@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7<br>
LZCOUNT t3d, t5d, 0x1f<br>
%ifdef HIGH_BIT_DEPTH<br>
mov t2d, [t0+t4*4]<br>
- mov [t1+t6 +4+16*4], t3b<br>
- mov [t1+t6*4+ 4], t2d<br>
+ mov [t1+t6+8+16*4], t3b<br>
+ mov [t1+t6*4+ 8], t2d<br>
%else<br>
mov t2w, [t0+t4*2]<br>
- mov [t1+t6 +4+16*2], t3b<br>
- mov [t1+t6*2+ 4], t2w<br>
+ mov [t1+t6+8+16*2], t3b<br>
+ mov [t1+t6*2+ 8], t2w<br>
%endif<br>
inc t3d<br>
shl t5d, t3b<br>
diff --git a/encoder/cavlc.c b/encoder/cavlc.c<br>
index 26af61f..29ed0b0 100644<br>
--- a/encoder/cavlc.c<br>
+++ b/encoder/cavlc.c<br>
@@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct<br>
runlevel.level[1] = 2;<br>
runlevel.level[2] = 2;<br>
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );<br>
+ x264_prefetch( &x264_run_before[runlevel.mask] );<br>
i_total_zero = runlevel.last + 1 - i_total;<br>
<br>
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1<br>
@@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct<br>
else if( (uint8_t)i_total < count_cat[ctx_block_cat] )<br>
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );<br>
<br>
- for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )<br>
- {<br>
- int i_zl = X264_MIN( i_total_zero, 7 );<br>
- bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );<br>
- i_total_zero -= runlevel.run[i];<br>
- }<br>
+ int zero_run_code = x264_run_before[runlevel.mask];<br>
+ bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );<br>
<br>
return i_total;<br>
}<br>
diff --git a/encoder/encoder.c b/encoder/encoder.c<br>
index 607ece1..253aabb 100644<br>
--- a/encoder/encoder.c<br>
+++ b/encoder/encoder.c<br>
@@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )<br>
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );<br>
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );<br>
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );<br>
- if( h->param.b_cabac )<br>
- x264_cabac_init( h );<br>
- else<br>
- x264_cavlc_init();<br>
x264_pixel_init( h->param.cpu, &h->pixf );<br>
x264_dct_init( h->param.cpu, &h->dctf );<br>
x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );<br>
@@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )<br>
x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );<br>
x264_bitstream_init( h->param.cpu, &h->bsf );<br>
x264_dct_init_weights();<br>
+ if( h->param.b_cabac )<br>
+ x264_cabac_init( h );<br>
+ else<br>
+ x264_cavlc_init( h );<br>
<br>
mbcmp_init( h );<br>
chroma_dsp_init( h );<br>
diff --git a/tools/checkasm.c b/tools/checkasm.c<br>
index 249a6bb..b97b001 100644<br>
--- a/tools/checkasm.c<br>
+++ b/tools/checkasm.c<br>
@@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )<br>
int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \<br>
int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \<br>
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \<br>
+ runlevel_c.mask != runlevel_a.mask || \<br>
memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \<br>
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \<br>
{ \<br>
<br>
_______________________________________________<br>
x264-devel mailing list<br>
<a href="mailto:x264-devel@videolan.org">x264-devel@videolan.org</a><br>
<a href="http://mailman.videolan.org/listinfo/x264-devel" target="_blank">http://mailman.videolan.org/listinfo/x264-devel</a><br>
</blockquote></div><br></div>