[x264-devel] commit: Faster 8x8dct+CAVLC interleave (Jason Garrett-Glaser )
git version control
git at videolan.org
Tue Feb 3 06:22:03 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sat Jan 31 05:00:39 2009 -0800| [c0be8106d40b2ccbfec37229afaecf236b03762c] | committer: Jason Garrett-Glaser
Faster 8x8dct+CAVLC interleave
Integrate array_non_zero with the CAVLC 8x8dct interleave function.
Roughly 1.5-2x faster than the original separate array_non_zero method.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c0be8106d40b2ccbfec37229afaecf236b03762c
---
common/dct.c | 9 ++++++-
common/dct.h | 2 +-
common/x86/dct-a.asm | 59 +++++++++++++++++++++++++++++++++++++------------
common/x86/dct.h | 2 +-
encoder/cavlc.c | 6 +----
tools/checkasm.c | 25 ++++++++++++++++++++-
6 files changed, 79 insertions(+), 24 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 5f9f0fb..f609540 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -608,12 +608,19 @@ static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8
#undef ZIG
#undef COPY4x4
-static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
+static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
{
int i,j;
for( i=0; i<4; i++ )
+ {
+ int nz = 0;
for( j=0; j<16; j++ )
+ {
+ nz |= src[i+j*4];
dst[i*16+j] = src[i+j*4];
+ }
+ nnz[(i&1) + (i>>1)*8] = !!nz;
+ }
}
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
diff --git a/common/dct.h b/common/dct.h
index 71951f9..3819ce1 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -119,7 +119,7 @@ typedef struct
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
- void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src );
+ void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t;
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 156a7ae..b660497 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -34,6 +34,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
+pb_1: times 8 db 1
SECTION .text
@@ -737,19 +738,47 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
movdqa [r0+16], xmm1
RET
-INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
- mov r2d, 24
-.loop:
- movq m0, [r1+r2*4+ 0]
- movq m1, [r1+r2*4+ 8]
- movq m2, [r1+r2*4+16]
- movq m3, [r1+r2*4+24]
+;-----------------------------------------------------------------------------
+; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+;-----------------------------------------------------------------------------
+
+%macro INTERLEAVE 1
+ movq m0, [r1+%1*4+ 0]
+ movq m1, [r1+%1*4+ 8]
+ movq m2, [r1+%1*4+16]
+ movq m3, [r1+%1*4+24]
TRANSPOSE4x4W 0,1,2,3,4
- movq [r0+r2+ 0], m0
- movq [r0+r2+32], m1
- movq [r0+r2+64], m2
- movq [r0+r2+96], m3
- sub r2d, 8
- jge .loop
- REP_RET
+ movq [r0+%1+ 0], m0
+ movq [r0+%1+32], m1
+ movq [r0+%1+64], m2
+ movq [r0+%1+96], m3
+%if %1
+ packsswb m0, m1
+ por m6, m2
+ por m7, m3
+ por m5, m0
+%else
+ packsswb m0, m1
+ SWAP m5, m0
+ SWAP m6, m2
+ SWAP m7, m3
+%endif
+%endmacro
+
+INIT_MMX
+cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+ INTERLEAVE 0
+ INTERLEAVE 8
+ INTERLEAVE 16
+ INTERLEAVE 24
+ packsswb m6, m7
+ packsswb m5, m6
+ packsswb m5, m5
+ pxor m0, m0
+ pcmpeqb m5, m0
+ paddb m5, [pb_1 GLOBAL]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 9939276..7617ea5 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -61,6 +61,6 @@ void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src );
+void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 4644f2e..052a16c 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -273,11 +273,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
- {
- h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
- for( i4 = 0; i4 < 4; i4++ )
- h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
- }
+ h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
}
for( i8 = i8start; i8 <= i8end; i8++ )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 71fcb19..5e6ade8 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -635,6 +635,26 @@ static int check_dct( int cpu_ref, int cpu_new )
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
}
+#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
+ if( zigzag_asm.name != zigzag_ref.name ) \
+ { \
+ for( j=0; j<100; j++ ) \
+ { \
+ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+ used_asm = 1; \
+ memcpy(dct, buf1, size*sizeof(int16_t));\
+ for( i=0; i<size; i++ ) \
+ dct[i] = rand()&0x1F ? 0 : dct[i]; \
+ memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
+ call_c( zigzag_c.name, t1, dct, buf3 ); \
+ call_a( zigzag_asm.name, t2, dct, buf4 ); \
+ if( memcmp( t1, t2, size*sizeof(int16_t) ) || memcmp( buf3, buf4, 10*sizeof(uint8_t) ) ) \
+ { \
+ ok = 0; \
+ } \
+ } \
+ }
+
interlace = 0;
x264_zigzag_init( 0, &zigzag_c, 0 );
x264_zigzag_init( cpu_ref, &zigzag_ref, 0 );
@@ -643,7 +663,6 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
- TEST_ZIGZAG_SCAN( interleave_8x8_cavlc, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_frame :" );
@@ -657,6 +676,10 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_field :" );
+
+ ok = 1; used_asm = 0;
+ TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 );
+ report( "zigzag_interleave :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB
More information about the x264-devel
mailing list