[x264-devel] commit: Add assembly version of CAVLC 8x8dct interleave (Loren Merritt )
git version control
git at videolan.org
Fri Oct 31 16:57:55 CET 2008
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Thu Oct 23 13:45:04 2008 -0700| [990274cd5fd276bb26ac0fa13fc9bc1cbcf7acbc] | committer: Jason Garrett-Glaser
Add assembly version of CAVLC 8x8dct interleave
Faster CAVLC encoding and RDO with 8x8dct
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=990274cd5fd276bb26ac0fa13fc9bc1cbcf7acbc
---
common/dct.c | 14 ++++++++++++++
common/dct.h | 1 +
common/x86/dct-a.asm | 17 +++++++++++++++++
common/x86/dct.h | 1 +
encoder/cavlc.c | 6 ++----
tools/checkasm.c | 1 +
6 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 3655296..b05f604 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -582,6 +582,14 @@ static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8
#undef ZIG
#undef COPY4x4
+static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
+{
+ int i,j;
+ for( i=0; i<4; i++ )
+ for( j=0; j<16; j++ )
+ dst[i*16+j] = src[i+j*4];
+}
+
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
if( b_interlaced )
@@ -627,4 +635,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
#endif
}
+
+ pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
+#ifdef HAVE_MMX
+ if( cpu&X264_CPU_MMX )
+ pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
+#endif
}
diff --git a/common/dct.h b/common/dct.h
index 8c1ee40..3e26175 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -120,6 +120,7 @@ typedef struct
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
+ void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src );
} x264_zigzag_function_t;
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 33d1836..a2c09d9 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -548,3 +548,20 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
movdqa [r0], xmm0
movdqa [r0+16], xmm1
RET
+
+INIT_MMX
+cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
+ mov r2d, 24
+.loop:
+ movq m0, [r1+r2*4+ 0]
+ movq m1, [r1+r2*4+ 8]
+ movq m2, [r1+r2*4+16]
+ movq m3, [r1+r2*4+24]
+ TRANSPOSE4x4W 0,1,2,3,4
+ movq [r0+r2+ 0], m0
+ movq [r0+r2+32], m1
+ movq [r0+r2+64], m2
+ movq [r0+r2+96], m3
+ sub r2d, 8
+ jge .loop
+ REP_RET
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 46c9871..4617f97 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -56,5 +56,6 @@ void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src );
#endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 7f3588d..1d1f356 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -273,15 +273,13 @@ static void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i_list, int i )
static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8start, int i8end )
{
- int i8, i4, i;
+ int i8, i4;
if( h->mb.b_transform_8x8 )
{
/* shuffle 8x8 dct coeffs into 4x4 lists */
for( i8 = i8start; i8 <= i8end; i8++ )
if( h->mb.i_cbp_luma & (1 << i8) )
- for( i4 = 0; i4 < 4; i4++ )
- for( i = 0; i < 16; i++ )
- h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
+ h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
}
for( i8 = i8start; i8 <= i8end; i8++ )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 295c25f..2ba4267 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -645,6 +645,7 @@ static int check_dct( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
+ TEST_ZIGZAG_SCAN( interleave_8x8_cavlc, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
report( "zigzag_frame :" );
More information about the x264-devel
mailing list