[x264-devel] commit: dequant_4x4_dc assembly (Jason Garrett-Glaser )
git version control
git at videolan.org
Thu Nov 27 08:57:01 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Nov 26 23:42:55 2008 -0800| [1591275a92faa3d63186e6de1e9022956113bc1d] | committer: Jason Garrett-Glaser
dequant_4x4_dc assembly
About 3.5x faster DC dequant on Conroe
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1591275a92faa3d63186e6de1e9022956113bc1d
---
common/quant.c | 5 ++-
common/quant.h | 5 +--
common/x86/quant-a.asm | 78 +++++++++++++++++++++++++++++++++++++++++++----
common/x86/quant.h | 2 +
encoder/macroblock.c | 2 +-
tools/checkasm.c | 27 ++++++++++++++++-
6 files changed, 106 insertions(+), 13 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index 348274e..42244e3 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -139,7 +139,7 @@ static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
}
}
-void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+static void dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
{
const int i_qbits = i_qp/6 - 6;
int y;
@@ -253,6 +253,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc = quant_2x2_dc;
pf->dequant_4x4 = dequant_4x4;
+ pf->dequant_4x4_dc = dequant_4x4_dc;
pf->dequant_8x8 = dequant_8x8;
pf->denoise_dct = x264_denoise_dct;
@@ -267,6 +268,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
pf->dequant_4x4 = x264_dequant_4x4_mmx;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_mmxext;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
@@ -294,6 +296,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_sse2;
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->dequant_4x4 = x264_dequant_4x4_sse2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
pf->dequant_8x8 = x264_dequant_8x8_sse2;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
diff --git a/common/quant.h b/common/quant.h
index 1024b4f..3b128e6 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -30,8 +30,9 @@ typedef struct
void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
- void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+ void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+ void (*dequant_4x4_dc)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
@@ -42,6 +43,4 @@ typedef struct
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
-void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
-
#endif
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index d55d17e..0abb906 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -255,26 +255,30 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1
%define t2d r1d
%endif
-;-----------------------------------------------------------------------------
-; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
-;-----------------------------------------------------------------------------
-%macro DEQUANT 4
-cglobal x264_dequant_%2x%2_%1, 0,3
+%macro DEQUANT_START 2
movifnidn t2d, r2m
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1, [t0*3]
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %3+2
+ shl t2d, %1
%ifdef ARCH_X86_64
add r1, t2 ; dequant_mf[i_mf]
%else
add r1, r1m ; dequant_mf[i_mf]
mov r0, r0m ; dct
%endif
- sub t0d, %3
+ sub t0d, %2
jl .rshift32 ; negative qbits => rightshift
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+;-----------------------------------------------------------------------------
+%macro DEQUANT 4
+cglobal x264_dequant_%2x%2_%1, 0,3
+ DEQUANT_START %3+2, %3
.lshift:
movd m5, t0d
@@ -339,7 +343,67 @@ INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
+%macro DEQUANT_DC 1
+cglobal x264_dequant_4x4dc_%1, 0,3
+ DEQUANT_START 6, 6
+
+.lshift:
+ movd m6, [r1]
+ movd m5, t0d
+ pslld m6, m5
+%if mmsize==16
+ pshuflw m6, m6, 0
+ punpcklqdq m6, m6
+%else
+ pshufw m6, m6, 0
+%endif
+%assign x 0
+%rep 16/mmsize
+ mova m0, [r0+mmsize*0+x]
+ mova m1, [r0+mmsize*1+x]
+ pmullw m0, m6
+ pmullw m1, m6
+ mova [r0+mmsize*0+x], m0
+ mova [r0+mmsize*1+x], m1
+%assign x x+mmsize*2
+%endrep
+ RET
+.rshift32:
+ neg t0d
+ movd m5, t0d
+ mova m6, [pw_1 GLOBAL]
+ mova m7, m6
+ pslld m6, m5
+ psrld m6, 1
+ movd m4, [r1]
+%if mmsize==8
+ punpcklwd m4, m4
+%else
+ pshuflw m4, m4, 0
+%endif
+ punpcklwd m4, m6
+%assign x 0
+%rep 32/mmsize
+ mova m0, [r0+x]
+ mova m1, m0
+ punpcklwd m0, m7
+ punpckhwd m1, m7
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ psrad m0, m5
+ psrad m1, m5
+ packssdw m0, m1
+ mova [r0+x], m0
+%assign x x+mmsize
+%endrep
+ RET
+%endmacro
+
+INIT_MMX
+DEQUANT_DC mmxext
+INIT_XMM
+DEQUANT_DC sse2
;-----------------------------------------------------------------------------
; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
diff --git a/common/x86/quant.h b/common/x86/quant.h
index ed64d60..29cb76d 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -36,8 +36,10 @@ void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_4x4dc_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 68ce833..17ca313 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -188,7 +188,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
/* output samples to fdec */
h->dctf.idct4x4dc( dct_dc4x4 );
- x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
+ h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */
/* calculate dct coeffs */
for( i = 0; i < 16; i++ )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 12110a5..df58c48 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1050,7 +1050,7 @@ static int check_quant( int cpu_ref, int cpu_new )
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT##w() \
- call_c( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
memcpy( dct2, dct1, w*w*2 ); \
call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
@@ -1070,6 +1070,31 @@ static int check_quant( int cpu_ref, int cpu_new )
TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
+#define TEST_DEQUANT_DC( qname, dqname, block, w ) \
+ if( qf_a.dqname != qf_ref.dqname ) \
+ { \
+ set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
+ used_asms[1] = 1; \
+ for( qp = 51; qp > 0; qp-- ) \
+ { \
+ for( i = 0; i < 16; i++ ) \
+ dct1[i] = rand(); \
+ call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
+ memcpy( dct2, dct1, w*w*2 ); \
+ call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
+ call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ if( memcmp( dct1, dct2, w*w*2 ) ) \
+ { \
+ oks[1] = 0; \
+ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
+ } \
+ call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
+ call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+ } \
+ }
+
+ TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
+
x264_cqm_delete( h );
}
More information about the x264-devel
mailing list