[x264-devel] x86: dct2x4dc asm
Henrik Gramner
git at videolan.org
Tue Apr 12 20:36:16 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Feb 27 20:34:39 2016 +0100| [eeb9b66ddb0f27d8baaa8efa9597613e61140836] | committer: Henrik Gramner
x86: dct2x4dc asm
Only used in 4:2:2. MMX2 version implemented for 8-bit, SSE2 and AVX
versions implemented for high bit-depth.
2.5x faster on 32-bit and 1.6x faster on 64-bit compared to C on Ivy Bridge.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=eeb9b66ddb0f27d8baaa8efa9597613e61140836
---
common/dct.c | 3 +++
common/x86/dct-a.asm | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/dct.h | 4 +++
3 files changed, 79 insertions(+)
diff --git a/common/dct.c b/common/dct.c
index 2740a31..9e2e955 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -576,6 +576,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = x264_add4x4_idct_sse2;
dctf->dct4x4dc = x264_dct4x4dc_sse2;
dctf->idct4x4dc = x264_idct4x4dc_sse2;
+ dctf->dct2x4dc = x264_dct2x4dc_sse2;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
@@ -597,6 +598,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
+ dctf->dct2x4dc = x264_dct2x4dc_avx;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
@@ -633,6 +635,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_MMX2 )
{
dctf->dct4x4dc = x264_dct4x4dc_mmx2;
+ dctf->dct2x4dc = x264_dct2x4dc_mmx2;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
}
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 004014f..454f53f 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -209,6 +209,78 @@ cglobal idct4x4dc, 1,1
RET
%endif ; HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+;-----------------------------------------------------------------------------
+%if WIN64
+ DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
+%else
+ DECLARE_REG_TMP 2
+%endif
+
+%macro INSERT_COEFF 3 ; dst, src, imm
+ %if %3
+ %if HIGH_BIT_DEPTH
+ %if cpuflag(sse4)
+ pinsrd %1, %2, %3
+ %elif %3 == 2
+ movd m2, %2
+ %elif %3 == 1
+ punpckldq %1, %2
+ %else
+ punpckldq m2, %2
+ punpcklqdq %1, m2
+ %endif
+ %else
+ %if %3 == 2
+ punpckldq %1, %2
+ %else
+ pinsrw %1, %2, %3
+ %endif
+ %endif
+ %else
+ movd %1, %2
+ %endif
+ %if HIGH_BIT_DEPTH
+ mov %2, t0d
+ %else
+ mov %2, t0w
+ %endif
+%endmacro
+
+%macro DCT2x4DC 2
+cglobal dct2x4dc, 2,3
+ xor t0d, t0d
+ INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0
+ INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2
+ add r1, 4*16*SIZEOF_DCTCOEF
+ INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1
+ INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3
+ INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0
+ INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2
+ INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1
+ INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3
+ SUMSUB_BA %1, 1, 0, 2
+ SBUTTERFLY %2, 1, 0, 2
+ SUMSUB_BA %1, 0, 1, 2
+ SBUTTERFLY %2, 0, 1, 2
+ SUMSUB_BA %1, 1, 0, 2
+ pshuf%1 m0, m0, q1032
+ mova [r0], m1
+ mova [r0+mmsize], m0
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+DCT2x4DC d, dq
+INIT_XMM avx
+DCT2x4DC d, dq
+%else
+INIT_MMX mmx2
+DCT2x4DC w, wd
+%endif
+
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 35b3384..ded790f 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -77,6 +77,10 @@ void x264_idct4x4dc_mmx ( int16_t d[16] );
void x264_idct4x4dc_sse2 ( int32_t d[16] );
void x264_idct4x4dc_avx ( int32_t d[16] );
+void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
+void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
More information about the x264-devel
mailing list