[x264-devel] x86: SSE2/AVX idct_dequant_2x4_(dc|dconly)
Henrik Gramner
git at videolan.org
Tue Apr 12 20:36:16 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Feb 20 20:31:22 2016 +0100| [23d1d8e89be2d99f5c6924a6055fc80d69429503] | committer: Henrik Gramner
x86: SSE2/AVX idct_dequant_2x4_(dc|dconly)
Only used in 4:2:2. Both 8-bit and high bit-depth implemented.
Approximate performance improvement compared to C on Ivy Bridge:
x86-32 x86-64
idct_dequant_2x4_dc 2.1x 1.7x
idct_dequant_2x4_dconly 2.7x 2.0x
Helps more on 32-bit due to the C versions being register starved.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=23d1d8e89be2d99f5c6924a6055fc80d69429503
---
common/quant.c | 8 +++
common/x86/quant-a.asm | 144 ++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/quant.h | 4 ++
3 files changed, 156 insertions(+)
diff --git a/common/quant.c b/common/quant.c
index 75325d0..312f7cd 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -486,6 +486,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_sse2;
pf->dequant_8x8 = x264_dequant_8x8_sse2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
+ pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
+ pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
@@ -532,6 +534,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
if( cpu&X264_CPU_AVX )
{
+ pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
+ pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
if( cpu&X264_CPU_XOP )
@@ -618,6 +622,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
+ pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
+ pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
@@ -680,6 +686,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_4x4 = x264_dequant_4x4_avx;
pf->dequant_8x8 = x264_dequant_8x8_avx;
}
+ pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
+ pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index f7c7cc0..2dc0249 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -829,6 +829,150 @@ INIT_YMM avx2
DEQUANT_DC w, pmullw
%endif
+%macro PEXTRW 4
+ %if cpuflag(sse4)
+ pextrw %1, %2, %3
+ %else
+ ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
+ %if %3
+ pextrw %4d, %2, %3
+ %else
+ movd %4d, %2
+ %endif
+ mov %1, %4w
+ %endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
+; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
+;-----------------------------------------------------------------------------
+
+%macro DEQUANT_2x4_DC 1
+%ifidn %1, dconly
+ DECLARE_REG_TMP 6,3,2
+ %define %%args dct, dmf, qp
+%else
+ DECLARE_REG_TMP 6,4,3
+ %define %%args dct, dct4x4, dmf, qp
+%endif
+
+%if ARCH_X86_64 == 0
+ DECLARE_REG_TMP 2,0,1
+%endif
+
+cglobal idct_dequant_2x4_%1, 0,3,5, %%args
+ movifnidn t2d, qpm
+ imul t0d, t2d, 0x2b
+ shr t0d, 8 ; qp / 6
+ lea t1d, [t0*5]
+ sub t2d, t0d
+ sub t2d, t1d ; qp % 6
+ shl t2d, 6 ; 16 * sizeof(int)
+%if ARCH_X86_64
+ imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
+%else
+ mov dctq, dctmp
+ add t2, dmfmp
+ imul t2d, [t2], -0xffff
+%endif
+%if HIGH_BIT_DEPTH
+ mova m0, [dctq]
+ mova m1, [dctq+16]
+ SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps,
+ packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later.
+%else
+ movq m0, [dctq]
+ movq m1, [dctq+8]
+ SUMSUB_BA w, 1, 0, 2
+ punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7
+%endif
+ pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
+ movd m3, t2d
+ pshuflw m3, m3, q1000 ; + + + -
+ SUMSUB_BA w, 0, 1, 2
+ punpcklqdq m3, m3 ; + + + - + + + -
+ pshufd m1, m1, q0022
+ sub t0d, 6
+ jl .rshift
+ movd m2, t0d
+ psllw m3, m2
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ jmp .end
+.rshift:
+ neg t0d
+ movd m2, t0d
+ pcmpeqd m4, m4
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ pslld m4, m2
+ psrad m4, 1
+ psubd m0, m4 ; + 1 << (qp/6-1)
+ psubd m1, m4
+ psrad m0, m2
+ psrad m1, m2
+.end:
+%ifidn %1, dconly
+%if HIGH_BIT_DEPTH
+ mova [dctq], m0
+ mova [dctq+16], m1
+%else
+ packssdw m0, m1
+ mova [dctq], m0
+%endif
+%else
+ movifnidn dct4x4q, dct4x4mp
+%if HIGH_BIT_DEPTH
+ movd [dct4x4q+0*64], m0
+%if cpuflag(sse4)
+ pextrd [dct4x4q+1*64], m0, 1
+ add dct4x4q, 4*64
+ pextrd [dct4x4q-2*64], m0, 2
+ pextrd [dct4x4q-1*64], m0, 3
+ movd [dct4x4q+0*64], m1
+ pextrd [dct4x4q+1*64], m1, 1
+ pextrd [dct4x4q+2*64], m1, 2
+ pextrd [dct4x4q+3*64], m1, 3
+%else
+ MOVHL m2, m0
+ psrlq m0, 32
+ movd [dct4x4q+1*64], m0
+ add dct4x4q, 4*64
+ movd [dct4x4q-2*64], m2
+ psrlq m2, 32
+ movd [dct4x4q-1*64], m2
+ movd [dct4x4q+0*64], m1
+ MOVHL m2, m1
+ psrlq m1, 32
+ movd [dct4x4q+1*64], m1
+ movd [dct4x4q+2*64], m2
+ psrlq m2, 32
+ movd [dct4x4q+3*64], m2
+%endif
+%else
+ PEXTRW [dct4x4q+0*32], m0, 0, eax
+ PEXTRW [dct4x4q+1*32], m0, 2, eax
+ PEXTRW [dct4x4q+2*32], m0, 4, eax
+ PEXTRW [dct4x4q+3*32], m0, 6, eax
+ add dct4x4q, 4*32
+ PEXTRW [dct4x4q+0*32], m1, 0, eax
+ PEXTRW [dct4x4q+1*32], m1, 2, eax
+ PEXTRW [dct4x4q+2*32], m1, 4, eax
+ PEXTRW [dct4x4q+3*32], m1, 6, eax
+%endif
+%endif
+ RET
+%endmacro
+
+; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
+INIT_XMM sse2
+DEQUANT_2x4_DC dc
+DEQUANT_2x4_DC dconly
+INIT_XMM avx
+DEQUANT_2x4_DC dc
+DEQUANT_2x4_DC dconly
+
; t4 is eax for return value.
%if ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 91a4dcb..c8c4c86 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -72,6 +72,10 @@ void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
+void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
More information about the x264-devel
mailing list