[x264-devel] x86: AVX2 dequant_4x4_dc
Henrik Gramner
git at videolan.org
Mon May 20 23:06:51 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue May 14 18:57:40 2013 +0200| [b547a4ea1169411610855002db9a8182b1e73314] | committer: Jason Garrett-Glaser
x86: AVX2 dequant_4x4_dc
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b547a4ea1169411610855002db9a8182b1e73314
---
common/quant.c | 5 ++--
common/x86/quant-a.asm | 75 +++++++++++++++++++++++++++---------------------
common/x86/quant.h | 1 +
3 files changed, 47 insertions(+), 34 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index 5d37f07..7aa851e 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -545,6 +545,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
pf->dequant_4x4 = x264_dequant_4x4_avx2;
pf->dequant_8x8 = x264_dequant_8x8_avx2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
}
#endif // HAVE_MMX
@@ -691,10 +692,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
pf->quant_8x8 = x264_quant_8x8_avx2;
pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
- if( cpu&X264_CPU_LZCNT )
- pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->dequant_4x4 = x264_dequant_4x4_avx2;
pf->dequant_8x8 = x264_dequant_8x8_avx2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
@@ -704,6 +704,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_avx2;
if( cpu&X264_CPU_LZCNT )
{
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
}
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 8a7f8f9..4b72886 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -757,55 +757,62 @@ cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
.lshift:
- movd m3, [r1]
- movd m2, t0d
- pslld m3, m2
- SPLAT%1 m3, m3, 0
-%assign x 0
-%rep SIZEOF_PIXEL*16/mmsize
- mova m0, [r0+mmsize*0+x]
- mova m1, [r0+mmsize*1+x]
- %2 m0, m3
- %2 m1, m3
- mova [r0+mmsize*0+x], m0
- mova [r0+mmsize*1+x], m1
-%assign x x+mmsize*2
+%if cpuflag(avx2)
+ vpbroadcastdct m3, [r1]
+%else
+ movd xm3, [r1]
+ SPLAT%1 m3, xm3
+%endif
+ movd xm2, t0d
+ pslld m3, xm2
+%assign %%x 0
+%rep SIZEOF_PIXEL*32/mmsize
+ %2 m0, m3, [r0+%%x]
+ mova [r0+%%x], m0
+%assign %%x %%x+mmsize
%endrep
RET
.rshift32:
- neg t0d
- movd m3, t0d
- mova m4, [p%1_1]
- mova m5, m4
- pslld m4, m3
- psrld m4, 1
- movd m2, [r1]
-%assign x 0
+ neg t0d
+%if cpuflag(avx2)
+ vpbroadcastdct m2, [r1]
+%else
+ movd xm2, [r1]
+%endif
+ mova m5, [p%1_1]
+ movd xm3, t0d
+ pslld m4, m5, xm3
+ psrld m4, 1
%if HIGH_BIT_DEPTH
- pshufd m2, m2, 0
+%if notcpuflag(avx2)
+ pshufd m2, m2, 0
+%endif
+%assign %%x 0
%rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+x]
- pmadcswd m0, m0, m2, m4
- psrad m0, m3
- mova [r0+x], m0
-%assign x x+mmsize
+ pmadcswd m0, m2, [r0+%%x], m4
+ psrad m0, xm3
+ mova [r0+%%x], m0
+%assign %%x %%x+mmsize
%endrep
%else ; !HIGH_BIT_DEPTH
+%if notcpuflag(avx2)
PSHUFLW m2, m2, 0
+%endif
punpcklwd m2, m4
+%assign %%x 0
%rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+x]
+ mova m0, [r0+%%x]
punpckhwd m1, m0, m5
punpcklwd m0, m5
pmaddwd m0, m2
pmaddwd m1, m2
- psrad m0, m3
- psrad m1, m3
+ psrad m0, xm3
+ psrad m1, xm3
packssdw m0, m1
- mova [r0+x], m0
-%assign x x+mmsize
+ mova [r0+%%x], m0
+%assign %%x %%x+mmsize
%endrep
%endif ; !HIGH_BIT_DEPTH
RET
@@ -816,6 +823,8 @@ INIT_XMM sse2
DEQUANT_DC d, pmaddwd
INIT_XMM xop
DEQUANT_DC d, pmaddwd
+INIT_YMM avx2
+DEQUANT_DC d, pmaddwd
%else
%if ARCH_X86_64 == 0
INIT_MMX mmx2
@@ -825,6 +834,8 @@ INIT_XMM sse2
DEQUANT_DC w, pmullw
INIT_XMM avx
DEQUANT_DC w, pmullw
+INIT_YMM avx2
+DEQUANT_DC w, pmullw
%endif
; t4 is eax for return value.
diff --git a/common/x86/quant.h b/common/x86/quant.h
index b23589d..9ebbc27 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -65,6 +65,7 @@ void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
More information about the x264-devel
mailing list