[x264-devel] x86: AVX-512 dequant_8x8
Henrik Gramner
git at videolan.org
Mon May 22 00:03:31 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 4 20:54:12 2017 +0200| [40aca29a164d5e5e6589d507bdcae6717d72f6bf] | committer: Henrik Gramner
x86: AVX-512 dequant_8x8
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=40aca29a164d5e5e6589d507bdcae6717d72f6bf
---
common/quant.c | 4 +++
common/x86/quant-a.asm | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-
common/x86/quant.h | 1 +
3 files changed, 83 insertions(+), 1 deletion(-)
diff --git a/common/quant.c b/common/quant.c
index 5710356a..37ae7b92 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -561,6 +561,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
if( cpu&X264_CPU_AVX512 )
{
pf->dequant_4x4 = x264_dequant_4x4_avx512;
+ pf->dequant_8x8 = x264_dequant_8x8_avx512;
pf->coeff_last4 = x264_coeff_last4_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
@@ -729,7 +730,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
if( cpu&X264_CPU_AVX512 )
{
if( h->param.i_cqm_preset != X264_CQM_FLAT )
+ {
pf->dequant_4x4 = x264_dequant_4x4_avx512;
+ pf->dequant_8x8 = x264_dequant_8x8_avx512;
+ }
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index b797e418..0803b6aa 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -30,7 +30,12 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+%if HIGH_BIT_DEPTH == 0
+dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30
+ dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62
+%endif
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -800,6 +805,78 @@ cglobal dequant_4x4, 0,3
%endif
RET
+cglobal dequant_8x8, 0,3
+ DEQUANT_START_AVX512 8
+ mova m0, [dmf+0*64]
+ mova m1, [dmf+1*64]
+ mova m2, [dmf+2*64]
+ mova m3, [dmf+3*64]
+%if HIGH_BIT_DEPTH
+ pmaddwd m0, [r0+0*64]
+ pmaddwd m1, [r0+1*64]
+ pmaddwd m2, [r0+2*64]
+ pmaddwd m3, [r0+3*64]
+%else
+ mova m6, [dequant_shuf_avx512]
+%endif
+ sub t0d, 6
+ jl .rshift
+%if HIGH_BIT_DEPTH
+ vpbroadcastd m4, t0d
+ vpsllvd m0, m4
+ vpsllvd m1, m4
+ vpsllvd m2, m4
+ vpsllvd m3, m4
+ jmp .end
+.rshift:
+%else
+ vpbroadcastw m4, t0d
+ vpermt2w m0, m6, m1
+ vpermt2w m2, m6, m3
+ pmullw m0, [r0]
+ pmullw m2, [r0+64]
+ vpsllvw m0, m4
+ vpsllvw m2, m4
+ mova [r0], m0
+ mova [r0+64], m2
+ RET
+.rshift:
+ pmovzxwd m4, [r0+0*32]
+ pmovzxwd m5, [r0+1*32]
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmovzxwd m4, [r0+2*32]
+ pmovzxwd m5, [r0+3*32]
+ pmaddwd m2, m4
+ pmaddwd m3, m5
+%endif
+ mov r1d, 1<<31
+ shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
+ neg t0d
+ vpbroadcastd m4, r1d
+ vpbroadcastd m5, t0d
+ paddd m0, m4
+ paddd m1, m4
+ vpsravd m0, m5
+ vpsravd m1, m5
+ paddd m2, m4
+ paddd m3, m4
+ vpsravd m2, m5
+ vpsravd m3, m5
+%if HIGH_BIT_DEPTH
+.end:
+ mova [r0+0*64], m0
+ mova [r0+1*64], m1
+ mova [r0+2*64], m2
+ mova [r0+3*64], m3
+%else
+ vpermt2w m0, m6, m1
+ vpermt2w m2, m6, m3
+ mova [r0], m0
+ mova [r0+64], m2
+%endif
+ RET
+
%undef dmf
%macro DEQUANT_DC 2
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 67cc1e6b..7d28670a 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -67,6 +67,7 @@ void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
More information about the x264-devel
mailing list