[x264-devel] x86: AVX-512 dequant_4x4

Mon May 22 00:03:27 CEST 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr  4 20:01:26 2017 +0200| [74f7802bb7bd301299f8229a0552a7caf2b55434] | committer: Henrik Gramner

x86: AVX-512 dequant_4x4

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=74f7802bb7bd301299f8229a0552a7caf2b55434
---

 common/quant.c         |  3 +++
 common/x86/quant-a.asm | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/quant.h     |  1 +
 tools/checkasm.c       |  2 +-
 4 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/common/quant.c b/common/quant.c
index 8350fa7a..5710356a 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -560,6 +560,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     }
     if( cpu&X264_CPU_AVX512 )
     {
+        pf->dequant_4x4 = x264_dequant_4x4_avx512;
         pf->coeff_last4 = x264_coeff_last4_avx512;
         pf->coeff_last8 = x264_coeff_last8_avx512;
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
@@ -727,6 +728,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     }
     if( cpu&X264_CPU_AVX512 )
     {
+        if( h->param.i_cqm_preset != X264_CQM_FLAT )
+            pf->dequant_4x4 = x264_dequant_4x4_avx512;
         pf->coeff_last8 = x264_coeff_last8_avx512;
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index d281f02d..b797e418 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -743,6 +743,65 @@ DEQUANT 4, 4, 4
 DEQUANT 8, 6, 4
 %endif
 
+%macro DEQUANT_START_AVX512 1
+    movifnidn t2d, r2m
+    imul t0d, t2d, 0x2b
+    shr  t0d, 8     ; i_qbits = i_qp / 6
+    lea  t1d, [t0*5]
+    sub  t2d, t0d
+    sub  t2d, t1d   ; i_mf = i_qp % 6
+    shl  t2d, %1
+%if ARCH_X86_64
+%define dmf r1+t2
+%else
+%define dmf r1
+    add  r1, r1mp   ; dequant_mf[i_mf]
+    mov  r0, r0mp   ; dct
+%endif
+%endmacro
+
+INIT_ZMM avx512
+cglobal dequant_4x4, 0,3
+    DEQUANT_START_AVX512 6
+    mova          m0, [dmf]
+%if HIGH_BIT_DEPTH
+    pmaddwd       m0, [r0]
+%endif
+    sub          t0d, 4
+    jl .rshift
+%if HIGH_BIT_DEPTH
+    vpbroadcastd  m1, t0d
+    vpsllvd       m0, m1
+    mova        [r0], m0
+%else
+    vpbroadcastw ym1, t0d
+    vpmovsdw     ym0, m0
+    pmullw       ym0, [r0]
+    vpsllvw      ym0, ym1
+    mova        [r0], ym0
+%endif
+    RET
+.rshift:
+%if HIGH_BIT_DEPTH == 0
+    pmovzxwd      m1, [r0]
+    pmaddwd       m0, m1
+%endif
+    mov          r1d, 1<<31
+    shrx         r1d, r1d, t0d ; 1 << (-i_qbits-1)
+    neg          t0d
+    vpbroadcastd  m1, r1d
+    vpbroadcastd  m2, t0d
+    paddd         m0, m1
+    vpsravd       m0, m2
+%if HIGH_BIT_DEPTH
+    mova        [r0], m0
+%else
+    vpmovsdw    [r0], m0
+%endif
+    RET
+
+%undef dmf
+
 %macro DEQUANT_DC 2
 cglobal dequant_4x4dc, 0,3,6
     DEQUANT_START 6, 6
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 4c6ba337..67cc1e6b 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -66,6 +66,7 @@ void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 7ada2791..03cac3a5 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2009,7 +2009,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
     ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
-    ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
+    ALIGNED_ARRAY_64( dctcoef, dct2,[64] );
     ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
     ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
     ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );