[x264-devel] x86: AVX-512 coeff_last

Mon May 22 00:03:20 CEST 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Mon Mar 27 18:19:53 2017 +0200| [75f6f9b228c3498b8c9b0d97fc925c0a7e6e6f43] | committer: Henrik Gramner

x86: AVX-512 coeff_last

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=75f6f9b228c3498b8c9b0d97fc925c0a7e6e6f43
---

 common/quant.c         | 15 ++++++++++++
 common/x86/quant-a.asm | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/quant.h     |  5 ++++
 tools/checkasm.c       |  4 ++--
 4 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/common/quant.c b/common/quant.c
index d1414445..8350fa7a 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -558,6 +558,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->denoise_dct = x264_denoise_dct_avx2;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
     }
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->coeff_last4 = x264_coeff_last4_avx512;
+        pf->coeff_last8 = x264_coeff_last8_avx512;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -717,6 +725,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
     }
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->coeff_last8 = x264_coeff_last8_avx512;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
+    }
 #endif // HAVE_MMX
 
 #if HAVE_ALTIVEC
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index a20d1cd9..d281f02d 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -1756,6 +1756,70 @@ cglobal coeff_last64, 1,3
     RET
 %endif
 
+%macro COEFF_LAST_AVX512 2 ; num, w/d
+cglobal coeff_last%1, 1,2
+    mova         m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
+    vptestm%2    k0, m0, m0
+%if %1 == 15
+    mov         eax, 30
+    kmovw       r1d, k0
+    lzcnt       r1d, r1d
+    sub         eax, r1d
+%else
+    kmovw       eax, k0
+    lzcnt       eax, eax
+    xor         eax, 31
+%endif
+    RET
+%endmacro
+
+%macro COEFF_LAST64_AVX512 1 ; w/d
+cglobal coeff_last64, 1,2
+    pxor        xm0, xm0
+    vpcmp%1      k0, m0, [r0+0*64], 4
+    vpcmp%1      k1, m0, [r0+1*64], 4
+%if HIGH_BIT_DEPTH
+    vpcmp%1      k2, m0, [r0+2*64], 4
+    vpcmp%1      k3, m0, [r0+3*64], 4
+    kunpckwd     k0, k1, k0
+    kunpckwd     k1, k3, k2
+%endif
+%if ARCH_X86_64
+    kunpckdq     k0, k1, k0
+    kmovq       rax, k0
+    lzcnt       rax, rax
+    xor         eax, 63
+%else
+    kmovd       r1d, k1
+    kmovd       eax, k0
+    lzcnt       r1d, r1d
+    lzcnt       eax, eax
+    xor         r1d, 32
+    cmovnz      eax, r1d
+    xor         eax, 31
+%endif
+    RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512  4, d
+INIT_YMM avx512
+COEFF_LAST_AVX512  8, d
+INIT_ZMM avx512
+COEFF_LAST_AVX512 15, d
+COEFF_LAST_AVX512 16, d
+COEFF_LAST64_AVX512 d
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512  8, w
+INIT_YMM avx512
+COEFF_LAST_AVX512 15, w
+COEFF_LAST_AVX512 16, w
+INIT_ZMM avx512
+COEFF_LAST64_AVX512 w
+%endif ; !HIGH_BIT_DEPTH
+
 ;-----------------------------------------------------------------------------
 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/quant.h b/common/x86/quant.h
index e0ce0f23..4c6ba337 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -110,6 +110,11 @@ int x264_coeff_last15_lzcnt( dctcoef *dct );
 int x264_coeff_last16_lzcnt( dctcoef *dct );
 int x264_coeff_last64_lzcnt( dctcoef *dct );
 int x264_coeff_last64_avx2 ( dctcoef *dct );
+int x264_coeff_last4_avx512( int32_t *dct );
+int x264_coeff_last8_avx512( dctcoef *dct );
+int x264_coeff_last15_avx512( dctcoef *dct );
+int x264_coeff_last16_avx512( dctcoef *dct );
+int x264_coeff_last64_avx512( dctcoef *dct );
 int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index d09a06a4..7575d3f4 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2008,7 +2008,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
+    ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
     ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
     ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
     ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
@@ -2631,7 +2631,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
             {\
                 for( int j = 0; j < 256; j++ )\
                 {\
-                    ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
+                    ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
                     uint8_t bitstream[2][1<<16];\
                     static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
                     int ac = ctx_ac[ctx_block_cat];\