[x264-devel] x86: AVX-512 coeff_last
Henrik Gramner
git at videolan.org
Mon May 22 00:03:20 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Mon Mar 27 18:19:53 2017 +0200| [75f6f9b228c3498b8c9b0d97fc925c0a7e6e6f43] | committer: Henrik Gramner
x86: AVX-512 coeff_last
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=75f6f9b228c3498b8c9b0d97fc925c0a7e6e6f43
---
common/quant.c | 15 ++++++++++++
common/x86/quant-a.asm | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/quant.h | 5 ++++
tools/checkasm.c | 4 ++--
4 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/common/quant.c b/common/quant.c
index d1414445..8350fa7a 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -558,6 +558,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_avx2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->coeff_last4 = x264_coeff_last4_avx512;
+ pf->coeff_last8 = x264_coeff_last8_avx512;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
@@ -717,6 +725,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->coeff_last8 = x264_coeff_last8_avx512;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index a20d1cd9..d281f02d 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -1756,6 +1756,70 @@ cglobal coeff_last64, 1,3
RET
%endif
+%macro COEFF_LAST_AVX512 2 ; num, w/d
+cglobal coeff_last%1, 1,2
+ mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
+ vptestm%2 k0, m0, m0
+%if %1 == 15
+ mov eax, 30
+ kmovw r1d, k0
+ lzcnt r1d, r1d
+ sub eax, r1d
+%else
+ kmovw eax, k0
+ lzcnt eax, eax
+ xor eax, 31
+%endif
+ RET
+%endmacro
+
+%macro COEFF_LAST64_AVX512 1 ; w/d
+cglobal coeff_last64, 1,2
+ pxor xm0, xm0
+ vpcmp%1 k0, m0, [r0+0*64], 4
+ vpcmp%1 k1, m0, [r0+1*64], 4
+%if HIGH_BIT_DEPTH
+ vpcmp%1 k2, m0, [r0+2*64], 4
+ vpcmp%1 k3, m0, [r0+3*64], 4
+ kunpckwd k0, k1, k0
+ kunpckwd k1, k3, k2
+%endif
+%if ARCH_X86_64
+ kunpckdq k0, k1, k0
+ kmovq rax, k0
+ lzcnt rax, rax
+ xor eax, 63
+%else
+ kmovd r1d, k1
+ kmovd eax, k0
+ lzcnt r1d, r1d
+ lzcnt eax, eax
+ xor r1d, 32
+ cmovnz eax, r1d
+ xor eax, 31
+%endif
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512 4, d
+INIT_YMM avx512
+COEFF_LAST_AVX512 8, d
+INIT_ZMM avx512
+COEFF_LAST_AVX512 15, d
+COEFF_LAST_AVX512 16, d
+COEFF_LAST64_AVX512 d
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512 8, w
+INIT_YMM avx512
+COEFF_LAST_AVX512 15, w
+COEFF_LAST_AVX512 16, w
+INIT_ZMM avx512
+COEFF_LAST64_AVX512 w
+%endif ; !HIGH_BIT_DEPTH
+
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
diff --git a/common/x86/quant.h b/common/x86/quant.h
index e0ce0f23..4c6ba337 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -110,6 +110,11 @@ int x264_coeff_last15_lzcnt( dctcoef *dct );
int x264_coeff_last16_lzcnt( dctcoef *dct );
int x264_coeff_last64_lzcnt( dctcoef *dct );
int x264_coeff_last64_avx2 ( dctcoef *dct );
+int x264_coeff_last4_avx512( int32_t *dct );
+int x264_coeff_last8_avx512( dctcoef *dct );
+int x264_coeff_last15_avx512( dctcoef *dct );
+int x264_coeff_last16_avx512( dctcoef *dct );
+int x264_coeff_last64_avx512( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index d09a06a4..7575d3f4 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2008,7 +2008,7 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
@@ -2631,7 +2631,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
{\
for( int j = 0; j < 256; j++ )\
{\
- ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
+ ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\
More information about the x264-devel
mailing list