[x264-devel] x86: AVX-512 sub8x8_dct
Henrik Gramner
git at videolan.org
Mon Jun 26 21:58:50 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Jun 7 16:55:48 2017 +0200| [774c6c76d081305d9c891091e1d4694acb3f8a68] | committer: Henrik Gramner
x86: AVX-512 sub8x8_dct
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=774c6c76d081305d9c891091e1d4694acb3f8a68
---
common/dct.c | 1 +
common/x86/dct-a.asm | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
common/x86/dct.h | 1 +
tools/checkasm.c | 2 +-
4 files changed, 49 insertions(+), 2 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index a24e7a31..3e7aa39a 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -715,6 +715,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_AVX512 )
{
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
+ dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
}
#endif //HAVE_MMX
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index a5011e3f..31897d9a 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -31,7 +31,7 @@
%include "x86util.asm"
SECTION_RODATA 64
-; Permutation indices for AVX-512 zigzags are bit-packed to save cache
+; AVX-512 permutation indices are bit-packed to save cache
%if HIGH_BIT_DEPTH
scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
@@ -47,6 +47,10 @@ cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:
dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
+dct_avx512: dd 0x00000000, 0x00000104, 0x0000014c, 0x00000048 ; bits 0-4: dct8x8_fenc
+ dd 0x00000210, 0x00000314, 0x0000035c, 0x00000258 ; bits 5-9: dct8x8_fdec
+ dd 0x00000021, 0x00000125, 0x0000016d, 0x00000069
+ dd 0x00000231, 0x00000335, 0x0000037d, 0x00000279
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
@@ -654,6 +658,47 @@ cglobal sub4x4_dct
mova [r0], m2
mova [r0+16], m0
RET
+
+INIT_ZMM avx512
+cglobal dct4x4x4_internal
+ punpcklbw m0, m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m4
+ punpckhbw m3, m4
+ DCT4x4_AVX512
+ mova m1, m2
+ vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
+ vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
+ ret
+
+%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
+ movu %1, [r1+%3*FENC_STRIDE]
+ vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
+%endmacro
+
+%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
+ movu %1, [r2+(%4 )*FDEC_STRIDE]
+ vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
+ movu %3, [r2+(%5 )*FDEC_STRIDE]
+ vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
+ vpermt2d %1, %2, %3
+%endmacro
+
+cglobal sub8x8_dct, 3,3
+ mova m0, [dct_avx512]
+ DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
+ mov r1d, 0xaaaaaaaa
+ kmovd k1, r1d
+ psrld m0, 5
+ DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
+ mov r1d, 0xf0f0f0f0
+ kmovd k2, r1d
+ pxor xm4, xm4
+ knotw k3, k2
+ call dct4x4x4_internal_avx512
+ mova [r0], m0
+ mova [r0+64], m1
+ RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 41920f76..f605437a 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -42,6 +42,7 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 20819cfd..e25a45cf 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -843,7 +843,7 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
More information about the x264-devel
mailing list