[x264-devel] x86: AVX-512 sub8x16_dct_dc
Henrik Gramner
git at videolan.org
Mon Jun 26 21:59:05 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Jun 22 19:51:28 2017 +0200| [1d9dee2e9be717fcde416854f902db776312f141] | committer: Henrik Gramner
x86: AVX-512 sub8x16_dct_dc
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1d9dee2e9be717fcde416854f902db776312f141
---
common/dct.c | 1 +
common/x86/dct-a.asm | 34 ++++++++++++++++++++++++++++++++++
common/x86/dct.h | 7 ++++---
3 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index f70bf81c..70853bf9 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -718,6 +718,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512;
dctf->add8x8_idct = x264_add8x8_idct_avx512;
}
#endif //HAVE_MMX
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 3d28b4d1..33ed0618 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -749,6 +749,40 @@ cglobal sub8x8_dct_dc, 3,3
movhps [r0], xmm0
RET
+cglobal sub8x16_dct_dc, 3,3
+ mova m5, [dct_avx512]
+ DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5
+ DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
+ mov r1d, 0xaa
+ kmovb k1, r1d
+ psrld m5, 5
+ DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
+ DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
+ pxor xm4, xm4
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
+ psubw m0, m2
+ psubw m1, m3
+ SBUTTERFLY qdq, 0, 1, 2
+ paddw m0, m1
+ vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7
+ psrlq xmm2, xmm0, 32
+ psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
+ paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7
+ punpckhdq xmm2, xmm0, xmm1
+ punpckldq xmm0, xmm1
+ psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
+ paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
+ punpcklwd xmm0, xmm1
+ psrlq xmm2, xmm0, 32
+ psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
+ paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
+ shufps xmm0, xmm1, q0220
+ mova [r0], xmm0
+ RET
+
%macro SARSUMSUB 3 ; a, b, tmp
mova m%3, m%1
vpsraw m%1 {k1}, 1
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 179369b8..20a65c53 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -48,9 +48,10 @@ void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
-void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
More information about the x264-devel
mailing list