[x264-devel] x86: AVX-512 sub8x8_dct_dc
Henrik Gramner
git at videolan.org
Mon Jun 26 21:59:01 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Jun 22 11:26:21 2017 +0200| [f672795407bf90045e399eb057e5b2426d79f961] | committer: Henrik Gramner
x86: AVX-512 sub8x8_dct_dc
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f672795407bf90045e399eb057e5b2426d79f961
---
common/dct.c | 1 +
common/x86/dct-a.asm | 24 ++++++++++++++++++++++++
common/x86/dct.h | 5 +++--
3 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 1b2a2ea6..f70bf81c 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -717,6 +717,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
dctf->add8x8_idct = x264_add8x8_idct_avx512;
}
#endif //HAVE_MMX
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index dd8e357d..3d28b4d1 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -725,6 +725,30 @@ cglobal sub16x16_dct
SUB4x16_DCT_AVX512 5, 3
RET
+cglobal sub8x8_dct_dc, 3,3
+ mova m3, [dct_avx512]
+ DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
+ mov r1d, 0xaa
+ kmovb k1, r1d
+ psrld m3, 5
+ DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
+ pxor xm3, xm3
+ psadbw m0, m3
+ psadbw m1, m3
+ psubw m0, m1
+ vpmovqw xmm0, m0
+ vprold xmm1, xmm0, 16
+ paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
+ punpckhqdq xmm2, xmm0, xmm0
+ psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
+ paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
+ punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
+ punpcklqdq xmm1, xmm0, xmm0
+ psubw xmm0 {k1}, xm3, xmm0
+ paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
+ movhps [r0], xmm0
+ RET
+
%macro SARSUMSUB 3 ; a, b, tmp
mova m%3, m%1
vpsraw m%1 {k1}, 1
diff --git a/common/x86/dct.h b/common/x86/dct.h
index e173c1fd..179369b8 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -45,8 +45,9 @@ void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
More information about the x264-devel
mailing list