[x264-devel] x86: AVX-512 sub8x8_dct_dc

Henrik Gramner git at videolan.org
Mon Jun 26 21:59:01 CEST 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Jun 22 11:26:21 2017 +0200| [f672795407bf90045e399eb057e5b2426d79f961] | committer: Henrik Gramner

x86: AVX-512 sub8x8_dct_dc

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f672795407bf90045e399eb057e5b2426d79f961
---

 common/dct.c         |  1 +
 common/x86/dct-a.asm | 24 ++++++++++++++++++++++++
 common/x86/dct.h     |  5 +++--
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 1b2a2ea6..f70bf81c 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -717,6 +717,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub4x4_dct       = x264_sub4x4_dct_avx512;
         dctf->sub8x8_dct       = x264_sub8x8_dct_avx512;
         dctf->sub16x16_dct     = x264_sub16x16_dct_avx512;
+        dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_avx512;
         dctf->add8x8_idct      = x264_add8x8_idct_avx512;
     }
 #endif //HAVE_MMX
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index dd8e357d..3d28b4d1 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -725,6 +725,30 @@ cglobal sub16x16_dct
     SUB4x16_DCT_AVX512 5, 3
     RET
 
+cglobal sub8x8_dct_dc, 3,3
+    mova         m3, [dct_avx512]
+    DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
+    mov         r1d, 0xaa
+    kmovb        k1, r1d
+    psrld        m3, 5
+    DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
+    pxor        xm3, xm3
+    psadbw       m0, m3
+    psadbw       m1, m3
+    psubw        m0, m1
+    vpmovqw    xmm0, m0
+    vprold     xmm1, xmm0, 16
+    paddw      xmm0, xmm1       ; 0 0 2 2 1 1 3 3
+    punpckhqdq xmm2, xmm0, xmm0
+    psubw      xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
+    paddw      xmm0, xmm2       ; 0+1 0+1 2+3 2+3
+    punpckldq  xmm0, xmm1       ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
+    punpcklqdq xmm1, xmm0, xmm0
+    psubw      xmm0 {k1}, xm3, xmm0
+    paddw      xmm0, xmm1       ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
+    movhps     [r0], xmm0
+    RET
+
 %macro SARSUMSUB 3 ; a, b, tmp
     mova    m%3, m%1
     vpsraw  m%1 {k1}, 1
diff --git a/common/x86/dct.h b/common/x86/dct.h
index e173c1fd..179369b8 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -45,8 +45,9 @@ void x264_sub8x8_dct_avx2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2
 void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
+void x264_sub8x8_dct_dc_mmx2   ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2   ( dctcoef dct [ 4], pixel   *pix1, pixel   *pix2 );
+void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x16_dct_dc_ssse3( int16_t dct  [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x16_dct_dc_avx  ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );



More information about the x264-devel mailing list