[x264-devel] x86: AVX-512 sub8x16_dct_dc

Henrik Gramner git at videolan.org
Mon Jun 26 21:59:05 CEST 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Thu Jun 22 19:51:28 2017 +0200| [1d9dee2e9be717fcde416854f902db776312f141] | committer: Henrik Gramner

x86: AVX-512 sub8x16_dct_dc

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1d9dee2e9be717fcde416854f902db776312f141
---

 common/dct.c         |  1 +
 common/x86/dct-a.asm | 34 ++++++++++++++++++++++++++++++++++
 common/x86/dct.h     |  7 ++++---
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index f70bf81c..70853bf9 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -718,6 +718,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub8x8_dct       = x264_sub8x8_dct_avx512;
         dctf->sub16x16_dct     = x264_sub16x16_dct_avx512;
         dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_avx512;
+        dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_avx512;
         dctf->add8x8_idct      = x264_add8x8_idct_avx512;
     }
 #endif //HAVE_MMX
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 3d28b4d1..33ed0618 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -749,6 +749,40 @@ cglobal sub8x8_dct_dc, 3,3
     movhps     [r0], xmm0
     RET
 
+cglobal sub8x16_dct_dc, 3,3
+    mova         m5, [dct_avx512]
+    DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8  ; 0 4 1 5
+    DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
+    mov         r1d, 0xaa
+    kmovb        k1, r1d
+    psrld        m5, 5
+    DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
+    DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
+    pxor        xm4, xm4
+    psadbw       m0, m4
+    psadbw       m1, m4
+    psadbw       m2, m4
+    psadbw       m3, m4
+    psubw        m0, m2
+    psubw        m1, m3
+    SBUTTERFLY  qdq, 0, 1, 2
+    paddw        m0, m1
+    vpmovqw    xmm0, m0         ; 0 2 4 6 1 3 5 7
+    psrlq      xmm2, xmm0, 32
+    psubw      xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
+    paddw      xmm0, xmm2       ; 0+4 2+6 1+5 3+7
+    punpckhdq  xmm2, xmm0, xmm1
+    punpckldq  xmm0, xmm1
+    psubw      xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
+    paddw      xmm0, xmm2       ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
+    punpcklwd  xmm0, xmm1
+    psrlq      xmm2, xmm0, 32
+    psubw      xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
+    paddw      xmm0, xmm2       ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
+    shufps     xmm0, xmm1, q0220
+    mova       [r0], xmm0
+    RET
+
 %macro SARSUMSUB 3 ; a, b, tmp
     mova    m%3, m%1
     vpsraw  m%1 {k1}, 1
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 179369b8..20a65c53 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -48,9 +48,10 @@ void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
 void x264_sub8x8_dct_dc_mmx2   ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_sse2   ( dctcoef dct [ 4], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
-void x264_sub8x16_dct_dc_ssse3( int16_t dct  [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_avx  ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
+void x264_sub8x16_dct_dc_sse2  ( dctcoef dct [ 8], pixel   *pix1, pixel   *pix2 );
+void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_avx   ( dctcoef dct [ 8], pixel   *pix1, pixel   *pix2 );
+void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
 
 void x264_add4x4_idct_mmx       ( uint8_t *p_dst, int16_t dct    [16] );
 void x264_add4x4_idct_sse2     ( uint16_t *p_dst, int32_t dct    [16] );



More information about the x264-devel mailing list