[x264-devel] x86: AVX-512 sub16x16_dct

Mon Jun 26 21:58:54 CEST 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Jun 10 16:01:53 2017 +0200| [9034085265e5ca56e801c3efbf5c538fcc17c82b] | committer: Henrik Gramner

x86: AVX-512 sub16x16_dct

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9034085265e5ca56e801c3efbf5c538fcc17c82b
---

 common/common.h      |  2 +-
 common/dct.c         |  1 +
 common/macroblock.c  |  5 ++---
 common/x86/dct-a.asm | 33 +++++++++++++++++++++++++++++----
 common/x86/dct.h     |  1 +
 5 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/common/common.h b/common/common.h
index 33319838..867b2073 100644
--- a/common/common.h
+++ b/common/common.h
@@ -778,7 +778,7 @@ struct x264_t
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
             ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_64( pixel fdec_buf[52*FDEC_STRIDE] );
+            ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
 
             /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
             ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
diff --git a/common/dct.c b/common/dct.c
index 3e7aa39a..0d7f96de 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -716,6 +716,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     {
         dctf->sub4x4_dct       = x264_sub4x4_dct_avx512;
         dctf->sub8x8_dct       = x264_sub8x8_dct_avx512;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_avx512;
     }
 #endif //HAVE_MMX
 
diff --git a/common/macroblock.c b/common/macroblock.c
index 8dc9f975..61686715 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -532,16 +532,15 @@ void x264_macroblock_thread_init( x264_t *h )
     h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
     h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
     h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
     if( CHROMA444 )
     {
         h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
-        h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
-        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
+        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
     }
     else
     {
         h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
-        h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
         h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
     }
 }
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 31897d9a..42af7c63 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -47,10 +47,10 @@ cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:
                    dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
                    dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
 %else
-dct_avx512:        dd 0x00000000, 0x00000104, 0x0000014c, 0x00000048 ; bits 0-4:   dct8x8_fenc
-                   dd 0x00000210, 0x00000314, 0x0000035c, 0x00000258 ; bits 5-9:   dct8x8_fdec
-                   dd 0x00000021, 0x00000125, 0x0000016d, 0x00000069
-                   dd 0x00000231, 0x00000335, 0x0000037d, 0x00000279
+dct_avx512:        dd 0x00000000, 0x00021104, 0x0006314c, 0x00042048 ; bits 0-4:   dct8x8_fenc
+                   dd 0x00008a10, 0x00029b14, 0x0006bb5c, 0x0004aa58 ; bits 5-9:   dct8x8_fdec
+                   dd 0x00004421, 0x00025525, 0x0006756d, 0x00046469 ; bits 10-13: dct16x16_fenc
+                   dd 0x0000ce31, 0x0002df35, 0x0006ff7d, 0x0004ee79 ; bits 14-18: dct16x16_fdec
 scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3:   4x4_frame
                    dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9:   8x8_frame1
                    dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
@@ -699,6 +699,31 @@ cglobal sub8x8_dct, 3,3
     mova     [r0], m0
     mova  [r0+64], m1
     RET
+
+%macro SUB4x16_DCT_AVX512 2 ; dst, src
+    vpermd   m1, m5, [r1+1*%2*64]
+    mova     m3,     [r2+2*%2*64]
+    vpermt2d m3, m6, [r2+2*%2*64+64]
+    call dct4x4x4_internal_avx512
+    mova [r0+%1*64    ], m0
+    mova [r0+%1*64+128], m1
+%endmacro
+
+cglobal sub16x16_dct
+    psrld    m5, [dct_avx512], 10
+    mov     eax, 0xaaaaaaaa
+    kmovd    k1, eax
+    mov     eax, 0xf0f0f0f0
+    kmovd    k2, eax
+    PROLOGUE 3,3
+    pxor    xm4, xm4
+    knotw    k3, k2
+    psrld    m6, m5, 4
+    SUB4x16_DCT_AVX512 0, 0
+    SUB4x16_DCT_AVX512 1, 1
+    SUB4x16_DCT_AVX512 4, 2
+    SUB4x16_DCT_AVX512 5, 3
+    RET
 %endif ; HIGH_BIT_DEPTH
 
 INIT_MMX
diff --git a/common/x86/dct.h b/common/x86/dct.h
index f605437a..c30b0daa 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -44,6 +44,7 @@ void x264_sub16x16_dct_xop  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
 void x264_sub8x8_dct_avx2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
 void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );