[x264-devel] x86: AVX-512 zigzag_scan_8x8_frame

Mon May 22 00:03:03 CEST 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Mar 25 19:14:28 2017 +0100| [724a577237f27cdb0c0fd18ef8ed32d39430796b] | committer: Henrik Gramner

x86: AVX-512 zigzag_scan_8x8_frame

The vperm* instructions ignores unused bits, so we can pack the permutation
indices together to save cache and just use a shift to get the right values.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=724a577237f27cdb0c0fd18ef8ed32d39430796b
---

 common/common.h      |  2 +-
 common/dct.c         |  2 ++
 common/x86/dct-a.asm | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 common/x86/dct.h     | 11 ++++++-----
 encoder/macroblock.c |  4 ++--
 encoder/macroblock.h |  2 +-
 tools/checkasm.c     |  2 +-
 7 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/common/common.h b/common/common.h
index e14dec7d..b49506c2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -638,7 +638,7 @@ struct x264_t
         ALIGNED_64( dctcoef luma16x16_dc[3][16] );
         ALIGNED_16( dctcoef chroma_dc[2][8] );
         // FIXME share memory?
-        ALIGNED_32( dctcoef luma8x8[12][64] );
+        ALIGNED_64( dctcoef luma8x8[12][64] );
         ALIGNED_64( dctcoef luma4x4[16*3][16] );
     } dct;
 
diff --git a/common/dct.c b/common/dct.c
index 8ebb9ba5..5c1b8b5c 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -989,6 +989,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
     if( cpu&X264_CPU_AVX512 )
     {
         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
     }
 #endif // HAVE_MMX
 #else
@@ -1033,6 +1034,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
     if( cpu&X264_CPU_AVX512 )
     {
         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+        pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
     }
 #endif // HAVE_MMX
 #if HAVE_ALTIVEC
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index ad457237..a9b853c4 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -31,10 +31,18 @@
 %include "x86util.asm"
 
 SECTION_RODATA 64
+; Permutation indices for AVX-512 zigzags are bit-packed to save cache
 %if HIGH_BIT_DEPTH
-scan_frame_avx512: dd 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15
+scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3:   4x4_frame
+                   dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8:   8x8_frame1
+                   dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13:  8x8_frame2
+                   dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
+                                                                     ; bits 19-23: 8x8_frame4
 %else
-scan_frame_avx512: dw 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15
+scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3:   4x4_frame
+                   dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9:   8x8_frame1
+                   dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
+                   dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
 %endif
 
 pw_ppmmmmpp:    dw 1,1,-1,-1,-1,-1,1,1
@@ -1897,6 +1905,37 @@ cglobal zigzag_scan_4x4_frame, 2,2
     vpermd      m0, m0, [r1]
     mova      [r0], m0
     RET
+
+cglobal zigzag_scan_8x8_frame, 2,2
+    psrld       m0, [scan_frame_avx512], 4
+    mova        m1, [r1+0*64]
+    mova        m2, [r1+1*64]
+    mova        m3, [r1+2*64]
+    mova        m4, [r1+3*64]
+    mov        r1d, 0x01fe7f80
+    kmovd       k1, r1d
+    kshiftrd    k2, k1, 16
+    vpermd      m5, m0, m3  ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
+    psrld       m6, m0, 5
+    vpermi2d    m0, m1, m2  ;  0  8  1  2  9 16 24 17 10  3  4 11 18 25 __ __
+    vmovdqa64   m0 {k1}, m5
+    mova [r0+0*64], m0
+    mova        m5, m1
+    vpermt2d    m1, m6, m2  ; __ 26 19 12  5  6 13 20 27 __ __ __ __ __ __ __
+    psrld       m0, m6, 5
+    vpermi2d    m6, m3, m4  ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
+    vmovdqa32   m6 {k2}, m1
+    mova [r0+1*64], m6
+    vpermt2d    m5, m0, m2  ; 28 21 14  7 15 22 29 __ __ __ __ __ __ __ __ 30
+    psrld       m1, m0, 5
+    vpermi2d    m0, m3, m4  ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
+    vmovdqa32   m5 {k1}, m0
+    mova [r0+2*64], m5
+    vpermt2d    m3, m1, m4  ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
+    vpermd      m2, m1, m2  ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
+    vmovdqa64   m2 {k2}, m3
+    mova [r0+3*64], m2
+    RET
 %else ; !HIGH_BIT_DEPTH
 INIT_YMM avx512
 cglobal zigzag_scan_4x4_frame, 2,2
@@ -1904,4 +1943,16 @@ cglobal zigzag_scan_4x4_frame, 2,2
     vpermw      m0, m0, [r1]
     mova      [r0], m0
     RET
+
+INIT_ZMM avx512
+cglobal zigzag_scan_8x8_frame, 2,2
+    psrlw       m0, [scan_frame_avx512], 4
+    mova        m1, [r1]
+    mova        m2, [r1+64]
+    psrlw       m3, m0, 6
+    vpermi2w    m0, m1, m2
+    vpermt2w    m1, m3, m2
+    mova      [r0], m0
+    mova   [r0+64], m1
+    RET
 %endif ; !HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index ce88c7e4..6254368b 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -101,11 +101,12 @@ void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
 void x264_add8x8_idct8_avx   ( pixel *dst, dctcoef dct   [64] );
 void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
 
-void x264_zigzag_scan_8x8_frame_xop  ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_avx  ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_mmx2  ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_sse2  ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx   ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_xop   ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
 void x264_zigzag_scan_4x4_frame_mmx   ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_frame_sse2  ( int32_t level[16], int32_t dct[16] );
 void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 3684e257..af8462a7 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -780,7 +780,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
         }
         else if( h->mb.b_transform_8x8 )
         {
-            ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] );
+            ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] );
             b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 
             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
@@ -1219,7 +1219,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
                 int quant_cat = p ? CQM_8PC : CQM_8PY;
                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
-                ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
+                ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
 
                 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
                 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index db85539f..1c901a89 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -154,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
     pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
-    ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
+    ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
     ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
 
     if( b_predict )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 02c84989..d09a06a4 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -842,7 +842,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
     ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
     ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
-    ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
+    ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
     ALIGNED_16( dctcoef dctdc[2][8] );
     x264_t h_buf;
     x264_t *h = &h_buf;