[x264-devel] x86: AVX-512 plane_copy_deinterleave_v210

Mon May 22 00:03:47 CEST 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Apr 12 16:21:09 2017 +0200| [3081ffa1c540d1df05123e0fab1937985573ac78] | committer: Henrik Gramner

x86: AVX-512 plane_copy_deinterleave_v210

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=3081ffa1c540d1df05123e0fab1937985573ac78
---

 common/frame.c       |   2 +-
 common/x86/mc-a2.asm | 110 +++++++++++++++++++++++++++++++--------------------
 common/x86/mc-c.c    |  26 +++++++-----
 tools/checkasm.c     |   2 +-
 4 files changed, 87 insertions(+), 53 deletions(-)

diff --git a/common/frame.c b/common/frame.c
index a81e9b10..4d80cbb0 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -78,7 +78,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     int i_padv = PADV << PARAM_INTERLACED;
     int align = 16;
 #if ARCH_X86 || ARCH_X86_64
-    if( h->param.cpu&X264_CPU_CACHELINE_64 )
+    if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
         align = 64;
     else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
         align = 32;
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 45692ff5..5bf452ba 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -30,18 +30,15 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA 32
-
-pw_1024: times 16 dw 1024
-filt_mul20: times 32 db 20
-filt_mul15: times 16 db 1, -5
-filt_mul51: times 16 db -5, 1
-hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+SECTION_RODATA 64
 
 %if HIGH_BIT_DEPTH
-v210_mask: times 4 dq 0xc00ffc003ff003ff
-v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
-v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
+v210_shuf_avx512: db  0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma
+                  db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20,
+                  db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62
+v210_mask:        dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00
+v210_luma_shuf:   db  1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15
+v210_chroma_shuf: db  0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14
 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
            dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
@@ -58,6 +55,12 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif ; !HIGH_BIT_DEPTH
 
+pw_1024: times 16 dw 1024
+filt_mul20: times 32 db 20
+filt_mul15: times 16 db 1, -5
+filt_mul51: times 16 db -5, 1
+hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+
 mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
                          db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
 mbtree_fix8_pack_shuf:   db  1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
@@ -1400,43 +1403,64 @@ cglobal plane_copy_deinterleave_v210, 7,7,7
 %define org_w r6m
 %define h     dword r7m
 %endif
-    FIX_STRIDES r1, r3, r6d
-    shl    r5, 2
-    add    r0, r6
-    add    r2, r6
-    neg    r6
-    mov   src, r4
-    mov org_w, r6
-    mova   m2, [v210_mask]
-    mova   m3, [v210_luma_shuf]
-    mova   m4, [v210_chroma_shuf]
-    mova   m5, [v210_mult] ; also functions as vpermd index for avx2
-    pshufd m6, m5, q1102
-
+    FIX_STRIDES  r1, r3, r6d
+    shl          r5, 2
+    add          r0, r6
+    add          r2, r6
+    neg          r6
+    mov         src, r4
+    mov       org_w, r6
+%if cpuflag(avx512)
+    vpbroadcastd m2, [v210_mask]
+    vpbroadcastd m3, [v210_shuf_avx512]
+    psrlw        m3, 6                  ; dw 0, 4
+    mova         m4, [v210_shuf_avx512] ; luma
+    psrlw        m5, m4, 8              ; chroma
+%else
+%if mmsize == 32
+    vbroadcasti128 m2, [v210_mask]
+    vbroadcasti128 m3, [v210_luma_shuf]
+    vbroadcasti128 m4, [v210_chroma_shuf]
+%else
+    mova         m2, [v210_mask]
+    mova         m3, [v210_luma_shuf]
+    mova         m4, [v210_chroma_shuf]
+%endif
+    mova         m5, [v210_mult] ; also functions as vpermd index for avx2
+    pshufd       m6, m5, q1102
+%endif
 ALIGN 16
 .loop:
-    movu   m1, [r4]
-    pandn  m0, m2, m1
-    pand   m1, m2
-    pshufb m0, m3
-    pshufb m1, m4
-    pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
-    pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
+    movu         m1, [r4]
+    pandn        m0, m2, m1
+    pand         m1, m2
+%if cpuflag(avx512)
+    psrld        m0, 10
+    vpsrlvw      m1, m3
+    mova         m6, m0
+    vpermt2w     m0, m4, m1
+    vpermt2w     m1, m5, m6
+%else
+    pshufb       m0, m3
+    pshufb       m1, m4
+    pmulhrsw     m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    pmulhrsw     m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
 %if mmsize == 32
-    vpermd m0, m5, m0
-    vpermd m1, m5, m1
+    vpermd       m0, m5, m0
+    vpermd       m1, m5, m1
 %endif
-    movu [r0+r6], m0
-    movu [r2+r6], m1
-    add    r4, mmsize
-    add    r6, 3*mmsize/4
+%endif
+    movu    [r0+r6], m0
+    movu    [r2+r6], m1
+    add          r4, mmsize
+    add          r6, mmsize*3/4
     jl .loop
-    add    r0, r1
-    add    r2, r3
-    add   src, r5
-    mov    r4, src
-    mov    r6, org_w
-    dec     h
+    add          r0, r1
+    add          r2, r3
+    add         src, r5
+    mov          r4, src
+    mov          r6, org_w
+    dec           h
     jg .loop
     RET
 %endmacro ; PLANE_DEINTERLEAVE_V210
@@ -1461,6 +1485,8 @@ PLANE_DEINTERLEAVE_V210
 INIT_YMM avx2
 LOAD_DEINTERLEAVE_CHROMA
 PLANE_DEINTERLEAVE_V210
+INIT_ZMM avx512
+PLANE_DEINTERLEAVE_V210
 %else
 INIT_XMM sse2
 PLANE_DEINTERLEAVE_RGB
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index b7f508a2..a89a139d 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -123,15 +123,18 @@ void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
                                              pixel *dstb, intptr_t i_dstb,
                                              pixel *dstc, intptr_t i_dstc,
                                              pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
-                                              uint16_t *dstv, intptr_t i_dstv,
-                                              uint32_t *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx  ( uint16_t *dstu, intptr_t i_dstu,
-                                              uint16_t *dstv, intptr_t i_dstv,
-                                              uint32_t *src,  intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
-                                              uint16_t *dstv, intptr_t i_dstv,
-                                              uint32_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu,
+                                               uint16_t *dstv, intptr_t i_dstv,
+                                               uint32_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx   ( uint16_t *dstu, intptr_t i_dstu,
+                                               uint16_t *dstv, intptr_t i_dstv,
+                                               uint32_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx2  ( uint16_t *dstu, intptr_t i_dstu,
+                                               uint16_t *dstv, intptr_t i_dstv,
+                                               uint32_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu,
+                                               uint16_t *dstv, intptr_t i_dstv,
+                                               uint32_t *src,  intptr_t i_src, int w, int h );
 void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
@@ -689,6 +692,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
         pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
     }
+
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512;
+    }
 #else // !HIGH_BIT_DEPTH
 
 #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4c84ade9..d963f774 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1605,7 +1605,7 @@ static int check_mc( int cpu_ref, int cpu_new )
         {
             int w = (plane_specs[i].w + 1) >> 1;
             int h = plane_specs[i].h;
-            intptr_t dst_stride = ALIGN( w, 16 );
+            intptr_t dst_stride = ALIGN( w, 32 );
             intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
             intptr_t offv = dst_stride*h + 32;
             memset( pbuf3, 0, 0x1000 );