[x264-devel] x86: AVX-512 pixel_avg_weight_w16

Mon Jun 26 21:59:10 CEST 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Jun 24 14:26:25 2017 +0200| [d3214e6b102701911fc9d5fc92435e79e8b49100] | committer: Henrik Gramner

x86: AVX-512 pixel_avg_weight_w16

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d3214e6b102701911fc9d5fc92435e79e8b49100
---

 common/x86/mc-a.asm | 62 ++++++++++++++++++++++++++++++++++++++++++-----------
 common/x86/mc-c.c   |  9 +++++++-
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 92029ade..2dbdee5d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -83,11 +83,11 @@ cextern deinterleave_shufd
     %endmacro
 %endif
 
-%macro AVG_END 0
-    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
+%macro AVG_END 0-1 2 ; rows
     lea  t2, [t2+t3*2*SIZEOF_PIXEL]
+    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
     lea  t0, [t0+t1*2*SIZEOF_PIXEL]
-    sub eax, 2
+    sub eax, %1
     jg .height_loop
     RET
 %endmacro
@@ -147,17 +147,24 @@ cextern deinterleave_shufd
 %endmacro
 
 %macro BIWEIGHT_START_SSSE3 0
-    movzx  t6d, byte r6m ; FIXME x86_64
-    mov    t7d, 64
-    sub    t7d, t6d
-    shl    t7d, 8
-    add    t6d, t7d
-    mova    m4, [pw_512]
-    movd   xm3, t6d
+    movzx         t6d, byte r6m ; FIXME x86_64
+%if mmsize > 16
+    vbroadcasti128 m4, [pw_512]
+%else
+    mova           m4, [pw_512]
+%endif
+    lea           t7d, [t6+(64<<8)]
+    shl           t6d, 8
+    sub           t7d, t6d
+%if cpuflag(avx512)
+    vpbroadcastw   m3, t7d
+%else
+    movd          xm3, t7d
 %if cpuflag(avx2)
-    vpbroadcastw m3, xm3
+    vpbroadcastw   m3, xm3
 %else
-    SPLATW  m3, m3   ; weight_dst,src
+    SPLATW         m3, m3   ; weight_dst,src
+%endif
 %endif
 %endmacro
 
@@ -268,6 +275,34 @@ cglobal pixel_avg_weight_w16
     mova    [t0], xm0
     vextracti128 [t0+t1], m0, 1
     AVG_END
+
+INIT_ZMM avx512
+cglobal pixel_avg_weight_w16
+    BIWEIGHT_START
+    AVG_START 5
+.height_loop:
+    movu        xm0, [t2]
+    movu        xm1, [t4]
+    vinserti128 ym0, [t2+t3], 1
+    vinserti128 ym1, [t4+t5], 1
+    lea          t2, [t2+t3*2]
+    lea          t4, [t4+t5*2]
+    vinserti32x4 m0, [t2], 2
+    vinserti32x4 m1, [t4], 2
+    vinserti32x4 m0, [t2+t3], 3
+    vinserti32x4 m1, [t4+t5], 3
+    SBUTTERFLY   bw, 0, 1, 2
+    pmaddubsw    m0, m3
+    pmaddubsw    m1, m3
+    pmulhrsw     m0, m4
+    pmulhrsw     m1, m4
+    packuswb     m0, m1
+    mova       [t0], xm0
+    vextracti128 [t0+t1], ym0, 1
+    lea          t0, [t0+t1*2]
+    vextracti32x4 [t0], m0, 2
+    vextracti32x4 [t0+t1], m0, 3
+    AVG_END 4
 %endif ;HIGH_BIT_DEPTH
 
 ;=============================================================================
@@ -738,6 +773,9 @@ INIT_XMM avx2
 AVG_FUNC 16, movdqu, movdqa
 AVGH 16, 16
 AVGH 16,  8
+INIT_XMM avx512
+AVGH 16, 16
+AVGH 16,  8
 
 %endif ;HIGH_BIT_DEPTH
 
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 74cfffec..0a7e414c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -32,7 +32,8 @@
     void func##_mmx2 args;\
     void func##_sse2 args;\
     void func##_ssse3 args;\
-    void func##_avx2 args;
+    void func##_avx2 args;\
+    void func##_avx512 args;
 
 DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
 DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
@@ -865,6 +866,12 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
         pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
     }
+
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_avx512;
+    }
 #endif // HIGH_BIT_DEPTH
 
     if( !(cpu&X264_CPU_AVX) )