[x265] [PATCH 005 of 307] x86: AVX-512 pixel_avg_weight_w16

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:03 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1498474278 -19800
#      Mon Jun 26 16:21:18 2017 +0530
# Node ID 5309fe76c442d720d2d3419eefab11f2a1f9731a
# Parent  2e5128235d577806f16e5cf93266dcd7f4155a63
x86: AVX-512 pixel_avg_weight_w16

diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jun 23 17:25:27 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jun 26 16:21:18 2017 +0530
@@ -3754,6 +3754,8 @@
 
         p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
         p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
+        p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512);
     }
 #endif
 }
diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Fri Jun 23 17:25:27 2017 +0530
+++ b/source/common/x86/mc-a.asm	Mon Jun 26 16:21:18 2017 +0530
@@ -3367,11 +3367,11 @@
     %endmacro
 %endif
 
-%macro AVG_END 0
+%macro AVG_END 0-1 2;rows
+    lea  t2, [t2+t3*2*SIZEOF_PIXEL]
     lea  t4, [t4+t5*2*SIZEOF_PIXEL]
-    lea  t2, [t2+t3*2*SIZEOF_PIXEL]
     lea  t0, [t0+t1*2*SIZEOF_PIXEL]
-    sub eax, 2
+    sub eax, %1
     jg .height_loop
  %ifidn movu,movq ; detect MMX
     EMMS
@@ -3434,17 +3434,24 @@
 %endmacro
 
 %macro BIWEIGHT_START_SSSE3 0
-    movzx  t6d, byte r6m ; FIXME x86_64
-    mov    t7d, 64
-    sub    t7d, t6d
-    shl    t7d, 8
-    add    t6d, t7d
-    mova    m4, [pw_512]
-    movd   xm3, t6d
+    movzx         t6d, byte r6m ; FIXME x86_64
+%if mmsize > 16
+    vbroadcasti128 m4, [pw_512]
+%else
+    mova           m4, [pw_512]
+%endif
+    lea           t7d, [t6+(64<<8)]
+    shl           t6d, 8
+    sub           t7d, t6d
+%if cpuflag(avx512)
+    vpbroadcastw   m3, t7d
+%else
+    movd          xm3, t7d
 %if cpuflag(avx2)
-    vpbroadcastw m3, xm3
+    vpbroadcastw   m3, xm3
 %else
-    SPLATW  m3, m3   ; weight_dst,src
+    SPLATW         m3, m3   ; weight_dst,src
+%endif
 %endif
 %endmacro
 
@@ -3586,6 +3593,34 @@
     vextracti128 [t0+t1], m0, 1
     AVG_END
 
+INIT_ZMM avx512
+ cglobal pixel_avg_weight_w16
+    BIWEIGHT_START
+    AVG_START 5
+.height_loop:
+    movu        xm0, [t2]
+    movu        xm1, [t4]
+    vinserti128 ym0, [t2+t3], 1
+    vinserti128 ym1, [t4+t5], 1
+    lea          t2, [t2+t3*2]
+    lea          t4, [t4+t5*2]
+    vinserti32x4 m0, [t2], 2
+    vinserti32x4 m1, [t4], 2
+    vinserti32x4 m0, [t2+t3], 3
+    vinserti32x4 m1, [t4+t5], 3
+    SBUTTERFLY   bw, 0, 1, 2
+    pmaddubsw    m0, m3
+    pmaddubsw    m1, m3
+    pmulhrsw     m0, m4
+    pmulhrsw     m1, m4
+    packuswb     m0, m1
+    mova       [t0], xm0
+    vextracti128 [t0+t1], ym0, 1
+    lea          t0, [t0+t1*2]
+    vextracti32x4 [t0], m0, 2
+    vextracti32x4 [t0+t1], m0, 3
+    AVG_END 4
+
 cglobal pixel_avg_weight_w32
     BIWEIGHT_START
     AVG_START 5
@@ -4345,6 +4380,10 @@
 AVGH 16, 8
 AVGH 16, 4
 
+INIT_XMM avx512
+AVGH 16, 16
+AVGH 16,  8
+
 %endif ;HIGH_BIT_DEPTH
 
 ;-------------------------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list