[x265] [PATCH 209 of 307] x86: AVX512 optimise scale1D128to64 code

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:27 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1511864533 -19800
#      Tue Nov 28 15:52:13 2017 +0530
# Node ID af867976d51969b1770e6bcffd80e0389c88b561
# Parent  651bf679ed5c7ec6b68714e81d3c24664f08ec6a
x86: AVX512 optimise scale1D128to64 code

Previous performance           : 16.10x
Performance after optimisation : 20.71x

diff -r 651bf679ed5c -r af867976d519 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Nov 28 15:09:00 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue Nov 28 15:52:13 2017 +0530
@@ -26,7 +26,7 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 
 var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
                  db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
@@ -4713,65 +4713,63 @@
 
 %if HIGH_BIT_DEPTH == 0
 INIT_ZMM avx512
-cglobal scale1D_128to64, 2, 2, 6
+cglobal scale1D_128to64, 2, 2, 7
     pxor            m4, m4
+    mova            m6, [dequant_shuf1_avx512]
     vbroadcasti32x8 m5, [pb_1]
 
     ;Top pixel
     movu            m0, [r1]
-    movu            m1, [r1 + 64]
-    movu            m2, [r1 + 128]
-    movu            m3, [r1 + 192]
-
-    pmaddubsw       m0, m0, m5
+    movu            m1, [r1 + 1 * mmsize]
+    movu            m2, [r1 + 2 * mmsize]
+    movu            m3, [r1 + 3 * mmsize]
+
+    pmaddubsw       m0, m5
     pavgw           m0, m4
-    pmaddubsw       m1, m1, m5
+    pmaddubsw       m1, m5
     pavgw           m1, m4
     packuswb        m0, m1
-    vpermq          m0, m0, q3120
-    vshufi64x2      m0, m0, q3120
+    vpermq          m0, m6, m0
     movu            [r0], m0
 
     ;Left pixel
-    pmaddubsw       m2, m2, m5
+    pmaddubsw       m2, m5
     pavgw           m2, m4
-    pmaddubsw       m3, m3, m5
+    pmaddubsw       m3, m5
     pavgw           m3, m4
     packuswb        m2, m3
-    vpermq          m2, m2, q3120
-    vshufi64x2      m2, m2, q3120
-    movu            [r0 + 64], m2
+    vpermq          m2, m6, m2
+    movu            [r0 + mmsize], m2
     RET
 
 INIT_ZMM avx512
-cglobal scale1D_128to64_aligned, 2, 2, 6
+cglobal scale1D_128to64_aligned, 2, 2, 7
     pxor            m4, m4
+    mova            m6, [dequant_shuf1_avx512]
     vbroadcasti32x8 m5, [pb_1]
 
     ;Top pixel
     mova            m0, [r1]
-    mova            m1, [r1 + 64]
-    mova            m2, [r1 + 128]
-    mova            m3, [r1 + 192]
-
-    pmaddubsw       m0, m0, m5
+    mova            m1, [r1 + 1 * mmsize]
+    mova            m2, [r1 + 2 * mmsize]
+    mova            m3, [r1 + 3 * mmsize]
+
+    pmaddubsw       m0, m5
     pavgw           m0, m4
-    pmaddubsw       m1, m1, m5
+    pmaddubsw       m1, m5
     pavgw           m1, m4
     packuswb        m0, m1
-    vpermq          m0, m0, q3120
-    vshufi64x2      m0, m0, q3120
+    vpermq          m0, m6, m0
     mova            [r0], m0
 
     ;Left pixel
-    pmaddubsw       m2, m2, m5
+    pmaddubsw       m2, m5
     pavgw           m2, m4
-    pmaddubsw       m3, m3, m5
+    pmaddubsw       m3, m5
     pavgw           m3, m4
     packuswb        m2, m3
-    vpermq          m2, m2, q3120
-    vshufi64x2      m2, m2, q3120
-    mova            [r0 + 64], m2
+    vpermq          m2, m6, m2
+    mova            [r0 + mmsize], m2
     RET
 %endif
 


More information about the x265-devel mailing list