[x265] [PATCH 081 of 307] x86: AVX512 cleanup blockcopy_sp_64x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:19 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502709712 -19800
#      Mon Aug 14 16:51:52 2017 +0530
# Node ID 5c18b655a88a739b87c6b071d186a2b9286b8266
# Parent  4a643ecb8c3bcc4dab96bfe56217d4449564bae0
x86: AVX512 cleanup blockcopy_sp_64x64

diff -r 4a643ecb8c3b -r 5c18b655a88a source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Aug 08 17:01:50 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Mon Aug 14 16:51:52 2017 +0530
@@ -26,7 +26,10 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+ALIGN 64
+const shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
 
 cextern pb_4
 cextern pb_1
@@ -2162,7 +2165,7 @@
 
 BLOCKCOPY_SP_W64_H4_avx2 64, 64
 
-%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0
+%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0
     movu               m0,             [r2]
     movu               m1,             [r2 + 64]
     movu               m2,             [r2 + r3]
@@ -2170,10 +2173,8 @@
 
     packuswb           m0,             m1
     packuswb           m2,             m3
-    vpermq             m0,             m0,                 11011000b
-    vpermq             m2,             m2,                 11011000b
-    vshufi64x2         m0,             m0,                 11011000b
-    vshufi64x2         m2,             m2,                 11011000b
+    vpermq             m0,             m4,         m0
+    vpermq             m2,             m4,         m2
     movu               [r0],           m0
     movu               [r0 + r1],      m2
 
@@ -2184,73 +2185,25 @@
 
     packuswb           m0,             m1
     packuswb           m2,             m3
-    vpermq             m0,             m0,                 11011000b
-    vpermq             m2,             m2,                 11011000b
-    vshufi64x2         m0,             m0,                 11011000b
-    vshufi64x2         m2,             m2,                 11011000b
-    movu               [r0 + 2 * r1],  m0
-    movu               [r0 + r5],      m2
-
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-
-    movu               m0,             [r2]
-    movu               m1,             [r2 + 64]
-    movu               m2,             [r2 + r3]
-    movu               m3,             [r2 + r3 + 64]
-
-    packuswb           m0,             m1
-    packuswb           m2,             m3
-    vpermq             m0,             m0,                 11011000b
-    vpermq             m2,             m2,                 11011000b
-    vshufi64x2         m0,             m0,                 11011000b
-    vshufi64x2         m2,             m2,                 11011000b
-    movu               [r0],           m0
-    movu               [r0 + r1],      m2
-
-    movu               m0,             [r2 + 2 * r3]
-    movu               m1,             [r2 + 2 * r3 + 64]
-    movu               m2,             [r2 + r4]
-    movu               m3,             [r2 + r4 + 64]
-
-    packuswb           m0,             m1
-    packuswb           m2,             m3
-    vpermq             m0,             m0,                 11011000b
-    vpermq             m2,             m2,                 11011000b
-    vshufi64x2         m0,             m0,                 11011000b
-    vshufi64x2         m2,             m2,                 11011000b
+    vpermq             m0,             m4,         m0
+    vpermq             m2,             m4,         m2
     movu               [r0 + 2 * r1],  m0
     movu               [r0 + r5],      m2
 %endmacro
 
 INIT_ZMM avx512
-cglobal blockcopy_sp_64x64, 4, 6, 4
+cglobal blockcopy_sp_64x64, 4, 6, 5
+    mova   m4, [shuf1_avx512]
     add    r3,  r3
     lea    r4,  [3 * r3]
     lea    r5,  [3 * r1]
 
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
+%rep 15
+    PROCESS_BLOCKCOPY_SP_64x4_AVX512
     lea    r0, [r0 + 4 * r1]
     lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
-    lea    r0, [r0 + 4 * r1]
-    lea    r2, [r2 + 4 * r3]
-    PROCESS_BLOCKCOPY_SP_64x8_AVX512
+%endrep
+    PROCESS_BLOCKCOPY_SP_64x4_AVX512
     RET
 
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list