[x265] [PATCH 095 of 307] x86: AVX512 copy_pp_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:33 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503462961 -19800
#      Wed Aug 23 10:06:01 2017 +0530
# Node ID 31a180bcef33fae436ad7e3aa4378b283a86d56a
# Parent  7868f1cb521d554dc77d768ec1f838e0b29824e4
x86: AVX512 copy_pp_32xN

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
32x16 |      1.63x       |      2.58x
32x24 |      2.51x       |      2.87x
32x32 |      2.48x       |      2.95x
32x64 |      2.03x       |      2.53x

This patch also clean up code for 64xN

diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Aug 22 13:51:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 23 10:06:01 2017 +0530
@@ -3965,6 +3965,18 @@
         p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
         p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
         p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
+        p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+        p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
+        p.pu[LUMA_32x32].copy_pp  = PFX(blockcopy_pp_32x32_avx512);
+        p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
 
         p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
         p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Aug 22 13:51:33 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Wed Aug 23 10:06:01 2017 +0530
@@ -1107,7 +1107,7 @@
 BLOCKCOPY_PP_W64_H4_avx 64, 64
 
 ;----------------------------------------------------------------------------------------------
-; Macro to calculate blockcopy_pp_64x4_avx512
+; blockcopy_pp avx512 code start
 ;----------------------------------------------------------------------------------------------
 %macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
 movu    m0, [r2]
@@ -1121,16 +1121,28 @@
 movu    [r0 + r5] , m3
 %endmacro
 
+%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0
+movu           ym0, [r2]
+vinserti32x8   m0,  [r2 + r3],     1
+movu           ym1, [r2 + 2 * r3]
+vinserti32x8   m1,  [r2 + r4],     1
+
+movu           [r0] ,              ym0
+vextracti32x8  [r0 + r1] ,         m0,    1
+movu           [r0 + 2 * r1]  ,    ym1
+vextracti32x8  [r0 + r5] ,         m1,    1
+%endmacro
+
 ;----------------------------------------------------------------------------------------------
 ; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;----------------------------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W64_H4_avx512 1
 INIT_ZMM avx512
-cglobal blockcopy_pp_64x%1, 4, 4, 6
+cglobal blockcopy_pp_64x%1, 4, 6, 4
 lea    r4,  [3 * r3]
 lea    r5,  [3 * r1]
 
-%rep %1/4 - 1         
+%rep %1/4 - 1
 PROCESS_BLOCKCOPY_PP_64X4_avx512
 lea     r2, [r2 + 4 * r3]
 lea     r0, [r0 + 4 * r1] 
@@ -1145,7 +1157,30 @@
 BLOCKCOPY_PP_W64_H4_avx512 48
 BLOCKCOPY_PP_W64_H4_avx512 64
 
-
+%macro BLOCKCOPY_PP_W32_H4_avx512 1
+INIT_ZMM avx512
+cglobal blockcopy_pp_32x%1, 4, 6, 2
+    lea    r4,  [3 * r3]
+    lea    r5,  [3 * r1]
+
+%rep %1/4 - 1
+    PROCESS_BLOCKCOPY_PP_32X4_avx512
+    lea     r2, [r2 + 4 * r3]
+    lea     r0, [r0 + 4 * r1] 
+%endrep
+    PROCESS_BLOCKCOPY_PP_32X4_avx512
+    RET
+%endmacro
+
+BLOCKCOPY_PP_W32_H4_avx512 8
+BLOCKCOPY_PP_W32_H4_avx512 16
+BLOCKCOPY_PP_W32_H4_avx512 24
+BLOCKCOPY_PP_W32_H4_avx512 32
+BLOCKCOPY_PP_W32_H4_avx512 48
+BLOCKCOPY_PP_W32_H4_avx512 64
+;----------------------------------------------------------------------------------------------
+; blockcopy_pp avx512 code end
+;----------------------------------------------------------------------------------------------
 
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)


More information about the x265-devel mailing list