[x265] [PATCH 082 of 307] x86: AVX512 blockcopy_sp_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:20 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502711388 -19800
#      Mon Aug 14 17:19:48 2017 +0530
# Node ID b30539ebe5c9b2d9412d3a39458a90a7574ac744
# Parent  5c18b655a88a739b87c6b071d186a2b9286b8266
x86: AVX512 blockcopy_sp_32xN

Size       | AVX2 performance | AVX512 performance
----------------------------------------------
32x32      |      6.77x       |      11.27x
i420 32x32 |      6.73x       |      11.43x
i422 32x64 |      6.68x       |      12.19x

diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 14 16:51:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Aug 14 17:19:48 2017 +0530
@@ -3948,6 +3948,10 @@
         p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
 
         p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+        p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx512);
+
         p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Mon Aug 14 16:51:52 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Mon Aug 14 17:19:48 2017 +0530
@@ -2191,6 +2191,25 @@
     movu               [r0 + r5],      m2
 %endmacro
 
+%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0
+    movu               m0,             [r2]
+    movu               m1,             [r2 + r3]
+    movu               m2,             [r2 + 2 * r3]
+    movu               m3,             [r2 + r4]
+
+    packuswb           m0,             m1
+    packuswb           m2,             m3
+    vpermq             m0,             m4,         m0
+    vpermq             m2,             m4,         m2
+    movu               [r0],           ym0
+    vextracti32x8      [r0 + r1],      m0,         1
+    movu               [r0 + 2 * r1],  ym2
+    vextracti32x8      [r0 + r5],      m2,         1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
 INIT_ZMM avx512
 cglobal blockcopy_sp_64x64, 4, 6, 5
     mova   m4, [shuf1_avx512]
@@ -2206,6 +2225,26 @@
     PROCESS_BLOCKCOPY_SP_64x4_AVX512
     RET
 
+%macro BLOCKCOPY_SP_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal blockcopy_sp_32x%1, 4, 6, 5
+    mova   m4, [shuf1_avx512]
+    add    r3,  r3
+    lea    r4,  [3 * r3]
+    lea    r5,  [3 * r1]
+
+%rep %1/4 - 1
+    PROCESS_BLOCKCOPY_SP_32x4_AVX512
+    lea    r0, [r0 + 4 * r1]
+    lea    r2, [r2 + 4 * r3]
+%endrep
+    PROCESS_BLOCKCOPY_SP_32x4_AVX512
+    RET
+%endmacro
+
+BLOCKCOPY_SP_32xN_AVX512 32
+BLOCKCOPY_SP_32xN_AVX512 64
+
 ;-----------------------------------------------------------------------------
 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list