[x265] [PATCH 011 of 307] x86: AVX512 blockcopy_sp_64x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:09 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499320932 -19800
#      Thu Jul 06 11:32:12 2017 +0530
# Node ID 1321369efdf990d960db9a6fbe0181f086ba90f9
# Parent  328d10aa0ff4d3097ff4941e224d2cdf6774a7c8
x86: AVX512 blockcopy_sp_64x64

AVX2 performance over C code   : 6.77x
AVX512 performance over C code : 8.46x

diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jul 06 17:03:15 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jul 06 11:32:12 2017 +0530
@@ -3777,6 +3777,8 @@
         p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
 
+        p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+
     }
 #endif
 }
diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Jul 06 17:03:15 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Jul 06 11:32:12 2017 +0530
@@ -2121,6 +2121,53 @@
 
 BLOCKCOPY_SP_W64_H4_avx2 64, 64
 
+%macro BLOCKCOPY_SP_W64_H4_avx512 2
+INIT_ZMM avx512
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
+    mov    r4d, %2/4
+    add    r3,  r3
+    lea    r5,  [3 * r3]
+    lea    r6,  [3 * r1]
+
+.loop:
+    movu               m0,             [r2]
+    movu               m1,             [r2 + 64]
+    movu               m2,             [r2 + r3]
+    movu               m3,             [r2 + r3 + 64]
+
+    packuswb           m0,             m1
+    packuswb           m2,             m3
+    vpermq             m0,             m0,                 11011000b
+    vpermq             m2,             m2,                 11011000b
+    vshufi64x2         m0,             m0,                 11011000b
+    vshufi64x2         m2,             m2,                 11011000b
+    movu               [r0],           m0
+    movu               [r0 + r1],      m2
+
+    movu               m0,             [r2 + 2 * r3]
+    movu               m1,             [r2 + 2 * r3 + 64]
+    movu               m2,             [r2 + r5]
+    movu               m3,             [r2 + r5 + 64]
+
+    packuswb           m0,             m1
+    packuswb           m2,             m3
+    vpermq             m0,             m0,                 11011000b
+    vpermq             m2,             m2,                 11011000b
+    vshufi64x2         m0,             m0,                 11011000b
+    vshufi64x2         m2,             m2,                 11011000b
+    movu               [r0 + 2 * r1],  m0
+    movu               [r0 + r6],      m2
+
+    lea    r0, [r0 + 4 * r1]
+    lea    r2, [r2 + 4 * r3]
+
+    dec    r4d
+    jnz    .loop
+    RET
+%endmacro
+
+BLOCKCOPY_SP_W64_H4_avx512 64, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Thu Jul 06 17:03:15 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Thu Jul 06 11:32:12 2017 +0530
@@ -57,6 +57,7 @@
 FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_sp, avx512, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);


More information about the x265-devel mailing list