[x265] [PATCH 012 of 307] x86: AVX512 blockcopy_ps_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:10 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499340573 -19800
#      Thu Jul 06 16:59:33 2017 +0530
# Node ID f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530
# Parent  1321369efdf990d960db9a6fbe0181f086ba90f9
x86: AVX512 blockcopy_ps_32xN

AVX2 performance over C code:    2.39x
AVX512 performance over C code : 3.62x

diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jul 06 11:32:12 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jul 06 16:59:33 2017 +0530
@@ -3778,6 +3778,9 @@
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
 
         p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+        p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
+        p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
 
     }
 #endif
diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Jul 06 11:32:12 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Jul 06 16:59:33 2017 +0530
@@ -3124,6 +3124,36 @@
 BLOCKCOPY_PS_W32_H4_avx2 32, 32
 BLOCKCOPY_PS_W32_H4_avx2 32, 64
 
+%macro BLOCKCOPY_PS_W32_H4_avx512 2
+INIT_ZMM avx512
+cglobal blockcopy_ps_%1x%2, 4, 7, 4
+    add     r1, r1
+    mov     r4d, %2/8
+    lea     r5, [3 * r3]
+    lea     r6, [3 * r1]
+.loop:
+%rep 2
+    pmovzxbw      m0, [r2]
+    pmovzxbw      m1, [r2 + r3]
+    pmovzxbw      m2, [r2 + r3 * 2]
+    pmovzxbw      m3, [r2 + r5]
+
+    movu          [r0], m0
+    movu          [r0 + r1], m1
+    movu          [r0 + r1 * 2], m2
+    movu          [r0 + r6], m3
+
+    lea           r0, [r0 + 4 * r1]
+    lea           r2, [r2 + 4 * r3]
+%endrep
+    dec           r4d
+    jnz           .loop
+    RET
+%endmacro
+
+BLOCKCOPY_PS_W32_H4_avx512 32, 32
+BLOCKCOPY_PS_W32_H4_avx512 32, 64
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Thu Jul 06 11:32:12 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Thu Jul 06 16:59:33 2017 +0530
@@ -61,5 +61,6 @@
 FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 
 #endif // ifndef X265_I386_PIXEL_H


More information about the x265-devel mailing list