[x265] [PATCH 013 of 307] x86: AVX512 blockcopy_ps_64x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:11 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499340609 -19800
#      Thu Jul 06 17:00:09 2017 +0530
# Node ID e59a457cfe6c0e2cd4137bf3337a2a2d0a815850
# Parent  f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530
x86: AVX512 blockcopy_ps_64x64

AVX2 performance over C code:    1.82x
AVX512 performance over C code : 3.51x

diff -r f5c54a1c4a55 -r e59a457cfe6c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jul 06 16:59:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jul 06 17:00:09 2017 +0530
@@ -3781,6 +3781,7 @@
         p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
+        p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512);
 
     }
 #endif
diff -r f5c54a1c4a55 -r e59a457cfe6c source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Jul 06 16:59:33 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Jul 06 17:00:09 2017 +0530
@@ -3340,6 +3340,42 @@
     RET
 
 ;-----------------------------------------------------------------------------
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockcopy_ps_64x64, 4, 7, 4
+    add     r1, r1
+    mov     r4d, 64/8
+    lea     r5, [3 * r3]
+    lea     r6, [3 * r1]
+.loop:
+%rep 2
+    pmovzxbw      m0, [r2]
+    pmovzxbw      m1, [r2 + 32]
+    pmovzxbw      m2, [r2 + r3]
+    pmovzxbw      m3, [r2 + r3 + 32]
+    movu          [r0], m0
+    movu          [r0 + 64], m1
+    movu          [r0 + r1], m2
+    movu          [r0 + r1 + 64], m3
+
+    pmovzxbw      m0, [r2 + r3 * 2]
+    pmovzxbw      m1, [r2 + r3 * 2 + 32]
+    pmovzxbw      m2, [r2 + r5]
+    pmovzxbw      m3, [r2 + r5 + 32]
+    movu          [r0 + r1 * 2], m0
+    movu          [r0 + r1 * 2 + 64], m1
+    movu          [r0 + r6], m2
+    movu          [r0 + r6 + 64], m3
+
+    lea           r0, [r0 + 4 * r1]
+    lea           r2, [r2 + 4 * r3]
+%endrep
+    dec           r4d
+    jnz           .loop
+    RET
+
+;-----------------------------------------------------------------------------
 ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2


More information about the x265-devel mailing list