[x265] [PATCH 013 of 307] x86: AVX512 blockcopy_ps_64x64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:11 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499340609 -19800
# Thu Jul 06 17:00:09 2017 +0530
# Node ID e59a457cfe6c0e2cd4137bf3337a2a2d0a815850
# Parent f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530
x86: AVX512 blockcopy_ps_64x64
AVX2 performance over C code: 1.82x
AVX512 performance over C code : 3.51x
diff -r f5c54a1c4a55 -r e59a457cfe6c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 06 16:59:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 17:00:09 2017 +0530
@@ -3781,6 +3781,7 @@
p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
+ p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512);
}
#endif
diff -r f5c54a1c4a55 -r e59a457cfe6c source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Jul 06 16:59:33 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Jul 06 17:00:09 2017 +0530
@@ -3340,6 +3340,42 @@
RET
;-----------------------------------------------------------------------------
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockcopy_ps_64x64, 4, 7, 4
+ add r1, r1
+ mov r4d, 64/8
+ lea r5, [3 * r3]
+ lea r6, [3 * r1]
+.loop:
+%rep 2
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 32]
+ pmovzxbw m2, [r2 + r3]
+ pmovzxbw m3, [r2 + r3 + 32]
+ movu [r0], m0
+ movu [r0 + 64], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 64], m3
+
+ pmovzxbw m0, [r2 + r3 * 2]
+ pmovzxbw m1, [r2 + r3 * 2 + 32]
+ pmovzxbw m2, [r2 + r5]
+ pmovzxbw m3, [r2 + r5 + 32]
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 64], m1
+ movu [r0 + r6], m2
+ movu [r0 + r6 + 64], m3
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+%endrep
+ dec r4d
+ jnz .loop
+ RET
+
+;-----------------------------------------------------------------------------
; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
More information about the x265-devel
mailing list