[x265] [PATCH 012 of 307] x86: AVX512 blockcopy_ps_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:10 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499340573 -19800
# Thu Jul 06 16:59:33 2017 +0530
# Node ID f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530
# Parent 1321369efdf990d960db9a6fbe0181f086ba90f9
x86: AVX512 blockcopy_ps_32xN
AVX2 performance over C code: 2.39x
AVX512 performance over C code : 3.62x
diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 06 11:32:12 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 16:59:33 2017 +0530
@@ -3778,6 +3778,9 @@
p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+ p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
+ p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
}
#endif
diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Jul 06 11:32:12 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Jul 06 16:59:33 2017 +0530
@@ -3124,6 +3124,36 @@
BLOCKCOPY_PS_W32_H4_avx2 32, 32
BLOCKCOPY_PS_W32_H4_avx2 32, 64
+%macro BLOCKCOPY_PS_W32_H4_avx512 2
+INIT_ZMM avx512
+cglobal blockcopy_ps_%1x%2, 4, 7, 4
+ add r1, r1
+ mov r4d, %2/8
+ lea r5, [3 * r3]
+ lea r6, [3 * r1]
+.loop:
+%rep 2
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + r3]
+ pmovzxbw m2, [r2 + r3 * 2]
+ pmovzxbw m3, [r2 + r5]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r6], m3
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+%endrep
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_PS_W32_H4_avx512 32, 32
+BLOCKCOPY_PS_W32_H4_avx512 32, 64
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Jul 06 11:32:12 2017 +0530
+++ b/source/common/x86/blockcopy8.h Thu Jul 06 16:59:33 2017 +0530
@@ -61,5 +61,6 @@
FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
#endif // ifndef X265_I386_PIXEL_H
More information about the x265-devel
mailing list