[x265] [PATCH 046 of 307] x86:AVX-512 blockfill_s_32x32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:44 CEST 2018
# HG changeset patch
# User Kalyan Goswami<kalyan at multicorewareinc.com>
# Date 1500980022 -19800
# Tue Jul 25 16:23:42 2017 +0530
# Node ID 9e1401dcdfc3c9fb633d81b7b39321ac5969a245
# Parent 723c72ffe3eacba3db73eb46332f7cf5c97efa8a
x86:AVX-512 blockfill_s_32x32
Size | AVX2 performance | AVX512 performance
------------------------------------------------
32x32 | 4.58x | 9.73x
diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 25 16:17:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 16:23:42 2017 +0530
@@ -3866,6 +3866,8 @@
p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
+ p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512);
+
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Jul 25 16:17:13 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:23:42 2017 +0530
@@ -2484,6 +2484,25 @@
movu [r0 + r3 + 32], m0
RET
+;--------------------------------------------------------------------
+; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val)
+;--------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockfill_s_32x32, 3, 4, 1
+add r1, r1
+lea r3, [3 * r1]
+movd xm0, r2d
+vpbroadcastw m0, xm0
+
+%rep 8
+movu [r0], m0
+movu [r0 + r1], m0
+movu [r0 + 2 * r1], m0
+movu [r0 + r3], m0
+lea r0, [r0 + 4 * r1]
+%endrep
+RET
+
;-----------------------------------------------------------------------------
; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Tue Jul 25 16:17:13 2017 +0530
+++ b/source/common/x86/blockcopy8.h Tue Jul 25 16:23:42 2017 +0530
@@ -47,6 +47,7 @@
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
More information about the x265-devel
mailing list