[x265] [PATCH 046 of 307] x86:AVX-512 blockfill_s_32x32

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:44 CEST 2018


# HG changeset patch
# User Kalyan Goswami<kalyan at multicorewareinc.com>
# Date 1500980022 -19800
#      Tue Jul 25 16:23:42 2017 +0530
# Node ID 9e1401dcdfc3c9fb633d81b7b39321ac5969a245
# Parent  723c72ffe3eacba3db73eb46332f7cf5c97efa8a
x86:AVX-512 blockfill_s_32x32

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x32   |     4.58x       |     9.73x

diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 25 16:17:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 16:23:42 2017 +0530
@@ -3866,6 +3866,8 @@
         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
 
+        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512);
+
         p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Jul 25 16:17:13 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Jul 25 16:23:42 2017 +0530
@@ -2484,6 +2484,25 @@
 movu       [r0 + r3 + 32], m0
 RET
 
+;--------------------------------------------------------------------
+; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val)
+;--------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockfill_s_32x32, 3, 4, 1
+add          r1, r1
+lea          r3, [3 * r1]
+movd         xm0, r2d
+vpbroadcastw m0, xm0
+
+%rep 8
+movu       [r0], m0
+movu       [r0 + r1], m0
+movu       [r0 + 2 * r1], m0
+movu       [r0 + r3], m0
+lea        r0, [r0 + 4 * r1]
+%endrep
+RET
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Tue Jul 25 16:17:13 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Jul 25 16:23:42 2017 +0530
@@ -47,6 +47,7 @@
 
 FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
 FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
 
 FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);


More information about the x265-devel mailing list