[x265] [PATCH 082 of 307] x86: AVX512 blockcopy_sp_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:20 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502711388 -19800
# Mon Aug 14 17:19:48 2017 +0530
# Node ID b30539ebe5c9b2d9412d3a39458a90a7574ac744
# Parent 5c18b655a88a739b87c6b071d186a2b9286b8266
x86: AVX512 blockcopy_sp_32xN
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x32 | 6.77x | 11.27x
i420 32x32 | 6.73x | 11.43x
i422 32x64 | 6.68x | 12.19x
diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Aug 14 16:51:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530
@@ -3948,6 +3948,10 @@
p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+ p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx512);
+
p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Aug 14 16:51:52 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Mon Aug 14 17:19:48 2017 +0530
@@ -2191,6 +2191,25 @@
movu [r0 + r5], m2
%endmacro
+%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + 2 * r3]
+ movu m3, [r2 + r4]
+
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m4, m0
+ vpermq m2, m4, m2
+ movu [r0], ym0
+ vextracti32x8 [r0 + r1], m0, 1
+ movu [r0 + 2 * r1], ym2
+ vextracti32x8 [r0 + r5], m2, 1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
INIT_ZMM avx512
cglobal blockcopy_sp_64x64, 4, 6, 5
mova m4, [shuf1_avx512]
@@ -2206,6 +2225,26 @@
PROCESS_BLOCKCOPY_SP_64x4_AVX512
RET
+%macro BLOCKCOPY_SP_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal blockcopy_sp_32x%1, 4, 6, 5
+ mova m4, [shuf1_avx512]
+ add r3, r3
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+%rep %1/4 - 1
+ PROCESS_BLOCKCOPY_SP_32x4_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+%endrep
+ PROCESS_BLOCKCOPY_SP_32x4_AVX512
+ RET
+%endmacro
+
+BLOCKCOPY_SP_32xN_AVX512 32
+BLOCKCOPY_SP_32xN_AVX512 64
+
;-----------------------------------------------------------------------------
; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list