[x265] [PATCH 095 of 307] x86: AVX512 copy_pp_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:33 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503462961 -19800
# Wed Aug 23 10:06:01 2017 +0530
# Node ID 31a180bcef33fae436ad7e3aa4378b283a86d56a
# Parent 7868f1cb521d554dc77d768ec1f838e0b29824e4
x86: AVX512 copy_pp_32xN
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x16 | 1.63x | 2.58x
32x24 | 2.51x | 2.87x
32x32 | 2.48x | 2.95x
32x64 | 2.03x | 2.53x
This patch also clean up code for 64xN
diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 22 13:51:33 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 23 10:06:01 2017 +0530
@@ -3965,6 +3965,18 @@
p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
+ p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+ p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
+ p.pu[LUMA_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
+ p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Aug 22 13:51:33 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Wed Aug 23 10:06:01 2017 +0530
@@ -1107,7 +1107,7 @@
BLOCKCOPY_PP_W64_H4_avx 64, 64
;----------------------------------------------------------------------------------------------
-; Macro to calculate blockcopy_pp_64x4_avx512
+; blockcopy_pp avx512 code start
;----------------------------------------------------------------------------------------------
%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
movu m0, [r2]
@@ -1121,16 +1121,28 @@
movu [r0 + r5] , m3
%endmacro
+%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0
+movu ym0, [r2]
+vinserti32x8 m0, [r2 + r3], 1
+movu ym1, [r2 + 2 * r3]
+vinserti32x8 m1, [r2 + r4], 1
+
+movu [r0] , ym0
+vextracti32x8 [r0 + r1] , m0, 1
+movu [r0 + 2 * r1] , ym1
+vextracti32x8 [r0 + r5] , m1, 1
+%endmacro
+
;----------------------------------------------------------------------------------------------
; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;----------------------------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W64_H4_avx512 1
INIT_ZMM avx512
-cglobal blockcopy_pp_64x%1, 4, 4, 6
+cglobal blockcopy_pp_64x%1, 4, 6, 4
lea r4, [3 * r3]
lea r5, [3 * r1]
-%rep %1/4 - 1
+%rep %1/4 - 1
PROCESS_BLOCKCOPY_PP_64X4_avx512
lea r2, [r2 + 4 * r3]
lea r0, [r0 + 4 * r1]
@@ -1145,7 +1157,30 @@
BLOCKCOPY_PP_W64_H4_avx512 48
BLOCKCOPY_PP_W64_H4_avx512 64
-
+%macro BLOCKCOPY_PP_W32_H4_avx512 1
+INIT_ZMM avx512
+cglobal blockcopy_pp_32x%1, 4, 6, 2
+ lea r4, [3 * r3]
+ lea r5, [3 * r1]
+
+%rep %1/4 - 1
+ PROCESS_BLOCKCOPY_PP_32X4_avx512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+%endrep
+ PROCESS_BLOCKCOPY_PP_32X4_avx512
+ RET
+%endmacro
+
+BLOCKCOPY_PP_W32_H4_avx512 8
+BLOCKCOPY_PP_W32_H4_avx512 16
+BLOCKCOPY_PP_W32_H4_avx512 24
+BLOCKCOPY_PP_W32_H4_avx512 32
+BLOCKCOPY_PP_W32_H4_avx512 48
+BLOCKCOPY_PP_W32_H4_avx512 64
+;----------------------------------------------------------------------------------------------
+; blockcopy_pp avx512 code end
+;----------------------------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
More information about the x265-devel
mailing list