[x265] [PATCH 045 of 307] x86:AVX-512 blockcopy_pp_64xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:43 CEST 2018
# HG changeset patch
# User Kalyan Goswami<kalyan at multicorewareinc.com>
# Date 1500979633 -19800
# Tue Jul 25 16:17:13 2017 +0530
# Node ID 723c72ffe3eacba3db73eb46332f7cf5c97efa8a
# Parent 4978d583e2e82aec1f09d94ecdf52191eac7ceb5
x86:AVX-512 blockcopy_pp_64xN
Size | AVX2 performance | AVX512 performance
------------------------------------------------
64x64 | 1.54x | 3.22x
64x48 | 1.74x | 3.29x
64x32 | 1.65x | 3.96x
64x16 | 1.69x | 3.79x
diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 25 12:03:26 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 16:17:13 2017 +0530
@@ -3848,6 +3848,11 @@
p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
+ p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_avx512);
+ p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
+ p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
+ p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
+
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Jul 25 12:03:26 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:17:13 2017 +0530
@@ -1103,6 +1103,47 @@
BLOCKCOPY_PP_W64_H4_avx 64, 48
BLOCKCOPY_PP_W64_H4_avx 64, 64
+;----------------------------------------------------------------------------------------------
+; Macro to calculate blockcopy_pp_64x4_avx512
+;----------------------------------------------------------------------------------------------
+%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
+movu m0, [r2]
+movu m1, [r2 + r3]
+movu m2, [r2 + 2 * r3]
+movu m3, [r2 + r4]
+
+movu [r0] , m0
+movu [r0 + r1] , m1
+movu [r0 + 2 * r1] , m2
+movu [r0 + r5] , m3
+%endmacro
+
+;----------------------------------------------------------------------------------------------
+; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+;----------------------------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W64_H4_avx512 1
+INIT_ZMM avx512
+cglobal blockcopy_pp_64x%1, 4, 4, 6
+lea r4, [3 * r3]
+lea r5, [3 * r1]
+
+%rep %1/4 - 1
+PROCESS_BLOCKCOPY_PP_64X4_avx512
+lea r2, [r2 + 4 * r3]
+lea r0, [r0 + 4 * r1]
+%endrep
+
+PROCESS_BLOCKCOPY_PP_64X4_avx512
+RET
+%endmacro
+
+BLOCKCOPY_PP_W64_H4_avx512 16
+BLOCKCOPY_PP_W64_H4_avx512 32
+BLOCKCOPY_PP_W64_H4_avx512 48
+BLOCKCOPY_PP_W64_H4_avx512 64
+
+
+
;-----------------------------------------------------------------------------
; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Tue Jul 25 12:03:26 2017 +0530
+++ b/source/common/x86/blockcopy8.h Tue Jul 25 16:17:13 2017 +0530
@@ -54,6 +54,7 @@
FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx512, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
More information about the x265-devel
mailing list