[x265] [PATCH 045 of 307] x86:AVX-512 blockcopy_pp_64xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:43 CEST 2018


# HG changeset patch
# User Kalyan Goswami<kalyan at multicorewareinc.com>
# Date 1500979633 -19800
#      Tue Jul 25 16:17:13 2017 +0530
# Node ID 723c72ffe3eacba3db73eb46332f7cf5c97efa8a
# Parent  4978d583e2e82aec1f09d94ecdf52191eac7ceb5
x86:AVX-512 blockcopy_pp_64xN

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
64x64   |     1.54x       |     3.22x
64x48   |     1.74x       |     3.29x
64x32   |     1.65x       |     3.96x
64x16   |     1.69x       |     3.79x

diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 25 12:03:26 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 16:17:13 2017 +0530
@@ -3848,6 +3848,11 @@
         p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
 
+        p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_avx512);
+        p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
+        p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
+        p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
+
         p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
         p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Jul 25 12:03:26 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Jul 25 16:17:13 2017 +0530
@@ -1103,6 +1103,47 @@
 BLOCKCOPY_PP_W64_H4_avx 64, 48
 BLOCKCOPY_PP_W64_H4_avx 64, 64
 
+;----------------------------------------------------------------------------------------------
+; Macro to calculate blockcopy_pp_64x4_avx512
+;----------------------------------------------------------------------------------------------
+%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
+movu    m0, [r2]
+movu    m1, [r2 + r3]
+movu    m2, [r2 + 2 * r3]
+movu    m3, [r2 + r4]
+
+movu    [r0] , m0
+movu    [r0 + r1] , m1
+movu    [r0 + 2 * r1]  , m2
+movu    [r0 + r5] , m3
+%endmacro
+
+;----------------------------------------------------------------------------------------------
+; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+;----------------------------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W64_H4_avx512 1
+INIT_ZMM avx512
+cglobal blockcopy_pp_64x%1, 4, 4, 6
+lea    r4,  [3 * r3]
+lea    r5,  [3 * r1]
+
+%rep %1/4 - 1         
+PROCESS_BLOCKCOPY_PP_64X4_avx512
+lea     r2, [r2 + 4 * r3]
+lea     r0, [r0 + 4 * r1] 
+%endrep
+
+PROCESS_BLOCKCOPY_PP_64X4_avx512
+RET
+%endmacro
+
+BLOCKCOPY_PP_W64_H4_avx512 16
+BLOCKCOPY_PP_W64_H4_avx512 32
+BLOCKCOPY_PP_W64_H4_avx512 48
+BLOCKCOPY_PP_W64_H4_avx512 64
+
+
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Tue Jul 25 12:03:26 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Jul 25 16:17:13 2017 +0530
@@ -54,6 +54,7 @@
 
 FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx512, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 
 FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);


More information about the x265-devel mailing list