[x265] [PATCH 036 of 307] x86: AVX512 blockcopy_ss_64x64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:34 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1499162011 -19800
# Tue Jul 04 15:23:31 2017 +0530
# Node ID 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392
# Parent 2eda6628c75302a10d59918a58740d6e27434293
x86: AVX512 blockcopy_ss_64x64
AVX2 performance over C code : 1.32x
AVX512 performance over C code : 3.00x
diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 04 15:23:31 2017 +0530
@@ -3854,6 +3854,8 @@
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
+ p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
+
}
#endif
}
diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Jul 20 16:59:52 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Jul 04 15:23:31 2017 +0530
@@ -4462,6 +4462,154 @@
BLOCKCOPY_SS_W64_H4_avx 64, 48
BLOCKCOPY_SS_W64_H4_avx 64, 64
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0
+ movu m0, [r2]
+ movu m1, [r2 + mmsize]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + mmsize]
+
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + mmsize], m3
+
+ movu m0, [r2 + 2 * r3]
+ movu m1, [r2 + 2 * r3 + mmsize]
+ movu m2, [r2 + r6]
+ movu m3, [r2 + r6 + mmsize]
+ lea r2, [r2 + 4 * r3]
+
+ movu [r0 + 2 * r1], m0
+ movu [r0 + 2 * r1 + mmsize], m1
+ movu [r0 + r5], m2
+ movu [r0 + r5 + mmsize], m3
+ lea r0, [r0 + 4 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + mmsize]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + mmsize]
+
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + mmsize], m3
+
+ movu m0, [r2 + 2 * r3]
+ movu m1, [r2 + 2 * r3 + mmsize]
+ movu m2, [r2 + r6]
+ movu m3, [r2 + r6 + mmsize]
+ lea r2, [r2 + 4 * r3]
+
+ movu [r0 + 2 * r1], m0
+ movu [r0 + 2 * r1 + mmsize], m1
+ movu [r0 + r5], m2
+ movu [r0 + r5 + mmsize], m3
+ lea r0, [r0 + 4 * r1]
+%endmacro
+
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0
+ movu m0, [r2]
+ movu m1, [r2 + mmsize]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + mmsize]
+
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + mmsize], m3
+
+ movu m0, [r2 + 2 * r3]
+ movu m1, [r2 + 2 * r3 + mmsize]
+ movu m2, [r2 + r6]
+ movu m3, [r2 + r6 + mmsize]
+ lea r2, [r2 + 4 * r3]
+
+ movu [r0 + 2 * r1], m0
+ movu [r0 + 2 * r1 + mmsize], m1
+ movu [r0 + r5], m2
+ movu [r0 + r5 + mmsize], m3
+ lea r0, [r0 + 4 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + mmsize]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + mmsize]
+
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + mmsize], m3
+
+ movu m0, [r2 + 2 * r3]
+ movu m1, [r2 + 2 * r3 + mmsize]
+ movu m2, [r2 + r6]
+ movu m3, [r2 + r6 + mmsize]
+
+ movu [r0 + 2 * r1], m0
+ movu [r0 + 2 * r1 + mmsize], m1
+ movu [r0 + r5], m2
+ movu [r0 + r5 + mmsize], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x16, 4, 7, 4
+ add r1, r1
+ add r3, r3
+ lea r5, [3 * r1]
+ lea r6, [3 * r3]
+
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+ RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x32, 4, 7, 4
+ add r1, r1
+ add r3, r3
+ lea r5, [3 * r1]
+ lea r6, [3 * r3]
+
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+ RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x48, 4, 7, 4
+ add r1, r1
+ add r3, r3
+ lea r5, [3 * r1]
+ lea r6, [3 * r3]
+
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+ RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_64x64, 4, 7, 4
+ add r1, r1
+ add r3, r3
+ lea r5, [3 * r1]
+ lea r6, [3 * r3]
+
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
+ RET
;--------------------------------------------------------------------------------------
; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Jul 20 16:59:52 2017 +0530
+++ b/source/common/x86/blockcopy8.h Tue Jul 04 15:23:31 2017 +0530
@@ -50,6 +50,7 @@
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx512, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
More information about the x265-devel
mailing list