[x265] [PATCH 011 of 307] x86: AVX512 blockcopy_sp_64x64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:09 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499320932 -19800
# Thu Jul 06 11:32:12 2017 +0530
# Node ID 1321369efdf990d960db9a6fbe0181f086ba90f9
# Parent 328d10aa0ff4d3097ff4941e224d2cdf6774a7c8
x86: AVX512 blockcopy_sp_64x64
AVX2 performance over C code : 6.77x
AVX512 performance over C code : 8.46x
diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 06 17:03:15 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 11:32:12 2017 +0530
@@ -3777,6 +3777,8 @@
p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
+ p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
+
}
#endif
}
diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Jul 06 17:03:15 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Jul 06 11:32:12 2017 +0530
@@ -2121,6 +2121,53 @@
BLOCKCOPY_SP_W64_H4_avx2 64, 64
+%macro BLOCKCOPY_SP_W64_H4_avx512 2
+INIT_ZMM avx512
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
+ mov r4d, %2/4
+ add r3, r3
+ lea r5, [3 * r3]
+ lea r6, [3 * r1]
+
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 64]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 64]
+
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m2, m2, 11011000b
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + 2 * r3]
+ movu m1, [r2 + 2 * r3 + 64]
+ movu m2, [r2 + r5]
+ movu m3, [r2 + r5 + 64]
+
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m0, 11011000b
+ vpermq m2, m2, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m2, m2, 11011000b
+ movu [r0 + 2 * r1], m0
+ movu [r0 + r6], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SP_W64_H4_avx512 64, 64
+
;-----------------------------------------------------------------------------
; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Jul 06 17:03:15 2017 +0530
+++ b/source/common/x86/blockcopy8.h Thu Jul 06 11:32:12 2017 +0530
@@ -57,6 +57,7 @@
FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_sp, avx512, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
More information about the x265-devel
mailing list