[x265] [PATCH 073 of 307] x86: AVX512 cpy1Dto2D_shl_32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:11 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502186111 -19800
# Tue Aug 08 15:25:11 2017 +0530
# Node ID 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d
# Parent ce93c1b1894ae7d789e451f65479f018ba90ec76
x86: AVX512 cpy1Dto2D_shl_32
Size | BitDepth | AVX2 performance | AVX512 performance
-------------------------------------------------------
32x32| 8 | 16.03x | 28.94x
32x32| 10 | 14.12x | 24.99x
diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 02 14:11:31 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 15:25:11 2017 +0530
@@ -2311,6 +2311,8 @@
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
@@ -3992,6 +3994,7 @@
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
}
#endif
diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Wed Aug 02 14:11:31 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Aug 08 15:25:11 2017 +0530
@@ -5513,7 +5513,62 @@
jnz .loop
RET
-
+;--------------------------------------------------------------------------------------
+; cpy_1Dto2D_shl avx512 code start
+;--------------------------------------------------------------------------------------
+%macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ movu m3, [r1 + 2 * mmsize]
+ movu m4, [r1 + 3 * mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
+ movu [r0], m1
+ movu [r0 + r2], m2
+ movu [r0 + 2 * r2], m3
+ movu [r0 + r3], m4
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ movu m3, [r1 + 2 * mmsize]
+ movu m4, [r1 + 3 * mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
+ movu [r0], m1
+ movu [r0 + r2], m2
+ movu [r0 + 2 * r2], m3
+ movu [r0 + r3], m4
+%endmacro
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal cpy1Dto2D_shl_32, 4, 4, 5
+ add r2d, r2d
+ movd xm0, r3d
+ lea r3, [3 * r2]
+
+ PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+ PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+ PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+ PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+ RET
+;--------------------------------------------------------------------------------------
+; copy_cnt avx512 code end
+;--------------------------------------------------------------------------------------
;--------------------------------------------------------------------------------------
; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Wed Aug 02 14:11:31 2017 +0530
+++ b/source/common/x86/blockcopy8.h Tue Aug 08 15:25:11 2017 +0530
@@ -37,6 +37,7 @@
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
More information about the x265-devel
mailing list