[x265] [PATCH 072 of 307] x86: AVX512 cpy2Dto1D_shl_32 and cpy2Dto1D_shl_16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:10 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501663291 -19800
# Wed Aug 02 14:11:31 2017 +0530
# Node ID ce93c1b1894ae7d789e451f65479f018ba90ec76
# Parent aac415b7223acced7fc844c4a07225704b811df0
x86: AVX512 cpy2Dto1D_shl_32 and cpy2Dto1D_shl_16
Size | BitDepth | AVX2 performance | AVX512 performance
-------------------------------------------------------
16x16| 8 | 15.09x | 21.16
16x16| 10 | 16.05x | 17.86
32x32| 8 | 13.90x | 25.62
32x32| 10 | 11.69x | 23.24
diff -r aac415b7223a -r ce93c1b1894a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 14:11:31 2017 +0530
@@ -2309,6 +2309,8 @@
p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
+ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
+ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
}
}
#else // if HIGH_BIT_DEPTH
@@ -3988,6 +3990,8 @@
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
+ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
+ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
}
#endif
diff -r aac415b7223a -r ce93c1b1894a source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Aug 01 17:37:05 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Wed Aug 02 14:11:31 2017 +0530
@@ -6140,6 +6140,102 @@
RET
;--------------------------------------------------------------------------------------
+; cpy2Dto1D_shl avx512 code start
+;--------------------------------------------------------------------------------------
+%macro PROCESS_CPY2Dto1D_SHL_16x8_AVX512 0
+ movu m1, [r1]
+ vinserti32x8 m1, [r1 + r2], 1
+ movu m2, [r1 + 2 * r2]
+ vinserti32x8 m2, [r1 + r3], 1
+
+ psllw m1, xm0
+ psllw m2, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m2
+
+ add r0, 2 * mmsize
+ lea r1, [r1 + r2 * 4]
+
+ movu m1, [r1]
+ vinserti32x8 m1, [r1 + r2], 1
+ movu m2, [r1 + 2 * r2]
+ vinserti32x8 m2, [r1 + r3], 1
+
+ psllw m1, xm0
+ psllw m2, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m2
+%endmacro
+
+%macro PROCESS_CPY2Dto1D_SHL_32x8_AVX512 0
+ movu m1, [r1]
+ movu m2, [r1 + r2]
+ movu m3, [r1 + 2 * r2]
+ movu m4, [r1 + r3]
+
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m2
+ movu [r0 + 2 * mmsize], m3
+ movu [r0 + 3 * mmsize], m4
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+
+ movu m1, [r1]
+ movu m2, [r1 + r2]
+ movu m3, [r1 + 2 * r2]
+ movu m4, [r1 + r3]
+
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m2
+ movu [r0 + 2 * mmsize], m3
+ movu [r0 + 3 * mmsize], m4
+%endmacro
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal cpy2Dto1D_shl_32, 4, 4, 5
+ add r2d, r2d
+ movd xm0, r3d
+ lea r3, [3 * r2]
+
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal cpy2Dto1D_shl_16, 4, 4, 3
+ add r2d, r2d
+ movd xm0, r3d
+ lea r3, [3 * r2]
+
+ PROCESS_CPY2Dto1D_SHL_16x8_AVX512
+ add r0, 2 * mmsize
+ lea r1, [r1 + r2 * 4]
+ PROCESS_CPY2Dto1D_SHL_16x8_AVX512
+ RET
+;--------------------------------------------------------------------------------------
+; cpy2Dto1D_shl avx512 code end
+;--------------------------------------------------------------------------------------
+;--------------------------------------------------------------------------------------
; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
diff -r aac415b7223a -r ce93c1b1894a source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Tue Aug 01 17:37:05 2017 +0530
+++ b/source/common/x86/blockcopy8.h Wed Aug 02 14:11:31 2017 +0530
@@ -28,6 +28,7 @@
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
More information about the x265-devel
mailing list