[x265] [PATCH 284 of 307] x86: AVX512 cpy2Dto1D_shr_16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:42 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515409019 -19800
# Mon Jan 08 16:26:59 2018 +0530
# Node ID a5d29083237f28a944143862f980960c3f2b15ff
# Parent 6d1c8baabf78dba44a500f2039eace3e31abf69c
x86: AVX512 cpy2Dto1D_shr_16
AVX2 performance : 21.14x
AVX512 performance : 28.50x
diff -r 6d1c8baabf78 -r a5d29083237f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 08 16:14:29 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 08 16:26:59 2018 +0530
@@ -2559,6 +2559,7 @@
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
+ p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
p.weight_pp = PFX(weight_pp_avx512);
@@ -4914,6 +4915,7 @@
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
+ p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx512);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
diff -r 6d1c8baabf78 -r a5d29083237f source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Jan 08 16:14:29 2018 +0530
+++ b/source/common/x86/blockcopy8.asm Mon Jan 08 16:26:59 2018 +0530
@@ -5199,6 +5199,53 @@
jnz .loop
RET
+INIT_ZMM avx512
+cglobal cpy2Dto1D_shr_16, 4, 5, 4
+ shl r2d, 1
+ movd xm0, r3d
+ pcmpeqw ymm1, ymm1
+ psllw ym1, ymm1, xm0
+ psraw ym1, 1
+ vinserti32x8 m1, ym1, 1
+ lea r3, [r2 * 3]
+ mov r4d, 2
+
+.loop:
+ ; Row 0-1
+ movu ym2, [r1]
+ vinserti32x8 m2, [r1 + r2], 1
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0], m2
+
+ ; Row 2-3
+ movu ym2, [r1 + 2 * r2]
+ vinserti32x8 m2, [r1 + r3], 1
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0 + mmsize], m2
+
+ lea r1, [r1 + 4 * r2]
+ ; Row 4-5
+
+ movu ym2, [r1]
+ vinserti32x8 m2, [r1 + r2], 1
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0 + 2 * mmsize], m2
+
+ ; Row 6-7
+ movu ym2, [r1 + 2 * r2]
+ vinserti32x8 m2, [r1 + r3], 1
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0 + 3 * mmsize], m2
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + 4 * r2]
+ dec r4d
+ jnz .loop
+ RET
;--------------------------------------------------------------------------------------
; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
More information about the x265-devel
mailing list