[x265] [PATCH 283 of 307] x86: AVX512 cpy2Dto1D_shr_32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:41 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515408269 -19800
# Mon Jan 08 16:14:29 2018 +0530
# Node ID 6d1c8baabf78dba44a500f2039eace3e31abf69c
# Parent 48917be3e409f917468ff2f73302b62afef492fb
x86: AVX512 cpy2Dto1D_shr_32
AVX2 performance : 21.17x
AVX512 performance : 34.33x
diff -r 48917be3e409 -r 6d1c8baabf78 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 08 16:01:42 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 08 16:14:29 2018 +0530
@@ -2559,6 +2559,8 @@
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
+ p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
+
p.weight_pp = PFX(weight_pp_avx512);
p.weight_sp = PFX(weight_sp_avx512);
p.dequant_normal = PFX(dequant_normal_avx512);
@@ -4912,6 +4914,8 @@
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
+ p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
+
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
diff -r 48917be3e409 -r 6d1c8baabf78 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Jan 08 16:01:42 2018 +0530
+++ b/source/common/x86/blockcopy8.asm Mon Jan 08 16:14:29 2018 +0530
@@ -5302,6 +5302,48 @@
jnz .loop
RET
+INIT_ZMM avx512
+cglobal cpy2Dto1D_shr_32, 4, 5, 4
+ shl r2d, 1
+ movd xm0, r3d
+ pcmpeqw ymm1, ymm1
+ psllw ym1, ymm1, xm0
+ psraw ym1, 1
+ vinserti32x8 m1, ym1, 1
+ lea r3, [r2 * 3]
+ mov r4d, 8
+
+.loop:
+ ; Row 0
+ movu m2, [r1]
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0], m2
+
+ ; Row 1
+ movu m2, [r1 + r2]
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0 + mmsize], m2
+
+ ; Row 2
+ movu m2, [r1 + 2 * r2]
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0 + 2 * mmsize], m2
+
+ ; Row 3
+ movu m2, [r1 + r3]
+ psubw m2, m1
+ psraw m2, xm0
+ movu [r0 + 3 * mmsize], m2
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + 4 * r2]
+ dec r4d
+ jnz .loop
+ RET
+
;--------------------------------------------------------------------------------------
; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
diff -r 48917be3e409 -r 6d1c8baabf78 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Mon Jan 08 16:01:42 2018 +0530
+++ b/source/common/x86/blockcopy8.h Mon Jan 08 16:14:29 2018 +0530
@@ -33,6 +33,7 @@
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
More information about the x265-devel
mailing list