[x265] [PATCH 282 of 307] x86: AVX512 cpy1Dto2D_shr_32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:40 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515407502 -19800
# Mon Jan 08 16:01:42 2018 +0530
# Node ID 48917be3e409f917468ff2f73302b62afef492fb
# Parent c9f8c315a900c488e41bb39955a1c893e35a66d4
x86: AVX512 cpy1Dto2D_shr_32
AVX2 performance : 21.03x
AVX512 performance : 34.55x
diff -r c9f8c315a900 -r 48917be3e409 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 08 15:55:34 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 08 16:01:42 2018 +0530
@@ -2557,6 +2557,8 @@
p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
+
p.weight_pp = PFX(weight_pp_avx512);
p.weight_sp = PFX(weight_sp_avx512);
p.dequant_normal = PFX(dequant_normal_avx512);
@@ -4908,6 +4910,7 @@
p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
+ p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
diff -r c9f8c315a900 -r 48917be3e409 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Jan 08 15:55:34 2018 +0530
+++ b/source/common/x86/blockcopy8.asm Mon Jan 08 16:01:42 2018 +0530
@@ -6809,3 +6809,30 @@
dec r3d
jnz .loop
RET
+
+INIT_ZMM avx512
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
+ shl r2d, 1
+ movd xm0, r3m
+ pcmpeqw ymm1, ymm1
+ psllw ym1, ymm1, xm0
+ psraw ym1, 1
+ vinserti32x8 m1, ym1, 1
+ mov r3d, 16
+
+.loop:
+ ; Row 0-1
+ movu m2, [r1]
+ movu m3, [r1 + mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], m2
+ movu [r0 + r2], m3
+
+ add r1, 2 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
More information about the x265-devel
mailing list