[x265] [PATCH 108 of 307] [x265-avx512]x86: AVX512 dequant_normal
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:46 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1507109312 -19800
# Wed Oct 04 14:58:32 2017 +0530
# Node ID 2221c70ef3b9b416b0ad491cd2325ccb595df8bb
# Parent c726239a07580fd13c4177f0206d615ee02c5975
[x265-avx512]x86: AVX512 dequant_normal
AVX2 Performance : 9.81x
AVX512 Performance : 15.37x
diff -r c726239a0758 -r 2221c70ef3b9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Aug 31 15:21:25 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Oct 04 14:58:32 2017 +0530
@@ -2350,6 +2350,7 @@
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
p.weight_pp = PFX(weight_pp_avx512);
+ p.dequant_normal = PFX(dequant_normal_avx512);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
@@ -4132,6 +4133,8 @@
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+ p.dequant_normal = PFX(dequant_normal_avx512);
+
//i444 chroma_hpp
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
diff -r c726239a0758 -r 2221c70ef3b9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Aug 31 15:21:25 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Oct 04 14:58:32 2017 +0530
@@ -1237,6 +1237,58 @@
jnz .loop
RET
+INIT_ZMM avx512
+cglobal dequant_normal, 5,5,7
+ vpbroadcastd m2, [pw_1] ; m2 = word [1]
+ vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
+ vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
+%if HIGH_BIT_DEPTH
+ cmp r3d, 32767
+ jle .skip
+ shr r3d, (BIT_DEPTH - 8)
+ sub r4d, (BIT_DEPTH - 8)
+.skip:
+%endif
+ movd xm0, r4d ; m0 = shift
+ add r4d, -1+16
+ bts r3d, r4d
+
+ movd xm1, r3d
+ vpbroadcastd m1, xm1 ; m1 = dword [add scale]
+
+ ; m0 = shift
+ ; m1 = scale
+ ; m2 = word [1]
+ mov r3d, r2d
+ shr r2d, 5
+.loop:
+ movu m3, [r0]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
+ pmaddwd m4, m1
+ psrad m3, xm0
+ psrad m4, xm0
+ pminsd m3, m5
+ pmaxsd m3, m6
+ pminsd m4, m5
+ pmaxsd m4, m6
+ packssdw m3, m4
+
+ mova [r1 + 0 * mmsize/2], ym3
+ cmp r3d, 16
+ je .num16
+ vextracti32x8 [r1 + 1 * mmsize/2], m3, 1
+
+ add r0, mmsize
+ add r1, mmsize
+
+ dec r2d
+ jnz .loop
+ RET
+.num16:
+ RET
+
;-----------------------------------------------------------------------------
; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff);
More information about the x265-devel
mailing list