[x265] [PATCH 109 of 307] [x265-avx512]x86: AVX512 dequant_scaling
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:47 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1504246803 -19800
# Fri Sep 01 11:50:03 2017 +0530
# Node ID e1348316cd4431a5d39c8a9457d865f0f9d546cc
# Parent 2221c70ef3b9b416b0ad491cd2325ccb595df8bb
[x265-avx512]x86: AVX512 dequant_scaling
AVX2 Performance : 8.65x
AVX512 Performance : 15.55x
diff -r 2221c70ef3b9 -r e1348316cd44 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 04 14:58:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 11:50:03 2017 +0530
@@ -2351,7 +2351,7 @@
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
p.weight_pp = PFX(weight_pp_avx512);
p.dequant_normal = PFX(dequant_normal_avx512);
-
+ p.dequant_scaling = PFX(dequant_scaling_avx512);
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
@@ -4134,7 +4134,7 @@
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
p.dequant_normal = PFX(dequant_normal_avx512);
-
+ p.dequant_scaling = PFX(dequant_scaling_avx512);
//i444 chroma_hpp
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
diff -r 2221c70ef3b9 -r e1348316cd44 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Oct 04 14:58:32 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Sep 01 11:50:03 2017 +0530
@@ -30,6 +30,10 @@
var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
+ALIGN 64
+const dequant_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
+const dequant_shuf2_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
+
%if BIT_DEPTH == 12
ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64
ssim_c2: times 4 dd 60851437.92 ; .03*.03*4095*4095*64*63
@@ -1237,6 +1241,90 @@
jnz .loop
RET
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal dequant_scaling, 6,7,8
+ mova m6, [dequant_shuf1_avx512]
+ mova m7, [dequant_shuf2_avx512]
+ add r5d, 4
+ mov r6d, r3d
+ shr r3d, 5 ; num/32
+ cmp r5d, r4d
+ jle .skip
+ sub r5d, r4d
+ vpbroadcastd m0, [pd_1]
+ movd xm1, r5d ; shift - per
+ dec r5d
+ movd xm2, r5d ; shift - per - 1
+ pslld m0, xm2 ; 1 << shift - per - 1
+
+.part0:
+ pmovsxwd m2, [r0]
+ pmovsxwd m4, [r0 + 32]
+ movu m3, [r1]
+ movu m5, [r1 + 64]
+ pmulld m2, m3
+ pmulld m4, m5
+ paddd m2, m0
+ paddd m4, m0
+ psrad m2, xm1
+ psrad m4, xm1
+ packssdw m2, m4
+ vpermq m2, m6, m2
+ cmp r6d, 16
+ je .num16part0
+ movu [r2], m2
+
+ add r0, 64
+ add r1, 128
+ add r2, 64
+ dec r3d
+ jnz .part0
+ jmp .end
+
+.num16part0:
+ movu [r2], ym2
+ jmp .end
+
+.skip:
+ sub r4d, r5d ; per - shift
+ movd xm0, r4d
+
+.part1:
+ pmovsxwd m2, [r0]
+ pmovsxwd m4, [r0 + 32]
+ movu m3, [r1]
+ movu m5, [r1 + 64]
+ pmulld m2, m3
+ pmulld m4, m5
+ packssdw m2, m4
+
+ vextracti32x8 ym4, m2, 1
+ pmovsxwd m1, ym2
+ pmovsxwd m2, ym4
+ pslld m1, xm0
+ pslld m2, xm0
+ packssdw m1, m2
+
+ vpermq m1, m7, m1
+ cmp r6d, 16
+ je .num16part1
+ movu [r2], m1
+
+ add r0, 64
+ add r1, 128
+ add r2, 64
+ dec r3d
+ jnz .part1
+
+.num16part1:
+ movu [r2], ym1
+
+.end:
+ RET
+
INIT_ZMM avx512
cglobal dequant_normal, 5,5,7
vpbroadcastd m2, [pw_1] ; m2 = word [1]
More information about the x265-devel
mailing list