[x265] [PATCH 2 of 2] asm: avx2 code for dequant_scaling, improved 11097c->6860c, 38% over SSE4
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Jun 18 07:03:02 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434602517 -19800
# Thu Jun 18 10:11:57 2015 +0530
# Node ID 2b69ad2ac441d0782fef529d5c412edd43a0093e
# Parent b977f03d9f0fb0811facc9faf926668a031b3105
asm: avx2 code for dequant_scaling, improved 11097c->6860c, 38% over SSE4
diff -r b977f03d9f0f -r 2b69ad2ac441 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 17 17:45:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 18 10:11:57 2015 +0530
@@ -1460,6 +1460,7 @@
p.quant = PFX(quant_avx2);
p.nquant = PFX(nquant_avx2);
p.dequant_normal = PFX(dequant_normal_avx2);
+ p.dequant_scaling = PFX(dequant_scaling_avx2);
p.dst4x4 = PFX(dst4_avx2);
p.idst4x4 = PFX(idst4_avx2);
p.denoiseDct = PFX(denoise_dct_avx2);
@@ -2768,6 +2769,7 @@
p.quant = PFX(quant_avx2);
p.nquant = PFX(nquant_avx2);
p.dequant_normal = PFX(dequant_normal_avx2);
+ p.dequant_scaling = PFX(dequant_scaling_avx2);
p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r b977f03d9f0f -r 2b69ad2ac441 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Jun 17 17:45:35 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Thu Jun 18 10:11:57 2015 +0530
@@ -969,6 +969,72 @@
.end:
RET
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal dequant_scaling, 6,6,6
+ add r5d, 4
+ shr r3d, 4 ; num/16
+ cmp r5d, r4d
+ jle .skip
+ sub r5d, r4d
+ mova m0, [pd_1]
+ movd xm1, r5d ; shift - per
+ dec r5d
+ movd xm2, r5d ; shift - per - 1
+ pslld m0, xm2 ; 1 << shift - per - 1
+
+.part0:
+ pmovsxwd m2, [r0]
+ pmovsxwd m4, [r0 + 16]
+ movu m3, [r1]
+ movu m5, [r1 + 32]
+ pmulld m2, m3
+ pmulld m4, m5
+ paddd m2, m0
+ paddd m4, m0
+ psrad m2, xm1
+ psrad m4, xm1
+ packssdw m2, m4
+ vpermq m2, m2, 11011000b
+ movu [r2], m2
+
+ add r0, 32
+ add r1, 64
+ add r2, 32
+ dec r3d
+ jnz .part0
+ jmp .end
+
+.skip:
+ sub r4d, r5d ; per - shift
+ movd xm0, r4d
+
+.part1:
+ pmovsxwd m2, [r0]
+ pmovsxwd m4, [r0 + 16]
+ movu m3, [r1]
+ movu m5, [r1 + 32]
+ pmulld m2, m3
+ pmulld m4, m5
+ packssdw m2, m4
+ vextracti128 xm4, m2, 1
+ pmovsxwd m1, xm2
+ pmovsxwd m2, xm4
+ pslld m1, xm0
+ pslld m2, xm0
+ packssdw m1, m2
+ movu [r2], m1
+
+ add r0, 32
+ add r1, 64
+ add r2, 32
+ dec r3d
+ jnz .part1
+.end:
+ RET
+
INIT_YMM avx2
cglobal dequant_normal, 5,5,7
vpbroadcastd m2, [pw_1] ; m2 = word [1]
More information about the x265-devel
mailing list