[x265] [PATCH 1 of 2] asm: dequant_scaling asm code, improved 12668c->11097c, 12% over intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Jun 18 07:03:01 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434543335 -19800
# Wed Jun 17 17:45:35 2015 +0530
# Node ID b977f03d9f0fb0811facc9faf926668a031b3105
# Parent d6c32960b5df5b150569d03eb985d9772e494d13
asm: dequant_scaling asm code, improved 12668c->11097c, 12% over intrinsic
diff -r d6c32960b5df -r b977f03d9f0f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 17 22:16:03 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jun 17 17:45:35 2015 +0530
@@ -1104,6 +1104,7 @@
p.quant = PFX(quant_sse4);
p.nquant = PFX(nquant_sse4);
p.dequant_normal = PFX(dequant_normal_sse4);
+ p.dequant_scaling = PFX(dequant_scaling_sse4);
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
ALL_LUMA_PU(satd, pixel_satd, sse4);
@@ -2396,6 +2397,7 @@
p.quant = PFX(quant_sse4);
p.nquant = PFX(nquant_sse4);
p.dequant_normal = PFX(dequant_normal_sse4);
+ p.dequant_scaling = PFX(dequant_scaling_sse4);
p.weight_pp = PFX(weight_pp_sse4);
p.weight_sp = PFX(weight_sp_sse4);
diff -r d6c32960b5df -r b977f03d9f0f source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Jun 17 22:16:03 2015 +0530
+++ b/source/common/x86/pixel-util.h Wed Jun 17 17:45:35 2015 +0530
@@ -31,6 +31,7 @@
uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
+ void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
diff -r d6c32960b5df -r b977f03d9f0f source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Jun 17 22:16:03 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Jun 17 17:45:35 2015 +0530
@@ -904,6 +904,70 @@
jnz .loop
RET
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal dequant_scaling, 6,6,6
+ add r5d, 4
+ shr r3d, 3 ; num/8
+ cmp r5d, r4d
+ jle .skip
+ sub r5d, r4d
+ mova m0, [pd_1]
+ movd m1, r5d ; shift - per
+ dec r5d
+ movd m2, r5d ; shift - per - 1
+ pslld m0, m2 ; 1 << shift - per - 1
+
+.part0:
+ pmovsxwd m2, [r0]
+ pmovsxwd m4, [r0 + 8]
+ movu m3, [r1]
+ movu m5, [r1 + 16]
+ pmulld m2, m3
+ pmulld m4, m5
+ paddd m2, m0
+ paddd m4, m0
+ psrad m2, m1
+ psrad m4, m1
+ packssdw m2, m4
+ movu [r2], m2
+
+ add r0, 16
+ add r1, 32
+ add r2, 16
+ dec r3d
+ jnz .part0
+ jmp .end
+
+.skip:
+ sub r4d, r5d ; per - shift
+ movd m0, r4d
+
+.part1:
+ pmovsxwd m2, [r0]
+ pmovsxwd m4, [r0 + 8]
+ movu m3, [r1]
+ movu m5, [r1 + 16]
+ pmulld m2, m3
+ pmulld m4, m5
+ packssdw m2, m4
+ pmovsxwd m1, m2
+ psrldq m2, 8
+ pmovsxwd m2, m2
+ pslld m1, m0
+ pslld m2, m0
+ packssdw m1, m2
+ movu [r2], m1
+
+ add r0, 16
+ add r1, 32
+ add r2, 16
+ dec r3d
+ jnz .part1
+.end:
+ RET
INIT_YMM avx2
cglobal dequant_normal, 5,5,7
More information about the x265-devel
mailing list