[x265] [PATCH 2 of 2] asm: rewrote and enable SSE2 version of propagateCost, 9325c -> 5564c (1.68x)
Min Chen
chenm003 at 163.com
Thu Oct 8 22:44:31 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444336054 18000
# Node ID 9a51ae2c8d47ab3480e1674ec95804b8f12affd4
# Parent b26659ff36fdf43e96477e7e0e10f9b63313bb72
asm: rewrote and enable SSE2 version of propagateCost, 9325c -> 5564c (1.68x)
TODO: I keep the optimize reference code, it is faster but made output change
---
source/common/pixel.cpp | 26 ++++++--
source/common/x86/asm-primitives.cpp | 2 +
source/common/x86/mc-a2.asm | 121 +++++++++++++++------------------
source/common/x86/mc.h | 9 +++
4 files changed, 86 insertions(+), 72 deletions(-)
diff -r b26659ff36fd -r 9a51ae2c8d47 source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Oct 08 15:27:31 2015 -0500
+++ b/source/common/pixel.cpp Thu Oct 08 15:27:34 2015 -0500
@@ -25,6 +25,7 @@
*****************************************************************************/
#include "common.h"
+#include "slicetype.h" // LOWRES_COST_MASK
#include "primitives.h"
#include "x265.h"
@@ -960,17 +961,30 @@
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
- const int32_t* invQscales, const double* fpsFactor, int len)
+ const int32_t* invQscales, const double* fpsFactor, int len)
{
- double fps = *fpsFactor / 256;
+ double fps = *fpsFactor / 256; // range[0.01, 1.00]
for (int i = 0; i < len; i++)
{
- double intraCost = intraCosts[i] * invQscales[i];
- double propagateAmount = (double)propagateIn[i] + intraCost * fps;
- double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
- double propagateDenom = (double)intraCosts[i];
+ int intraCost = intraCosts[i];
+ int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+ double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+ double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+ double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+
+#if 0
+ // algorithm that output match to asm
+ float intraRcp = (float)1.0f / intraCost; // VC can't mapping this into RCPPS
+ float intraRcpError1 = (float)intraCost * (float)intraRcp;
+ intraRcpError1 *= (float)intraRcp;
+ float intraRcpError2 = intraRcp + intraRcp;
+ float propagateDenom = intraRcpError2 - intraRcpError1;
+ dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+#else
+ double propagateDenom = (double)intraCost; // Q32
dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+#endif
}
}
diff -r b26659ff36fd -r 9a51ae2c8d47 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 08 15:27:31 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 08 15:27:34 2015 -0500
@@ -1030,6 +1030,7 @@
ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
+ p.propagateCost = PFX(mbtree_propagate_cost_sse2);
}
if (cpuMask & X265_CPU_SSE3)
{
@@ -2355,6 +2356,7 @@
ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
+ p.propagateCost = PFX(mbtree_propagate_cost_sse2);
}
if (cpuMask & X265_CPU_SSE3)
{
diff -r b26659ff36fd -r 9a51ae2c8d47 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Thu Oct 08 15:27:31 2015 -0500
+++ b/source/common/x86/mc-a2.asm Thu Oct 08 15:27:34 2015 -0500
@@ -48,6 +48,8 @@
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
pf_inv256: times 8 dd 0.00390625
+const pd_inv256, times 4 dq 0.00390625
+const pd_0_5, times 4 dq 0.5
SECTION .text
@@ -989,75 +991,62 @@
%endif
;-----------------------------------------------------------------------------
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
+; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
;-----------------------------------------------------------------------------
-%macro MBTREE 0
-cglobal mbtree_propagate_cost, 7,7,7
- add r6d, r6d
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- pxor xmm4, xmm4
- movss xmm6, [r5]
- shufps xmm6, xmm6, 0
- mulps xmm6, [pf_inv256]
- movdqa xmm5, [pw_3fff]
+INIT_XMM sse2
+cglobal mbtree_propagate_cost, 6,6,7
+ movsd m6, [r5]
+ mulpd m6, [pd_inv256]
+ xor r5d, r5d
+ lea r0, [r0+r5*2]
+ pxor m4, m4
+ movlhps m6, m6
+ mova m5, [pw_3fff]
.loop:
- movq xmm2, [r2+r6] ; intra
- movq xmm0, [r4+r6] ; invq
- movq xmm3, [r3+r6] ; inter
- movq xmm1, [r1+r6] ; prop
- punpcklwd xmm2, xmm4
- punpcklwd xmm0, xmm4
- pmaddwd xmm0, xmm2
- pand xmm3, xmm5
- punpcklwd xmm1, xmm4
- punpcklwd xmm3, xmm4
-%if cpuflag(fma4)
- cvtdq2ps xmm0, xmm0
- cvtdq2ps xmm1, xmm1
- fmaddps xmm0, xmm0, xmm6, xmm1
- cvtdq2ps xmm1, xmm2
- psubd xmm2, xmm3
- cvtdq2ps xmm2, xmm2
- rcpps xmm3, xmm1
- mulps xmm1, xmm3
- mulps xmm0, xmm2
- addps xmm2, xmm3, xmm3
- fnmaddps xmm3, xmm1, xmm3, xmm2
- mulps xmm0, xmm3
-%else
- cvtdq2ps xmm0, xmm0
- mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
- cvtdq2ps xmm1, xmm1 ; prop
- addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
- cvtdq2ps xmm1, xmm2 ; intra
- psubd xmm2, xmm3 ; intra - inter
- cvtdq2ps xmm2, xmm2 ; intra - inter
- rcpps xmm3, xmm1 ; 1 / intra 1st approximation
- mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
- mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
- mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
- subps xmm3, xmm1 ; 2nd approximation for 1/intra
- mulps xmm0, xmm3 ; / intra
-%endif
- cvtps2dq xmm0, xmm0
- movdqa [r0+r6*2], xmm0
- add r6, 8
- jl .loop
+ movh m2, [r2+r5*4] ; intra
+ movh m0, [r4+r5*4] ; invq
+ movd m3, [r3+r5*2] ; inter
+ pand m3, m5
+ punpcklwd m3, m4
+
+ ; PMINSD
+ pcmpgtd m1, m2, m3
+ pand m3, m1
+ pandn m1, m2
+ por m3, m1
+
+ movd m1, [r1+r5*2] ; prop
+ pmaddwd m0, m2
+ punpcklwd m1, m4
+ cvtdq2pd m0, m0
+ mulpd m0, m6 ; intra*invq*fps_factor>>8
+ cvtdq2pd m1, m1 ; prop
+ addpd m0, m1 ; prop + (intra*invq*fps_factor>>8)
+ ;cvtdq2ps m1, m2 ; intra
+ cvtdq2pd m1, m2 ; intra
+ psubd m2, m3 ; intra - inter
+ cvtdq2pd m2, m2 ; intra - inter
+ ;rcpps m3, m1
+ ;mulps m1, m3 ; intra * (1/intra 1st approx)
+ ;mulps m1, m3 ; intra * (1/intra 1st approx)^2
+ ;addps m3, m3 ; 2 * (1/intra 1st approx)
+ ;subps m3, m1 ; 2nd approximation for 1/intra
+ ;cvtps2pd m3, m3 ; 1 / intra 1st approximation
+ mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ ;mulpd m0, m3 ; / intra
+
+ ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
+ divpd m0, m1
+ addpd m0, [pd_0_5]
+ cvttpd2dq m0, m0
+
+ movh [r0+r5*4], m0
+ add r5d, 2
+ cmp r5d, r6m
+ jl .loop
RET
-%endmacro
-INIT_XMM sse2
-MBTREE
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
-INIT_XMM fma4
-MBTREE
%macro INT16_UNPACK 1
vpunpckhwd xm4, xm%1, xm7
diff -r b26659ff36fd -r 9a51ae2c8d47 source/common/x86/mc.h
--- a/source/common/x86/mc.h Thu Oct 08 15:27:31 2015 -0500
+++ b/source/common/x86/mc.h Thu Oct 08 15:27:34 2015 -0500
@@ -36,4 +36,13 @@
#undef LOWRES
+#define PROPAGATE_COST(cpu) \
+ void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
+ const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
+
+PROPAGATE_COST(sse2)
+PROPAGATE_COST(avx)
+
+#undef PROPAGATE_COST
+
#endif // ifndef X265_MC_H
More information about the x265-devel
mailing list