[x265] [PATCH] asm: rewrote and enable AVX/AVX2 version of propagateCost, 10378c -> 5392c (1.92x)
Min Chen
chenm003 at 163.com
Fri Oct 9 19:27:58 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444410026 18000
# Node ID 733f32812fe80dbd348bf89848ce14313e0fd583
# Parent b6156a08b1def3584647f26096866c1a0c11e54a
asm: rewrote and enable AVX/AVX2 version of propagateCost, 10378c -> 5392c (1.92x)
---
source/common/x86/asm-primitives.cpp | 4 +
source/common/x86/mc-a2.asm | 115 +++++++++++++--------------------
source/common/x86/mc.h | 1 +
3 files changed, 50 insertions(+), 70 deletions(-)
diff -r b6156a08b1de -r 733f32812fe8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Oct 09 20:45:59 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Oct 09 12:00:26 2015 -0500
@@ -1306,6 +1306,7 @@
p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx);
}
if (cpuMask & X265_CPU_XOP)
{
@@ -2157,6 +2158,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx2);
// TODO: depends on hps and vsp
ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes
@@ -2653,6 +2655,7 @@
p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
p.frameInitLowres = PFX(frame_init_lowres_core_avx);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx);
}
if (cpuMask & X265_CPU_XOP)
{
@@ -3649,6 +3652,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx2);
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
diff -r b6156a08b1de -r 733f32812fe8 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Fri Oct 09 20:45:59 2015 +0530
+++ b/source/common/x86/mc-a2.asm Fri Oct 09 12:00:26 2015 -0500
@@ -1003,6 +1003,7 @@
pxor m4, m4
movlhps m6, m6
mova m5, [pw_3fff]
+
.loop:
movh m2, [r2+r5*4] ; intra
movh m0, [r4+r5*4] ; invq
@@ -1048,81 +1049,55 @@
RET
-%macro INT16_UNPACK 1
- vpunpckhwd xm4, xm%1, xm7
- vpunpcklwd xm%1, xm7
- vinsertf128 m%1, m%1, xm4, 1
-%endmacro
-
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
+; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
+;-----------------------------------------------------------------------------
; FIXME: align loads/stores to 16 bytes
%macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 7,7,8
- add r6d, r6d
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- mova xm5, [pw_3fff]
- vbroadcastss m6, [r5]
- mulps m6, [pf_inv256]
-%if notcpuflag(avx2)
- pxor xm7, xm7
+cglobal mbtree_propagate_cost, 6,6,7
+ vbroadcastsd m6, [r5]
+ mulpd m6, [pd_inv256]
+ xor r5d, r5d
+ lea r0, [r0+r5*2]
+ mova m5, [pw_3fff]
+
+.loop:
+ movu xm2, [r2+r5*4] ; intra
+ movu xm0, [r4+r5*4] ; invq
+ pmovzxwd xm3, [r3+r5*2] ; inter
+ pand xm3, xm5
+ pminsd xm3, xm2
+
+ pmovzxwd xm1, [r1+r5*2] ; prop
+ pmaddwd xm0, xm2
+ cvtdq2pd m0, xm0
+ cvtdq2pd m1, xm1 ; prop
+%if cpuflag(avx2)
+ fmaddpd m0, m0, m6, m1
+%else
+ mulpd m0, m6 ; intra*invq*fps_factor>>8
+ addpd m0, m1 ; prop + (intra*invq*fps_factor>>8)
%endif
-.loop:
-%if cpuflag(avx2)
- pmovzxwd m0, [r2+r6] ; intra
- pmovzxwd m1, [r4+r6] ; invq
- pmovzxwd m2, [r1+r6] ; prop
- pand xm3, xm5, [r3+r6] ; inter
- pmovzxwd m3, xm3
- pmaddwd m1, m0
- psubd m4, m0, m3
- cvtdq2ps m0, m0
- cvtdq2ps m1, m1
- cvtdq2ps m2, m2
- cvtdq2ps m4, m4
- fmaddps m1, m1, m6, m2
- rcpps m3, m0
- mulps m2, m0, m3
- mulps m1, m4
- addps m4, m3, m3
- fnmaddps m4, m2, m3, m4
- mulps m1, m4
-%else
- movu xm0, [r2+r6]
- movu xm1, [r4+r6]
- movu xm2, [r1+r6]
- pand xm3, xm5, [r3+r6]
- INT16_UNPACK 0
- INT16_UNPACK 1
- INT16_UNPACK 2
- INT16_UNPACK 3
- cvtdq2ps m0, m0
- cvtdq2ps m1, m1
- cvtdq2ps m2, m2
- cvtdq2ps m3, m3
- mulps m1, m0
- subps m4, m0, m3
- mulps m1, m6 ; intra*invq*fps_factor>>8
- addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
- rcpps m3, m0 ; 1 / intra 1st approximation
- mulps m2, m0, m3 ; intra * (1/intra 1st approx)
- mulps m2, m3 ; intra * (1/intra 1st approx)^2
- mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- addps m3, m3 ; 2 * (1/intra 1st approx)
- subps m3, m2 ; 2nd approximation for 1/intra
- mulps m1, m3 ; / intra
-%endif
- vcvtps2dq m1, m1
- movu [r0+r6*2], m1
- add r6, 16
- jl .loop
+ cvtdq2pd m1, xm2 ; intra
+ psubd m2, m3 ; intra - inter
+ cvtdq2pd m2, xm2 ; intra - inter
+ mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+ ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
+ divpd m0, m1
+ addpd m0, [pd_0_5]
+ cvttpd2dq xm0, m0
+
+ movu [r0+r5*4], xm0
+ add r5d, 4
+ cmp r5d, r6m
+ jl .loop
RET
%endmacro
INIT_YMM avx
MBTREE_AVX
-INIT_YMM avx2,fma3
+
+INIT_YMM avx2
MBTREE_AVX
diff -r b6156a08b1de -r 733f32812fe8 source/common/x86/mc.h
--- a/source/common/x86/mc.h Fri Oct 09 20:45:59 2015 +0530
+++ b/source/common/x86/mc.h Fri Oct 09 12:00:26 2015 -0500
@@ -42,6 +42,7 @@
PROPAGATE_COST(sse2)
PROPAGATE_COST(avx)
+PROPAGATE_COST(avx2)
#undef PROPAGATE_COST
More information about the x265-devel
mailing list