[x264-devel] x86: Minor mbtree_propagate_cost improvements
Henrik Gramner
git at videolan.org
Thu Mar 13 21:23:52 CET 2014
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Feb 16 21:24:54 2014 +0100| [438a7e20a55177fdd5e285646cfdb5e96106a2eb] | committer: Jason Garrett-Glaser
x86: Minor mbtree_propagate_cost improvements
Reduce the number of registers used from 7 to 6.
Reduce the number of vector registers used by the AVX2 implementation from 8 to 7.
Multiply fps_factor by 1/256 once per frame instead of once per macroblock row.
Use mova instead of movu for dst since it's guaranteed to be aligned.
Some cosmetics.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=438a7e20a55177fdd5e285646cfdb5e96106a2eb
---
common/mc.c | 2 +-
common/x86/mc-a2.asm | 163 +++++++++++++++++++++++++-------------------------
encoder/slicetype.c | 2 +-
tools/checkasm.c | 2 +-
4 files changed, 84 insertions(+), 85 deletions(-)
diff --git a/common/mc.c b/common/mc.c
index c7a544f..7147496 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -486,7 +486,7 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
- float fps = *fps_factor / 256.f;
+ float fps = *fps_factor;
for( int i = 0; i < len; i++ )
{
float intra_cost = intra_costs[i] * inv_qscales[i];
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 04872cf..89d6854 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -56,7 +56,6 @@ pw_1024: times 16 dw 1024
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
-pf_inv256: times 8 dd 0.00390625
pad10: times 8 dw 10*PIXEL_MAX
pad20: times 8 dw 20*PIXEL_MAX
@@ -1881,62 +1880,62 @@ FRAME_INIT_LOWRES
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
%macro MBTREE 0
-cglobal mbtree_propagate_cost, 7,7,7
- add r6d, r6d
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- pxor xmm4, xmm4
- movss xmm6, [r5]
- shufps xmm6, xmm6, 0
- mulps xmm6, [pf_inv256]
- movdqa xmm5, [pw_3fff]
+cglobal mbtree_propagate_cost, 6,6,7
+ movss m6, [r5]
+ mov r5d, r6m
+ lea r0, [r0+r5*4]
+ add r5d, r5d
+ add r1, r5
+ add r2, r5
+ add r3, r5
+ add r4, r5
+ neg r5
+ pxor m4, m4
+ shufps m6, m6, 0
+ mova m5, [pw_3fff]
.loop:
- movq xmm2, [r2+r6] ; intra
- movq xmm0, [r4+r6] ; invq
- movq xmm3, [r3+r6] ; inter
- movq xmm1, [r1+r6] ; prop
- punpcklwd xmm2, xmm4
- punpcklwd xmm0, xmm4
- pmaddwd xmm0, xmm2
- pand xmm3, xmm5
- punpcklwd xmm1, xmm4
- punpcklwd xmm3, xmm4
+ movq m2, [r2+r5] ; intra
+ movq m0, [r4+r5] ; invq
+ movq m3, [r3+r5] ; inter
+ movq m1, [r1+r5] ; prop
+ punpcklwd m2, m4
+ punpcklwd m0, m4
+ pmaddwd m0, m2
+ pand m3, m5
+ punpcklwd m1, m4
+ punpcklwd m3, m4
%if cpuflag(fma4)
- cvtdq2ps xmm0, xmm0
- cvtdq2ps xmm1, xmm1
- fmaddps xmm0, xmm0, xmm6, xmm1
- cvtdq2ps xmm1, xmm2
- psubd xmm2, xmm3
- cvtdq2ps xmm2, xmm2
- rcpps xmm3, xmm1
- mulps xmm1, xmm3
- mulps xmm0, xmm2
- addps xmm2, xmm3, xmm3
- fnmaddps xmm3, xmm1, xmm3, xmm2
- mulps xmm0, xmm3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ fmaddps m0, m0, m6, m1
+ cvtdq2ps m1, m2
+ psubd m2, m3
+ cvtdq2ps m2, m2
+ rcpps m3, m1
+ mulps m1, m3
+ mulps m0, m2
+ addps m2, m3, m3
+ fnmaddps m3, m1, m3, m2
+ mulps m0, m3
%else
- cvtdq2ps xmm0, xmm0
- mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
- cvtdq2ps xmm1, xmm1 ; prop
- addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
- cvtdq2ps xmm1, xmm2 ; intra
- psubd xmm2, xmm3 ; intra - inter
- cvtdq2ps xmm2, xmm2 ; intra - inter
- rcpps xmm3, xmm1 ; 1 / intra 1st approximation
- mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
- mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
- mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
- subps xmm3, xmm1 ; 2nd approximation for 1/intra
- mulps xmm0, xmm3 ; / intra
+ cvtdq2ps m0, m0
+ mulps m0, m6 ; intra*invq*fps_factor>>8
+ cvtdq2ps m1, m1 ; prop
+ addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
+ cvtdq2ps m1, m2 ; intra
+ psubd m2, m3 ; intra - inter
+ cvtdq2ps m2, m2 ; intra - inter
+ rcpps m3, m1 ; 1 / intra 1st approximation
+ mulps m1, m3 ; intra * (1/intra 1st approx)
+ mulps m1, m3 ; intra * (1/intra 1st approx)^2
+ mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ addps m3, m3 ; 2 * (1/intra 1st approx)
+ subps m3, m1 ; 2nd approximation for 1/intra
+ mulps m0, m3 ; / intra
%endif
- cvtps2dq xmm0, xmm0
- movdqa [r0+r6*2], xmm0
- add r6, 8
+ cvtps2dq m0, m0
+ mova [r0+r5*2], m0
+ add r5, 8
jl .loop
RET
%endmacro
@@ -1948,33 +1947,33 @@ INIT_XMM fma4
MBTREE
%macro INT16_UNPACK 1
- vpunpckhwd xm4, xm%1, xm7
- vpunpcklwd xm%1, xm7
- vinsertf128 m%1, m%1, xm4, 1
+ punpckhwd xm4, xm%1, xm7
+ punpcklwd xm%1, xm7
+ vinsertf128 m%1, m%1, xm4, 1
%endmacro
-; FIXME: align loads/stores to 16 bytes
-%macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 7,7,8
- add r6d, r6d
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- mova xm5, [pw_3fff]
- vbroadcastss m6, [r5]
- mulps m6, [pf_inv256]
+; FIXME: align loads to 16 bytes
+%macro MBTREE_AVX 1
+cglobal mbtree_propagate_cost, 6,6,%1
+ vbroadcastss m6, [r5]
+ mov r5d, r6m
+ lea r0, [r0+r5*4]
+ add r5d, r5d
+ add r1, r5
+ add r2, r5
+ add r3, r5
+ add r4, r5
+ neg r5
+ mova xm5, [pw_3fff]
%if notcpuflag(avx2)
- pxor xm7, xm7
+ pxor xm7, xm7
%endif
.loop:
%if cpuflag(avx2)
- pmovzxwd m0, [r2+r6] ; intra
- pmovzxwd m1, [r4+r6] ; invq
- pmovzxwd m2, [r1+r6] ; prop
- pand xm3, xm5, [r3+r6] ; inter
+ pmovzxwd m0, [r2+r5] ; intra
+ pmovzxwd m1, [r4+r5] ; invq
+ pmovzxwd m2, [r1+r5] ; prop
+ pand xm3, xm5, [r3+r5] ; inter
pmovzxwd m3, xm3
pmaddwd m1, m0
psubd m4, m0, m3
@@ -1990,10 +1989,10 @@ cglobal mbtree_propagate_cost, 7,7,8
fnmaddps m4, m2, m3, m4
mulps m1, m4
%else
- movu xm0, [r2+r6]
- movu xm1, [r4+r6]
- movu xm2, [r1+r6]
- pand xm3, xm5, [r3+r6]
+ movu xm0, [r2+r5]
+ movu xm1, [r4+r5]
+ movu xm2, [r1+r5]
+ pand xm3, xm5, [r3+r5]
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
@@ -2015,13 +2014,13 @@ cglobal mbtree_propagate_cost, 7,7,8
mulps m1, m3 ; / intra
%endif
vcvtps2dq m1, m1
- movu [r0+r6*2], m1
- add r6, 16
+ mova [r0+r5*2], m1
+ add r5, 16
jl .loop
RET
%endmacro
INIT_YMM avx
-MBTREE_AVX
+MBTREE_AVX 8
INIT_YMM avx2,fma3
-MBTREE_AVX
+MBTREE_AVX 7
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index f112a8f..b11172d 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -1055,7 +1055,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl
uint16_t *propagate_cost = frames[b]->i_propagate_cost;
x264_emms();
- float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);
+ float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f);
/* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
if( !referenced )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 7805c01..2a2d1dd 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1573,7 +1573,7 @@ static int check_mc( int cpu_ref, int cpu_new )
x264_emms();
for( int i = 0; i < 10; i++ )
{
- float fps_factor = (rand()&65535) / 256.;
+ float fps_factor = (rand()&65535) / 65535.0f;
set_func_name( "mbtree_propagate" );
int *dsta = (int*)buf3;
int *dstc = dsta+400;
More information about the x264-devel
mailing list