[x264-devel] x86: AVX-512 mbtree_propagate_cost
Henrik Gramner
git at videolan.org
Mon May 22 00:03:24 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Mar 28 22:59:56 2017 +0200| [3451ba3af49e58a720277615df3d8e4a4171986f] | committer: Henrik Gramner
x86: AVX-512 mbtree_propagate_cost
Also make the AVX and AVX2 implementations slightly faster.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=3451ba3af49e58a720277615df3d8e4a4171986f
---
common/frame.c | 6 ++++--
common/macroblock.c | 2 +-
common/x86/mc-a2.asm | 49 ++++++++++++++++++++++++++++++++++++++++---------
common/x86/mc-c.c | 22 ++++++++++++++--------
tools/checkasm.c | 2 +-
5 files changed, 60 insertions(+), 21 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index 2cbcf1e5..a81e9b10 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -223,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
- PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
+ PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
- PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+ PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
+ /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
+ prealloc_size += NATIVE_ALIGN;
}
if( h->param.rc.i_aq_mode )
{
diff --git a/common/macroblock.c b/common/macroblock.c
index e5097a6d..0aa09de4 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
- int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
+ int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index b3cb4634..e0b4a0cd 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -2147,13 +2147,13 @@ MBTREE
cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
vbroadcastss m5, [r5]
mov r5d, r6m
- lea r0, [r0+r5*2]
+ lea r2, [r2+r5*2]
add r5d, r5d
- add r1, r5
- add r2, r5
- add r3, r5
add r4, r5
neg r5
+ sub r1, r5
+ sub r3, r5
+ sub r0, r5
mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
@@ -2165,9 +2165,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
- pminsd m3, m0
pmaddwd m1, m0
- psubd m3, m0, m3
+ psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
@@ -2184,7 +2183,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm4, [r3+r5]
- pminsw xm3, xm0
+ psubusw xm3, xm0, xm3
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
@@ -2194,7 +2193,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
- subps m3, m0, m3
mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
rcpps m2, m0 ; 1 / intra 1st approximation
@@ -2205,7 +2203,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
subps m2, m0 ; 2nd approximation for 1/intra
mulps m1, m2 ; / intra
%endif
- vcvtps2dq m1, m1
+ cvtps2dq m1, m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
@@ -2219,6 +2217,39 @@ MBTREE_AVX
INIT_YMM avx2
MBTREE_AVX
+INIT_ZMM avx512
+cglobal mbtree_propagate_cost, 6,6
+ vbroadcastss m5, [r5]
+ mov r5d, 0x3fff3fff
+ vpbroadcastd ym4, r5d
+ mov r5d, r6m
+ lea r2, [r2+r5*2]
+ add r5d, r5d
+ add r1, r5
+ neg r5
+ sub r4, r5
+ sub r3, r5
+ sub r0, r5
+.loop:
+ pmovzxwd m0, [r2+r5] ; intra
+ pmovzxwd m1, [r1+r5] ; prop
+ pmovzxwd m2, [r4+r5] ; invq
+ pand ym3, ym4, [r3+r5] ; inter
+ pmovzxwd m3, ym3
+ psubusw m3, m0, m3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m3, m3
+ vdivps m1, m0, {rn-sae}
+ fmaddps m1, m2, m5, m1
+ mulps m1, m3
+ cvtps2dq m1, m1
+ vpmovsdw [r0+r5], m1
+ add r5, 32
+ jl .loop
+ RET
+
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 90a6cc19..f6e349a9 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -160,14 +160,16 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
@@ -864,4 +866,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
+
+ if( !(cpu&X264_CPU_AVX512) )
+ return;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
}
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 7575d3f4..7ada2791 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1743,7 +1743,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
if( !ok )
- fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+ fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] );
}
}
}
More information about the x264-devel
mailing list