[x264-devel] commit: simd part of x264_macroblock_tree_propagate. (Loren Merritt )
git version control
git at videolan.org
Sun Aug 9 11:48:43 CEST 2009
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Sun Aug 9 04:00:36 2009 +0000| [01a693d0c9483e2589b34e7c2cf2b59dae5c1ec2] | committer: Loren Merritt
simd part of x264_macroblock_tree_propagate.
1.6x faster on conroe.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=01a693d0c9483e2589b34e7c2cf2b59dae5c1ec2
---
common/macroblock.c | 3 ++-
common/mc.c | 29 +++++++++++++++++++++++++++++
common/mc.h | 3 +++
common/x86/mc-a2.asm | 41 +++++++++++++++++++++++++++++++++++++++++
common/x86/mc-c.c | 3 +++
encoder/encoder.c | 1 +
encoder/slicetype.c | 13 ++++++-------
tools/checkasm.c | 26 ++++++++++++++++++++++++++
8 files changed, 111 insertions(+), 8 deletions(-)
diff --git a/common/macroblock.c b/common/macroblock.c
index 836d203..16bb689 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -743,7 +743,8 @@ int x264_macroblock_cache_init( x264_t *h )
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
- CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
+ int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+ CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
return 0;
fail: return -1;
diff --git a/common/mc.c b/common/mc.c
index e5d6cc8..2d5f66d 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -356,6 +356,33 @@ static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
}
}
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+// gcc isn't smart enough to use the "div" instruction
+static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y) {
+ int32_t quotient, remainder;
+ asm("idiv %4"
+ :"=a"(quotient), "=d"(remainder)
+ :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
+ );
+ return quotient;
+}
+#else
+#define div_64_32(x,y) ((x)/(y))
+#endif
+
+/* Estimate the total amount of influence on future quality that could be had if we
+ * were to improve the reference samples used to inter predict any given macroblock. */
+static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+{
+ int i;
+ for( i=0; i<len; i++ )
+ {
+ int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
+ dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
+ }
+}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
@@ -392,6 +419,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = integral_init4v;
pf->integral_init8v = integral_init8v;
+ pf->mbtree_propagate_cost = mbtree_propagate_cost;
+
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
diff --git a/common/mc.h b/common/mc.h
index 594940f..556ae83 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -74,6 +74,9 @@ typedef struct
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
+
+ void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 9745ac6..e79be5e 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -34,6 +34,7 @@ filt_mul51: times 8 db 1, -5
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_32: times 8 dw 32
+pd_128: times 4 dd 128
SECTION .text
@@ -1081,3 +1082,43 @@ INIT_XMM
FRAME_INIT_LOWRES sse2, 12
%define PALIGNR PALIGNR_SSSE3
FRAME_INIT_LOWRES ssse3, 12
+
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+;-----------------------------------------------------------------------------
+cglobal x264_mbtree_propagate_cost_sse2, 6,6
+ shl r5d, 1
+ lea r0, [r0+r5*2]
+ lea r1, [r1+r5]
+ lea r2, [r2+r5]
+ lea r3, [r3+r5]
+ lea r4, [r4+r5]
+ neg r5
+ pxor xmm5, xmm5
+ movdqa xmm4, [pd_128 GLOBAL]
+.loop:
+ movq xmm2, [r2+r5] ; intra
+ movq xmm0, [r4+r5] ; invq
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm0, xmm5
+ pmaddwd xmm0, xmm2
+ paddd xmm0, xmm4
+ psrld xmm0, 8 ; intra*invq>>8
+ movq xmm1, [r1+r5] ; prop
+ movq xmm3, [r3+r5] ; inter
+ punpcklwd xmm1, xmm5
+ punpcklwd xmm3, xmm5
+ paddd xmm0, xmm1 ; prop + (intra*invq>>8)
+ cvtdq2ps xmm1, xmm2 ; intra
+ psubd xmm2, xmm3 ; intra - inter
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm2, xmm2
+ mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
+ divps xmm0, xmm1 ; / intra
+ cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
+ movdqa [r0+r5*2], xmm0
+ add r5, 8
+ jl .loop
+ REP_RET
+
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index f69b99c..06ef579 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -74,6 +74,8 @@ extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int strid
extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, int len );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
@@ -303,6 +305,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 96ed35f..6de7346 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -648,6 +648,7 @@ static int x264_validate_parameters( x264_t *h )
BOOLIFY( analyse.b_fast_pskip );
BOOLIFY( rc.b_stat_write );
BOOLIFY( rc.b_stat_read );
+ BOOLIFY( rc.b_mb_tree );
#undef BOOLIFY
return 0;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index f0fe2b4..207995c 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -406,22 +406,21 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
+ int *buf = h->scratch_buffer;
for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
{
int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
+ h->mc.mbtree_propagate_cost( buf, frames[b]->i_propagate_cost+mb_index,
+ frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
+ frames[b]->i_inv_qscale_factor+mb_index, h->sps->i_mb_width );
for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++, mb_index++ )
{
- int inter_cost = frames[b]->lowres_costs[b-p0][p1-b][mb_index];
- int intra_cost = frames[b]->i_intra_cost[mb_index];
-
+ int propagate_amount = buf[h->mb.i_mb_x];
/* Don't propagate for an intra block. */
- if( inter_cost < intra_cost )
+ if( propagate_amount > 0 )
{
int lists_used = frames[b]->lowres_inter_types[b-p0][p1-b][mb_index];
- /* The approximate amount of data that this block contains. */
- int propagate_amount = frames[b]->i_propagate_cost[mb_index] + ((intra_cost * frames[b]->i_inv_qscale_factor[mb_index] + 128)>>8);
- propagate_amount = ((uint64_t)propagate_amount*(intra_cost-inter_cost)) / intra_cost;
int list;
/* Follow the MVs to the previous frame(s). */
for( list = 0; list < 2; list++ )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index d65221b..2a05392 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -960,6 +960,32 @@ static int check_mc( int cpu_ref, int cpu_new )
INTEGRAL_INIT( integral_init8v, 9, sum, stride );
report( "integral init :" );
+ if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
+ {
+ ok = 1; used_asm = 1;
+ set_func_name( "mbtree_propagate" );
+ int *dsta = (int*)buf3;
+ int *dstc = dsta+400;
+ uint16_t *prop = (uint16_t*)buf1;
+ uint16_t *intra = (uint16_t*)buf4;
+ uint16_t *inter = intra+400;
+ uint16_t *qscale = inter+400;
+ uint16_t *rand = (uint16_t*)buf2;
+ for( i=0; i<400; i++ )
+ {
+ intra[i] = *rand++ & 0x7fff;
+ intra[i] += !intra[i];
+ inter[i] = *rand++ & 0x7fff;
+ qscale[i] = *rand++ & 0x7fff;
+ }
+ call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
+ call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
+ // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+ for( i=0; i<400; i++ )
+ ok &= abs(dstc[i]-dsta[i]) <= (abs(dstc[i])>512) || fabs((double)dstc[i]/dsta[i]-1) < 1e-6;
+ report( "mbtree propagate :" );
+ }
+
return ret;
}
More information about the x264-devel
mailing list