[x264-devel] commit: Reduce lookahead memory usage, cache misses (Jason Garrett-Glaser )
git at videolan.org
git at videolan.org
Thu Apr 29 19:58:08 CEST 2010
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Mon Apr 26 15:10:11 2010 -0700| [41b877e0ab14a77a211e0cd087bf9c3726b2c2f6] | committer: Jason Garrett-Glaser
Reduce lookahead memory usage, cache misses
Merge lowres_types with lowres_costs.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=41b877e0ab14a77a211e0cd087bf9c3726b2c2f6
---
common/frame.c | 6 ------
common/frame.h | 9 +++++++--
common/mc.c | 2 +-
common/x86/mc-a2.asm | 4 +++-
encoder/slicetype.c | 16 ++++++++--------
5 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index 90783cd..fa8eec0 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -148,10 +148,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
- {
CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
- CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
- }
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
}
@@ -199,10 +196,7 @@ void x264_frame_delete( x264_frame_t *frame )
x264_free( frame->i_propagate_cost );
for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
- {
x264_free( frame->lowres_costs[j][i] );
- x264_free( frame->lowres_inter_types[j][i] );
- }
x264_free( frame->f_qp_offset );
x264_free( frame->f_qp_offset_aq );
x264_free( frame->i_inv_qscale_factor );
diff --git a/common/frame.h b/common/frame.h
index 419da08..2d6ea0b 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -84,9 +84,14 @@ typedef struct x264_frame
uint8_t *mb_partition;
int16_t (*mv[2])[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+
+ /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
+ * Doesn't need special addressing for intra cost because
+ * lists_used is guaranteed to be zero in that cast. */
uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
- /* Actually a width-2 bitfield with 4 values per uint8_t. */
- uint8_t (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
+ #define LOWRES_COST_MASK ((1<<14)-1)
+ #define LOWRES_COST_SHIFT 14
+
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
int i_ref[2];
diff --git a/common/mc.c b/common/mc.c
index 859e5fc..ad7fe79 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -427,7 +427,7 @@ static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *i
for( int i = 0; i < len; i++ )
{
int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
- dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
+ dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
}
}
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 20ef5d7..33659a3 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -37,6 +37,7 @@ pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pd_128: times 4 dd 128
+pw_0x3fff: times 4 dw 0x3fff
SECTION .text
@@ -1132,8 +1133,9 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
pmaddwd xmm0, xmm2
paddd xmm0, xmm4
psrld xmm0, 8 ; intra*invq>>8
- movq xmm1, [r1+r5] ; prop
movq xmm3, [r3+r5] ; inter
+ movq xmm1, [r1+r5] ; prop
+ pand xmm3, [pw_0x3fff]
punpcklwd xmm1, xmm5
punpcklwd xmm3, xmm5
paddd xmm0, xmm1 ; prop + (intra*invq>>8)
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 120a38a..afe14bd 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -416,10 +416,6 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
- /* Store to width-2 bitfield. */
- frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
- frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
-
lowres_intra_mb:
if( !fenc->b_intra_calculated )
{
@@ -481,7 +477,10 @@ lowres_intra_mb:
int i_icost = fenc->i_intra_cost[i_mb_xy];
int b_intra = i_icost < i_bcost;
if( b_intra )
+ {
i_bcost = i_icost;
+ list_used = 0;
+ }
if( b_frame_score_mb )
fenc->i_intra_mbs[b-p0] += b_intra;
}
@@ -501,7 +500,8 @@ lowres_intra_mb:
}
}
- fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
+ assert(i_bcost < (1<<14));
+ fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT);
}
#undef TRY_BIDIR
@@ -615,7 +615,7 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
{
int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
- int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
+ int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK;
float qp_adj = qp_offset[i_mb_xy];
i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
row_satd[ h->mb.i_mb_y ] += i_mb_cost;
@@ -681,7 +681,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
if( propagate_amount > 0 )
{
/* Access width-2 bitfield. */
- int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
+ int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
/* Follow the MVs to the previous frame(s). */
for( int list = 0; list < 2; list++ )
if( (lists_used >> list)&1 )
@@ -1490,7 +1490,7 @@ int x264_rc_analyse_slice( x264_t *h )
for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
{
int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
- int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
+ int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK;
int diff = intra_cost - inter_cost;
if( h->param.rc.i_aq_mode )
h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
More information about the x264-devel
mailing list