[x264-devel] commit: Reduce lookahead memory usage, cache misses (Jason Garrett-Glaser )

git at videolan.org git at videolan.org
Thu Apr 29 19:58:08 CEST 2010


x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Mon Apr 26 15:10:11 2010 -0700| [41b877e0ab14a77a211e0cd087bf9c3726b2c2f6] | committer: Jason Garrett-Glaser 

Reduce lookahead memory usage, cache misses
Merge lowres_types with lowres_costs.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=41b877e0ab14a77a211e0cd087bf9c3726b2c2f6
---

 common/frame.c       |    6 ------
 common/frame.h       |    9 +++++++--
 common/mc.c          |    2 +-
 common/x86/mc-a2.asm |    4 +++-
 encoder/slicetype.c  |   16 ++++++++--------
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/common/frame.c b/common/frame.c
index 90783cd..fa8eec0 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -148,10 +148,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
             CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
             for( int j = 0; j <= h->param.i_bframe+1; j++ )
                 for( int i = 0; i <= h->param.i_bframe+1; i++ )
-                {
                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
-                    CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
-                }
             frame->i_intra_cost = frame->lowres_costs[0][0];
             memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
         }
@@ -199,10 +196,7 @@ void x264_frame_delete( x264_frame_t *frame )
         x264_free( frame->i_propagate_cost );
         for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
             for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
-            {
                 x264_free( frame->lowres_costs[j][i] );
-                x264_free( frame->lowres_inter_types[j][i] );
-            }
         x264_free( frame->f_qp_offset );
         x264_free( frame->f_qp_offset_aq );
         x264_free( frame->i_inv_qscale_factor );
diff --git a/common/frame.h b/common/frame.h
index 419da08..2d6ea0b 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -84,9 +84,14 @@ typedef struct x264_frame
     uint8_t *mb_partition;
     int16_t (*mv[2])[2];
     int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+
+    /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
+     * Doesn't need special addressing for intra cost because
+     * lists_used is guaranteed to be zero in that cast. */
     uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
-    /* Actually a width-2 bitfield with 4 values per uint8_t. */
-    uint8_t  (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
+    #define LOWRES_COST_MASK ((1<<14)-1)
+    #define LOWRES_COST_SHIFT 14
+
     int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
     int8_t  *ref[2];
     int     i_ref[2];
diff --git a/common/mc.c b/common/mc.c
index 859e5fc..ad7fe79 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -427,7 +427,7 @@ static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *i
     for( int i = 0; i < len; i++ )
     {
         int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
-        dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
+        dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
     }
 }
 
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 20ef5d7..33659a3 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -37,6 +37,7 @@ pw_1:  times 8 dw 1
 pw_16: times 8 dw 16
 pw_32: times 8 dw 32
 pd_128: times 4 dd 128
+pw_0x3fff: times 4 dw 0x3fff
 
 SECTION .text
 
@@ -1132,8 +1133,9 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
     pmaddwd   xmm0, xmm2
     paddd     xmm0, xmm4
     psrld     xmm0, 8       ; intra*invq>>8
-    movq      xmm1, [r1+r5] ; prop
     movq      xmm3, [r3+r5] ; inter
+    movq      xmm1, [r1+r5] ; prop
+    pand      xmm3, [pw_0x3fff]
     punpcklwd xmm1, xmm5
     punpcklwd xmm3, xmm5
     paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 120a38a..afe14bd 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -416,10 +416,6 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
         TRY_BIDIR( m[0].mv, m[1].mv, 5 );
 
-    /* Store to width-2 bitfield. */
-    frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
-    frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
-
 lowres_intra_mb:
     if( !fenc->b_intra_calculated )
     {
@@ -481,7 +477,10 @@ lowres_intra_mb:
         int i_icost = fenc->i_intra_cost[i_mb_xy];
         int b_intra = i_icost < i_bcost;
         if( b_intra )
+        {
             i_bcost = i_icost;
+            list_used = 0;
+        }
         if( b_frame_score_mb )
             fenc->i_intra_mbs[b-p0] += b_intra;
     }
@@ -501,7 +500,8 @@ lowres_intra_mb:
         }
     }
 
-    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
+    assert(i_bcost < (1<<14));
+    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT);
 }
 #undef TRY_BIDIR
 
@@ -615,7 +615,7 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
         for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
         {
             int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
-            int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
+            int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK;
             float qp_adj = qp_offset[i_mb_xy];
             i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
             row_satd[ h->mb.i_mb_y ] += i_mb_cost;
@@ -681,7 +681,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
             if( propagate_amount > 0 )
             {
                 /* Access width-2 bitfield. */
-                int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
+                int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
                 /* Follow the MVs to the previous frame(s). */
                 for( int list = 0; list < 2; list++ )
                     if( (lists_used >> list)&1 )
@@ -1490,7 +1490,7 @@ int x264_rc_analyse_slice( x264_t *h )
             for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
             {
                 int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
-                int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
+                int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK;
                 int diff = intra_cost - inter_cost;
                 if( h->param.rc.i_aq_mode )
                     h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;



More information about the x264-devel mailing list