[x264-devel] Make ref and i4x4_mode costs global instead of static

Mon Dec 25 20:39:53 CET 2017

x264 | branch: master | Anton Mitrofanov <BugMaster at narod.ru> | Fri Sep 22 17:18:55 2017 +0300| [bdf27e783a8eb4a5bcae0cd0a950d6dc3d995bfe] | committer: Anton Mitrofanov

Make ref and i4x4_mode costs global instead of static

Fixes some thread safety doubts and makes code cleaner.
Downside: slightly higher memory usage when calling multiple encoders from the same application.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=bdf27e783a8eb4a5bcae0cd0a950d6dc3d995bfe
---

 common/common.h   |  7 ++++++-
 encoder/analyse.c | 16 +++++-----------
 encoder/encoder.c |  2 ++
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/common/common.h b/common/common.h
index 162cfb4d..fe2b1c7f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -343,9 +343,14 @@ struct x264_t
     udctcoef        (*quant8_bias0[4])[64];  /* [4][QP_MAX_SPEC+1][64] */
     udctcoef        (*nr_offset_emergency)[4][64];
 
-    /* mv/ref cost arrays. */
+    /* mv/ref/mode cost arrays. */
     uint16_t *cost_mv[QP_MAX+1];
     uint16_t *cost_mv_fpel[QP_MAX+1][4];
+    struct
+    {
+        uint16_t ref[QP_MAX+1][3][33];
+        ALIGNED_64( uint16_t i4x4_mode[QP_MAX+1][32] );
+    } *cost_table;
 
     const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index b8b29d9b..a289b242 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -140,10 +140,6 @@ static const uint8_t i_sub_mb_p_cost_table[4] =
 
 static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 
-static uint16_t x264_cost_ref[QP_MAX+1][3][33];
-static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
-static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
-
 static int init_costs( x264_t *h, float *logs, int qp )
 {
     if( h->cost_mv[qp] )
@@ -159,11 +155,9 @@ static int init_costs( x264_t *h, float *logs, int qp )
         h->cost_mv[qp][-i] =
         h->cost_mv[qp][i]  = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX );
     }
-    x264_pthread_mutex_lock( &cost_ref_mutex );
     for( int i = 0; i < 3; i++ )
         for( int j = 0; j < 33; j++ )
-            x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
-    x264_pthread_mutex_unlock( &cost_ref_mutex );
+            h->cost_table->ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
     if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
     {
         for( int j = 0; j < 4; j++ )
@@ -174,7 +168,7 @@ static int init_costs( x264_t *h, float *logs, int qp )
                 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
         }
     }
-    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
+    uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[qp];
     for( int i = 0; i < 17; i++ )
         cost_i4x4_mode[i] = 3*lambda*(i!=8);
     return 0;
@@ -252,8 +246,8 @@ void x264_analyse_weight_frame( x264_t *h, int end )
 static void mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 {
     a->p_cost_mv = h->cost_mv[a->i_qp];
-    a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
-    a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+    a->p_cost_ref[0] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+    a->p_cost_ref[1] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 }
 
 static void mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
@@ -749,7 +743,7 @@ static void mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter
             return;
     }
 
-    uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
+    uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[a->i_qp] + 8;
     /* 8x8 prediction selection */
     if( flags & X264_ANALYSE_I8x8 )
     {
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 088f5411..ff18054d 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1527,6 +1527,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
     h->frames.i_largest_pts = h->frames.i_second_largest_pts = -1;
     h->frames.i_poc_last_open_gop = -1;
 
+    CHECKED_MALLOCZERO( h->cost_table, sizeof(*h->cost_table) );
     CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
     /* Allocate room for max refs plus a few extra just in case. */
     CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + X264_REF_MAX + 4) * sizeof(x264_frame_t *) );
@@ -4364,6 +4365,7 @@ void    x264_encoder_close  ( x264_t *h )
     x264_free( h->nal_buffer );
     x264_free( h->reconfig_h );
     x264_analyse_free_costs( h );
+    x264_free( h->cost_table );
 
     if( h->i_thread_frames > 1 )
         h = h->thread[h->i_thread_phase];