[x264-devel] commit: Optimizations and cosmetics in macroblock.c (Jason Garrett-Glaser )

Wed Jul 2 07:41:49 CEST 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Jul  1 23:42:39 2008 -0600| [1c8f807054a0308482d53d4c58ba1d5f5d2ae263]

Optimizations and cosmetics in macroblock.c
If an i4x4 dct block has no coefficients, don't bother with dequant/zigzag/idct.  Not useful for larger sizes because the odds of an empty block are much lower.
Cosmetics in i16x16 to be more consistent with other similar functions.
Add an SSD threshold for chroma in probe_skip to improve speed and minimize time spent on chroma skip analysis.
Rename lambda arrays to lambda_tab for consistency.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=1c8f807054a0308482d53d4c58ba1d5f5d2ae263
---

 encoder/analyse.c    |   10 ++++----
 encoder/macroblock.c |   56 ++++++++++++++++++++++++++++++-------------------
 encoder/macroblock.h |    3 ++
 encoder/slicetype.c  |    2 +-
 4 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 9200ace..d9ff0bc 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -128,7 +128,7 @@ typedef struct
 } x264_mb_analysis_t;
 
 /* lambda = pow(2,qp/6-2) */
-static const int i_qp0_cost_table[52] = {
+const int x264_lambda_tab[52] = {
    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
    1, 1, 1, 1,              /*  8-11 */
    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
@@ -139,7 +139,7 @@ static const int i_qp0_cost_table[52] = {
 };
 
 /* lambda2 = pow(lambda,2) * .9 * 256 */
-static const int i_qp0_cost2_table[52] = {
+const int x264_lambda2_tab[52] = {
     14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
     91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
    580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
@@ -205,8 +205,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
     /* conduct the analysis using this lamda and QP */
     a->i_qp = h->mb.i_qp = i_qp;
     h->mb.i_chroma_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
-    a->i_lambda = i_qp0_cost_table[i_qp];
-    a->i_lambda2 = i_qp0_cost2_table[i_qp];
+    a->i_lambda = x264_lambda_tab[i_qp];
+    a->i_lambda2 = x264_lambda2_tab[i_qp];
     a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 &&
                 ( h->sh.i_type != SLICE_TYPE_B || h->param.analyse.b_bframe_rdo );
 
@@ -924,7 +924,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 
         if( i_max > 0 )
         {
-            int i_chroma_lambda = i_qp0_cost2_table[h->mb.i_chroma_qp];
+            int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
             /* the previous thing encoded was x264_intra_rd(), so the pixels and
              * coefs for the current chroma mode are still around, so we only
              * have to recount the bits. */
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 7d03e41..66d034c 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -100,11 +100,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
     else
         h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
 
-    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
-    h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
+    if( array_non_zero( dct4x4 ) )
+    {
+        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
+        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
 
-    /* output samples to fdec */
-    h->dctf.add4x4_idct( p_dst, dct4x4 );
+        /* output samples to fdec */
+        h->dctf.add4x4_idct( p_dst, dct4x4 );
+    }
+    else
+        memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
 }
 
 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
@@ -132,7 +137,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
-    DECLARE_ALIGNED_16( int16_t dct4x4[16+1][4][4] );
+    DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+    DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
 
     int i;
 
@@ -143,46 +149,46 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
             int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
             int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
             h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
-            dct4x4[0][block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
+            dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
             h->dct.luma4x4[i][0] = 0;
         }
-        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
+        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
         return;
     }
 
-    h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
+    h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
     for( i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
-        dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
-        dct4x4[1+i][0][0] = 0;
+        dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+        dct4x4[i][0][0] = 0;
 
         /* quant/scan/dequant */
         if( h->mb.b_trellis )
-            x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
+            x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
         else
-            h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
+            h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
 
-        h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[1+i] );
-        h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
+        h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
+        h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qscale );
     }
 
-    h->dctf.dct4x4dc( dct4x4[0] );
-    h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
-    h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
+    h->dctf.dct4x4dc( dct_dc4x4 );
+    h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
+    h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
 
     /* output samples to fdec */
-    h->dctf.idct4x4dc( dct4x4[0] );
-    x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
+    h->dctf.idct4x4dc( dct_dc4x4 );
+    x264_mb_dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
 
     /* calculate dct coeffs */
     for( i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
-        dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
+        dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]];
     }
     /* put pixels to fdec */
-    h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
+    h->dctf.add16x16_idct( p_dst, dct4x4 );
 }
 
 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
@@ -617,7 +623,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
 
     int i_qp = h->mb.i_qp;
     int mvp[2];
-    int ch;
+    int ch, thresh;
 
     int i8x8, i4x4;
     int i_decimate_mb;
@@ -656,6 +662,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
 
     /* encode chroma */
     i_qp = h->mb.i_chroma_qp;
+    thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
 
     for( ch = 0; ch < 2; ch++ )
     {
@@ -669,6 +676,11 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
                              mvp[0], mvp[1], 8, 8 );
         }
 
+        /* there is almost never a termination during chroma, but we can't avoid the check entirely */
+        /* so instead we check SSD and skip the actual check if the score is low enough. */
+        if( h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) < thresh )
+            continue;
+
         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
 
         /* calculate dct DC */
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 5ac5834..ba7be69 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -26,6 +26,9 @@
 
 #include "common/macroblock.h"
 
+extern const int x264_lambda2_tab[52];
+extern const int x264_lambda_tab[52];
+
 void x264_rdo_init( );
 
 int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index d72e40a..27257c6 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -32,7 +32,7 @@
 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 {
     a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
-    a->i_lambda = i_qp0_cost_table[ a->i_qp ];
+    a->i_lambda = x264_lambda_tab[ a->i_qp ];
     x264_mb_analyse_load_costs( h, a );
     h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
     h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now