[x264-devel] commit: Extend trellis to support luma/chroma DC and chroma AC ( Jason Garrett-Glaser )

Fri Oct 31 16:57:50 CET 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Oct 16 03:17:53 2008 -0700| [79194caffdc216e338674d88e50adca2f4ea8fa2] | committer: Jason Garrett-Glaser 

Extend trellis to support luma/chroma DC and chroma AC
Small speed loss in trellis 1, slightly larger in trellis 2, but significant quality improvement.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=79194caffdc216e338674d88e50adca2f4ea8fa2
---

 encoder/macroblock.c |   25 +++++++++++++++++++------
 encoder/macroblock.h |    2 ++
 encoder/rdo.c        |   40 +++++++++++++++++++++++++++-------------
 3 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index e1e215c..f51d5ff 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -187,7 +187,10 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     }
 
     h->dctf.dct4x4dc( dct_dc4x4 );
-    h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
+    if( h->mb.b_trellis )
+        x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
+    else
+        h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
     h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
 
     /* output samples to fdec */
@@ -239,8 +242,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             dct2x2[i>>1][i&1] = dct4x4[i][0][0];
             dct4x4[i][0][0] = 0;
 
-            /* no trellis; it doesn't seem to help chroma noticeably */
-            h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+            if( h->mb.b_trellis )
+                x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
+            else
+                h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
             h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
 
             if( b_decimate )
@@ -248,7 +253,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         }
 
         h->dctf.dct2x2dc( dct2x2 );
-        h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+        if( h->mb.b_trellis )
+            x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+        else
+            h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
 
         /* output samples to fdec */
@@ -937,9 +945,14 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 
             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-            h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+            dct4x4[0][0] = 0;
+
+            if( h->mb.b_trellis )
+                x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
+            else
+                h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+
             h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
-            h->dct.luma4x4[16+i8+ch*4][0] = 0;
             if( array_non_zero( dct4x4 ) )
             {
                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 9fda183..b25509a 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -54,6 +54,8 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
 
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
+void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+                             int i_qp, int i_ctxBlockCat, int b_intra );
 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                              int i_qp, int i_ctxBlockCat, int b_intra, int idx );
 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
diff --git a/encoder/rdo.c b/encoder/rdo.c
index c56b6e0..dbdcbe8 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -333,7 +333,6 @@ typedef struct {
 } trellis_node_t;
 
 // TODO:
-// support chroma and i16x16 DC
 // save cabac state between blocks?
 // use trellis' RD score instead of x264_mb_decimate_score?
 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
@@ -353,10 +352,10 @@ typedef struct {
 // comparable to the input. so unquant is the direct inverse of quant,
 // and uses the dct scaling factors, not the idct ones.
 
-static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
+static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
                                  const uint16_t *quant_mf, const int *unquant_mf,
                                  const int *coef_weight, const uint8_t *zigzag,
-                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx )
+                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
 {
     int abs_coefs[64], signs[64];
     trellis_node_t nodes[2][8];
@@ -381,7 +380,7 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
 
     /* init coefs */
     for( i = i_coefs-1; i >= b_ac; i-- )
-        if( (unsigned)(dct[zigzag[i]] * quant_mf[zigzag[i]] + f-1) >= 2*f )
+        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
             break;
 
     if( i < b_ac )
@@ -425,17 +424,22 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
             cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
         }
     }
-    else
+    else if( !dc || i_ctxBlockCat != DCT_CHROMA_DC )
     {
         memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
         memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
     }
+    else
+    {
+        memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
+        memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
+    }
     memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
 
     for( i = i_last_nnz; i >= b_ac; i-- )
     {
         int i_coef = abs_coefs[i];
-        int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16;
+        int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
         int abs_level;
         int cost_sig[2], cost_last[2];
         trellis_node_t n;
@@ -488,11 +492,11 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
         // that are better left coded, especially at QP > 40.
         for( abs_level = q; abs_level >= q-1; abs_level-- )
         {
-            int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
+            int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
             int d = i_coef - unquant_abs_level;
             int64_t ssd;
             /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
-            if( h->mb.i_psy_trellis && i )
+            if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
             {
                 int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
                 int predicted_coef = orig_coef - i_coef * signs[i];
@@ -501,7 +505,8 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
                 ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
             }
             else
-                ssd = (int64_t)d*d * coef_weight[i];
+            /* FIXME: for i16x16 dc is this weight optimal? */
+                ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
 
             for( j = 0; j < 8; j++ )
             {
@@ -563,19 +568,28 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
     }
 }
 
+const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
+
+void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+                            int i_qp, int i_ctxBlockCat, int b_intra )
+{
+    quant_trellis_cabac( h, (int16_t*)dct,
+        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
+        NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
+        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
+}
 
 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                              int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
-    int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);
+    int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
     quant_trellis_cabac( h, (int16_t*)dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan4[h->mb.b_interlaced],
-        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16, idx );
+        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
 }
 
-
 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                              int i_qp, int b_intra, int idx )
 {
@@ -583,6 +597,6 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan8[h->mb.b_interlaced],
-        DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64, idx );
+        DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );
 }