[x264-devel] commit: Faster CABAC state copying for small partitions ( Jason Garrett-Glaser )

Tue Apr 6 21:48:29 CEST 2010

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Apr  1 15:51:59 2010 -0700| [769adffece015d5082492fb87a4bc3cdd5cbd249] | committer: Jason Garrett-Glaser 

Faster CABAC state copying for small partitions
Save ~25 clocks per i4x4, i8x8, and sub8x8 RD call.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=769adffece015d5082492fb87a4bc3cdd5cbd249
---

 encoder/rdo.c |   31 ++++++++++++++++++++++++++++---
 1 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/encoder/rdo.c b/encoder/rdo.c
index 9537f32..2d0fad5 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -60,6 +60,8 @@ static uint16_t cabac_size_5ones[128];
 
 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
+#define COPY_CABAC_PART( pos, size )\
+        memcpy( &cb->state[pos], &h->cabac.state[pos], size )
 
 static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
 {
@@ -178,6 +180,29 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
     return i_ssd + i_bits;
 }
 
+/* For small partitions (i.e. those using at most one DCT category's worth of CABAC states),
+ * it's faster to copy the individual parts than to perform a whole CABAC_COPY. */
+static ALWAYS_INLINE void x264_copy_cabac_part( x264_t *h, x264_cabac_t *cb, int cat, int intra )
+{
+    if( intra )
+        COPY_CABAC_PART( 68, 2 );  //intra pred mode
+    else
+        COPY_CABAC_PART( 40, 16 ); //mvd, rounded up to 16 bytes
+
+    /* 8x8dct writes CBP, while non-8x8dct writes CBF */
+    if( cat != DCT_LUMA_8x8 )
+        COPY_CABAC_PART( 85 + cat * 4, 4 );
+    else
+        COPY_CABAC_PART( 73, 4 );
+
+    /* Really should be 15 bytes, but rounding up a byte saves some
+     * instructions and is faster, and copying extra data doesn't hurt. */
+    COPY_CABAC_PART( significant_coeff_flag_offset[h->mb.b_interlaced][cat], 16 );
+    COPY_CABAC_PART( last_coeff_flag_offset[h->mb.b_interlaced][cat], 16 );
+    COPY_CABAC_PART( coeff_abs_level_m1_offset[cat], 10 );
+    cb->f8_bits_encoded = 0;
+}
+
 /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
 
 static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
@@ -195,7 +220,7 @@ static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pi
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        COPY_CABAC;
+        x264_copy_cabac_part( h, &cabac_tmp, DCT_LUMA_4x4, 0 );
         x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
@@ -258,7 +283,7 @@ static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        COPY_CABAC;
+        x264_copy_cabac_part( h, &cabac_tmp, DCT_LUMA_8x8, 1 );
         x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
@@ -278,7 +303,7 @@ static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        COPY_CABAC;
+        x264_copy_cabac_part( h, &cabac_tmp, DCT_LUMA_4x4, 1 );
         x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }