[x264-devel] [PATCH] Added support for CABAC zero bytes insertion

Fri Oct 26 13:20:20 CEST 2018

>From 658dc3f6af9fb2dd4d776b4afb70dae1110fd5d9 Mon Sep 17 00:00:00 2001
From: "Jay N. Shingala" <100153 at ittiam.com>
Date: Tue, 11 Sep 2018 21:03:40 +0530
Subject: [PATCH] Added support for cabac zero bytes insertion


Following changes done for zero bytes insertion to meet the constraints of clause 7.4.2.10 in specification.
    - Added "i_bin_cnt" to "x264_cabac_t" for counting total bins
       i_bin_cnt is unconditionally incremented in all cabac decision and bypass mode modules (both C and ASM)

    - Added "i_bin_cnt" to "x264_frame_stat_t" as part of frame stats
       Gets assigned from cabac structure to frame structure at the end of slice
       Recommended to not move placement of i_bin_cnt in this structure. Current position ensures the accumulation of bin counter across multiple slices at end of frame (in threaded_slices_write()).

    - Added macro "INSERT_CABAC_ZERO_WORD" to insert cabac zero bytes
       "encoder_insert_cabaczerowords()" is the new function which adds cabac zero bytes as per clause 7.4.2.10 at the end of frame
---
 common/aarch64/asm-offsets.c |  1 +
 common/aarch64/asm-offsets.h |  1 +
 common/aarch64/cabac-a.S     | 10 +++++++
 common/cabac.c               |  6 +++++
 common/cabac.h               |  3 +++
 common/common.h              |  2 ++
 common/x86/cabac-a.asm       | 20 ++++++++------x
 encoder/encoder.c            | 62 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 97 insertions(+), 8 deletions(-)

diff --git a/common/aarch64/asm-offsets.c b/common/aarch64/asm-offsets.c
index db88e9c4..81bd4f8f 100644
--- a/common/aarch64/asm-offsets.c
+++ b/common/aarch64/asm-offsets.c
@@ -38,5 +38,6 @@ X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
 X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
 X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
 X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
+X264_CHECK_OFFSET(x264_cabac_t, i_bin_cnt,           CABAC_BIN_COUNT);
 X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
 X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
diff --git a/common/aarch64/asm-offsets.h b/common/aarch64/asm-offsets.h
index 81184de1..c19655ff 100644
--- a/common/aarch64/asm-offsets.h
+++ b/common/aarch64/asm-offsets.h
@@ -33,6 +33,7 @@
 #define CABAC_P_START               0x10
 #define CABAC_P                     0x18
 #define CABAC_P_END                 0x20
+#define CABAC_BIN_COUNT             0x28
 #define CABAC_F8_BITS_ENCODED       0x30
 #define CABAC_STATE                 0x34

diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S
index 9abb14b2..0346bed3 100644
--- a/common/aarch64/cabac-a.S
+++ b/common/aarch64/cabac-a.S
@@ -28,6 +28,7 @@

 // w11 holds x264_cabac_t.i_low
 // w12 holds x264_cabac_t.i_range
+// w7 holds x264_cabac_t.i_bin_cnt

 function cabac_encode_decision_asm, export=1
     movrel      x8,  X264(cabac_range_lps)
@@ -35,6 +36,7 @@ function cabac_encode_decision_asm, export=1
     add         w10, w1, #CABAC_STATE
     ldrb        w3,  [x0,  x10]         // i_state
     ldr         w12, [x0,  #CABAC_I_RANGE]
+    ldr         w7,  [x0,  #CABAC_BIN_COUNT]
     and         x4,  x3,  #~1
     asr         w5,  w12, #6
     add         x8,  x8,  x4, lsl #1
@@ -42,7 +44,9 @@ function cabac_encode_decision_asm, export=1
     eor         w6,  w2,  w3            // b ^ i_state
     ldrb        w4,  [x8,  x5]          // i_range_lps
     ldr         w11, [x0, #CABAC_I_LOW]
+adds        w7,  w7,  #1            // i_bin_cnt += 1
     sub         w12, w12, w4
+str         w7,  [x0, #CABAC_BIN_COUNT]
     tbz         w6,  #0,  1f            // (b ^ i_state) & 1
     add         w11, w11, w12
     mov         w12,  w4
@@ -103,11 +107,14 @@ endfunc

 function cabac_encode_bypass_asm, export=1
     ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w7,  [x0, #CABAC_BIN_COUNT]
     ldr         w11, [x0, #CABAC_I_LOW]
     ldr         w2,  [x0, #CABAC_I_QUEUE]
     and         w1,  w1,  w12
     add         w11, w1,  w11, lsl #1
+    adds        w7,  w7,  #1            // i_bin_cnt += 1
     adds        w2,  w2,  #1
+    str         w7,  [x0, #CABAC_BIN_COUNT]
     b.ge        cabac_putbyte
     str         w11, [x0, #CABAC_I_LOW]
     str         w2,  [x0, #CABAC_I_QUEUE]
@@ -116,7 +123,10 @@ endfunc

 function cabac_encode_terminal_asm, export=1
     ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w7,  [x0, #CABAC_BIN_COUNT]
     ldr         w11, [x0, #CABAC_I_LOW]
     sub         w12, w12, #2
+    adds        w7,  w7,  #1            // i_bin_cnt += 1
+    str         w7,  [x0, #CABAC_BIN_COUNT]
     b           cabac_encode_renorm
 endfunc
diff --git a/common/cabac.c b/common/cabac.c
index 8c3e72af..f2f2b01d 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -56,6 +56,7 @@ void x264_cabac_encode_init_core( x264_cabac_t *cb )
     cb->i_range = 0x01FE;
     cb->i_queue = -9; // the first bit will be shifted away and not written
     cb->i_bytes_outstanding = 0;
+    cb->i_bin_cnt = 0;
 }

 void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
@@ -122,6 +123,7 @@ void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
     }
     cb->state[i_ctx] = x264_cabac_transition[i_state][b];
     cabac_encode_renorm( cb );
+    cb->i_bin_cnt++;
 }

 /* Note: b is negated for this function */
@@ -131,6 +133,7 @@ void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
     cb->i_low += b & cb->i_range;
     cb->i_queue += 1;
     cabac_putbyte( cb );
+    cb->i_bin_cnt++;
 }

 static const int bypass_lut[16] =
@@ -146,6 +149,8 @@ void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
     uint32_t x = (bypass_lut[k-exp_bits]<<exp_bits) + v;
     k = 2*k+1-exp_bits;
     int i = ((k-1)&7)+1;
+
+    cb->i_bin_cnt += k;
     do {
         k -= i;
         cb->i_low <<= i;
@@ -160,6 +165,7 @@ void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
 {
     cb->i_range -= 2;
     cabac_encode_renorm( cb );
+    cb->i_bin_cnt++;
 }

 void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb )
diff --git a/common/cabac.h b/common/cabac.h
index b573416e..a29a8763 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -37,10 +37,13 @@ typedef struct
     int i_queue; //stored with an offset of -8 for faster asm
     int i_bytes_outstanding;

+
     uint8_t *p_start;
     uint8_t *p;
     uint8_t *p_end;

+    int i_bin_cnt; // bin counter needed for cabac zero word padding of access unit
+
     /* aligned for memcpy_aligned starting here */
     ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()

diff --git a/common/common.h b/common/common.h
index 320206ef..febe11c3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -259,6 +259,8 @@ typedef struct
     int i_mb_field[3];
     /* Adaptive direct mv pred */
     int i_direct_score[2];
+    /* bin counter needed for cabac zero word padding of access unit */
+    int i_bin_cnt;
     /* Metrics */
     int64_t i_ssd[3];
     double f_ssim;
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index fcafd9c4..ae2d905c 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -115,6 +115,7 @@ struc cb
     .start: pointer 1
     .p: pointer 1
     .end: pointer 1
+    .i_bin_cnt: resd 1
     align 64, resb 1
     .bits_encoded: resd 1
     .state: resb 1024
@@ -143,12 +144,13 @@ endstruc
     DECLARE_REG_TMP 0,4,2,1,3,5,6,2
 %endif

-cglobal cabac_encode_decision_%1, 1,7
+cglobal cabac_encode_decision_%1, 1,7
     movifnidn t1d, r1m
     mov   t5d, [r0+cb.range]
     movzx t6d, byte [r0+cb.state+t1]
+    inc  dword [r0+cb.i_bin_cnt]
     movifnidn t0,  r0 ; WIN64
-    mov   t4d, ~1
+    mov   t4d, ~1
     mov   t3d, t5d
     and   t4d, t6d
     shr   t5d, 6
@@ -193,13 +195,14 @@ cglobal cabac_encode_decision_%1, 1,7
     mov   [t0+cb.queue], t3d
     RET

-cglobal cabac_encode_bypass_%1, 2,3
+cglobal cabac_encode_bypass_%1, 2,3
     mov       t7d, [r0+cb.low]
     and       r1d, [r0+cb.range]
-    lea       t7d, [t7*2+r1]
-    movifnidn  t0, r0 ; WIN64
-    mov       t3d, [r0+cb.queue]
-    inc       t3d
+    inc      dword [r0+cb.i_bin_cnt]
+    lea       t7d, [t7*2+r1]
+    movifnidn  t0, r0 ; WIN64
+    mov       t3d, [r0+cb.queue]
+    inc       t3d
 %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
     jge cabac_putbyte_%1
 %else
@@ -217,7 +220,8 @@ cglobal cabac_encode_bypass_%1, 2,3

 %ifnidn %1,bmi2
 cglobal cabac_encode_terminal_%1, 1,3
-    sub  dword [r0+cb.range], 2
+    inc  dword [r0+cb.i_bin_cnt]
+    sub  dword [r0+cb.range], 2
 ; shortcut: the renormalization shift in terminal
 ; can only be 0 or 1 and is zero over 99% of the time.
     test dword [r0+cb.range], 0x100
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 7316a586..bb3f8d67 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -37,6 +37,7 @@
 #endif

 //#define DEBUG_MB_TYPE
+#define INSERT_CABAC_ZERO_WORD  1

 #define bs_write_ue bs_write_ue_big

@@ -2005,6 +2006,55 @@ static int encoder_encapsulate_nals( x264_t *h, int start )
     return nal_buffer - (h0->nal_buffer + previous_nal_size);
 }

+#if INSERT_CABAC_ZERO_WORD
+static int encoder_insert_cabaczerowords( x264_t *h, int frame_size )
+{
+     int RawMbBits = 256 * h->param.i_bitdepth;
+     int min_num_bytes = 0;
+     int i_chroma_format_idc = h->sps->i_chroma_format_idc;
+
+     if(i_chroma_format_idc == CHROMA_420 )
+        RawMbBits = (RawMbBits * 3) / 2;
+     else if( i_chroma_format_idc == CHROMA_422 )
+        RawMbBits = (RawMbBits * 2);
+     else if( i_chroma_format_idc == CHROMA_444 )
+        RawMbBits = (RawMbBits * 3);
+
+    /* Check for cabac zero word stuffing based on output frame size, bin count and raw bits as per section 7.4.2.10
+       Also see section 9.3.4.6 Byte stuffing process */
+    min_num_bytes = ((96 * h->stat.frame.i_bin_cnt) - (RawMbBits * h->mb.i_mb_count * 3) + 1023) / 1024;
+    //printf("bin cnt = %d, min_num_bytes = %d, frame_size = %d, \n",h->stat.frame.i_bin_cnt, min_num_bytes, frame_size);
+    if(frame_size < min_num_bytes)
+    {
+        int stuffing_bytes = min_num_bytes - frame_size;
+        x264_nal_t *nal = &h->out.nal[h->out.i_nal-1];
+
+        /* If the required buffer size exceeds current allocated size, re-allocate the nal buffer.
+           Note that +2 guardband is due to insertion of cabac zero word with EPB (0x000003) in steps of 3 */
+        if( check_encapsulated_buffer( h, h->thread[0], h->out.i_nal, frame_size, (min_num_bytes+2)) < 0 )
+            return -1;
+
+        //printf ("Inserting %d/%d cabac_zero_word syntax elements/bytes (Clause 7.4.2.10)\n", ((stuffing_bytes + 2)/3), stuffing_bytes);
+        {
+            uint8_t *nal_buf = nal->p_payload + nal->i_payload;
+            int i;
+            for (i = 0; i < stuffing_bytes; i+=3 )
+            {
+              *nal_buf++ = 0x00; // CABAC zero word
+              *nal_buf++ = 0x00;
+              *nal_buf++ = 0x03;
+            }
+
+            nal->i_payload += i;
+            nal->i_padding += i;
+            frame_size += i;
+        }
+    }
+
+    return (frame_size);
+}
+#endif
+
 /****************************************************************************
  * x264_encoder_headers:
  ****************************************************************************/
@@ -2996,6 +3046,8 @@ cont:
     {
         x264_cabac_encode_flush( h, &h->cabac );
         h->out.bs.p = h->cabac.p;
+
+        h->stat.frame.i_bin_cnt = h->cabac.i_bin_cnt;
     }
     else
     {
@@ -3837,6 +3889,16 @@ static int encoder_frame_end( x264_t *h, x264_t *thread_current,
     if( frame_size < 0 )
         return -1;

+#if INSERT_CABAC_ZERO_WORD
+     if(h->param.b_cabac)
+     {
+         /* cabac zero word insertion; Clause 7.4.2.10 */
+         frame_size = encoder_insert_cabaczerowords(h, frame_size);
+         if( frame_size < 0 )
+            return -1;
+     }
+#endif
+
     /* Set output picture properties */
     pic_out->i_type = h->fenc->i_type;

--
2.13.0.windows.1

This is confidential Ittiam property.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-cabac-zerobytes.patch
Type: application/octet-stream
Size: 12281 bytes
Desc: 0001-cabac-zerobytes.patch
URL: <http://mailman.videolan.org/pipermail/x264-devel/attachments/20181026/6a8321f6/attachment-0001.obj>