[x264-devel] [PATCH] Added support for CABAC zero bytes insertion

Fri Apr 12 13:23:47 CEST 2019

Dear x264 developers,

This query is regarding the patch submitted (quite a while ago) on CABAC zero word insertion which is a requirement for bit-stream conformance.

For reference, here is an excerpt of section 7.4.2.10 of AVC/H.264 specification describing the need for zero word insertion when the CABAC bin count to bits count ratio is higher than constrained limits.

"cabac_zero_word is a byte-aligned sequence of two bytes equal to 0x0000.

Let NumBytesInVclNALunits be the sum of the values of NumBytesInNALunit for all VCL NAL units of a coded picture

Let  BinCountsInNALunits  be  the  number  of  times  that  the  parsing  process  function  DecodeBin( ),  specified  in
clause 9.3.3.2,  is  invoked  to  decode  the  contents  of  all  VCL  NAL  units  of  a  coded  picture.   When
entropy_coding_mode_flag is equal to 1,  it is a requirement of bitstream conformance that  BinCountsInNALunits shall
not exceed ( 32 ÷ 3 ) * NumBytesInVclNALunits + ( RawMbBits * PicSizeInMbs ) ÷ 32.

NOTE – The constraint on the maximum number of bins resulting from decoding the contents of the slice layer NAL units can be
met  by  inserting  a  number  of  cabac_zero_word  syntax  elements  to  increase  the  value  of  NumBytesInVclNALunits.  Each
cabac_zero_word is represented in a NAL unit by the three-byte sequence 0x000003 (as a result of the constraints on NAL unit
contents that result in requiring inclusion of an emulation_prevention_three_byte for each cabac_zero_word)."

This patch will be useful for strict bit stream conformance in x264.
It is important to note that the overall performance impact was negligible as the latency cycle of "bin_cnt" incrementing in cabac_encode_decision() and cabac_encode_bypass() is well hidden.

Request you to please provide comments on the conformance requirement and the suitability of this patch in x264.

Thank you,
Jay


-----Original Message-----
From: Jay N. Shingala
Sent: Friday, October 26, 2018 4:50 PM
To: x264-devel at videolan.org
Subject: [x264-devel] [PATCH] Added support for CABAC zero bytes insertion

From 658dc3f6af9fb2dd4d776b4afb70dae1110fd5d9 Mon Sep 17 00:00:00 2001
From: "Jay N. Shingala" <100153 at ittiam.com>
Date: Tue, 11 Sep 2018 21:03:40 +0530
Subject: [PATCH] Added support for cabac zero bytes insertion


Following changes done for zero bytes insertion to meet the constraints of clause 7.4.2.10 in specification.
    - Added “i_bin_cnt” to "x264_cabac_t" for counting total bins
       i_bin_cnt is unconditionally incremented in all cabac decision and bypass mode modules (both C and ASM)

    - Added “i_bin_cnt” to "x264_frame_stat_t" as part of frame stats
       Gets assigned from cabac structure to frame structure at the end of slice
       Recommended to not move placement of i_bin_cnt in this structure. Current position ensures the accumulation of bin counter across multiple slices at end of frame (in threaded_slices_write()).

    - Added macro “INSERT_CABAC_ZERO_WORD” to insert cabac zero bytes
       "encoder_insert_cabaczerowords()" is the new function which adds cabac zero bytes as per clause 7.4.2.10 at the end of frame
---
 common/aarch64/asm-offsets.c |  1 +
 common/aarch64/asm-offsets.h |  1 +
 common/aarch64/cabac-a.S     | 10 +++++++
 common/cabac.c               |  6 +++++
 common/cabac.h               |  3 +++
 common/common.h              |  2 ++
 common/x86/cabac-a.asm       | 20 ++++++++------x
 encoder/encoder.c            | 62 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 97 insertions(+), 8 deletions(-)

diff --git a/common/aarch64/asm-offsets.c b/common/aarch64/asm-offsets.c index db88e9c4..81bd4f8f 100644
--- a/common/aarch64/asm-offsets.c
+++ b/common/aarch64/asm-offsets.c
@@ -38,5 +38,6 @@ X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
 X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
 X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
 X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
+X264_CHECK_OFFSET(x264_cabac_t, i_bin_cnt,           CABAC_BIN_COUNT);
 X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
 X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
diff --git a/common/aarch64/asm-offsets.h b/common/aarch64/asm-offsets.h index 81184de1..c19655ff 100644
--- a/common/aarch64/asm-offsets.h
+++ b/common/aarch64/asm-offsets.h
@@ -33,6 +33,7 @@
 #define CABAC_P_START               0x10
 #define CABAC_P                     0x18
 #define CABAC_P_END                 0x20
+#define CABAC_BIN_COUNT             0x28
 #define CABAC_F8_BITS_ENCODED       0x30
 #define CABAC_STATE                 0x34

diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S index 9abb14b2..0346bed3 100644
--- a/common/aarch64/cabac-a.S
+++ b/common/aarch64/cabac-a.S
@@ -28,6 +28,7 @@

 // w11 holds x264_cabac_t.i_low
 // w12 holds x264_cabac_t.i_range
+// w7 holds x264_cabac_t.i_bin_cnt

 function cabac_encode_decision_asm, export=1
     movrel      x8,  X264(cabac_range_lps)
@@ -35,6 +36,7 @@ function cabac_encode_decision_asm, export=1
     add         w10, w1, #CABAC_STATE
     ldrb        w3,  [x0,  x10]         // i_state
     ldr         w12, [x0,  #CABAC_I_RANGE]
+    ldr         w7,  [x0,  #CABAC_BIN_COUNT]
     and         x4,  x3,  #~1
     asr         w5,  w12, #6
     add         x8,  x8,  x4, lsl #1
@@ -42,7 +44,9 @@ function cabac_encode_decision_asm, export=1
     eor         w6,  w2,  w3            // b ^ i_state
     ldrb        w4,  [x8,  x5]          // i_range_lps
     ldr         w11, [x0, #CABAC_I_LOW]
+adds        w7,  w7,  #1            // i_bin_cnt += 1
     sub         w12, w12, w4
+str         w7,  [x0, #CABAC_BIN_COUNT]
     tbz         w6,  #0,  1f            // (b ^ i_state) & 1
     add         w11, w11, w12
     mov         w12,  w4
@@ -103,11 +107,14 @@ endfunc

 function cabac_encode_bypass_asm, export=1
     ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w7,  [x0, #CABAC_BIN_COUNT]
     ldr         w11, [x0, #CABAC_I_LOW]
     ldr         w2,  [x0, #CABAC_I_QUEUE]
     and         w1,  w1,  w12
     add         w11, w1,  w11, lsl #1
+    adds        w7,  w7,  #1            // i_bin_cnt += 1
     adds        w2,  w2,  #1
+    str         w7,  [x0, #CABAC_BIN_COUNT]
     b.ge        cabac_putbyte
     str         w11, [x0, #CABAC_I_LOW]
     str         w2,  [x0, #CABAC_I_QUEUE]
@@ -116,7 +123,10 @@ endfunc

 function cabac_encode_terminal_asm, export=1
     ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w7,  [x0, #CABAC_BIN_COUNT]
     ldr         w11, [x0, #CABAC_I_LOW]
     sub         w12, w12, #2
+    adds        w7,  w7,  #1            // i_bin_cnt += 1
+    str         w7,  [x0, #CABAC_BIN_COUNT]
     b           cabac_encode_renorm
 endfunc
diff --git a/common/cabac.c b/common/cabac.c index 8c3e72af..f2f2b01d 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -56,6 +56,7 @@ void x264_cabac_encode_init_core( x264_cabac_t *cb )
     cb->i_range = 0x01FE;
     cb->i_queue = -9; // the first bit will be shifted away and not written
     cb->i_bytes_outstanding = 0;
+    cb->i_bin_cnt = 0;
 }

 void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end ) @@ -122,6 +123,7 @@ void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
     }
     cb->state[i_ctx] = x264_cabac_transition[i_state][b];
     cabac_encode_renorm( cb );
+    cb->i_bin_cnt++;
 }

 /* Note: b is negated for this function */ @@ -131,6 +133,7 @@ void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
     cb->i_low += b & cb->i_range;
     cb->i_queue += 1;
     cabac_putbyte( cb );
+    cb->i_bin_cnt++;
 }

 static const int bypass_lut[16] =
@@ -146,6 +149,8 @@ void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
     uint32_t x = (bypass_lut[k-exp_bits]<<exp_bits) + v;
     k = 2*k+1-exp_bits;
     int i = ((k-1)&7)+1;
+
+    cb->i_bin_cnt += k;
     do {
         k -= i;
         cb->i_low <<= i;
@@ -160,6 +165,7 @@ void x264_cabac_encode_terminal_c( x264_cabac_t *cb )  {
     cb->i_range -= 2;
     cabac_encode_renorm( cb );
+    cb->i_bin_cnt++;
 }

 void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb ) diff --git a/common/cabac.h b/common/cabac.h index b573416e..a29a8763 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -37,10 +37,13 @@ typedef struct
     int i_queue; //stored with an offset of -8 for faster asm
     int i_bytes_outstanding;

+
     uint8_t *p_start;
     uint8_t *p;
     uint8_t *p_end;

+    int i_bin_cnt; // bin counter needed for cabac zero word padding of
+ access unit
+
     /* aligned for memcpy_aligned starting here */
     ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()

diff --git a/common/common.h b/common/common.h index 320206ef..febe11c3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -259,6 +259,8 @@ typedef struct
     int i_mb_field[3];
     /* Adaptive direct mv pred */
     int i_direct_score[2];
+    /* bin counter needed for cabac zero word padding of access unit */
+    int i_bin_cnt;
     /* Metrics */
     int64_t i_ssd[3];
     double f_ssim;
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index fcafd9c4..ae2d905c 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -115,6 +115,7 @@ struc cb
     .start: pointer 1
     .p: pointer 1
     .end: pointer 1
+    .i_bin_cnt: resd 1
     align 64, resb 1
     .bits_encoded: resd 1
     .state: resb 1024
@@ -143,12 +144,13 @@ endstruc
     DECLARE_REG_TMP 0,4,2,1,3,5,6,2
 %endif

-cglobal cabac_encode_decision_%1, 1,7
+cglobal cabac_encode_decision_%1, 1,7
     movifnidn t1d, r1m
     mov   t5d, [r0+cb.range]
     movzx t6d, byte [r0+cb.state+t1]
+    inc  dword [r0+cb.i_bin_cnt]
     movifnidn t0,  r0 ; WIN64
-    mov   t4d, ~1
+    mov   t4d, ~1
     mov   t3d, t5d
     and   t4d, t6d
     shr   t5d, 6
@@ -193,13 +195,14 @@ cglobal cabac_encode_decision_%1, 1,7
     mov   [t0+cb.queue], t3d
     RET

-cglobal cabac_encode_bypass_%1, 2,3
+cglobal cabac_encode_bypass_%1, 2,3
     mov       t7d, [r0+cb.low]
     and       r1d, [r0+cb.range]
-    lea       t7d, [t7*2+r1]
-    movifnidn  t0, r0 ; WIN64
-    mov       t3d, [r0+cb.queue]
-    inc       t3d
+    inc      dword [r0+cb.i_bin_cnt]
+    lea       t7d, [t7*2+r1]
+    movifnidn  t0, r0 ; WIN64
+    mov       t3d, [r0+cb.queue]
+    inc       t3d
 %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
     jge cabac_putbyte_%1
 %else
@@ -217,7 +220,8 @@ cglobal cabac_encode_bypass_%1, 2,3

 %ifnidn %1,bmi2
 cglobal cabac_encode_terminal_%1, 1,3
-    sub  dword [r0+cb.range], 2
+    inc  dword [r0+cb.i_bin_cnt]
+    sub  dword [r0+cb.range], 2
 ; shortcut: the renormalization shift in terminal  ; can only be 0 or 1 and is zero over 99% of the time.
     test dword [r0+cb.range], 0x100
diff --git a/encoder/encoder.c b/encoder/encoder.c index 7316a586..bb3f8d67 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -37,6 +37,7 @@
 #endif

 //#define DEBUG_MB_TYPE
+#define INSERT_CABAC_ZERO_WORD  1

 #define bs_write_ue bs_write_ue_big

@@ -2005,6 +2006,55 @@ static int encoder_encapsulate_nals( x264_t *h, int start )
     return nal_buffer - (h0->nal_buffer + previous_nal_size);  }

+#if INSERT_CABAC_ZERO_WORD
+static int encoder_insert_cabaczerowords( x264_t *h, int frame_size ) {
+     int RawMbBits = 256 * h->param.i_bitdepth;
+     int min_num_bytes = 0;
+     int i_chroma_format_idc = h->sps->i_chroma_format_idc;
+
+     if(i_chroma_format_idc == CHROMA_420 )
+        RawMbBits = (RawMbBits * 3) / 2;
+     else if( i_chroma_format_idc == CHROMA_422 )
+        RawMbBits = (RawMbBits * 2);
+     else if( i_chroma_format_idc == CHROMA_444 )
+        RawMbBits = (RawMbBits * 3);
+
+    /* Check for cabac zero word stuffing based on output frame size, bin count and raw bits as per section 7.4.2.10
+       Also see section 9.3.4.6 Byte stuffing process */
+    min_num_bytes = ((96 * h->stat.frame.i_bin_cnt) - (RawMbBits * h->mb.i_mb_count * 3) + 1023) / 1024;
+    //printf("bin cnt = %d, min_num_bytes = %d, frame_size = %d, \n",h->stat.frame.i_bin_cnt, min_num_bytes, frame_size);
+    if(frame_size < min_num_bytes)
+    {
+        int stuffing_bytes = min_num_bytes - frame_size;
+        x264_nal_t *nal = &h->out.nal[h->out.i_nal-1];
+
+        /* If the required buffer size exceeds current allocated size, re-allocate the nal buffer.
+           Note that +2 guardband is due to insertion of cabac zero word with EPB (0x000003) in steps of 3 */
+        if( check_encapsulated_buffer( h, h->thread[0], h->out.i_nal, frame_size, (min_num_bytes+2)) < 0 )
+            return -1;
+
+        //printf ("Inserting %d/%d cabac_zero_word syntax elements/bytes (Clause 7.4.2.10)\n", ((stuffing_bytes + 2)/3), stuffing_bytes);
+        {
+            uint8_t *nal_buf = nal->p_payload + nal->i_payload;
+            int i;
+            for (i = 0; i < stuffing_bytes; i+=3 )
+            {
+              *nal_buf++ = 0x00; // CABAC zero word
+              *nal_buf++ = 0x00;
+              *nal_buf++ = 0x03;
+            }
+
+            nal->i_payload += i;
+            nal->i_padding += i;
+            frame_size += i;
+        }
+    }
+
+    return (frame_size);
+}
+#endif
+
 /****************************************************************************
  * x264_encoder_headers:
  ****************************************************************************/
@@ -2996,6 +3046,8 @@ cont:
     {
         x264_cabac_encode_flush( h, &h->cabac );
         h->out.bs.p = h->cabac.p;
+
+        h->stat.frame.i_bin_cnt = h->cabac.i_bin_cnt;
     }
     else
     {
@@ -3837,6 +3889,16 @@ static int encoder_frame_end( x264_t *h, x264_t *thread_current,
     if( frame_size < 0 )
         return -1;

+#if INSERT_CABAC_ZERO_WORD
+     if(h->param.b_cabac)
+     {
+         /* cabac zero word insertion; Clause 7.4.2.10 */
+         frame_size = encoder_insert_cabaczerowords(h, frame_size);
+         if( frame_size < 0 )
+            return -1;
+     }
+#endif
+
     /* Set output picture properties */
     pic_out->i_type = h->fenc->i_type;

--
2.13.0.windows.1

This is confidential Ittiam property.