[x264-devel] commit: rearrange cabac struct to reduce code size (Jason Garrett-Glaser )

Tue Mar 25 03:13:50 CET 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Mon Mar 24 19:12:07 2008 -0600| [6ae335530efb189b00fd6f3b1b7da5eefd856473]

rearrange cabac struct to reduce code size

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6ae335530efb189b00fd6f3b1b7da5eefd856473
---

 common/cabac.h         |   10 ++++----
 common/x86/cabac-a.asm |   59 ++++++++++++++++++++++++++----------------------
 encoder/rdo.c          |   14 ++++++-----
 3 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/common/cabac.h b/common/cabac.h
index 709c516..1c762b8 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -26,11 +26,6 @@
 
 typedef struct
 {
-    /* context */
-    DECLARE_ALIGNED_16( uint8_t state[460] );
-
-    int f8_bits_encoded; // only if using x264_cabac_size_decision()
-
     /* state */
     int i_low;
     int i_range;
@@ -43,6 +38,11 @@ typedef struct
     uint8_t *p;
     uint8_t *p_end;
 
+    /* aligned for aligned_memcpy starting here */
+    DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
+    
+    /* context */
+    uint8_t state[460];
 } x264_cabac_t;
 
 extern const uint8_t x264_cabac_transition[128][2];
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 9a6fbcd..9c21096 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -40,20 +40,25 @@ cextern x264_cabac_renorm_shift
 ; t3 must be ecx, since it's used for shift.
 %ifdef ARCH_X86_64
     DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
-    %define pointer 8
+    %define pointer resq
 %else
     DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
-    %define pointer 4
+    %define pointer resd
 %endif
 
-%define cb.state r0+0
-%define cb.low   r0+464
-%define cb.range r0+468
-%define cb.queue r0+472
-%define cb.bytes_outstanding r0+476
-%define cb.p     r0+480+pointer
-%define cb.end   r0+480+pointer*2
-
+struc cb
+    .low: resd 1
+    .range: resd 1
+    .queue: resd 1
+    .bytes_outstanding: resd 1
+    .start: pointer 1
+    .p: pointer 1
+    .end: pointer 1
+    align 16
+    .bits_encoded: resd 1
+    .state: resb 460
+endstruc
+    
 %macro LOAD_GLOBAL 4
 %ifdef PIC64
     ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
@@ -78,8 +83,8 @@ cglobal x264_cabac_encode_decision, 0,7
     movifnidn t0d, r0m
     movifnidn t1d, r1m
     picgetgot t2
-    mov   t5d, [cb.range]
-    movzx t3d, byte [cb.state+t1]
+    mov   t5d, [r0+cb.range]
+    movzx t3d, byte [r0+cb.state+t1]
     mov   t4d, t5d
     shr   t5d, 6
     and   t5d, 3
@@ -93,7 +98,7 @@ cglobal x264_cabac_encode_decision, 0,7
     movifnidn t2d, r2m
     cmp   t6d, t2d
 %endif
-    mov   t6d, [cb.low]
+    mov   t6d, [r0+cb.low]
     lea   t7,  [t6+t4]
     cmovne t4d, t5d
     cmovne t6d, t7d
@@ -103,18 +108,18 @@ cglobal x264_cabac_encode_decision, 0,7
 %else
     LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
 %endif
-    if32 mov t1d, r1m
-    mov   [cb.state+t1], t3b
+    movifnidn t1d, r1m
+    mov   [r0+cb.state+t1], t3b
 .renorm:
     mov   t3d, t4d
     shr   t3d, 3
     LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
     shl   t4d, t3b
     shl   t6d, t3b
-    add   t3d, [cb.queue]
-    mov   [cb.range], t4d
-    mov   [cb.low], t6d
-    mov   [cb.queue], t3d
+    add   t3d, [r0+cb.queue]
+    mov   [r0+cb.range], t4d
+    mov   [r0+cb.low], t6d
+    mov   [r0+cb.queue], t3d
     cmp   t3d, 8
     jge .putbyte
 .ret:
@@ -130,15 +135,15 @@ cglobal x264_cabac_encode_decision, 0,7
     sub   t3d, 10
     and   t6d, t1d
     cmp   t2b, 0xff ; FIXME is a 32bit op faster?
-    mov   [cb.queue], t3d
-    mov   [cb.low], t6d
+    mov   [r0+cb.queue], t3d
+    mov   [r0+cb.low], t6d
     mov   t1d, t2d
-    mov   t4,  [cb.p]
+    mov   t4,  [r0+cb.p]
     je .postpone
-    mov   t5d, [cb.bytes_outstanding]
+    mov   t5d, [r0+cb.bytes_outstanding]
     shr   t1d, 8 ; carry
     lea   t6, [t4+t5+1]
-    cmp   t6, [cb.end]
+    cmp   t6, [r0+cb.end]
     jge .ret
     add   [t4-1], t1b
     test  t5d, t5d
@@ -152,10 +157,10 @@ cglobal x264_cabac_encode_decision, 0,7
 .no_outstanding:
     mov   [t4], t2b
     inc   t4
-    mov   [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
-    mov   [cb.p], t4
+    mov   [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
+    mov   [r0+cb.p], t4
     RET
 .postpone:
-    inc   dword [cb.bytes_outstanding]
+    inc   dword [r0+cb.bytes_outstanding]
     RET
 
diff --git a/encoder/rdo.c b/encoder/rdo.c
index e122731..7967a92 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -49,7 +49,9 @@ static int cabac_prefix_size[15][128];
 #define x264_macroblock_write_cabac  x264_macroblock_size_cabac
 #include "cabac.c"
 
-
+#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
+        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
+    
 static int ssd_mb( x264_t *h )
 {
     return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
@@ -83,7 +85,7 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
     else if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
         x264_macroblock_size_cabac( h, &cabac_tmp );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
     }
@@ -125,7 +127,7 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
         x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
     }
@@ -147,7 +149,7 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
         x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
     }
@@ -169,7 +171,7 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
         x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
     }
@@ -195,7 +197,7 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
     if( h->param.b_cabac )
     {
         x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
         x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
     }