[x264-devel] 64-bit cabac optimizations

Wed Feb 27 00:18:03 CET 2013

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Jan 13 18:27:08 2013 +0100| [0ce5b431b94f3934a7229ab264c12f1106e4330d] | committer: Jason Garrett-Glaser

64-bit cabac optimizations

~4% faster PIC

WIN64:
~3% faster and 16 byte shorter cabac_encode_bypass
~8% faster cabac_encode_terminal
Benchmarked on Ivy Bridge

UNIX64:
One instruction less in cabac_encode_bypass

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0ce5b431b94f3934a7229ab264c12f1106e4330d
---

 common/x86/cabac-a.asm |   64 +++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 19d2aa2..d95d125 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -35,7 +35,7 @@ cextern cabac_renorm_shift
 
 ; t3 must be ecx, since it's used for shift.
 %if WIN64
-    DECLARE_REG_TMP 3,1,2,0,6,5,4,2
+    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
     %define pointer resq
 %elif ARCH_X86_64
     DECLARE_REG_TMP 0,1,2,3,4,5,6,6
@@ -58,24 +58,24 @@ struc cb
     .state: resb 1024
 endstruc
 
-%macro LOAD_GLOBAL 4
+%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
 %ifdef PIC
-    ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
-    lea   r7, [%2]
-    %ifnidn %3, 0
-    add   r7, %3
+    %ifidn %4, 0
+        movzx %1, byte [%2+%3+r7-$$]
+    %else
+        lea   %5, [r7+%4]
+        movzx %1, byte [%2+%3+%5-$$]
     %endif
-    movzx %1, byte [r7+%4]
 %else
     movzx %1, byte [%2+%3+%4]
 %endif
 %endmacro
 
-cglobal cabac_encode_decision_asm, 0,7
-    movifnidn t0,  r0mp
+cglobal cabac_encode_decision_asm, 1,7
     movifnidn t1d, r1m
-    mov   t5d, [t0+cb.range]
-    movzx t6d, byte [t0+cb.state+t1]
+    mov   t5d, [r0+cb.range]
+    movzx t6d, byte [r0+cb.state+t1]
+    movifnidn t0,  r0 ; WIN64
     mov   t4d, ~1
     mov   t3d, t5d
     and   t4d, t6d
@@ -84,8 +84,11 @@ cglobal cabac_encode_decision_asm, 0,7
 %if WIN64
     PUSH r7
 %endif
-    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
-    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
+%ifdef PIC
+    lea    r7, [$$]
+%endif
+    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
+    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
     and   t6d, 1
     sub   t3d, t5d
     cmp   t6d, t2d
@@ -97,7 +100,7 @@ cglobal cabac_encode_decision_asm, 0,7
 ;cabac_encode_renorm
     mov   t4d, t3d
     shr   t3d, 3
-    LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
+    LOAD_GLOBAL t3d, cabac_renorm_shift, t3
 %if WIN64
     POP r7
 %endif
@@ -111,15 +114,14 @@ cglobal cabac_encode_decision_asm, 0,7
     mov   [t0+cb.queue], t3d
     RET
 
-cglobal cabac_encode_bypass_asm, 0,3
-    movifnidn  t0, r0mp
-    movifnidn t3d, r1m
-    mov       t7d, [t0+cb.low]
-    and       t3d, [t0+cb.range]
-    lea       t7d, [t7*2+t3]
-    mov       t3d, [t0+cb.queue]
+cglobal cabac_encode_bypass_asm, 2,3
+    mov       t7d, [r0+cb.low]
+    and       r1d, [r0+cb.range]
+    lea       t7d, [t7*2+r1]
+    movifnidn  t0, r0 ; WIN64
+    mov       t3d, [r0+cb.queue]
     inc       t3d
-%if UNIX64 ; .putbyte compiles to nothing but a jmp
+%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
     jge cabac_putbyte
 %else
     jge .putbyte
@@ -127,28 +129,30 @@ cglobal cabac_encode_bypass_asm, 0,3
     mov   [t0+cb.low], t7d
     mov   [t0+cb.queue], t3d
     RET
+%if ARCH_X86_64 == 0
 .putbyte:
     PROLOGUE 0,7
     movifnidn t6d, t7d
     jmp cabac_putbyte
+%endif
 
-cglobal cabac_encode_terminal_asm, 0,3
-    movifnidn  t0, r0mp
-    sub  dword [t0+cb.range], 2
+cglobal cabac_encode_terminal_asm, 1,3
+    sub  dword [r0+cb.range], 2
 ; shortcut: the renormalization shift in terminal
 ; can only be 0 or 1 and is zero over 99% of the time.
-    test dword [t0+cb.range], 0x100
+    test dword [r0+cb.range], 0x100
     je .renorm
     RET
 .renorm:
-    shl  dword [t0+cb.low], 1
-    shl  dword [t0+cb.range], 1
-    inc  dword [t0+cb.queue]
+    shl  dword [r0+cb.low], 1
+    shl  dword [r0+cb.range], 1
+    inc  dword [r0+cb.queue]
     jge .putbyte
     RET
 .putbyte:
     PROLOGUE 0,7
-    mov t3d, [t0+cb.queue]
+    movifnidn t0, r0 ; WIN64
+    mov t3d, [r0+cb.queue]
     mov t6d, [t0+cb.low]
 
 cabac_putbyte: