[x264-devel] commit: Faster cabac_encode_decision_asm (Holger Lubitz )
git at videolan.org
git at videolan.org
Sun Mar 28 04:44:18 CEST 2010
x264 | branch: master | Holger Lubitz <holger at lubitz.org> | Wed Mar 24 00:54:39 2010 +0100| [8ec5a5ee10face5a1e64bf15364d1fdf2cf29ae5] | committer: Jason Garrett-Glaser
Faster cabac_encode_decision_asm
Minimizes instruction count, which also means smaller code.
Various other slight changes to allow more instruction level parallelism.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8ec5a5ee10face5a1e64bf15364d1fdf2cf29ae5
---
common/x86/cabac-a.asm | 77 ++++++++++++++++++++++--------------------------
1 files changed, 35 insertions(+), 42 deletions(-)
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 62e281a..b18e0db 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -5,6 +5,7 @@
;*
;* Author: Loren Merritt <lorenm at u.washington.edu>
;* Jason Garrett-Glaser <darkshikari at gmail.com>
+;* Holger Lubitz <holger at lubitz.org>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -39,7 +40,7 @@ cextern x264_cabac_renorm_shift
DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq
%else
- DECLARE_REG_TMP 0,3,2,1,4,5,6,3
+ DECLARE_REG_TMP 0,4,2,1,3,5,6,2
%define pointer resd
%endif
@@ -73,69 +74,61 @@ cglobal x264_cabac_encode_decision_asm, 0,7
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [t0+cb.range]
- movzx t3d, byte [t0+cb.state+t1]
- mov t4d, t5d
+ movzx t6d, byte [t0+cb.state+t1]
+ mov t3d, t5d
shr t5d, 6
- LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t3*4
- sub t4d, t5d
- mov t6d, t3d
- shr t6d, 6
movifnidn t2d, r2m
+ LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4
+ LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2
+ shr t6d, 6
+ sub t3d, t5d
cmp t6d, t2d
mov t6d, [t0+cb.low]
- lea t7, [t6+t4]
- cmovne t4d, t5d
+ lea t7, [t6+t3]
+ cmovne t3d, t5d
cmovne t6d, t7d
- LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
- movifnidn t1d, r1m
- mov [t0+cb.state+t1], t3b
-.renorm:
- mov t3d, t4d
+ mov [t0+cb.state+t1], t4b
+;x264_cabac_encode_renorm
+ mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
- mov [t0+cb.low], t6d
- mov [t0+cb.queue], t3d
cmp t3d, 8
- jge .putbyte
- REP_RET
-.putbyte:
+ jl .update_queue_low
+;x264_cabac_putbyte
; alive: t0=cb t3=queue t6=low
+%ifdef WIN64
+ DECLARE_REG_TMP 3,4,1,0,2,5,6,10
+%endif
+ mov t1d, -1
add t3d, 2
- mov t1d, 1
mov t2d, t6d
shl t1d, t3b
shr t2d, t3b ; out
- dec t1d
+ not t1d
sub t3d, 10
and t6d, t1d
- cmp t2b, 0xff ; FIXME is a 32bit op faster?
- mov [t0+cb.queue], t3d
- mov [t0+cb.low], t6d
- mov t1d, t2d
- mov t4, [t0+cb.p]
- je .postpone
mov t5d, [t0+cb.bytes_outstanding]
- shr t1d, 8 ; carry
- add [t4-1], t1b
- test t5d, t5d
- jz .no_outstanding
- dec t1d
+ cmp t2b, 0xff ; FIXME is a 32bit op faster?
+ jz .postpone
+ mov t1, [t0+cb.p]
+ add [t1-1], dh ; t2h
+ dec dh
.loop_outstanding:
- mov [t4], t1b
- inc t4
+ mov [t1], dh
+ inc t1
dec t5d
- jg .loop_outstanding
-.no_outstanding:
- mov [t4], t2b
- inc t4
- mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
- mov [t0+cb.p], t4
- RET
+ jge .loop_outstanding
+ mov [t1-1], t2b
+ mov [t0+cb.p], t1
.postpone:
- inc dword [t0+cb.bytes_outstanding]
+ inc t5d
+ mov [t0+cb.bytes_outstanding], t5d
+.update_queue_low:
+ mov [t0+cb.low], t6d
+ mov [t0+cb.queue], t3d
RET
More information about the x264-devel
mailing list