[x264-devel] BMI1 decimate functions
Jason Garrett-Glaser
git at videolan.org
Wed Mar 7 03:20:16 CET 2012
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Feb 14 16:54:03 2012 -0800| [5a242c5862baaa4bd5829bd1b43dc11cf5c86344] | committer: Jason Garrett-Glaser
BMI1 decimate functions
Intel was nice enough to make tzcnt equal to "rep bsf", which is backwards-compatible.
This means we don't actually have to add new functions to make it work.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5a242c5862baaa4bd5829bd1b43dc11cf5c86344
---
common/x86/bitstream-a.asm | 2 +-
common/x86/quant-a.asm | 10 +++++-----
common/x86/x86inc.asm | 4 ++++
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
index 8fc9197..c2aaf79 100644
--- a/common/x86/bitstream-a.asm
+++ b/common/x86/bitstream-a.asm
@@ -91,7 +91,7 @@ ALIGN 16
.escape:
; Skip bytes that are known to be valid
and r4d, r3d
- bsf r3d, r4d
+ tzcnt r3d, r4d
add r1, r3
.escape_loop:
inc r1
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index bbe2930..fefc435 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -912,7 +912,7 @@ cextern decimate_table8
%macro DECIMATE4x4 1
-;A LUT is faster than bsf on AMD processors.
+;A LUT is faster than bsf on older AMD processors.
;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
@@ -947,7 +947,7 @@ cglobal decimate_score%1, 1,3
add al, byte [mask_table + rdx]
%else
.loop:
- bsf ecx, edx
+ tzcnt ecx, edx
shr edx, cl
add al, byte [table + rcx]
shr edx, 1
@@ -1011,7 +1011,7 @@ cglobal decimate_score64, 1,5
add eax, r3d
jne .ret9
.loop:
- bsf rcx, r1
+ tzcnt rcx, r1
shr r1, cl
add al, byte [table + rcx]
shr r1, 1
@@ -1047,7 +1047,7 @@ cglobal decimate_score64, 1,5
add r0, r2
jne .ret9 ;r0 is zero at this point, so we don't need to zero it
.loop:
- bsf ecx, r3
+ tzcnt ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
@@ -1073,7 +1073,7 @@ cglobal decimate_score64, 1,5
.largerun:
mov r3, r4
xor r4, r4
- bsf ecx, r3
+ tzcnt ecx, r3
shr r3, cl
shr r3, 1
jne .loop
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 6373b61..487e170 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1098,3 +1098,7 @@ AVX_INSTR pfmul, 1, 0, 1
FMA_INSTR pmacsdd, pmulld, paddd
FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
More information about the x264-devel
mailing list