[x264-devel] BMI1 decimate functions

Wed Mar 7 03:20:16 CET 2012

x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Feb 14 16:54:03 2012 -0800| [5a242c5862baaa4bd5829bd1b43dc11cf5c86344] | committer: Jason Garrett-Glaser

BMI1 decimate functions
Intel was nice enough to make tzcnt equal to "rep bsf", which is backwards-compatible.
This means we don't actually have to add new functions to make it work.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5a242c5862baaa4bd5829bd1b43dc11cf5c86344
---

 common/x86/bitstream-a.asm |    2 +-
 common/x86/quant-a.asm     |   10 +++++-----
 common/x86/x86inc.asm      |    4 ++++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
index 8fc9197..c2aaf79 100644
--- a/common/x86/bitstream-a.asm
+++ b/common/x86/bitstream-a.asm
@@ -91,7 +91,7 @@ ALIGN 16
 .escape:
     ; Skip bytes that are known to be valid
     and      r4d, r3d
-    bsf      r3d, r4d
+    tzcnt    r3d, r4d
     add       r1, r3
 .escape_loop:
     inc       r1
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index bbe2930..fefc435 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -912,7 +912,7 @@ cextern decimate_table8
 
 %macro DECIMATE4x4 1
 
-;A LUT is faster than bsf on AMD processors.
+;A LUT is faster than bsf on older AMD processors.
 ;This is not true for score64.
 cglobal decimate_score%1, 1,3
 %ifdef PIC
@@ -947,7 +947,7 @@ cglobal decimate_score%1, 1,3
     add    al, byte [mask_table + rdx]
 %else
 .loop:
-    bsf   ecx, edx
+    tzcnt ecx, edx
     shr   edx, cl
     add    al, byte [table + rcx]
     shr   edx, 1
@@ -1011,7 +1011,7 @@ cglobal decimate_score64, 1,5
     add   eax, r3d
     jne  .ret9
 .loop:
-    bsf   rcx, r1
+    tzcnt rcx, r1
     shr   r1, cl
     add   al, byte [table + rcx]
     shr   r1, 1
@@ -1047,7 +1047,7 @@ cglobal decimate_score64, 1,5
     add   r0, r2
     jne  .ret9      ;r0 is zero at this point, so we don't need to zero it
 .loop:
-    bsf   ecx, r3
+    tzcnt ecx, r3
     test  r3, r3
     je   .largerun
     shrd  r3, r4, cl
@@ -1073,7 +1073,7 @@ cglobal decimate_score64, 1,5
 .largerun:
     mov   r3, r4
     xor   r4, r4
-    bsf   ecx, r3
+    tzcnt ecx, r3
     shr   r3, cl
     shr   r3, 1
     jne  .loop
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 6373b61..487e170 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1098,3 +1098,7 @@ AVX_INSTR pfmul, 1, 0, 1
 FMA_INSTR  pmacsdd,  pmulld, paddd
 FMA_INSTR  pmacsww,  pmullw, paddw
 FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf