[x264-devel] [PATCH] Add emulation for previously unsupported XOP fma instructions.

James Almer jamrial at gmail.com
Sun Feb 23 03:26:05 CET 2014


Allow non-destructive emulation when the fourth argument
is the same as the first by specifying a fifth argument
to be used as temporary.

Signed-off-by: James Almer <jamrial at gmail.com>
---
I renamed FMA_INSTR to PMA_INSTR in order to keep the former 
free for a future implementation of FMA3/4 emulation similar 
to this one.
---
 common/x86/quant-a.asm | 10 +++++-----
 common/x86/x86inc.asm  | 18 ------------------
 common/x86/x86util.asm | 29 +++++++++++++++++++++++++++--
 3 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 26224aa..900937e 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -554,8 +554,8 @@ cglobal quant_4x4x4, 3,3,6
 %if HIGH_BIT_DEPTH
     mova      m0, %1
     mova      m1, %4
-    pmadcswd  m0, m0, %2, m3
-    pmadcswd  m1, m1, %3, m3
+    PMADCSWD  m0, m0, %2, m3, 0
+    PMADCSWD  m1, m1, %3, m3, 0
     psrad     m0, xm2
     psrad     m1, xm2
     mova      %1, m0
@@ -569,8 +569,8 @@ cglobal quant_4x4x4, 3,3,6
     punpckhwd m1, m0, m4
     punpcklwd m0, m4
 %endif
-    pmadcswd  m0, m0, %2, m3
-    pmadcswd  m1, m1, %3, m3
+    PMADCSWD  m0, m0, %2, m3, 0
+    PMADCSWD  m1, m1, %3, m3, 0
     psrad     m0, xm2
     psrad     m1, xm2
     packssdw  m0, m1
@@ -790,7 +790,7 @@ cglobal dequant_4x4dc, 0,3,6
 %endif
 %assign %%x 0
 %rep SIZEOF_PIXEL*32/mmsize
-    pmadcswd  m0, m2, [r0+%%x], m4
+    PMADCSWD  m0, m2, [r0+%%x], m4, 0
     psrad     m0, xm3
     mova      [r0+%%x], m0
 %assign %%x %%x+mmsize
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index bd73bc4..f1d981c 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1396,24 +1396,6 @@ AVX_INSTR pfmul, 1, 0, 1
 %undef i
 %undef j
 
-%macro FMA_INSTR 3
-    %macro %1 4-7 %1, %2, %3
-        %if cpuflag(xop)
-            v%5 %1, %2, %3, %4
-        %elifnidn %1, %4
-            %6 %1, %2, %3
-            %7 %1, %4
-        %else
-            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
-%endmacro
-
-FMA_INSTR  pmacsww,  pmullw, paddw
-FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
-FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
 ; convert FMA4 to FMA3 if possible
 %macro FMA4_INSTR 4
     %macro %1 4-8 %1, %2, %3, %4
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 79ff34f..a146583 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -643,8 +643,14 @@
 
 %macro SUMSUB2_AB 4
 %if cpuflag(xop)
-    pmacs%1%1 m%4, m%3, [p%1_m2], m%2
-    pmacs%1%1 m%2, m%2, [p%1_2], m%3
+    %ifidn   %1, d
+        %define pmacs PMACSDD
+    %elifidn %1, w
+        %define pmacs PMACSWW
+    %endif
+    pmacs m%4, m%3, [p%1_m2], m%2, 0
+    pmacs m%2, m%2, [p%1_2], m%3, 0
+    %undef pmacs
 %elifnum %3
     psub%1  m%4, m%2, m%3
     psub%1  m%4, m%3
@@ -868,3 +874,22 @@
     SWAP       %2, %3
 %endif
 %endmacro
+
+%macro PMA_INSTR 4
+    %macro %1 5-8 %2, %3, %4
+        %if cpuflag(xop)
+            v%6 %1, %2, %3, %4
+        %elifidn %1, %4
+            %7 %5, %2, %3
+            %8 %1, %4, %5
+        %else
+            %7 %1, %2, %3
+            %8 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+PMA_INSTR  PMACSWW,  pmacsww,  pmullw, paddw
+PMA_INSTR  PMACSDD,  pmacsdd,  pmulld, paddd ; sse4 emulation
+PMA_INSTR PMACSDQL, pmacsdql,  pmuldq, paddq ; sse4 emulation
+PMA_INSTR PMADCSWD, pmadcswd, pmaddwd, paddd
-- 
1.8.3.2



More information about the x264-devel mailing list