[x264-devel] [PATCH] Add emulation for previously unsupported XOP fma instructions.
James Almer
jamrial at gmail.com
Sun Feb 23 03:26:05 CET 2014
Allow non-destructive emulation when the fourth argument
is the same as the first by specifying a fifth argument
to be used as temporary.
Signed-off-by: James Almer <jamrial at gmail.com>
---
I renamed FMA_INSTR to PMA_INSTR in order to keep the former
free for a future implementation of FMA3/4 emulation similar
to this one.
---
common/x86/quant-a.asm | 10 +++++-----
common/x86/x86inc.asm | 18 ------------------
common/x86/x86util.asm | 29 +++++++++++++++++++++++++++--
3 files changed, 32 insertions(+), 25 deletions(-)
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 26224aa..900937e 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -554,8 +554,8 @@ cglobal quant_4x4x4, 3,3,6
%if HIGH_BIT_DEPTH
mova m0, %1
mova m1, %4
- pmadcswd m0, m0, %2, m3
- pmadcswd m1, m1, %3, m3
+ PMADCSWD m0, m0, %2, m3, 0
+ PMADCSWD m1, m1, %3, m3, 0
psrad m0, xm2
psrad m1, xm2
mova %1, m0
@@ -569,8 +569,8 @@ cglobal quant_4x4x4, 3,3,6
punpckhwd m1, m0, m4
punpcklwd m0, m4
%endif
- pmadcswd m0, m0, %2, m3
- pmadcswd m1, m1, %3, m3
+ PMADCSWD m0, m0, %2, m3, 0
+ PMADCSWD m1, m1, %3, m3, 0
psrad m0, xm2
psrad m1, xm2
packssdw m0, m1
@@ -790,7 +790,7 @@ cglobal dequant_4x4dc, 0,3,6
%endif
%assign %%x 0
%rep SIZEOF_PIXEL*32/mmsize
- pmadcswd m0, m2, [r0+%%x], m4
+ PMADCSWD m0, m2, [r0+%%x], m4, 0
psrad m0, xm3
mova [r0+%%x], m0
%assign %%x %%x+mmsize
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index bd73bc4..f1d981c 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1396,24 +1396,6 @@ AVX_INSTR pfmul, 1, 0, 1
%undef i
%undef j
-%macro FMA_INSTR 3
- %macro %1 4-7 %1, %2, %3
- %if cpuflag(xop)
- v%5 %1, %2, %3, %4
- %elifnidn %1, %4
- %6 %1, %2, %3
- %7 %1, %4
- %else
- %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
- %endif
- %endmacro
-%endmacro
-
-FMA_INSTR pmacsww, pmullw, paddw
-FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
-FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
; convert FMA4 to FMA3 if possible
%macro FMA4_INSTR 4
%macro %1 4-8 %1, %2, %3, %4
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 79ff34f..a146583 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -643,8 +643,14 @@
%macro SUMSUB2_AB 4
%if cpuflag(xop)
- pmacs%1%1 m%4, m%3, [p%1_m2], m%2
- pmacs%1%1 m%2, m%2, [p%1_2], m%3
+ %ifidn %1, d
+ %define pmacs PMACSDD
+ %elifidn %1, w
+ %define pmacs PMACSWW
+ %endif
+ pmacs m%4, m%3, [p%1_m2], m%2, 0
+ pmacs m%2, m%2, [p%1_2], m%3, 0
+ %undef pmacs
%elifnum %3
psub%1 m%4, m%2, m%3
psub%1 m%4, m%3
@@ -868,3 +874,22 @@
SWAP %2, %3
%endif
%endmacro
+
+%macro PMA_INSTR 4
+ %macro %1 5-8 %2, %3, %4
+ %if cpuflag(xop)
+ v%6 %1, %2, %3, %4
+ %elifidn %1, %4
+ %7 %5, %2, %3
+ %8 %1, %4, %5
+ %else
+ %7 %1, %2, %3
+ %8 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+PMA_INSTR PMACSWW, pmacsww, pmullw, paddw
+PMA_INSTR PMACSDD, pmacsdd, pmulld, paddd ; sse4 emulation
+PMA_INSTR PMACSDQL, pmacsdql, pmuldq, paddq ; sse4 emulation
+PMA_INSTR PMADCSWD, pmadcswd, pmaddwd, paddd
--
1.8.3.2
More information about the x264-devel
mailing list