[x265] [PATCH 16 of 20] x86inc: Fix AVX emulation of some instructions
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Mon Jun 12 07:37:58 CEST 2017
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1496813868 -19800
# Wed Jun 07 11:07:48 2017 +0530
# Node ID d64adca2176b860242da335e3c8d19f77eb0559b
# Parent 27cf65f6bd8a45def62366fc004b42ca270d81ec
x86inc: Fix AVX emulation of some instructions
diff -r 27cf65f6bd8a -r d64adca2176b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Jun 06 18:50:22 2017 +0530
+++ b/source/common/x86/loopfilter.asm Wed Jun 07 11:07:48 2017 +0530
@@ -1583,7 +1583,7 @@
pshufb m1, m4, m0
pcmpgtb m0, [pb_15] ; m0 = [mask]
- pblendvb m6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+ pblendvb m6, m1, m0
pmovsxbw m0, m6 ; offset
punpckhbw m6, m6
@@ -1630,7 +1630,7 @@
pshufb m6, m3, m1
pshufb m5, m4, m1
- pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+ pblendvb m6, m5, m0
pmovzxbw m1, m2 ; rec
punpckhbw m2, m7
@@ -1904,7 +1904,7 @@
sub r3, r4
movu xmm0, [r3]
movu m3, [r0]
- pblendvb m5, m5, m3, xmm0
+ pblendvb m5, m3, xmm0
movu [r0], m5
.end:
diff -r 27cf65f6bd8a -r d64adca2176b source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jun 06 18:50:22 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Jun 07 11:07:48 2017 +0530
@@ -1597,7 +1597,7 @@
.widthLess8:
movu m6, [r1]
- pblendvb m6, m6, m7, m0
+ pblendvb m6, m7, m0
movu [r1], m6
.nextH:
diff -r 27cf65f6bd8a -r d64adca2176b source/common/x86/x86inc.asm
--- a/source/common/x86/x86inc.asm Tue Jun 06 18:50:22 2017 +0530
+++ b/source/common/x86/x86inc.asm Wed Jun 07 11:07:48 2017 +0530
@@ -1052,7 +1052,7 @@
;%1 == instruction
;%2 == minimal instruction set
;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
;%6+: operands
%macro RUN_AVX_INSTR 6-9+
@@ -1127,9 +1127,9 @@
;%1 == instruction
;%2 == minimal instruction set
;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
%macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
%ifidn %2, fnord
RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
@@ -1149,8 +1149,8 @@
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
-AVX_INSTR addsd, sse2, 1, 0, 1
-AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
AVX_INSTR addsubpd, sse3, 1, 0, 0
AVX_INSTR addsubps, sse3, 1, 0, 0
AVX_INSTR aesdec, fnord, 0, 0, 0
@@ -1163,10 +1163,10 @@
AVX_INSTR andnps, sse, 1, 0, 0
AVX_INSTR andpd, sse2, 1, 0, 1
AVX_INSTR andps, sse, 1, 0, 1
-AVX_INSTR blendpd, sse4, 1, 0, 0
-AVX_INSTR blendps, sse4, 1, 0, 0
-AVX_INSTR blendvpd, sse4, 1, 0, 0
-AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
AVX_INSTR cmppd, sse2, 1, 1, 0
AVX_INSTR cmpps, sse, 1, 1, 0
AVX_INSTR cmpsd, sse2, 1, 1, 0
@@ -1180,10 +1180,10 @@
AVX_INSTR cvtps2dq, sse2
AVX_INSTR cvtps2pd, sse2
AVX_INSTR cvtsd2si, sse2
-AVX_INSTR cvtsd2ss, sse2
-AVX_INSTR cvtsi2sd, sse2
-AVX_INSTR cvtsi2ss, sse
-AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
AVX_INSTR cvtss2si, sse
AVX_INSTR cvttpd2dq, sse2
AVX_INSTR cvttps2dq, sse2
@@ -1206,12 +1206,12 @@
AVX_INSTR maskmovdqu, sse2
AVX_INSTR maxpd, sse2, 1, 0, 1
AVX_INSTR maxps, sse, 1, 0, 1
-AVX_INSTR maxsd, sse2, 1, 0, 1
-AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
AVX_INSTR minpd, sse2, 1, 0, 1
AVX_INSTR minps, sse, 1, 0, 1
-AVX_INSTR minsd, sse2, 1, 0, 1
-AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
AVX_INSTR movapd, sse2
AVX_INSTR movaps, sse
AVX_INSTR movd, mmx
@@ -1237,11 +1237,11 @@
AVX_INSTR movss, sse, 1, 0, 0
AVX_INSTR movupd, sse2
AVX_INSTR movups, sse
-AVX_INSTR mpsadbw, sse4
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
AVX_INSTR mulpd, sse2, 1, 0, 1
AVX_INSTR mulps, sse, 1, 0, 1
-AVX_INSTR mulsd, sse2, 1, 0, 1
-AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
AVX_INSTR orpd, sse2, 1, 0, 1
AVX_INSTR orps, sse, 1, 0, 1
AVX_INSTR pabsb, ssse3
@@ -1259,14 +1259,18 @@
AVX_INSTR paddsw, mmx, 0, 0, 1
AVX_INSTR paddusb, mmx, 0, 0, 1
AVX_INSTR paddusw, mmx, 0, 0, 1
-AVX_INSTR palignr, ssse3
+AVX_INSTR palignr, ssse3, 0, 1, 0
AVX_INSTR pand, mmx, 0, 0, 1
AVX_INSTR pandn, mmx, 0, 0, 0
AVX_INSTR pavgb, mmx2, 0, 0, 1
AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4, 0, 0, 0
-AVX_INSTR pblendw, sse4
-AVX_INSTR pclmulqdq
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
AVX_INSTR pcmpestri, sse42
AVX_INSTR pcmpestrm, sse42
AVX_INSTR pcmpistri, sse42
@@ -1290,10 +1294,10 @@
AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR phsubd, ssse3, 0, 0, 0
AVX_INSTR phsubsw, ssse3, 0, 0, 0
-AVX_INSTR pinsrb, sse4
-AVX_INSTR pinsrd, sse4
-AVX_INSTR pinsrq, sse4
-AVX_INSTR pinsrw, mmx2
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
AVX_INSTR pmaxsb, sse4, 0, 0, 1
@@ -1365,18 +1369,18 @@
AVX_INSTR punpckldq, mmx, 0, 0, 0
AVX_INSTR punpcklqdq, sse2, 0, 0, 0
AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpps, sse
AVX_INSTR rcpss, sse, 1, 0, 0
AVX_INSTR roundpd, sse4
AVX_INSTR roundps, sse4
-AVX_INSTR roundsd, sse4
-AVX_INSTR roundss, sse4
-AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse
AVX_INSTR rsqrtss, sse, 1, 0, 0
AVX_INSTR shufpd, sse2, 1, 1, 0
AVX_INSTR shufps, sse, 1, 1, 0
-AVX_INSTR sqrtpd, sse2, 1, 0, 0
-AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtpd, sse2
+AVX_INSTR sqrtps, sse
AVX_INSTR sqrtsd, sse2, 1, 0, 0
AVX_INSTR sqrtss, sse, 1, 0, 0
AVX_INSTR stmxcsr, sse
More information about the x265-devel
mailing list