[x264-devel] x86: Correctly use v-prefix for instructions with opmasks
Henrik Gramner
git at videolan.org
Tue Aug 7 00:05:12 CEST 2018
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Mar 31 01:31:57 2018 +0200| [5f7f950c80e330728ecb07bc133e17456870121a] | committer: Henrik Gramner
x86: Correctly use v-prefix for instructions with opmasks
This was always required, but accidentally happened to work correctly
in a few cases.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5f7f950c80e330728ecb07bc133e17456870121a
---
common/x86/dct-a.asm | 10 +++++-----
common/x86/mc-a2.asm | 8 ++++----
common/x86/pixel-a.asm | 2 +-
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index ca842ee9..01311c49 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -621,8 +621,8 @@ cglobal sub16x16_dct, 3,3,6
SBUTTERFLY wd, 1, 0, 2
paddw m2, m1, m0
psubw m3, m1, m0
- paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
- psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
+ vpaddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
+ vpsubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1
SUMSUB_BA w, 1, 2, 3
@@ -630,8 +630,8 @@ cglobal sub16x16_dct, 3,3,6
shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
paddw m2, m1, m3
psubw m0, m1, m3
- paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
- psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
+ vpaddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
+ vpsubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
%endmacro
INIT_XMM avx512
@@ -743,7 +743,7 @@ cglobal sub8x8_dct_dc, 3,3
paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
punpcklqdq xmm1, xmm0, xmm0
- psubw xmm0 {k1}, xm3, xmm0
+ vpsubw xmm0 {k1}, xm3, xmm0
paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
movhps [r0], xmm0
RET
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index e3658f23..3c031313 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -2518,8 +2518,8 @@ cglobal mbtree_propagate_list_internal, 5,7,21
paddd m6, m7 ; i_mb_x += 8
pand m3, m8 ; {x, y}
vprold m1, m3, 20 ; {y, x} << 4
- psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
- psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
+ vpsubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
+ vpsubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
pmullw m3, m1
paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000)
pmulhrsw m2, m3, m4 ; idx01weight idx23weightp
@@ -2530,11 +2530,11 @@ cglobal mbtree_propagate_list_internal, 5,7,21
vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width
kunpckwd k2, k2, k2
psrad m1, m0, 16
- paddd m1 {k6}, m11
+ vpaddd m1 {k6}, m11
vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height
pmaddwd m0, m15
- paddd m0 {k6}, m14 ; idx0 | idx2
+ vpaddd m0 {k6}, m14 ; idx0 | idx2
vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight
vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 4271c921..04fe7099 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4744,7 +4744,7 @@ cglobal intra_sad_x9_8x8, 5,7,8
%endmacro
%macro SATD_AVX512_END 0-1 0 ; sa8d
- paddw m0 {k1}{z}, m1 ; zero-extend to dwords
+ vpaddw m0 {k1}{z}, m1 ; zero-extend to dwords
%if ARCH_X86_64
%if mmsize == 64
vextracti32x8 ym1, m0, 1
More information about the x264-devel
mailing list