[x265] [PATCH 263 of 307] x86: AVX512 Clean up of luma_vps and chroma_vps
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:21 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1513228979 -19800
# Thu Dec 14 10:52:59 2017 +0530
# Node ID 1480076a7bdda5ca31776adf31c087268f232107
# Parent 458b708e6d17aafb49a5fd369b2e9540d0268726
x86: AVX512 Clean up of luma_vps and chroma_vps
diff -r 458b708e6d17 -r 1480076a7bdd source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Dec 13 11:05:10 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Dec 14 10:52:59 2017 +0530
@@ -10947,7 +10947,7 @@
%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m10, [interp4_vps_store1_avx512]
mova m11, [interp4_vps_store2_avx512]
@@ -11073,7 +11073,7 @@
%ifidn %1,pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m10, [interp4_vps_store1_avx512]
mova m11, [interp4_vps_store2_avx512]
@@ -11248,7 +11248,7 @@
%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m10, [interp4_vps_store1_avx512]
mova m11, [interp4_vps_store2_avx512]
@@ -11400,7 +11400,7 @@
%ifidn %1,pp
vbroadcasti32x8 m12, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m12, [pw_2000]
mova m13, [interp4_vps_store1_avx512]
mova m14, [interp4_vps_store2_avx512]
@@ -14256,24 +14256,23 @@
%else
psubw m0, m7
psubw m1, m7
+ mova m12, m16
+ mova m13, m17
+ vpermi2q m12, m0, m1
+ vpermi2q m13, m0, m1
+ movu [r2], ym12
+ vextracti32x8 [r2 + 2 * r3], m12, 1
+
psubw m2, m7
psubw m3, m7
-
- mova m12, m16
- mova m13, m17
mova m14, m16
mova m15, m17
-
- vpermi2q m12, m0, m1
- vpermi2q m13, m0, m1
vpermi2q m14, m2, m3
vpermi2q m15, m2, m3
-
- movu [r2], ym12
movu [r2 + r3], ym14
- vextracti32x8 [r2 + 2 * r3], m12, 1
vextracti32x8 [r2 + r7], m14, 1
lea r2, [r2 + 4 * r3]
+
movu [r2], ym13
movu [r2 + r3], ym15
vextracti32x8 [r2 + 2 * r3], m13, 1
@@ -14303,7 +14302,7 @@
%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m16, [interp4_vps_store1_avx512]
mova m17, [interp4_vps_store2_avx512]
@@ -14425,22 +14424,20 @@
%else
psubw m0, m7
psubw m1, m7
+ mova m12, m16
+ mova m13, m17
+ vpermi2q m12, m0, m1
+ vpermi2q m13, m0, m1
+ movu [r2], m12
+ movu [r2 + 2 * r3], m13
+
psubw m2, m7
psubw m3, m7
-
- mova m12, m16
- mova m13, m17
mova m14, m16
mova m15, m17
-
- vpermi2q m12, m0, m1
- vpermi2q m13, m0, m1
vpermi2q m14, m2, m3
vpermi2q m15, m2, m3
-
- movu [r2], m12
movu [r2 + r3], m14
- movu [r2 + 2 * r3], m13
movu [r2 + r7], m15
%endif
%endmacro
@@ -14467,7 +14464,7 @@
%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m16, [interp4_vps_store1_avx512]
mova m17, [interp4_vps_store2_avx512]
@@ -14596,22 +14593,20 @@
%else
psubw m0, m7
psubw m1, m7
+ mova m12, m16
+ mova m13, m17
+ vpermi2q m12, m0, m1
+ vpermi2q m13, m0, m1
+ movu [r9], m12
+ movu [r9 + 2 * r3], m13
+
psubw m2, m7
psubw m3, m7
-
- mova m12, m16
- mova m13, m17
mova m14, m16
mova m15, m17
-
- vpermi2q m12, m0, m1
- vpermi2q m13, m0, m1
vpermi2q m14, m2, m3
vpermi2q m15, m2, m3
-
- movu [r9], m12
movu [r9 + r3], m14
- movu [r9 + 2 * r3], m13
movu [r9 + r7], m15
%endif
movu xm1, [r0 + mmsize/2]
@@ -14726,24 +14721,23 @@
%else
psubw m0, m7
psubw m1, m7
+ mova m12, m16
+ mova m13, m17
+ vpermi2q m12, m0, m1
+ vpermi2q m13, m0, m1
+ movu [r2 + mmsize], ym12
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m12, 1
+
psubw m2, m7
psubw m3, m7
-
- mova m12, m16
- mova m13, m17
mova m14, m16
mova m15, m17
-
- vpermi2q m12, m0, m1
- vpermi2q m13, m0, m1
vpermi2q m14, m2, m3
vpermi2q m15, m2, m3
-
- movu [r2 + mmsize], ym12
movu [r2 + r3 + mmsize], ym14
- vextracti32x8 [r2 + 2 * r3 + mmsize], m12, 1
vextracti32x8 [r2 + r7 + mmsize], m14, 1
lea r2, [r2 + 4 * r3]
+
movu [r2 + mmsize], ym13
movu [r2 + r3 + mmsize], ym15
vextracti32x8 [r2 + 2 * r3 + mmsize], m13, 1
@@ -14774,7 +14768,7 @@
%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m16, [interp4_vps_store1_avx512]
mova m17, [interp4_vps_store2_avx512]
@@ -14877,21 +14871,19 @@
%else
psubw m0, m7
psubw m1, m7
+ mova m12, m16
+ mova m13, m17
+ vpermi2q m12, m0, m1
+ vpermi2q m13, m0, m1
+ movu [r2], m12
+ movu [r2 + mmsize], m13
+
psubw m2, m7
psubw m3, m7
-
- mova m12, m16
- mova m13, m17
mova m14, m16
mova m15, m17
-
- vpermi2q m12, m0, m1
- vpermi2q m13, m0, m1
vpermi2q m14, m2, m3
vpermi2q m15, m2, m3
-
- movu [r2], m12
- movu [r2 + mmsize], m13
movu [r2 + r3], m14
movu [r2 + r3 + mmsize], m15
%endif
@@ -14919,7 +14911,7 @@
%ifidn %1, pp
vbroadcasti32x8 m7, [pw_512]
%else
- add r3d, r3d
+ shl r3d, 1
vbroadcasti32x8 m7, [pw_2000]
mova m16, [interp4_vps_store1_avx512]
mova m17, [interp4_vps_store2_avx512]
More information about the x265-devel
mailing list