[x265] [PATCH 263 of 307] x86: AVX512 Clean up of luma_vps and chroma_vps

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:21 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1513228979 -19800
#      Thu Dec 14 10:52:59 2017 +0530
# Node ID 1480076a7bdda5ca31776adf31c087268f232107
# Parent  458b708e6d17aafb49a5fd369b2e9540d0268726
x86: AVX512 Clean up of luma_vps and chroma_vps

diff -r 458b708e6d17 -r 1480076a7bdd source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Dec 13 11:05:10 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Dec 14 10:52:59 2017 +0530
@@ -10947,7 +10947,7 @@
 %ifidn %1, pp
     vbroadcasti32x8       m7,                 [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                 [pw_2000]
     mova                  m10,                [interp4_vps_store1_avx512]
     mova                  m11,                [interp4_vps_store2_avx512]
@@ -11073,7 +11073,7 @@
 %ifidn %1,pp
     vbroadcasti32x8       m7,                [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                [pw_2000]
     mova                  m10,                [interp4_vps_store1_avx512]
     mova                  m11,                [interp4_vps_store2_avx512]
@@ -11248,7 +11248,7 @@
 %ifidn %1, pp
     vbroadcasti32x8       m7,                 [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                 [pw_2000]
     mova                  m10,                [interp4_vps_store1_avx512]
     mova                  m11,                [interp4_vps_store2_avx512]
@@ -11400,7 +11400,7 @@
 %ifidn %1,pp
     vbroadcasti32x8            m12, [pw_512]
 %else
-    add                        r3d, r3d
+    shl                        r3d, 1
     vbroadcasti32x8            m12, [pw_2000]
     mova                       m13, [interp4_vps_store1_avx512]
     mova                       m14, [interp4_vps_store2_avx512]
@@ -14256,24 +14256,23 @@
 %else
     psubw                 m0,                 m7
     psubw                 m1,                 m7
+    mova                  m12,                 m16
+    mova                  m13,                 m17
+    vpermi2q              m12,                 m0,                m1
+    vpermi2q              m13,                 m0,                m1
+    movu                  [r2],               ym12
+    vextracti32x8         [r2 + 2 * r3],      m12,                 1
+
     psubw                 m2,                 m7
     psubw                 m3,                 m7
-
-    mova                  m12,                 m16
-    mova                  m13,                 m17
     mova                  m14,                 m16
     mova                  m15,                 m17
-
-    vpermi2q              m12,                 m0,                m1
-    vpermi2q              m13,                 m0,                m1
     vpermi2q              m14,                 m2,                m3
     vpermi2q              m15,                 m2,                m3
-
-    movu                  [r2],               ym12
     movu                  [r2 + r3],          ym14
-    vextracti32x8         [r2 + 2 * r3],      m12,                 1
     vextracti32x8         [r2 + r7],          m14,                 1
     lea                   r2,                 [r2 + 4 * r3]
+
     movu                  [r2],               ym13
     movu                  [r2 + r3],          ym15
     vextracti32x8         [r2 + 2 * r3],      m13,                 1
@@ -14303,7 +14302,7 @@
 %ifidn %1, pp
     vbroadcasti32x8       m7,                 [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                 [pw_2000]
     mova                  m16,                [interp4_vps_store1_avx512]
     mova                  m17,                [interp4_vps_store2_avx512]
@@ -14425,22 +14424,20 @@
 %else
     psubw                 m0,                 m7
     psubw                 m1,                 m7
+    mova                  m12,                 m16
+    mova                  m13,                 m17
+    vpermi2q              m12,                 m0,                m1
+    vpermi2q              m13,                 m0,                m1
+    movu                  [r2],               m12
+    movu                  [r2 + 2 * r3],      m13
+
     psubw                 m2,                 m7
     psubw                 m3,                 m7
-
-    mova                  m12,                 m16
-    mova                  m13,                 m17
     mova                  m14,                 m16
     mova                  m15,                 m17
-
-    vpermi2q              m12,                 m0,                m1
-    vpermi2q              m13,                 m0,                m1
     vpermi2q              m14,                 m2,                m3
     vpermi2q              m15,                 m2,                m3
-
-    movu                  [r2],               m12
     movu                  [r2 + r3],          m14
-    movu                  [r2 + 2 * r3],      m13
     movu                  [r2 + r7],          m15
 %endif
 %endmacro
@@ -14467,7 +14464,7 @@
 %ifidn %1, pp
     vbroadcasti32x8       m7,                 [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                 [pw_2000]
     mova                  m16,                [interp4_vps_store1_avx512]
     mova                  m17,                [interp4_vps_store2_avx512]
@@ -14596,22 +14593,20 @@
 %else
     psubw                 m0,                 m7
     psubw                 m1,                 m7
+    mova                  m12,                 m16
+    mova                  m13,                 m17
+    vpermi2q              m12,                 m0,                m1
+    vpermi2q              m13,                 m0,                m1
+    movu                  [r9],               m12
+    movu                  [r9 + 2 * r3],      m13
+
     psubw                 m2,                 m7
     psubw                 m3,                 m7
-
-    mova                  m12,                 m16
-    mova                  m13,                 m17
     mova                  m14,                 m16
     mova                  m15,                 m17
-
-    vpermi2q              m12,                 m0,                m1
-    vpermi2q              m13,                 m0,                m1
     vpermi2q              m14,                 m2,                m3
     vpermi2q              m15,                 m2,                m3
-
-    movu                  [r9],               m12
     movu                  [r9 + r3],          m14
-    movu                  [r9 + 2 * r3],      m13
     movu                  [r9 + r7],          m15
 %endif
     movu                  xm1,                [r0 + mmsize/2]
@@ -14726,24 +14721,23 @@
 %else
     psubw                 m0,                 m7
     psubw                 m1,                 m7
+    mova                  m12,                 m16
+    mova                  m13,                 m17
+    vpermi2q              m12,                 m0,                m1
+    vpermi2q              m13,                 m0,                m1
+    movu                  [r2 + mmsize],               ym12
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m12,                 1
+
     psubw                 m2,                 m7
     psubw                 m3,                 m7
-
-    mova                  m12,                 m16
-    mova                  m13,                 m17
     mova                  m14,                 m16
     mova                  m15,                 m17
-
-    vpermi2q              m12,                 m0,                m1
-    vpermi2q              m13,                 m0,                m1
     vpermi2q              m14,                 m2,                m3
     vpermi2q              m15,                 m2,                m3
-
-    movu                  [r2 + mmsize],               ym12
     movu                  [r2 + r3 + mmsize],          ym14
-    vextracti32x8         [r2 + 2 * r3 + mmsize],      m12,                 1
     vextracti32x8         [r2 + r7 + mmsize],          m14,                 1
     lea                   r2,                          [r2 + 4 * r3]
+
     movu                  [r2 + mmsize],               ym13
     movu                  [r2 + r3 + mmsize],          ym15
     vextracti32x8         [r2 + 2 * r3 + mmsize],      m13,                 1
@@ -14774,7 +14768,7 @@
 %ifidn %1, pp
     vbroadcasti32x8       m7,                 [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                 [pw_2000]
     mova                  m16,                [interp4_vps_store1_avx512]
     mova                  m17,                [interp4_vps_store2_avx512]
@@ -14877,21 +14871,19 @@
 %else
     psubw                 m0,                 m7
     psubw                 m1,                 m7
+    mova                  m12,                 m16
+    mova                  m13,                 m17
+    vpermi2q              m12,                 m0,                m1
+    vpermi2q              m13,                 m0,                m1
+    movu                  [r2],               m12
+    movu                  [r2 + mmsize],      m13
+
     psubw                 m2,                 m7
     psubw                 m3,                 m7
-
-    mova                  m12,                 m16
-    mova                  m13,                 m17
     mova                  m14,                 m16
     mova                  m15,                 m17
-
-    vpermi2q              m12,                 m0,                m1
-    vpermi2q              m13,                 m0,                m1
     vpermi2q              m14,                 m2,                m3
     vpermi2q              m15,                 m2,                m3
-
-    movu                  [r2],               m12
-    movu                  [r2 + mmsize],      m13
     movu                  [r2 + r3],          m14
     movu                  [r2 + r3 + mmsize], m15
 %endif
@@ -14919,7 +14911,7 @@
 %ifidn %1, pp
     vbroadcasti32x8       m7,                 [pw_512]
 %else
-    add                   r3d,                r3d
+    shl                   r3d,                1
     vbroadcasti32x8       m7,                 [pw_2000]
     mova                  m16,                [interp4_vps_store1_avx512]
     mova                  m17,                [interp4_vps_store2_avx512]


More information about the x265-devel mailing list