[x265] [PATCH 216 of 307] x86: AVX512 idct16 kernel - optimize to use align load

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:34 CEST 2018


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1512016234 28800
#      Wed Nov 29 20:30:34 2017 -0800
# Node ID 63bedd49719fe9094ffdcbb88ac8512dccc120d2
# Parent  2a79f5eb0a9897f8703dafadfa034ba68b5955a9
x86: AVX512 idct16 kernel - optimize to use align load

diff -r 2a79f5eb0a98 -r 63bedd49719f source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Nov 29 19:43:31 2017 -0800
+++ b/source/common/x86/dct8.asm	Wed Nov 29 20:30:34 2017 -0800
@@ -4761,7 +4761,7 @@
     paddd            m10,  m17
     vmovdqu32        m9   {k2}, m10
 
-    movu            m5,  [tab_AVX512_idct16_1 + %1 * 64]
+    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64]
     pmaddwd         m10, m1, m5
     pmaddwd         m11, m3, m5
 
@@ -4794,7 +4794,7 @@
     paddd           m9, m14
     psrad           m9, IDCT_SHIFT1
 
-    movu            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
+    mova            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
     pmaddwd         m10, m0, m5
     pmaddwd         m12, m7, m5
 
@@ -4824,7 +4824,7 @@
 
 
 
-    movu            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64] 
+    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64] 
     pmaddwd         m12, m1, m5
     pmaddwd         m13, m3, m5
 
@@ -4864,8 +4864,8 @@
     packssdw        m11, m5
     packssdw        m9, m10
 
-    movu            m10, [idct16_AVX512_shuff]
-    movu            m5,  [idct16_AVX512_shuff1]
+    mova            m10, [idct16_AVX512_shuff]
+    mova            m5,  [idct16_AVX512_shuff1]
 
     vpermd          m%2, m10, m11
     vpermd          m%3, m5, m9
@@ -5232,20 +5232,20 @@
     IDCT16_AVX512_PASS1      0, 22, 23
     IDCT16_AVX512_PASS1      2, 24, 25
 
-    movu       m26,    [idct16_AVX512_shuff2]
-    movu       m27,    [idct16_AVX512_shuff3]
+    mova       m26,    [idct16_AVX512_shuff2]
+    mova       m27,    [idct16_AVX512_shuff3]
     vpermi2q   m26,    m18, m22
     vpermi2q   m27,    m18, m22
-    movu       m18,    [idct16_AVX512_shuff2]
-    movu       m22,    [idct16_AVX512_shuff3]
+    mova       m18,    [idct16_AVX512_shuff2]
+    mova       m22,    [idct16_AVX512_shuff3]
     vpermi2q   m18,    m20, m24
     vpermi2q   m22,    m20, m24
-    movu       m20,    [idct16_AVX512_shuff4]
-    movu       m24,    [idct16_AVX512_shuff5]
+    mova       m20,    [idct16_AVX512_shuff4]
+    mova       m24,    [idct16_AVX512_shuff5]
     vpermi2q   m20,    m21, m25
     vpermi2q   m24,    m21, m25
-    movu       m21,    [idct16_AVX512_shuff4]
-    movu       m25,    [idct16_AVX512_shuff5]
+    mova       m21,    [idct16_AVX512_shuff4]
+    mova       m25,    [idct16_AVX512_shuff5]
     vpermi2q   m21,    m19, m23
     vpermi2q   m25,    m19, m23
 


More information about the x265-devel mailing list