[x265] [PATCH 187 of 307] [x265-avx512]x86: optimize idct32 by optimizing shift operations

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:05 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1511501028 -19800
#      Fri Nov 24 10:53:48 2017 +0530
# Node ID 699c19611415b93c5227950409f68b40046efffa
# Parent  664d45353792c5014a714a5ddc8d618b01391deb
[x265-avx512]x86: optimize idct32 by optimizing shift operations.

AVX512 Performance              : 6.97x
AVX512 Perforamnce(optimized)   : 7.36x

diff -r 664d45353792 -r 699c19611415 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Mon Nov 20 10:34:37 2017 +0530
+++ b/source/common/x86/dct8.asm	Fri Nov 24 10:53:48 2017 +0530
@@ -4719,26 +4719,22 @@
     pmaddwd         m9,  m8, m%4
     pmaddwd         m10, m7, m%5
 
+    paddd            m9,  m10
+    vpsrldq          m0,   m9, 8
+    paddd            m9,   m0
     vpsrldq          m0,   m9, 4
     paddd            m9,   m0
-    vpslldq          m5,  m10, 4
-    paddd           m10,   m5
-    vmovdqu32        m9   {k1}, m10
 
     pmaddwd         m10, m4, m%4
     pmaddwd         m11, m1, m%5
 
-    vpsrldq          m0,   m10, 4
+    paddd           m10,   m11
+    vpsrldq          m0,   m10, 8
+    paddd           m10,   m0
+    vpslldq          m0,   m10, 4
     paddd           m10,    m0
-    vpslldq          m5,   m11, 4
-    paddd           m11,    m5
-    vmovdqu32       m10    {k1}, m11
-
-    vpsrldq          m0,   m9, 8
-    paddd            m9,   m0
-    vpslldq          m5,   m10, 8
-    paddd           m10,   m5
-    vmovdqu32        m9   {k2}, m10
+
+    vmovdqu32       m9 {k3}, m10
 
     movu            m6,  [tab_idct32_AVX512_5 + %1 * 64]
     movu            m5,  [tab_idct32_AVX512_5 + %1 * 64 + 64]
@@ -4746,34 +4742,23 @@
     pmaddwd         m10, m8, m6
     pmaddwd         m11, m7, m5
 
+    paddd           m10,  m11
+    vpslldq         m0,   m10, 8
+    paddd           m10,   m0
     vpsrldq          m0,  m10, 4
     paddd           m10,   m0
-    vpslldq          m5,  m11, 4
-    paddd           m11,   m5
-    vmovdqu32       m10   {k1}, m11
 
     pmaddwd         m11, m4, m6
-    pmaddwd         m12, m1, [tab_idct32_AVX512_5 + %1 * 64 + 64]
-
-    vpsrldq          m0,   m11, 4
+    pmaddwd         m12, m1, m5
+
+    paddd           m11,   m12
+    vpslldq          m0,   m11, 8
     paddd           m11,    m0
-    vpslldq          m5,   m12, 4
-    paddd           m12,    m5
-    vmovdqu32       m11    {k1}, m12
-
-    vpsrldq          m0,   m10, 8
-    paddd           m10,    m0
-    vpslldq          m5,   m11, 8
-    paddd           m11,    m5
-    vmovdqu32       m10    {k2}, m11
-
-    pshufd           m0,    m9,  q2301
-    pshufd           m5,   m10,  q2301
-    paddd            m9,    m0
-    paddd           m10,    m5
-    punpckhdq        m0,    m9,  m10
-    punpckldq        m5,    m9,  m10
-    punpckhdq        m9,   m5,   m0
+    vpslldq          m0,   m11, 4
+    paddd           m11,    m0
+
+    vmovdqu32        m10  {k4},  m11
+    vmovdqu32        m9  {k2}, m10
 
     pmaddwd         m10, m3, m%2
     pmaddwd         m11, m14, m%2
@@ -5095,6 +5080,10 @@
     kmovd            k1, r7d
     mov             r7d, 0xCCCC
     kmovd            k2, r7d
+    mov             r7d, 0x2222
+    kmovd            k3, r7d
+    mov             r7d, 0x8888
+    kmovd            k4, r7d
 
 
     movu            m16, [tab_idct32_AVX512_2 + 0 * 64]


More information about the x265-devel mailing list