[x265] [PATCH 162 of 307] [x265-avx512]x86: optimize idct8x8 by eliminating few shuffles

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:40 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1510726736 -19800
#      Wed Nov 15 11:48:56 2017 +0530
# Node ID b60cd251df9429611a8651748d7e266075a33016
# Parent  f4cd489d06cfbbf66c6f0f7dc684606c80615c5e
[x265-avx512]x86: optimize idct8x8 by eliminating few shuffles.

Around 12% IPC gains over earlier implementation.

diff -r f4cd489d06cf -r b60cd251df94 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Thu Nov 02 12:18:41 2017 +0530
+++ b/source/common/x86/dct8.asm	Wed Nov 15 11:48:56 2017 +0530
@@ -193,7 +193,6 @@
 
 idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
 
-const idct8_avx512_shuf2,    times 4 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
 
 idct8_avx512_shuf3:    times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
 
@@ -3210,6 +3209,8 @@
 
 
 %macro IDCT8_AVX512_PASS_2 0
+    mov             r7d, 0xAAAA
+    kmovd           k1, r7d 
     punpcklqdq      m2,                m3, m13
     punpckhqdq      m0,                m3, m13
 
@@ -3218,25 +3219,18 @@
     pmaddwd         m6,                m2, [r5 + 2 * mmsize]
     pmaddwd         m7,                m2, [r5 + 3 * mmsize]
 
-    pshufd           m14,     m3,     q2301
-    pshufd           m16,     m5,     q2301
-    paddd             m3,    m14
-    paddd             m5,    m16
-    punpckhdq        m14,     m3,      m5
-    punpckldq        m16,     m3,      m5
-    punpckhdq         m3,    m16,     m14
-
-    pshufd           m14,     m6,     q2301
-    pshufd           m16,     m7,     q2301
-    paddd             m6,    m14
-    paddd             m7,    m16
-    punpckhdq        m14,     m6,      m7
-    punpckldq        m16,     m6,      m7
-    punpckhdq         m6,    m16,     m14
-
-
-    pshufb          m3,                [idct8_avx512_shuf2]
-    pshufb          m6,                [idct8_avx512_shuf2]
+    vpsrldq         m14,   m3, 4
+    paddd            m3,  m14
+    vpslldq         m16,   m5, 4
+    paddd            m5,  m16
+    vmovdqu32        m3   {k1}, m5
+
+    vpsrldq         m14,   m6, 4
+    paddd            m6,  m14
+    vpslldq         m16,   m7, 4
+    paddd            m7,  m16
+    vmovdqu32        m6   {k1}, m7
+
     punpcklqdq      m7,                m3, m6
     punpckhqdq      m3,                m6
 
@@ -3245,24 +3239,18 @@
     pmaddwd         m8,                m0, [r6 + 2 * mmsize]
     pmaddwd         m9,                m0, [r6 + 3 * mmsize]
 
-    pshufd          m14,     m5,    q2301
-    pshufd          m16,     m6,    q2301
-    paddd            m5,    m14
-    paddd            m6,    m16
-    punpckhdq       m14,     m5,    m6
-    punpckldq       m16,     m5,    m6
-    punpckhdq        m5,    m16,   m14
-
-    pshufd          m14,     m8,    q2301
-    pshufd          m16,     m9,    q2301
-    paddd            m8,    m14
-    paddd            m9,    m16
-    punpckhdq       m14,     m8,    m9
-    punpckldq       m16,     m8,    m9
-    punpckhdq        m8,    m16,   m14
-
-    pshufb          m5,                [idct8_avx512_shuf2]
-    pshufb          m8,                [idct8_avx512_shuf2]
+    vpsrldq         m14,   m5, 4
+    paddd            m5,  m14
+    vpslldq         m16,   m6, 4
+    paddd            m6,  m16
+    vmovdqu32        m5   {k1}, m6
+
+    vpsrldq         m14,   m8, 4
+    paddd            m8,  m14
+    vpslldq         m16,   m9, 4
+    paddd            m9,  m16
+    vmovdqu32        m8   {k1}, m9
+
     punpcklqdq      m6,                m5, m8
     punpckhqdq      m5,                m8
 
@@ -3292,7 +3280,7 @@
 
 %if ARCH_X86_64
 INIT_ZMM avx512
-cglobal idct8, 3, 7, 25
+cglobal idct8, 3, 8, 25
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
     vpbroadcastd    m12,                [pd_128]


More information about the x265-devel mailing list