[x265] [PATCH 187 of 307] [x265-avx512]x86: optimize idct32 by optimizing shift operations
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:05 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1511501028 -19800
# Fri Nov 24 10:53:48 2017 +0530
# Node ID 699c19611415b93c5227950409f68b40046efffa
# Parent 664d45353792c5014a714a5ddc8d618b01391deb
[x265-avx512]x86: optimize idct32 by optimizing shift operations.
AVX512 Performance : 6.97x
AVX512 Perforamnce(optimized) : 7.36x
diff -r 664d45353792 -r 699c19611415 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Mon Nov 20 10:34:37 2017 +0530
+++ b/source/common/x86/dct8.asm Fri Nov 24 10:53:48 2017 +0530
@@ -4719,26 +4719,22 @@
pmaddwd m9, m8, m%4
pmaddwd m10, m7, m%5
+ paddd m9, m10
+ vpsrldq m0, m9, 8
+ paddd m9, m0
vpsrldq m0, m9, 4
paddd m9, m0
- vpslldq m5, m10, 4
- paddd m10, m5
- vmovdqu32 m9 {k1}, m10
pmaddwd m10, m4, m%4
pmaddwd m11, m1, m%5
- vpsrldq m0, m10, 4
+ paddd m10, m11
+ vpsrldq m0, m10, 8
+ paddd m10, m0
+ vpslldq m0, m10, 4
paddd m10, m0
- vpslldq m5, m11, 4
- paddd m11, m5
- vmovdqu32 m10 {k1}, m11
-
- vpsrldq m0, m9, 8
- paddd m9, m0
- vpslldq m5, m10, 8
- paddd m10, m5
- vmovdqu32 m9 {k2}, m10
+
+ vmovdqu32 m9 {k3}, m10
movu m6, [tab_idct32_AVX512_5 + %1 * 64]
movu m5, [tab_idct32_AVX512_5 + %1 * 64 + 64]
@@ -4746,34 +4742,23 @@
pmaddwd m10, m8, m6
pmaddwd m11, m7, m5
+ paddd m10, m11
+ vpslldq m0, m10, 8
+ paddd m10, m0
vpsrldq m0, m10, 4
paddd m10, m0
- vpslldq m5, m11, 4
- paddd m11, m5
- vmovdqu32 m10 {k1}, m11
pmaddwd m11, m4, m6
- pmaddwd m12, m1, [tab_idct32_AVX512_5 + %1 * 64 + 64]
-
- vpsrldq m0, m11, 4
+ pmaddwd m12, m1, m5
+
+ paddd m11, m12
+ vpslldq m0, m11, 8
paddd m11, m0
- vpslldq m5, m12, 4
- paddd m12, m5
- vmovdqu32 m11 {k1}, m12
-
- vpsrldq m0, m10, 8
- paddd m10, m0
- vpslldq m5, m11, 8
- paddd m11, m5
- vmovdqu32 m10 {k2}, m11
-
- pshufd m0, m9, q2301
- pshufd m5, m10, q2301
- paddd m9, m0
- paddd m10, m5
- punpckhdq m0, m9, m10
- punpckldq m5, m9, m10
- punpckhdq m9, m5, m0
+ vpslldq m0, m11, 4
+ paddd m11, m0
+
+ vmovdqu32 m10 {k4}, m11
+ vmovdqu32 m9 {k2}, m10
pmaddwd m10, m3, m%2
pmaddwd m11, m14, m%2
@@ -5095,6 +5080,10 @@
kmovd k1, r7d
mov r7d, 0xCCCC
kmovd k2, r7d
+ mov r7d, 0x2222
+ kmovd k3, r7d
+ mov r7d, 0x8888
+ kmovd k4, r7d
movu m16, [tab_idct32_AVX512_2 + 0 * 64]
More information about the x265-devel
mailing list