[x265] [PATCH 162 of 307] [x265-avx512]x86: optimize idct8x8 by eliminating few shuffles
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:32:40 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1510726736 -19800
# Wed Nov 15 11:48:56 2017 +0530
# Node ID b60cd251df9429611a8651748d7e266075a33016
# Parent f4cd489d06cfbbf66c6f0f7dc684606c80615c5e
[x265-avx512]x86: optimize idct8x8 by eliminating few shuffles.
Around 12% IPC gains over earlier implementation.
diff -r f4cd489d06cf -r b60cd251df94 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Nov 02 12:18:41 2017 +0530
+++ b/source/common/x86/dct8.asm Wed Nov 15 11:48:56 2017 +0530
@@ -193,7 +193,6 @@
idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-const idct8_avx512_shuf2, times 4 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
idct8_avx512_shuf3: times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
@@ -3210,6 +3209,8 @@
%macro IDCT8_AVX512_PASS_2 0
+ mov r7d, 0xAAAA
+ kmovd k1, r7d
punpcklqdq m2, m3, m13
punpckhqdq m0, m3, m13
@@ -3218,25 +3219,18 @@
pmaddwd m6, m2, [r5 + 2 * mmsize]
pmaddwd m7, m2, [r5 + 3 * mmsize]
- pshufd m14, m3, q2301
- pshufd m16, m5, q2301
- paddd m3, m14
- paddd m5, m16
- punpckhdq m14, m3, m5
- punpckldq m16, m3, m5
- punpckhdq m3, m16, m14
-
- pshufd m14, m6, q2301
- pshufd m16, m7, q2301
- paddd m6, m14
- paddd m7, m16
- punpckhdq m14, m6, m7
- punpckldq m16, m6, m7
- punpckhdq m6, m16, m14
-
-
- pshufb m3, [idct8_avx512_shuf2]
- pshufb m6, [idct8_avx512_shuf2]
+ vpsrldq m14, m3, 4
+ paddd m3, m14
+ vpslldq m16, m5, 4
+ paddd m5, m16
+ vmovdqu32 m3 {k1}, m5
+
+ vpsrldq m14, m6, 4
+ paddd m6, m14
+ vpslldq m16, m7, 4
+ paddd m7, m16
+ vmovdqu32 m6 {k1}, m7
+
punpcklqdq m7, m3, m6
punpckhqdq m3, m6
@@ -3245,24 +3239,18 @@
pmaddwd m8, m0, [r6 + 2 * mmsize]
pmaddwd m9, m0, [r6 + 3 * mmsize]
- pshufd m14, m5, q2301
- pshufd m16, m6, q2301
- paddd m5, m14
- paddd m6, m16
- punpckhdq m14, m5, m6
- punpckldq m16, m5, m6
- punpckhdq m5, m16, m14
-
- pshufd m14, m8, q2301
- pshufd m16, m9, q2301
- paddd m8, m14
- paddd m9, m16
- punpckhdq m14, m8, m9
- punpckldq m16, m8, m9
- punpckhdq m8, m16, m14
-
- pshufb m5, [idct8_avx512_shuf2]
- pshufb m8, [idct8_avx512_shuf2]
+ vpsrldq m14, m5, 4
+ paddd m5, m14
+ vpslldq m16, m6, 4
+ paddd m6, m16
+ vmovdqu32 m5 {k1}, m6
+
+ vpsrldq m14, m8, 4
+ paddd m8, m14
+ vpslldq m16, m9, 4
+ paddd m9, m16
+ vmovdqu32 m8 {k1}, m9
+
punpcklqdq m6, m5, m8
punpckhqdq m5, m8
@@ -3292,7 +3280,7 @@
%if ARCH_X86_64
INIT_ZMM avx512
-cglobal idct8, 3, 7, 25
+cglobal idct8, 3, 8, 25
%if BIT_DEPTH == 12
%define IDCT_SHIFT2 8
vpbroadcastd m12, [pd_128]
More information about the x265-devel
mailing list