[x265] [PATCH 266 of 307] [x265-avx512]x86: AVX512 optimize idct8x8
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:24 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1514282724 -19800
# Tue Dec 26 15:35:24 2017 +0530
# Node ID e883724b1af9f60e9d91be3aa6fe7b949e782684
# Parent 3d780e0d48827cd1cc4e664c3bf96dce6f515810
[x265-avx512]x86: AVX512 optimize idct8x8
AVX2 Performance : 8.28x
AVX512 Performance (old) : 9.17x
AVX512 Performance (opt) : 9.46x
Overall 12.47% gains over avx2
diff -r 3d780e0d4882 -r e883724b1af9 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Tue Dec 12 18:45:12 2017 +0530
+++ b/source/common/x86/dct8.asm Tue Dec 26 15:35:24 2017 +0530
@@ -4292,12 +4292,12 @@
%macro IDCT8_AVX512_PASS_1 0
- pmaddwd m5, m4, m17
- pmaddwd m6, m0, m18
+ pmaddwd m5, m29, m17
+ pmaddwd m6, m25, m18
paddd m5, m6
- pmaddwd m6, m1, m21
- pmaddwd m3, m2, m22
+ pmaddwd m6, m30, m21
+ pmaddwd m3, m26, m22
paddd m6, m3
paddd m3, m5, m6
@@ -4308,12 +4308,12 @@
paddd m5, m11
psrad m5, IDCT_SHIFT1
- pmaddwd m6, m4, m19
- pmaddwd m8, m0, m20
+ pmaddwd m6, m29, m19
+ pmaddwd m8, m25, m20
paddd m6, m8
- pmaddwd m8, m1, m23
- pmaddwd m9, m2, m24
+ pmaddwd m8, m30, m23
+ pmaddwd m9, m26, m24
paddd m8, m9
paddd m9, m6, m8
@@ -4334,7 +4334,7 @@
%macro IDCT8_AVX512_PASS_2 0
mov r7d, 0xAAAA
- kmovd k1, r7d
+ kmovd k1, r7d
punpcklqdq m2, m3, m13
punpckhqdq m0, m3, m13
@@ -4404,7 +4404,7 @@
%if ARCH_X86_64
INIT_ZMM avx512
-cglobal idct8, 3, 8, 25
+cglobal idct8, 3, 8, 31
%if BIT_DEPTH == 12
%define IDCT_SHIFT2 8
vpbroadcastd m12, [pd_128]
@@ -4424,34 +4424,33 @@
lea r4, [avx512_idct8_3]
lea r5, [avx2_idct8_1]
lea r6, [avx2_idct8_2]
+ movu m16, [idct16_shuff2]
+ movu m17, [idct16_shuff3]
;pass1
- mova ym1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
- mova ym0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
- vpunpcklwd ym5, ym1, ym0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
- vpunpckhwd ym1, ym0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
- vinserti128 ym4, ym5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
- vextracti128 xm2, ym5, 1 ; [1 3 1 3 1 3 1 3]
- vinserti128 ym1, ym1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
-
- mova ym2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
- mova ym0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
- vpunpcklwd ym5, ym2, ym0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
- vpunpckhwd ym2, ym0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
- vinserti128 ym0, ym5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
- vextracti128 xm5, ym5, 1 ; [5 7 5 7 5 7 5 7]
- vinserti128 ym2, ym2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
-
- mova ym5, [idct8_shuf1]
- vpermd ym4, ym5, ym4
- vpermd ym0, ym5, ym0
- vpermd ym1, ym5, ym1
- vpermd ym2, ym5, ym2
-
- vinserti64x4 m4, m4, ym4, 1
- vinserti64x4 m0, m0, ym0, 1
- vinserti64x4 m1, m1, ym1, 1
- vinserti64x4 m2, m2, ym2, 1
+ mova ym1, [r0 + 0 * 32]
+ mova ym0, [r0 + 1 * 32]
+ mova ym25, ym16
+ mova ym26, ym17
+ vpermi2w ym25, ym1, ym0
+ vpermi2w ym26, ym1, ym0
+
+ mova ym1, [r0 + 2 * 32]
+ mova ym0, [r0 + 3 * 32]
+ mova ym27, ym16
+ mova ym28, ym17
+ vpermi2w ym27, ym1, ym0
+ vpermi2w ym28, ym1, ym0
+
+ vperm2i128 ym29, ym25, ym26, 0x20
+ vperm2i128 ym30, ym25, ym26, 0x31
+ vperm2i128 ym25, ym27, ym28, 0x20
+ vperm2i128 ym26, ym27, ym28, 0x31
+
+ vinserti64x4 m29, m29, ym29, 1
+ vinserti64x4 m25, m25, ym25, 1
+ vinserti64x4 m30, m30, ym30, 1
+ vinserti64x4 m26, m26, ym26, 1
movu m17, [r4]
movu m18, [r4 + 1 * mmsize]
More information about the x265-devel
mailing list