[x265] [PATCH 216 of 307] x86: AVX512 idct16 kernel - optimize to use align load
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:34 CEST 2018
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1512016234 28800
# Wed Nov 29 20:30:34 2017 -0800
# Node ID 63bedd49719fe9094ffdcbb88ac8512dccc120d2
# Parent 2a79f5eb0a9897f8703dafadfa034ba68b5955a9
x86: AVX512 idct16 kernel - optimize to use align load
diff -r 2a79f5eb0a98 -r 63bedd49719f source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Nov 29 19:43:31 2017 -0800
+++ b/source/common/x86/dct8.asm Wed Nov 29 20:30:34 2017 -0800
@@ -4761,7 +4761,7 @@
paddd m10, m17
vmovdqu32 m9 {k2}, m10
- movu m5, [tab_AVX512_idct16_1 + %1 * 64]
+ mova m5, [tab_AVX512_idct16_1 + %1 * 64]
pmaddwd m10, m1, m5
pmaddwd m11, m3, m5
@@ -4794,7 +4794,7 @@
paddd m9, m14
psrad m9, IDCT_SHIFT1
- movu m5, [tab_AVX512_idct16_2 + %1 * 64 + 64]
+ mova m5, [tab_AVX512_idct16_2 + %1 * 64 + 64]
pmaddwd m10, m0, m5
pmaddwd m12, m7, m5
@@ -4824,7 +4824,7 @@
- movu m5, [tab_AVX512_idct16_1 + %1 * 64 + 64]
+ mova m5, [tab_AVX512_idct16_1 + %1 * 64 + 64]
pmaddwd m12, m1, m5
pmaddwd m13, m3, m5
@@ -4864,8 +4864,8 @@
packssdw m11, m5
packssdw m9, m10
- movu m10, [idct16_AVX512_shuff]
- movu m5, [idct16_AVX512_shuff1]
+ mova m10, [idct16_AVX512_shuff]
+ mova m5, [idct16_AVX512_shuff1]
vpermd m%2, m10, m11
vpermd m%3, m5, m9
@@ -5232,20 +5232,20 @@
IDCT16_AVX512_PASS1 0, 22, 23
IDCT16_AVX512_PASS1 2, 24, 25
- movu m26, [idct16_AVX512_shuff2]
- movu m27, [idct16_AVX512_shuff3]
+ mova m26, [idct16_AVX512_shuff2]
+ mova m27, [idct16_AVX512_shuff3]
vpermi2q m26, m18, m22
vpermi2q m27, m18, m22
- movu m18, [idct16_AVX512_shuff2]
- movu m22, [idct16_AVX512_shuff3]
+ mova m18, [idct16_AVX512_shuff2]
+ mova m22, [idct16_AVX512_shuff3]
vpermi2q m18, m20, m24
vpermi2q m22, m20, m24
- movu m20, [idct16_AVX512_shuff4]
- movu m24, [idct16_AVX512_shuff5]
+ mova m20, [idct16_AVX512_shuff4]
+ mova m24, [idct16_AVX512_shuff5]
vpermi2q m20, m21, m25
vpermi2q m24, m21, m25
- movu m21, [idct16_AVX512_shuff4]
- movu m25, [idct16_AVX512_shuff5]
+ mova m21, [idct16_AVX512_shuff4]
+ mova m25, [idct16_AVX512_shuff5]
vpermi2q m21, m19, m23
vpermi2q m25, m19, m23
More information about the x265-devel
mailing list