[x265] [PATCH 217 of 307] x86: AVX512 idct32 optimize load operations
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:35 CEST 2018
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1512016927 28800
# Wed Nov 29 20:42:07 2017 -0800
# Node ID aff686238f2a30ab42b0e2ad296be54e77179531
# Parent 63bedd49719fe9094ffdcbb88ac8512dccc120d2
x86: AVX512 idct32 optimize load operations
diff -r 63bedd49719f -r aff686238f2a source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Nov 29 20:30:34 2017 -0800
+++ b/source/common/x86/dct8.asm Wed Nov 29 20:42:07 2017 -0800
@@ -5713,8 +5713,8 @@
vmovdqu32 m9 {k3}, m10
- movu m6, [tab_idct32_AVX512_5 + %1 * 64]
- movu m5, [tab_idct32_AVX512_5 + %1 * 64 + 64]
+ mova m6, [tab_idct32_AVX512_5 + %1 * 64]
+ mova m5, [tab_idct32_AVX512_5 + %1 * 64 + 64]
pmaddwd m10, m8, m6
pmaddwd m11, m7, m5
@@ -5846,8 +5846,8 @@
paddd m4, m25
vmovdqu32 m3 {k2}, m4
- movu m24, [idct16_AVX512_shuff3]
- movu m25, [idct16_AVX512_shuff2]
+ mova m24, [idct16_AVX512_shuff3]
+ mova m25, [idct16_AVX512_shuff2]
vpermi2q m24, m2, m3
vpermi2q m25, m2, m3
paddd m2, m25, m24
@@ -5900,8 +5900,8 @@
paddd m5, m25
vmovdqu32 m4 {k2}, m5
- movu m24, [idct16_AVX512_shuff3]
- movu m25, [idct16_AVX512_shuff2]
+ mova m24, [idct16_AVX512_shuff3]
+ mova m25, [idct16_AVX512_shuff2]
vpermi2q m24, m3, m4
vpermi2q m25, m3, m4
paddd m3, m25, m24
@@ -5955,8 +5955,8 @@
paddd m6, m25
vmovdqu32 m5 {k2}, m6
- movu m24, [idct16_AVX512_shuff3]
- movu m25, [idct16_AVX512_shuff2]
+ mova m24, [idct16_AVX512_shuff3]
+ mova m25, [idct16_AVX512_shuff2]
vpermi2q m24, m4, m5
vpermi2q m25, m4, m5
paddd m4, m25, m24
@@ -6009,8 +6009,8 @@
paddd m0, m25
vmovdqu32 m6 {k2}, m0
- movu m24, [idct16_AVX512_shuff3]
- movu m25, [idct16_AVX512_shuff2]
+ mova m24, [idct16_AVX512_shuff3]
+ mova m25, [idct16_AVX512_shuff2]
vpermi2q m24, m5, m6
vpermi2q m25, m5, m6
paddd m5, m25, m24
@@ -6063,24 +6063,24 @@
kmovd k4, r7d
- movu m16, [tab_idct32_AVX512_2 + 0 * 64]
- movu m17, [tab_idct32_AVX512_2 + 1 * 64]
- movu m18, [tab_idct32_AVX512_2 + 2 * 64]
- movu m19, [tab_idct32_AVX512_2 + 3 * 64]
-
- movu m20, [tab_idct32_AVX512_3 + 0 * 64]
- movu m21, [tab_idct32_AVX512_3 + 1 * 64]
- movu m22, [tab_idct32_AVX512_3 + 2 * 64]
- movu m23, [tab_idct32_AVX512_3 + 3 * 64]
-
- movu m24, [tab_idct32_AVX512_1 + 0 * 64]
- movu m25, [tab_idct32_AVX512_1 + 1 * 64]
- movu m26, [tab_idct32_AVX512_1 + 2 * 64]
- movu m27, [tab_idct32_AVX512_1 + 3 * 64]
- movu m28, [tab_idct32_AVX512_1 + 4 * 64]
- movu m29, [tab_idct32_AVX512_1 + 5 * 64]
- movu m30, [tab_idct32_AVX512_1 + 6 * 64]
- movu m31, [tab_idct32_AVX512_1 + 7 * 64]
+ mova m16, [tab_idct32_AVX512_2 + 0 * 64]
+ mova m17, [tab_idct32_AVX512_2 + 1 * 64]
+ mova m18, [tab_idct32_AVX512_2 + 2 * 64]
+ mova m19, [tab_idct32_AVX512_2 + 3 * 64]
+
+ mova m20, [tab_idct32_AVX512_3 + 0 * 64]
+ mova m21, [tab_idct32_AVX512_3 + 1 * 64]
+ mova m22, [tab_idct32_AVX512_3 + 2 * 64]
+ mova m23, [tab_idct32_AVX512_3 + 3 * 64]
+
+ mova m24, [tab_idct32_AVX512_1 + 0 * 64]
+ mova m25, [tab_idct32_AVX512_1 + 1 * 64]
+ mova m26, [tab_idct32_AVX512_1 + 2 * 64]
+ mova m27, [tab_idct32_AVX512_1 + 3 * 64]
+ mova m28, [tab_idct32_AVX512_1 + 4 * 64]
+ mova m29, [tab_idct32_AVX512_1 + 5 * 64]
+ mova m30, [tab_idct32_AVX512_1 + 6 * 64]
+ mova m31, [tab_idct32_AVX512_1 + 7 * 64]
.pass1:
movq xm0, [r0 + 2 * 64]
@@ -6242,28 +6242,28 @@
mov r6d, 0xFFFF0000
kmovd k3, r6d
- movu m7, [tab_idct32_AVX512_6]
- movu m8, [tab_idct32_AVX512_6 + 1 * mmsize]
- movu m9, [tab_idct32_AVX512_6 + 2 * mmsize]
- movu m10, [tab_idct32_AVX512_6 + 3 * mmsize]
- movu m11, [tab_idct32_AVX512_6 + 4 * mmsize]
- movu m12, [tab_idct32_AVX512_6 + 5 * mmsize]
- movu m13, [tab_idct32_AVX512_6 + 6 * mmsize]
- movu m14, [tab_idct32_AVX512_6 + 7 * mmsize]
- movu m16, [tab_idct32_AVX512_6 + 8 * mmsize]
- movu m17, [tab_idct32_AVX512_6 + 9 * mmsize]
- movu m18, [tab_idct32_AVX512_6 + 10 * mmsize]
- movu m19, [tab_idct32_AVX512_6 + 11 * mmsize]
- movu m20, [tab_idct32_AVX512_6 + 12 * mmsize]
- movu m21, [tab_idct32_AVX512_6 + 13 * mmsize]
- movu m22, [tab_idct32_AVX512_6 + 14 * mmsize]
- movu m23, [tab_idct32_AVX512_6 + 15 * mmsize]
- movu m26, [tab_idct32_AVX512_4]
- movu m27, [tab_idct32_AVX512_4 + 1 * mmsize]
- movu m28, [tab_idct32_AVX512_4 + 2 * mmsize]
- movu m29, [tab_idct32_AVX512_4 + 3 * mmsize]
- movu m30, [tab_idct32_AVX512_4 + 4 * mmsize]
- movu m31, [tab_idct32_AVX512_4 + 5 * mmsize]
+ mova m7, [tab_idct32_AVX512_6]
+ mova m8, [tab_idct32_AVX512_6 + 1 * mmsize]
+ mova m9, [tab_idct32_AVX512_6 + 2 * mmsize]
+ mova m10, [tab_idct32_AVX512_6 + 3 * mmsize]
+ mova m11, [tab_idct32_AVX512_6 + 4 * mmsize]
+ mova m12, [tab_idct32_AVX512_6 + 5 * mmsize]
+ mova m13, [tab_idct32_AVX512_6 + 6 * mmsize]
+ mova m14, [tab_idct32_AVX512_6 + 7 * mmsize]
+ mova m16, [tab_idct32_AVX512_6 + 8 * mmsize]
+ mova m17, [tab_idct32_AVX512_6 + 9 * mmsize]
+ mova m18, [tab_idct32_AVX512_6 + 10 * mmsize]
+ mova m19, [tab_idct32_AVX512_6 + 11 * mmsize]
+ mova m20, [tab_idct32_AVX512_6 + 12 * mmsize]
+ mova m21, [tab_idct32_AVX512_6 + 13 * mmsize]
+ mova m22, [tab_idct32_AVX512_6 + 14 * mmsize]
+ mova m23, [tab_idct32_AVX512_6 + 15 * mmsize]
+ mova m26, [tab_idct32_AVX512_4]
+ mova m27, [tab_idct32_AVX512_4 + 1 * mmsize]
+ mova m28, [tab_idct32_AVX512_4 + 2 * mmsize]
+ mova m29, [tab_idct32_AVX512_4 + 3 * mmsize]
+ mova m30, [tab_idct32_AVX512_4 + 4 * mmsize]
+ mova m31, [tab_idct32_AVX512_4 + 5 * mmsize]
.pass2:
movu ym0, [r3]
More information about the x265-devel
mailing list