[x265] [PATCH 040 of 307] x86: AVX512 sub_ps_64x64 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:38 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500887152 -19800
# Mon Jul 24 14:35:52 2017 +0530
# Node ID 9a4caf163d0fbdbc51c9f681ed898a39a5602bcf
# Parent be860e68659a37dae543956a65a4eb167f8b5504
x86: AVX512 sub_ps_64x64 for high bit depth
AVX2 performance : 21.24x
AVX512 performance : 36.95x
diff -r be860e68659a -r 9a4caf163d0f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jul 21 12:48:22 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jul 24 14:35:52 2017 +0530
@@ -2191,6 +2191,7 @@
if (cpuMask & X265_CPU_AVX512)
{
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+ p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
}
}
#else // if HIGH_BIT_DEPTH
diff -r be860e68659a -r 9a4caf163d0f source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Jul 21 12:48:22 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Mon Jul 24 14:35:52 2017 +0530
@@ -6023,9 +6023,6 @@
RET
%endif
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
%macro PROCESS_SUB_PS_64x8_AVX512 0
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 32]
@@ -6112,7 +6109,129 @@
movu [r0 + 2 * r1 + 64], m5
%endmacro
-%if HIGH_BIT_DEPTH==0
+%macro PROCESS_SUB_PS_64x8_HBD_AVX512 0
+ movu m0, [r2]
+ movu m1, [r2 + 64]
+ movu m4, [r3]
+ movu m5, [r3 + 64]
+ psubw m0, m4
+ psubw m1, m5
+ movu m2, [r2 + r4]
+ movu m3, [r2 + r4 + 64]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 64]
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0], m0
+ movu [r0 + 64], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 64], m3
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r4 * 2 + 64]
+ movu m4, [r3 + r5 * 2]
+ movu m5, [r3 + r5 * 2 + 64]
+ psubw m0, m4
+ psubw m1, m5
+ movu m2, [r2 + r7]
+ movu m3, [r2 + r7 + 64]
+ movu m6, [r3 + r8]
+ movu m7, [r3 + r8 + 64]
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 64], m1
+ movu [r0 + r6], m2
+ movu [r0 + r6 + 64], m3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+
+ movu m0, [r2]
+ movu m1, [r2 + 64]
+ movu m4, [r3]
+ movu m5, [r3 + 64]
+ psubw m0, m4
+ psubw m1, m5
+ movu m2, [r2 + r4]
+ movu m3, [r2 + r4 + 64]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 64]
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0], m0
+ movu [r0 + 64], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 64], m3
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r4 * 2 + 64]
+ movu m4, [r3 + r5 * 2]
+ movu m5, [r3 + r5 * 2 + 64]
+ psubw m0, m4
+ psubw m1, m5
+ movu m2, [r2 + r7]
+ movu m3, [r2 + r7 + 64]
+ movu m6, [r3 + r8]
+ movu m7, [r3 + r8 + 64]
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 64], m1
+ movu [r0 + r6], m2
+ movu [r0 + r6 + 64], m3
+%endmacro
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_sub_ps_64x64, 6, 9, 8
+ add r1d, r1d
+ add r4d, r4d
+ add r5d, r5d
+ lea r6, [r1 * 3]
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ PROCESS_SUB_PS_64x8_HBD_AVX512
+ RET
+%endif
+%else
%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_sub_ps_64x64, 6, 7, 8
More information about the x265-devel
mailing list