[x265] [PATCH 038 of 307] x86: AVX512 getResidual32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:36 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1500627732 -19800
# Fri Jul 21 14:32:12 2017 +0530
# Node ID 49123506b563fd44378e856e6833c77812d0349e
# Parent ef8989f43083cd5195ff3ba360959fe3900399e5
x86: AVX512 getResidual32
BIT_DEPTH = 8
AVX2 performance over C code : 2.99x
AVX512 performance over C code : 5.46x
HIGH_BIT_DEPTH
AVX2 performance over C code : 3.10x
AVX512 performance over C code : 5.60x
diff -r ef8989f43083 -r 49123506b563 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 04 18:02:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jul 21 14:32:12 2017 +0530
@@ -3723,6 +3723,7 @@
p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
+ p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
}
if (cpuMask & X265_CPU_AVX512)
{
@@ -3859,6 +3860,8 @@
p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
+ p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
+
}
#endif
}
diff -r ef8989f43083 -r 49123506b563 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jul 04 18:02:59 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Jul 21 14:32:12 2017 +0530
@@ -554,6 +554,135 @@
%endrep
RET
%endif
+
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0
+ movu m0, [r0]
+ movu m1, [r0 + r3]
+ movu m2, [r0 + r3 * 2]
+ movu m3, [r0 + r4]
+ lea r0, [r0 + r3 * 4]
+
+ movu m4, [r1]
+ movu m5, [r1 + r3]
+ movu m6, [r1 + r3 * 2]
+ movu m7, [r1 + r4]
+ lea r1, [r1 + r3 * 4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+ lea r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0
+ movu m0, [r0]
+ movu m1, [r0 + r3]
+ movu m2, [r0 + r3 * 2]
+ movu m3, [r0 + r4]
+
+ movu m4, [r1]
+ movu m5, [r1 + r3]
+ movu m6, [r1 + r3 * 2]
+ movu m7, [r1 + r4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3], m1
+ movu [r2 + r3 * 2], m2
+ movu [r2 + r4], m3
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_AVX512 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r3]
+ pmovzxbw m2, [r0 + r3 * 2]
+ pmovzxbw m3, [r0 + r4]
+ lea r0, [r0 + r3 * 4]
+
+ pmovzxbw m4, [r1]
+ pmovzxbw m5, [r1 + r3]
+ pmovzxbw m6, [r1 + r3 * 2]
+ pmovzxbw m7, [r1 + r4]
+ lea r1, [r1 + r3 * 4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3 * 2], m1
+ lea r2, [r2 + r3 * 4]
+ movu [r2], m2
+ movu [r2 + r3 * 2], m3
+ lea r2, [r2 + r3 * 4]
+%endmacro
+
+%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0
+ pmovzxbw m0, [r0]
+ pmovzxbw m1, [r0 + r3]
+ pmovzxbw m2, [r0 + r3 * 2]
+ pmovzxbw m3, [r0 + r4]
+
+ pmovzxbw m4, [r1]
+ pmovzxbw m5, [r1 + r3]
+ pmovzxbw m6, [r1 + r3 * 2]
+ pmovzxbw m7, [r1 + r4]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r2], m0
+ movu [r2 + r3 * 2], m1
+ lea r2, [r2 + r3 * 4]
+ movu [r2], m2
+ movu [r2 + r3 * 2], m3
+%endmacro
+
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal getResidual32, 4,5,8
+ add r3, r3
+ lea r4, [r3 * 3]
+
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END
+ RET
+%else
+INIT_ZMM avx512
+cglobal getResidual32, 4,5,8
+ lea r4, [r3 * 3]
+
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512
+ PROCESS_GETRESIDUAL32_W4_AVX512_END
+ RET
+%endif
+
;-----------------------------------------------------------------------------
; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list