[x265] [PATCH 261 of 307] [x265-avx512] x86: AVX512 psyCost_pp for main10 and main12
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:19 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1512721859 -19800
# Fri Dec 08 14:00:59 2017 +0530
# Node ID ab5b1becd807647d5264381c1fb74750c20fdfae
# Parent 42fe321e5cdf9ad260e4e5c7a64137a8b7601915
[x265-avx512] x86: AVX512 psyCost_pp for main10 and main12
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x16 | 13.86x | 18.45x
32x32 | 13.48x | 19.86x
64x64 | 13.51x | 18.33x
diff -r 42fe321e5cdf -r ab5b1becd807 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 12 16:48:04 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 14:00:59 2017 +0530
@@ -3080,6 +3080,9 @@
p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
+ p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
+ p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
+ p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
}
#endif
}
diff -r 42fe321e5cdf -r ab5b1becd807 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Dec 12 16:48:04 2017 +0530
+++ b/source/common/x86/pixel-a.asm Fri Dec 08 14:00:59 2017 +0530
@@ -45,6 +45,8 @@
times 2 dw 1, -1
times 4 dw 1
times 2 dw 1, -1
+psy_pp_shuff1: dq 0, 1, 8, 9, 4, 5, 12, 13
+psy_pp_shuff2: dq 2, 3, 10, 11, 6, 7, 14, 15
ALIGN 32
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
@@ -10403,6 +10405,369 @@
pabsd m11, m11
%endmacro
+%macro PSY_COST_PP_8x8_AVX512_MAIN12 0
+ ; load source and recon pixels
+ lea r4, [r1 * 3]
+ pmovzxwd ym0, [r0]
+ pmovzxwd ym1, [r0 + r1]
+ pmovzxwd ym2, [r0 + r1 * 2]
+ pmovzxwd ym3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ pmovzxwd ym4, [r5]
+ pmovzxwd ym5, [r5 + r1]
+ pmovzxwd ym6, [r5 + r1 * 2]
+ pmovzxwd ym7, [r5 + r4]
+
+ lea r4, [r3 * 3]
+ pmovzxwd ym16, [r2]
+ pmovzxwd ym17, [r2 + r3]
+ pmovzxwd ym18, [r2 + r3 * 2]
+ pmovzxwd ym19, [r2 + r4]
+ lea r5, [r2 + r3 * 4]
+ pmovzxwd ym20, [r5]
+ pmovzxwd ym21, [r5 + r3]
+ pmovzxwd ym22, [r5 + r3 * 2]
+ pmovzxwd ym23, [r5 + r4]
+
+ vinserti64x4 m0, m0, ym16, 1
+ vinserti64x4 m1, m1, ym17, 1
+ vinserti64x4 m2, m2, ym18, 1
+ vinserti64x4 m3, m3, ym19, 1
+ vinserti64x4 m4, m4, ym20, 1
+ vinserti64x4 m5, m5, ym21, 1
+ vinserti64x4 m6, m6, ym22, 1
+ vinserti64x4 m7, m7, ym23, 1
+
+ ; source + recon SAD
+ paddd m8, m0, m1
+ paddd m8, m2
+ paddd m8, m3
+ paddd m8, m4
+ paddd m8, m5
+ paddd m8, m6
+ paddd m8, m7
+
+ vextracti64x4 ym15, m8, 1
+
+ vextracti128 xm9, ym8, 1
+ paddd ym8, ym9 ; sad_8x8
+ movhlps xm9, xm8
+ paddd xm8, xm9
+ pshuflw xm9, xm8, 0Eh
+ paddd xm8, xm9
+ psrld ym8, 2
+
+ vextracti128 xm9, ym15, 1
+ paddd ym15, ym9 ; sad_8x8
+ movhlps xm9, xm15
+ paddd xm15, xm9
+ pshuflw xm9, xm15, 0Eh
+ paddd xm15, xm9
+ psrld ym15, 2
+
+ ; source and recon SA8D
+ psubd m9, m1, m0
+ paddd m0, m1
+ psubd m1, m3, m2
+ paddd m2, m3
+ punpckhdq m3, m0, m9
+ punpckldq m0, m9
+ psubd m9, m3, m0
+ paddd m0, m3
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m10, m3, m2
+ paddd m2, m3
+ psubd m3, m5, m4
+ paddd m4, m5
+ psubd m5, m7, m6
+ paddd m6, m7
+ punpckhdq m1, m4, m3
+ punpckldq m4, m3
+ psubd m7, m1, m4
+ paddd m4, m1
+ punpckhdq m3, m6, m5
+ punpckldq m6, m5
+ psubd m1, m3, m6
+ paddd m6, m3
+ psubd m3, m2, m0
+ paddd m0, m2
+ psubd m2, m10, m9
+ paddd m9, m10
+ punpckhqdq m5, m0, m3
+ punpcklqdq m0, m3
+ psubd m10, m5, m0
+ paddd m0, m5
+ punpckhqdq m3, m9, m2
+ punpcklqdq m9, m2
+ psubd m5, m3, m9
+ paddd m9, m3
+ psubd m3, m6, m4
+ paddd m4, m6
+ psubd m6, m1, m7
+ paddd m7, m1
+ punpckhqdq m2, m4, m3
+ punpcklqdq m4, m3
+ psubd m1, m2, m4
+ paddd m4, m2
+ punpckhqdq m3, m7, m6
+ punpcklqdq m7, m6
+
+ psubd m2, m3, m7
+ paddd m7, m3
+ psubd m3, m4, m0
+ paddd m0, m4
+ psubd m4, m1, m10
+ paddd m10, m1
+
+ mova m16, m13
+ mova m17, m14
+ vpermi2q m16, m0, m3
+ vpermi2q m17, m0, m3
+
+ pabsd m17, m17
+ pabsd m16, m16
+ pmaxsd m17, m16
+
+ mova m18, m13
+ mova m19, m14
+ vpermi2q m18, m10, m4
+ vpermi2q m19, m10, m4
+
+ pabsd m19, m19
+ pabsd m18, m18
+ pmaxsd m19, m18
+ psubd m18, m7, m9
+ paddd m9, m7
+ psubd m7, m2, m5
+ paddd m5, m2
+
+ mova m20, m13
+ mova m21, m14
+ vpermi2q m20, m9, m18
+ vpermi2q m21, m9, m18
+
+ pabsd m21, m21
+ pabsd m20, m20
+ pmaxsd m21, m20
+
+ mova m22, m13
+ mova m23, m14
+ vpermi2q m22, m5, m7
+ vpermi2q m23, m5, m7
+
+ pabsd m23, m23
+ pabsd m22, m22
+ pmaxsd m23, m22
+ paddd m17, m21
+ paddd m17, m19
+ paddd m17, m23
+
+ vextracti64x4 ym26, m17, 1
+
+ vextracti128 xm9, m17, 1
+ paddd ym17, ym9 ; sad_8x8
+ movhlps xm9, xm17
+ paddd xm17, xm9
+ pshuflw xm9, xm17, 0Eh
+ paddd xm17, xm9
+ paddd ym17, [pd_1]
+ psrld ym17, 1 ; sa8d_8x8
+
+ vextracti128 xm9, ym26, 1
+ paddd ym26, ym9 ; sad_8x8
+ movhlps xm9, xm26
+ paddd xm26, xm9
+ pshuflw xm9, xm26, 0Eh
+ paddd xm26, xm9
+ paddd ym26, [pd_1]
+ psrld ym26, 1 ; sa8d_8x8
+
+
+
+ psubd ym11, ym17, ym8 ; sa8d_8x8 - sad_8x8
+ psubd ym12, ym26, ym15 ; sa8d_8x8 - sad_8x8
+
+ psubd ym11, ym12
+ pabsd ym11, ym11
+%endmacro
+
+%macro PSY_PP_INPUT_AVX512_MAIN10 0
+ lea r4, [r1 * 3]
+ movu xm0, [r0]
+ movu xm1, [r0 + r1]
+ movu xm2, [r0 + r1 * 2]
+ movu xm3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu xm4, [r5]
+ movu xm5, [r5 + r1]
+ movu xm6, [r5 + r1 * 2]
+ movu xm7, [r5 + r4]
+
+ lea r4, [r3 * 3]
+ vinserti128 ym0, ym0, [r2], 1
+ vinserti128 ym1, ym1, [r2 + r3], 1
+ vinserti128 ym2, ym2, [r2 + r3 * 2], 1
+ vinserti128 ym3, ym3, [r2 + r4], 1
+ lea r5, [r2 + r3 * 4]
+ vinserti128 ym4, ym4, [r5], 1
+ vinserti128 ym5, ym5, [r5 + r3], 1
+ vinserti128 ym6, ym6, [r5 + r3 * 2], 1
+ vinserti128 ym7, ym7, [r5 + r4], 1
+
+ add r0, 16
+ add r2, 16
+
+ lea r4, [r1 * 3]
+ vinserti32x4 m0, m0, [r0], 2
+ vinserti32x4 m1, m1, [r0 + r1], 2
+ vinserti32x4 m2, m2, [r0 + r1 * 2], 2
+ vinserti32x4 m3, m3, [r0 + r4], 2
+ lea r5, [r0 + r1 * 4]
+ vinserti32x4 m4, m4, [r5], 2
+ vinserti32x4 m5, m5, [r5 + r1], 2
+ vinserti32x4 m6, m6, [r5 + r1 * 2], 2
+ vinserti32x4 m7, m7, [r5 + r4], 2
+
+ lea r4, [r3 * 3]
+ vinserti32x4 m0, m0, [r2], 3
+ vinserti32x4 m1, m1, [r2 + r3], 3
+ vinserti32x4 m2, m2, [r2 + r3 * 2], 3
+ vinserti32x4 m3, m3, [r2 + r4], 3
+ lea r5, [r2 + r3 * 4]
+ vinserti32x4 m4, m4, [r5], 3
+ vinserti32x4 m5, m5, [r5 + r3], 3
+ vinserti32x4 m6, m6, [r5 + r3 * 2], 3
+ vinserti32x4 m7, m7, [r5 + r4], 3
+%endmacro
+
+
+%macro PSY_PP_16x8_AVX512_MAIN10 0
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, m14
+
+ psrldq m9, m8, 8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ psubw m9, m1, m0
+ paddw m0, m1
+ psubw m1, m3, m2
+ paddw m2, m3
+ punpckhwd m3, m0, m9
+ punpcklwd m0, m9
+ psubw m9, m3, m0
+ paddw m0, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ psubw m10, m3, m2
+ paddw m2, m3
+
+ psubw m3, m5, m4
+ paddw m4, m5
+ psubw m5, m7, m6
+ paddw m6, m7
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ psubw m7, m1, m4
+ paddw m4, m1
+ punpckhwd m3, m6, m5
+ punpcklwd m6, m5
+ psubw m1, m3, m6
+ paddw m6, m3
+
+ psubw m3, m2, m0
+ paddw m0, m2
+ psubw m2, m10, m9
+ paddw m9, m10
+ punpckhdq m5, m0, m3
+ punpckldq m0, m3
+ psubw m10, m5, m0
+ paddw m0, m5
+ punpckhdq m3, m9, m2
+ punpckldq m9, m2
+ psubw m5, m3, m9
+ paddw m9, m3
+
+ psubw m3, m6, m4
+ paddw m4, m6
+ psubw m6, m1, m7
+ paddw m7, m1
+ punpckhdq m2, m4, m3
+ punpckldq m4, m3
+ psubw m1, m2, m4
+ paddw m4, m2
+ punpckhdq m3, m7, m6
+ punpckldq m7, m6
+ psubw m2, m3, m7
+ paddw m7, m3
+
+ psubw m3, m4, m0
+ paddw m0, m4
+ psubw m4, m1, m10
+ paddw m10, m1
+ punpckhqdq m6, m0, m3
+ punpcklqdq m0, m3
+ pabsw m0, m0
+ pabsw m6, m6
+ pmaxsw m0, m6
+ punpckhqdq m3, m10, m4
+ punpcklqdq m10, m4
+ pabsw m10, m10
+ pabsw m3, m3
+ pmaxsw m10, m3
+
+ psubw m3, m7, m9
+ paddw m9, m7
+ psubw m7, m2, m5
+ paddw m5, m2
+ punpckhqdq m4, m9, m3
+ punpcklqdq m9, m3
+ pabsw m9, m9
+ pabsw m4, m4
+ pmaxsw m9, m4
+ punpckhqdq m3, m5, m7
+ punpcklqdq m5, m7
+ pabsw m5, m5
+ pabsw m3, m3
+ pmaxsw m5, m3
+
+ paddd m0, m9
+ paddd m0, m10
+ paddd m0, m5
+ psrld m9, m0, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m0, m9
+ psrldq m9, m0, 8
+ paddd m0, m9
+ psrldq m9, m0, 4
+ paddd m0, m9
+ paddd m0, m15
+ psrld m0, 1
+ psubd m0, m8
+
+ vextracti64x4 ym2, m0, 1
+
+ vextracti128 xm3, ym2, 1
+ psubd xm3, xm2
+ pabsd xm3, xm3
+
+ vextracti128 xm1, ym0, 1
+ psubd xm1, xm0
+ pabsd xm1, xm1
+ paddd xm1, xm3
+%endmacro
+
+
%if ARCH_X86_64
INIT_YMM avx2
%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
@@ -10672,6 +11037,173 @@
RET
%endif
%endif
+%if ARCH_X86_64
+INIT_ZMM avx512
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_16x16, 4, 10, 27
+ add r1d, r1d
+ add r3d, r3d
+ pxor m24, m24
+ movu m13, [psy_pp_shuff1]
+ movu m14, [psy_pp_shuff2]
+
+ mov r8d, 2
+.loopH:
+ mov r9d, 2
+.loopW:
+ PSY_COST_PP_8x8_AVX512_MAIN12
+
+ paddd xm24, xm11
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r8d
+ jnz .loopH
+ movd eax, xm24
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
+cglobal psyCost_pp_16x16, 4, 10, 16
+ add r1d, r1d
+ add r3d, r3d
+ pxor m11, m11
+ vbroadcasti32x8 m14, [pw_1]
+ vbroadcasti32x8 m15, [pd_1]
+
+ mov r8d, 2
+.loopH:
+ PSY_PP_INPUT_AVX512_MAIN10
+ PSY_PP_16x8_AVX512_MAIN10
+
+ paddd xm11, xm1
+ lea r0, [r0 + r1 * 8 - 16]
+ lea r2, [r2 + r3 * 8 - 16]
+ dec r8d
+ jnz .loopH
+ movd eax, xm11
+ RET
+%endif
+%endif
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_32x32, 4, 10, 27
+ add r1d, r1d
+ add r3d, r3d
+ pxor m24, m24
+ movu m13, [psy_pp_shuff1]
+ movu m14, [psy_pp_shuff2]
+
+ mov r8d, 4
+.loopH:
+ mov r9d, 4
+.loopW:
+ PSY_COST_PP_8x8_AVX512_MAIN12
+
+ paddd xm24, xm11
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r8d
+ jnz .loopH
+ movd eax, xm24
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
+cglobal psyCost_pp_32x32, 4, 10, 16
+ add r1d, r1d
+ add r3d, r3d
+ pxor m11, m11
+ vbroadcasti32x8 m14, [pw_1]
+ vbroadcasti32x8 m15, [pd_1]
+
+ mov r8d, 4
+.loopH:
+ mov r9d, 2
+.loopW:
+ PSY_PP_INPUT_AVX512_MAIN10
+ PSY_PP_16x8_AVX512_MAIN10
+
+ paddd xm11, xm1
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r8d
+ jnz .loopH
+ movd eax, xm11
+ RET
+%endif
+%endif
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
+cglobal psyCost_pp_64x64, 4, 10, 27
+ add r1d, r1d
+ add r3d, r3d
+ pxor m24, m24
+ movu m13, [psy_pp_shuff1]
+ movu m14, [psy_pp_shuff2]
+
+ mov r8d, 8
+.loopH:
+ mov r9d, 8
+.loopW:
+ PSY_COST_PP_8x8_AVX512_MAIN12
+
+ paddd xm24, xm11
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 128]
+ lea r2, [r2 + r3 * 8 - 128]
+ dec r8d
+ jnz .loopH
+ movd eax, xm24
+ RET
+%endif
+
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
+cglobal psyCost_pp_64x64, 4, 10, 16
+ add r1d, r1d
+ add r3d, r3d
+ pxor m11, m11
+ vbroadcasti32x8 m14, [pw_1]
+ vbroadcasti32x8 m15, [pd_1]
+
+ mov r8d, 8
+.loopH:
+ mov r9d, 4
+.loopW:
+ PSY_PP_INPUT_AVX512_MAIN10
+ PSY_PP_16x8_AVX512_MAIN10
+
+ paddd xm11, xm1
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 128]
+ lea r2, [r2 + r3 * 8 - 128]
+ dec r8d
+ jnz .loopH
+ movd eax, xm11
+ RET
+%endif
+%endif
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
More information about the x265-devel
mailing list