[x265] [PATCH 269 of 307] [x265-avx512] x86: AVX512 psyCost_pp for main
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:27 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1513073610 -19800
# Tue Dec 12 15:43:30 2017 +0530
# Node ID a9be28cde01fd379dff1aec4bfcf809c7c96f9d2
# Parent 9a2c5411769847c4283594b99c1b07a99e92ea4a
[x265-avx512] x86: AVX512 psyCost_pp for main
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x16 | 10.51x | 14.52x
32x32 | 10.85x | 13.76x
64x64 | 10.23x | 13.62x
diff -r 9a2c54117698 -r a9be28cde01f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 20 12:15:46 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 12 15:43:30 2017 +0530
@@ -5235,6 +5235,9 @@
//p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
+ p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
+ p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
+ p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512);
diff -r 9a2c54117698 -r a9be28cde01f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 20 12:15:46 2017 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 12 15:43:30 2017 +0530
@@ -47,6 +47,7 @@
times 2 dw 1, -1
psy_pp_shuff1: dq 0, 1, 8, 9, 4, 5, 12, 13
psy_pp_shuff2: dq 2, 3, 10, 11, 6, 7, 14, 15
+psy_pp_shuff3: dq 0, 0, 8, 8, 1, 1, 9, 9
ALIGN 32
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
@@ -10767,6 +10768,227 @@
paddd xm1, xm3
%endmacro
+%macro PSY_PP_INPUT_AVX512_MAIN 0
+ movu xm16, [r0 + r1 * 0]
+ movu xm17, [r0 + r1 * 1]
+ movu xm18, [r0 + r1 * 2]
+ movu xm19, [r0 + r4 * 1]
+
+ movu xm20, [r2 + r3 * 0]
+ movu xm21, [r2 + r3 * 1]
+ movu xm22, [r2 + r3 * 2]
+ movu xm23, [r2 + r7 * 1]
+
+ mova m0, m26
+ vpermi2q m0, m16, m20
+ mova m1, m26
+ vpermi2q m1, m17, m21
+ mova m2, m26
+ vpermi2q m2, m18, m22
+ mova m3, m26
+ vpermi2q m3, m19, m23
+
+
+ lea r5, [r0 + r1 * 4]
+ lea r6, [r2 + r3 * 4]
+
+ movu xm16, [r5 + r1 * 0]
+ movu xm17, [r5 + r1 * 1]
+ movu xm18, [r5 + r1 * 2]
+ movu xm19, [r5 + r4 * 1]
+
+ movu xm20, [r6 + r3 * 0]
+ movu xm21, [r6 + r3 * 1]
+ movu xm22, [r6 + r3 * 2]
+ movu xm23, [r6 + r7 * 1]
+
+ mova m4, m26
+ vpermi2q m4, m16, m20
+ mova m5, m26
+ vpermi2q m5, m17, m21
+ mova m6, m26
+ vpermi2q m6, m18, m22
+ mova m7, m26
+ vpermi2q m7, m19, m23
+%endmacro
+
+%macro PSY_PP_16x8_AVX512_MAIN 0
+ pmaddubsw m0, m8
+ pmaddubsw m1, m8
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+ pmaddubsw m5, m8
+ pmaddubsw m6, m8
+ pmaddubsw m7, m8
+
+ paddw m11, m0, m1
+ paddw m11, m2
+ paddw m11, m3
+ paddw m11, m4
+ paddw m11, m5
+ paddw m11, m6
+ paddw m11, m7
+
+ pmaddwd m11, m14
+ psrldq m10, m11, 4
+ paddd m11, m10
+ psrld m11, 2
+
+ mova m9, m0
+ paddw m0, m1
+ psubw m1, m9
+ mova m9, m2
+ paddw m2, m3
+ psubw m3, m9
+ mova m9, m0
+ paddw m0, m2
+ psubw m2, m9
+ mova m9, m1
+ paddw m1, m3
+ psubw m3, m9
+
+ movdqa m9, m4
+ paddw m4, m5
+ psubw m5, m9
+ movdqa m9, m6
+ paddw m6, m7
+ psubw m7, m9
+ movdqa m9, m4
+ paddw m4, m6
+ psubw m6, m9
+ movdqa m9, m5
+ paddw m5, m7
+ psubw m7, m9
+
+ movdqa m9, m0
+ paddw m0, m4
+ psubw m4, m9
+ movdqa m9, m1
+ paddw m1, m5
+ psubw m5, m9
+
+ mova m9, m0
+ vshufps m9, m9, m4, 11011101b
+ vshufps m0, m0, m4, 10001000b
+
+ movdqa m4, m0
+ paddw m16, m0, m9
+ psubw m17, m9, m4
+
+ movaps m4, m1
+ vshufps m4, m4, m5, 11011101b
+ vshufps m1, m1, m5, 10001000b
+
+ movdqa m5, m1
+ paddw m18, m1, m4
+ psubw m19, m4, m5
+
+ movdqa m5, m2
+ paddw m2, m6
+ psubw m6, m5
+ movdqa m5, m3
+ paddw m3, m7
+ psubw m7, m5
+
+ movaps m5, m2
+ vshufps m5, m5, m6, 11011101b
+ vshufps m2, m2, m6, 10001000b
+
+ movdqa m6, m2
+ paddw m20, m2, m5
+ psubw m21, m5, m6
+
+ movaps m6, m3
+
+ vshufps m6, m6, m7, 11011101b
+ vshufps m3, m3, m7, 10001000b
+
+ movdqa m7, m3
+ paddw m22, m3, m6
+ psubw m23, m6, m7
+
+ movdqa m7, m16
+
+ vextracti64x4 ym24, m16, 1
+ vextracti64x4 ym25, m17, 1
+ pblendw ym16, ym17, 10101010b
+ pblendw ym24, ym25, 10101010b
+ vinserti64x4 m16, m16, ym24, 1
+
+ pslld m17, 10h
+ psrld m7, 10h
+ por m17, m7
+ pabsw m16, m16
+ pabsw m17, m17
+ pmaxsw m16, m17
+ movdqa m7, m18
+
+ vextracti64x4 ym24, m18, 1
+ vextracti64x4 ym25, m19, 1
+ pblendw ym18, ym19, 10101010b
+ pblendw ym24, ym25, 10101010b
+ vinserti64x4 m18, m18, ym24, 1
+
+ pslld m19, 10h
+ psrld m7, 10h
+ por m19, m7
+ pabsw m18, m18
+ pabsw m19, m19
+ pmaxsw m18, m19
+ movdqa m7, m20
+
+ vextracti64x4 ym24, m20, 1
+ vextracti64x4 ym25, m21, 1
+ pblendw ym20, ym21, 10101010b
+ pblendw ym24, ym25, 10101010b
+ vinserti64x4 m20, m20, ym24, 1
+
+ pslld m21, 10h
+ psrld m7, 10h
+ por m21, m7
+ pabsw m20, m20
+ pabsw m21, m21
+ pmaxsw m20, m21
+ mova m7, m22
+
+ vextracti64x4 ym24, m22, 1
+ vextracti64x4 ym25, m23, 1
+ pblendw ym22, ym23, 10101010b
+ pblendw ym24, ym25, 10101010b
+ vinserti64x4 m22, m22, ym24, 1
+
+ pslld m23, 10h
+ psrld m7, 10h
+ por m23, m7
+ pabsw m22, m22
+ pabsw m23, m23
+ pmaxsw m22, m23
+ paddw m16, m18
+ paddw m16, m20
+ paddw m16, m22
+ pmaddwd m16, m14
+ psrldq m1, m16, 8
+ paddd m16, m1
+
+ pshuflw m1, m16, 00001110b
+ paddd m16, m1
+ paddd m16, m15
+ psrld m16, 1
+
+ psubd m16, m11
+ vextracti64x4 ym2, m16, 1
+
+ vextracti128 xm1, ym16, 1
+ psubd xm16, xm1
+ pabsd xm16, xm16
+
+ vextracti128 xm3, ym2, 1
+ psubd xm3, xm2
+ pabsd xm3, xm3
+ paddd xm16, xm3
+%endmacro
+
%if ARCH_X86_64
INIT_YMM avx2
@@ -11087,6 +11309,30 @@
movd eax, xm11
RET
%endif
+
+%if BIT_DEPTH == 8
+cglobal psyCost_pp_16x16, 4, 10, 27
+ lea r4, [3 * r1]
+ lea r7, [3 * r3]
+ vbroadcasti32x8 m8, [hmul_8p]
+ pxor m13, m13
+ vbroadcasti32x8 m14, [pw_1]
+ vbroadcasti32x8 m15, [pd_1]
+ movu m26, [psy_pp_shuff3]
+
+ mov r8d, 2
+.loopH:
+ PSY_PP_INPUT_AVX512_MAIN
+ PSY_PP_16x8_AVX512_MAIN
+
+ paddd m13, m16
+ lea r0, [r0 + r1 * 8]
+ lea r2, [r2 + r3 * 8]
+ dec r8d
+ jnz .loopH
+ movd eax, xm13
+ RET
+%endif
%endif
%if ARCH_X86_64
@@ -11145,6 +11391,36 @@
movd eax, xm11
RET
%endif
+
+%if BIT_DEPTH == 8
+cglobal psyCost_pp_32x32, 4, 10, 27
+ lea r4, [3 * r1]
+ lea r7, [3 * r3]
+ vbroadcasti32x8 m8, [hmul_8p]
+ pxor m13, m13
+ vbroadcasti32x8 m14, [pw_1]
+ vbroadcasti32x8 m15, [pd_1]
+ movu m26, [psy_pp_shuff3]
+
+ mov r8d, 4
+.loopH:
+ mov r9d, 2
+.loopW:
+ PSY_PP_INPUT_AVX512_MAIN
+ PSY_PP_16x8_AVX512_MAIN
+
+ paddd m13, m16
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r8d
+ jnz .loopH
+ movd eax, xm13
+ RET
+%endif
%endif
%if ARCH_X86_64
@@ -11203,6 +11479,36 @@
movd eax, xm11
RET
%endif
+
+%if BIT_DEPTH == 8
+cglobal psyCost_pp_64x64, 4, 10, 27
+ lea r4, [3 * r1]
+ lea r7, [3 * r3]
+ vbroadcasti32x8 m8, [hmul_8p]
+ pxor m13, m13
+ vbroadcasti32x8 m14, [pw_1]
+ vbroadcasti32x8 m15, [pd_1]
+ movu m26, [psy_pp_shuff3]
+
+ mov r8d, 8
+.loopH:
+ mov r9d, 4
+.loopW:
+ PSY_PP_INPUT_AVX512_MAIN
+ PSY_PP_16x8_AVX512_MAIN
+
+ paddd m13, m16
+ add r0, 16
+ add r2, 16
+ dec r9d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 64]
+ lea r2, [r2 + r3 * 8 - 64]
+ dec r8d
+ jnz .loopH
+ movd eax, xm13
+ RET
+%endif
%endif
;---------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list