[x265] [PATCH 052 of 307] x86: AVX512 pixel_add_ps_64x64 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:50 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501565215 -19800
# Tue Aug 01 10:56:55 2017 +0530
# Node ID 05972a61eb1aeac474ecc0d0150671e879177112
# Parent 984cad60283b474ed756238cf904b08df290e103
x86: AVX512 pixel_add_ps_64x64 for high bit depth
AVX2 performance: 14.14x
AVX512 performance: 20.40x
diff -r 984cad60283b -r 05972a61eb1a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 25 16:37:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 10:56:55 2017 +0530
@@ -2197,6 +2197,8 @@
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
+ p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+
// 64 X N
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
diff -r 984cad60283b -r 05972a61eb1a source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Tue Jul 25 16:37:38 2017 +0530
+++ b/source/common/x86/pixeladd8.asm Tue Aug 01 10:56:55 2017 +0530
@@ -1272,7 +1272,7 @@
%endif
;-----------------------------------------------------------------------------
-; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; pixel_add_ps_64x64 avx512 code start
;-----------------------------------------------------------------------------
%macro PROCESS_ADD_PS_64x8_AVX512 0
pmovzxbw m0, [r2]
@@ -1376,8 +1376,148 @@
movu [r0 + r1], m4
%endmacro
+%macro PROCESS_ADD_PS_64x8_HBD_AVX512 0
+ movu m0, [r2]
+ movu m1, [r2 + mmsize]
+ movu m2, [r3]
+ movu m3, [r3 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+
+ movu m0, [r2 + r4]
+ movu m1, [r2 + r4 + mmsize]
+ movu m2, [r3 + r5]
+ movu m3, [r3 + r5 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r1], m0
+ movu [r0 + r1 + mmsize], m1
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r4 * 2 + mmsize]
+ movu m2, [r3 + r5 * 2]
+ movu m3, [r3 + r5 * 2 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + mmsize], m1
+
+ movu m0, [r2 + r6]
+ movu m1, [r2 + r6 + mmsize]
+ movu m2, [r3 + r7]
+ movu m3, [r3 + r7 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r8], m0
+ movu [r0 + r8 + mmsize], m1
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+
+ movu m0, [r2]
+ movu m1, [r2 + mmsize]
+ movu m2, [r3]
+ movu m3, [r3 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0], m0
+ movu [r0 + mmsize], m1
+
+ movu m0, [r2 + r4]
+ movu m1, [r2 + r4 + mmsize]
+ movu m2, [r3 + r5]
+ movu m3, [r3 + r5 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r1], m0
+ movu [r0 + r1 + mmsize], m1
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r4 * 2 + mmsize]
+ movu m2, [r3 + r5 * 2]
+ movu m3, [r3 + r5 * 2 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + mmsize], m1
+
+ movu m0, [r2 + r6]
+ movu m1, [r2 + r6 + mmsize]
+ movu m2, [r3 + r7]
+ movu m3, [r3 + r7 + mmsize]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r8], m0
+ movu [r0 + r8 + mmsize], m1
+%endmacro
+;-----------------------------------------------------------------------------
+; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
%if ARCH_X86_64
-%if HIGH_BIT_DEPTH==0
+INIT_ZMM avx512
+cglobal pixel_add_ps_64x64, 6, 9, 6
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ pxor m4, m4
+ add r4d, r4d
+ add r5d, r5d
+ add r1d, r1d
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_64x8_HBD_AVX512
+ RET
+%endif
+%else
+%if ARCH_X86_64
INIT_ZMM avx512
cglobal pixel_add_ps_64x64, 6, 7, 8
add r5, r5
@@ -1411,6 +1551,8 @@
lea r0, [r0 + r1 * 2]
PROCESS_ADD_PS_64x8_AVX512
RET
-
%endif
%endif
+;-----------------------------------------------------------------------------
+; pixel_add_ps_64x64 avx512 code end
+;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list