[x265] [PATCH 021 of 307] x86: AVX512 pixel_add_ps_64x64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:19 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499923196 -19800
# Thu Jul 13 10:49:56 2017 +0530
# Node ID 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e
# Parent a32718b2358bab3f19861d8402fe9adc8e312633
x86: AVX512 pixel_add_ps_64x64
AVX2 performance: 13.99x
AVX512 performance: 21.64x
diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 11 12:24:29 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 13 10:49:56 2017 +0530
@@ -3805,6 +3805,8 @@
p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
+ p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+
}
#endif
}
diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Tue Jul 11 12:24:29 2017 +0530
+++ b/source/common/x86/pixeladd8.asm Thu Jul 13 10:49:56 2017 +0530
@@ -1145,3 +1145,147 @@
RET
%endif
+
+;-----------------------------------------------------------------------------
+; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%macro PROCESS_ADD_PS_64x8_AVX512 0
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 32]
+ movu m2, [r3]
+ movu m3, [r3 + 64]
+ pmovzxbw m4, [r2 + r4]
+ pmovzxbw m5, [r2 + r4 + 32]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 64]
+
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ packuswb m0, m1
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m4, m4, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m4, m4, 11011000b
+ movu [r0], m0
+ movu [r0 + r1], m4
+
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 32]
+ movu m2, [r3]
+ movu m3, [r3 + 64]
+ pmovzxbw m4, [r2 + r4]
+ pmovzxbw m5, [r2 + r4 + 32]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 64]
+
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ packuswb m0, m1
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m4, m4, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m4, m4, 11011000b
+ movu [r0], m0
+ movu [r0 + r1], m4
+
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 32]
+ movu m2, [r3]
+ movu m3, [r3 + 64]
+ pmovzxbw m4, [r2 + r4]
+ pmovzxbw m5, [r2 + r4 + 32]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 64]
+
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ packuswb m0, m1
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m4, m4, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m4, m4, 11011000b
+ movu [r0], m0
+ movu [r0 + r1], m4
+
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 32]
+ movu m2, [r3]
+ movu m3, [r3 + 64]
+ pmovzxbw m4, [r2 + r4]
+ pmovzxbw m5, [r2 + r4 + 32]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 64]
+
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ packuswb m0, m1
+ packuswb m4, m5
+ vpermq m0, m0, 11011000b
+ vpermq m4, m4, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m4, m4, 11011000b
+ movu [r0], m0
+ movu [r0 + r1], m4
+%endmacro
+
+%if ARCH_X86_64
+%if HIGH_BIT_DEPTH==0
+INIT_ZMM avx512
+cglobal pixel_add_ps_64x64, 6, 7, 8
+ add r5, r5
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ PROCESS_ADD_PS_64x8_AVX512
+ RET
+
+%endif
+%endif
More information about the x265-devel
mailing list