[x265] [PATCH 022 of 307] x86: AVX512 pixel_add_ps_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:20 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499924389 -19800
# Thu Jul 13 11:09:49 2017 +0530
# Node ID c1b7926fb590752578aa8cd17f4b86a7f743791b
# Parent 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e
x86: AVX512 pixel_add_ps_32xN
AVX2 performance: 14.81x
AVX512 performance: 18.01x
diff -r 238c5ee9ad24 -r c1b7926fb590 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 13 10:49:56 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 13 11:09:49 2017 +0530
@@ -3806,6 +3806,9 @@
p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+ p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
}
#endif
diff -r 238c5ee9ad24 -r c1b7926fb590 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Thu Jul 13 10:49:56 2017 +0530
+++ b/source/common/x86/pixeladd8.asm Thu Jul 13 11:09:49 2017 +0530
@@ -768,6 +768,131 @@
PIXEL_ADD_PS_W32_H4_avx2 32
PIXEL_ADD_PS_W32_H4_avx2 64
+;-----------------------------------------------------------------------------
+; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+;-----------------------------------------------------------------------------
+%macro PROCESS_ADD_PS_32x8_AVX512 0
+ pmovzxbw m0, [r2] ; row 0 of src0
+ movu m1, [r3] ; row 0 of src1
+ pmovzxbw m2, [r2 + r4] ; row 1 of src0
+ movu m3, [r3 + r5] ; row 1 of src1
+ pmovzxbw m4, [r2 + r4 * 2] ; row 2 of src0
+ movu m5, [r3 + r5 * 2] ; row 2 of src1
+ pmovzxbw m6, [r2 + r7] ; row 3 of src0
+ movu m7, [r3 + r8] ; row 3 of src1
+
+ paddw m0, m1
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ packuswb m0, m2
+ packuswb m4, m6
+ vpermq m0, m0, 11011000b
+ vpermq m4, m4, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m4, m4, 11011000b
+ movu [r0], ym0 ; row 0 of dst
+ movu [r0 + r1 * 2], ym4 ; row 2 of dst
+ vshufi64x2 m0, m0, 01001110b
+ vshufi64x2 m4, m4, 01001110b
+ movu [r0 + r1], ym0 ; row 1 of dst
+ movu [r0 + r9], ym4 ; row 3 of dst
+
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+
+ pmovzxbw m0, [r2] ; row 4 of src0
+ movu m1, [r3] ; row 4 of src1
+ pmovzxbw m2, [r2 + r4] ; row 5 of src0
+ movu m3, [r3 + r5] ; row 5 of src1
+ pmovzxbw m4, [r2 + r4 * 2] ; row 6 of src0
+ movu m5, [r3 + r5 * 2] ; row 6 of src1
+ pmovzxbw m6, [r2 + r7] ; row 7 of src0
+ movu m7, [r3 + r8] ; row 7 of src1
+
+ paddw m0, m1
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ packuswb m0, m2
+ packuswb m4, m6
+ vpermq m0, m0, 11011000b
+ vpermq m4, m4, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ vshufi64x2 m4, m4, 11011000b
+ movu [r0], ym0 ; row 4 of dst
+ movu [r0 + r1 * 2], ym4 ; row 6 of dst
+ vshufi64x2 m0, m0, 01001110b
+ vshufi64x2 m4, m4, 01001110b
+ movu [r0 + r1], ym0 ; row 5 of dst
+ movu [r0 + r9], ym4 ; row 7 of dst
+%endmacro
+
+
+%if HIGH_BIT_DEPTH==0
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x32, 6, 10, 8
+ add r5, r5
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+ lea r9, [r1 * 3]
+
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x64, 6, 10, 8
+ add r5, r5
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+ lea r9, [r1 * 3]
+
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ RET
+%endif
+%endif
;-----------------------------------------------------------------------------
; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
More information about the x265-devel
mailing list