[x265] [PATCH 053 of 307] x86: AVX512 pixel_add_ps_32xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:51 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501571354 -19800
# Tue Aug 01 12:39:14 2017 +0530
# Node ID f8687bef93f25b343606e42f4fd252d5f0897d1a
# Parent 05972a61eb1aeac474ecc0d0150671e879177112
x86: AVX512 pixel_add_ps_32xN for high bit depth
AVX2 performance : 12.77x
AVX512 performance : 21.54x
This patch also cleanup low bit depth code
diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 01 10:56:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 12:39:14 2017 +0530
@@ -2198,6 +2198,9 @@
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
+ p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512);
// 64 X N
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
@@ -3893,8 +3896,8 @@
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512);
p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Tue Aug 01 10:56:55 2017 +0530
+++ b/source/common/x86/pixeladd8.asm Tue Aug 01 12:39:14 2017 +0530
@@ -769,132 +769,6 @@
PIXEL_ADD_PS_W32_H4_avx2 64
;-----------------------------------------------------------------------------
-; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PROCESS_ADD_PS_32x8_AVX512 0
- pmovzxbw m0, [r2] ; row 0 of src0
- movu m1, [r3] ; row 0 of src1
- pmovzxbw m2, [r2 + r4] ; row 1 of src0
- movu m3, [r3 + r5] ; row 1 of src1
- pmovzxbw m4, [r2 + r4 * 2] ; row 2 of src0
- movu m5, [r3 + r5 * 2] ; row 2 of src1
- pmovzxbw m6, [r2 + r7] ; row 3 of src0
- movu m7, [r3 + r8] ; row 3 of src1
-
- paddw m0, m1
- paddw m2, m3
- paddw m4, m5
- paddw m6, m7
- packuswb m0, m2
- packuswb m4, m6
- vpermq m0, m0, 11011000b
- vpermq m4, m4, 11011000b
- vshufi64x2 m0, m0, 11011000b
- vshufi64x2 m4, m4, 11011000b
- movu [r0], ym0 ; row 0 of dst
- movu [r0 + r1 * 2], ym4 ; row 2 of dst
- vshufi64x2 m0, m0, 01001110b
- vshufi64x2 m4, m4, 01001110b
- movu [r0 + r1], ym0 ; row 1 of dst
- movu [r0 + r9], ym4 ; row 3 of dst
-
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
-
- pmovzxbw m0, [r2] ; row 4 of src0
- movu m1, [r3] ; row 4 of src1
- pmovzxbw m2, [r2 + r4] ; row 5 of src0
- movu m3, [r3 + r5] ; row 5 of src1
- pmovzxbw m4, [r2 + r4 * 2] ; row 6 of src0
- movu m5, [r3 + r5 * 2] ; row 6 of src1
- pmovzxbw m6, [r2 + r7] ; row 7 of src0
- movu m7, [r3 + r8] ; row 7 of src1
-
- paddw m0, m1
- paddw m2, m3
- paddw m4, m5
- paddw m6, m7
- packuswb m0, m2
- packuswb m4, m6
- vpermq m0, m0, 11011000b
- vpermq m4, m4, 11011000b
- vshufi64x2 m0, m0, 11011000b
- vshufi64x2 m4, m4, 11011000b
- movu [r0], ym0 ; row 4 of dst
- movu [r0 + r1 * 2], ym4 ; row 6 of dst
- vshufi64x2 m0, m0, 01001110b
- vshufi64x2 m4, m4, 01001110b
- movu [r0 + r1], ym0 ; row 5 of dst
- movu [r0 + r9], ym4 ; row 7 of dst
-%endmacro
-
-
-%if HIGH_BIT_DEPTH==0
-%if ARCH_X86_64
-INIT_ZMM avx512
-cglobal pixel_add_ps_32x32, 6, 10, 8
- add r5, r5
- lea r7, [r4 * 3]
- lea r8, [r5 * 3]
- lea r9, [r1 * 3]
-
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- RET
-
-INIT_ZMM avx512
-cglobal pixel_add_ps_32x64, 6, 10, 8
- add r5, r5
- lea r7, [r4 * 3]
- lea r8, [r5 * 3]
- lea r9, [r1 * 3]
-
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- lea r2, [r2 + r4 * 4]
- lea r3, [r3 + r5 * 4]
- lea r0, [r0 + r1 * 4]
- PROCESS_ADD_PS_32x8_AVX512
- RET
-%endif
-%endif
-
-;-----------------------------------------------------------------------------
; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%macro PIXEL_ADD_PS_W64_H2 2
@@ -1272,7 +1146,7 @@
%endif
;-----------------------------------------------------------------------------
-; pixel_add_ps_64x64 avx512 code start
+; pixel_add_ps avx512 code start
;-----------------------------------------------------------------------------
%macro PROCESS_ADD_PS_64x8_AVX512 0
pmovzxbw m0, [r2]
@@ -1553,6 +1427,250 @@
RET
%endif
%endif
+
+%macro PROCESS_ADD_PS_32x8_AVX512 0
+ pmovzxbw m0, [r2]
+ movu m1, [r3]
+ pmovzxbw m2, [r2 + r4]
+ movu m3, [r3 + r5]
+ paddw m0, m1
+ paddw m2, m3
+ packuswb m0, m2
+ vpermq m0, m0, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ movu [r0], ym0
+ vshufi64x2 m0, m0, 01001110b
+ movu [r0 + r1], ym0
+
+ pmovzxbw m0, [r2 + r4 * 2]
+ movu m1, [r3 + r5 * 2]
+ pmovzxbw m2, [r2 + r6]
+ movu m3, [r3 + r7]
+ paddw m0, m1
+ paddw m2, m3
+ packuswb m0, m2
+ vpermq m0, m0, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ movu [r0 + r1 * 2], ym0
+ vshufi64x2 m0, m0, 01001110b
+ movu [r0 + r8], ym0
+
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+
+ pmovzxbw m0, [r2]
+ movu m1, [r3]
+ pmovzxbw m2, [r2 + r4]
+ movu m3, [r3 + r5]
+ paddw m0, m1
+ paddw m2, m3
+ packuswb m0, m2
+ vpermq m0, m0, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ movu [r0], ym0
+ vshufi64x2 m0, m0, 01001110b
+ movu [r0 + r1], ym0
+
+ pmovzxbw m0, [r2 + r4 * 2]
+ movu m1, [r3 + r5 * 2]
+ pmovzxbw m2, [r2 + r6]
+ movu m3, [r3 + r7]
+ paddw m0, m1
+ paddw m2, m3
+ packuswb m0, m2
+ vpermq m0, m0, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ movu [r0 + r1 * 2], ym0
+ vshufi64x2 m0, m0, 01001110b
+ movu [r0 + r8], ym0
+%endmacro
+
+%macro PROCESS_ADD_PS_32x8_HBD_AVX512 0
+ movu m0, [r2]
+ movu m1, [r2 + r4]
+ movu m2, [r3]
+ movu m3, [r3 + r5]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0], m0
+ movu [r0 + r1], m1
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r6]
+ movu m2, [r3 + r5 * 2]
+ movu m3, [r3 + r7]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r8], m1
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+
+ movu m0, [r2]
+ movu m1, [r2 + r4]
+ movu m2, [r3]
+ movu m3, [r3 + r5]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0], m0
+ movu [r0 + r1], m1
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r6]
+ movu m2, [r3 + r5 * 2]
+ movu m3, [r3 + r7]
+ paddw m0, m2
+ paddw m1, m3
+
+ CLIPW2 m0, m1, m4, m5
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r8], m1
+%endmacro
;-----------------------------------------------------------------------------
-; pixel_add_ps_64x64 avx512 code end
+; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x32, 6, 9, 6
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ pxor m4, m4
+ add r4d, r4d
+ add r5d, r5d
+ add r1d, r1d
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x64, 6, 9, 6
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ pxor m4, m4
+ add r4d, r4d
+ add r5d, r5d
+ add r1d, r1d
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_HBD_AVX512
+ RET
+%endif
+%else
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x32, 6, 9, 4
+ add r5, r5
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_add_ps_32x64, 6, 9, 4
+ add r5, r5
+ lea r6, [r4 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
+ PROCESS_ADD_PS_32x8_AVX512
+ RET
+%endif
+%endif
+;-----------------------------------------------------------------------------
+; pixel_add_ps avx512 code end
+;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list