[x265] [PATCH 231 of 307] x86: AVX512 ssd_ss_16x16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:49 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512371012 -19800
# Mon Dec 04 12:33:32 2017 +0530
# Node ID 465682e66d91ecf207feae78c33e32f0eaaf45c4
# Parent 4f690222337dbc1757665729ea15f2380a11c329
x86: AVX512 ssd_ss_16x16
AVX2 performance : 43.55x
AVX512 performance : 48.11x
This patch also cleanup already existing ssd_ss AVX512 code
diff -r 4f690222337d -r 465682e66d91 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 01 10:30:38 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 12:33:32 2017 +0530
@@ -4743,6 +4743,7 @@
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
+ p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
diff -r 4f690222337d -r 465682e66d91 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Fri Dec 01 10:30:38 2017 +0530
+++ b/source/common/x86/ssd-a.asm Mon Dec 04 12:33:32 2017 +0530
@@ -1390,183 +1390,120 @@
;-----------------------------------------------------------------------------
; ssd_ss avx512 code start
;-----------------------------------------------------------------------------
-%macro PROCESS_SSD_SS_64x8_AVX512 0
+%if ARCH_X86_64
+%macro PROCESS_SSD_SS_64x4_AVX512 0
movu m0, [r0]
movu m1, [r0 + mmsize]
movu m2, [r0 + r1]
movu m3, [r0 + r1 + mmsize]
-
- psubw m0, [r2]
- psubw m1, [r2 + mmsize]
- psubw m2, [r2 + r3]
- psubw m3, [r2 + r3 + mmsize]
+ movu m4, [r2]
+ movu m5, [r2 + mmsize]
+ movu m6, [r2 + r3]
+ movu m7, [r2 + r3 + mmsize]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
- paddd m4, m0
- paddd m5, m1
- paddd m4, m2
- paddd m5, m3
+ paddd m8, m0
+ paddd m8, m1
+ paddd m8, m2
+ paddd m8, m3
movu m0, [r0 + 2 * r1]
movu m1, [r0 + 2 * r1 + mmsize]
movu m2, [r0 + r5]
movu m3, [r0 + r5 + mmsize]
-
- psubw m0, [r2 + 2 * r3]
- psubw m1, [r2 + 2 * r3 + mmsize]
- psubw m2, [r2 + r6]
- psubw m3, [r2 + r6 + mmsize]
+ movu m4, [r2 + 2 * r3]
+ movu m5, [r2 + 2 * r3 + mmsize]
+ movu m6, [r2 + r6]
+ movu m7, [r2 + r6 + mmsize]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
- paddd m4, m0
- paddd m5, m1
- paddd m4, m2
- paddd m5, m3
-
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
-
+ paddd m8, m0
+ paddd m8, m1
+ paddd m8, m2
+ paddd m8, m3
+%endmacro
+
+%macro PROCESS_SSD_SS_32x4_AVX512 0
movu m0, [r0]
- movu m1, [r0 + mmsize]
- movu m2, [r0 + r1]
- movu m3, [r0 + r1 + mmsize]
-
- psubw m0, [r2]
- psubw m1, [r2 + mmsize]
- psubw m2, [r2 + r3]
- psubw m3, [r2 + r3 + mmsize]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + 2 * r1]
+ movu m3, [r0 + r5]
+ movu m4, [r2]
+ movu m5, [r2 + r3]
+ movu m6, [r2 + 2 * r3]
+ movu m7, [r2 + r6]
+
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
- paddd m4, m0
- paddd m5, m1
- paddd m4, m2
- paddd m5, m3
-
- movu m0, [r0 + 2 * r1]
- movu m1, [r0 + 2 * r1 + mmsize]
- movu m2, [r0 + r5]
- movu m3, [r0 + r5 + mmsize]
-
- psubw m0, [r2 + 2 * r3]
- psubw m1, [r2 + 2 * r3 + mmsize]
- psubw m2, [r2 + r6]
- psubw m3, [r2 + r6 + mmsize]
+ paddd m8, m0
+ paddd m8, m1
+ paddd m8, m2
+ paddd m8, m3
+%endmacro
+
+%macro PROCESS_SSD_SS_16x4_AVX512 0
+ movu ym0, [r0]
+ vinserti32x8 m0, [r0 + r1], 1
+ movu ym1, [r0 + 2 * r1]
+ vinserti32x8 m1, [r0 + r5], 1
+ movu ym4, [r2]
+ vinserti32x8 m4, [r2 + r3], 1
+ movu ym5, [r2 + 2 * r3]
+ vinserti32x8 m5, [r2 + r6], 1
+
+ psubw m0, m4
+ psubw m1, m5
pmaddwd m0, m0
pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- paddd m4, m0
- paddd m5, m1
- paddd m4, m2
- paddd m5, m3
+ paddd m8, m0
+ paddd m8, m1
%endmacro
-%macro PROCESS_SSD_SS_32x8_AVX512 0
- movu m0, [r0]
- movu m1, [r0 + r1]
- movu m2, [r0 + 2 * r1]
- movu m3, [r0 + r5]
-
- psubw m0, [r2]
- psubw m1, [r2 + r3]
- psubw m2, [r2 + 2 * r3]
- psubw m3, [r2 + r6]
- pmaddwd m0, m0
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- paddd m4, m0
- paddd m5, m1
- paddd m4, m2
- paddd m5, m3
-
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
-
- movu m0, [r0]
- movu m1, [r0 + r1]
- movu m2, [r0 + 2 * r1]
- movu m3, [r0 + r5]
-
- psubw m0, [r2]
- psubw m1, [r2 + r3]
- psubw m2, [r2 + 2 * r3]
- psubw m3, [r2 + r6]
- pmaddwd m0, m0
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- paddd m4, m0
- paddd m5, m1
- paddd m4, m2
- paddd m5, m3
-%endmacro
-
+%macro SSD_SS_AVX512 2
INIT_ZMM avx512
-cglobal pixel_ssd_ss_64x64, 4,7,6
+cglobal pixel_ssd_ss_%1x%2, 4,7,9
add r1d, r1d
add r3d, r3d
lea r5, [r1 * 3]
lea r6, [r3 * 3]
- pxor m4, m4
- pxor m5, m5
-
- PROCESS_SSD_SS_64x8_AVX512
+ pxor m8, m8
+
+%rep %2/4 - 1
+ PROCESS_SSD_SS_%1x4_AVX512
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_64x8_AVX512
- paddd m4, m5
- HADDD m4, m0
- movd eax, xm4
+%endrep
+ PROCESS_SSD_SS_%1x4_AVX512
+ HADDD m8, m0
+ movd eax, xm8
RET
-
-INIT_ZMM avx512
-cglobal pixel_ssd_ss_32x32, 4,7,6
- add r1d, r1d
- add r3d, r3d
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
- pxor m4, m4
- pxor m5, m5
-
- PROCESS_SSD_SS_32x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_32x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_32x8_AVX512
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
- PROCESS_SSD_SS_32x8_AVX512
- paddd m4, m5
- HADDD m4, m0
- movd eax, xm4
- RET
+%endmacro
+
+
+SSD_SS_AVX512 64, 64
+SSD_SS_AVX512 32, 32
+SSD_SS_AVX512 16, 16
+%endif
;-----------------------------------------------------------------------------
; ssd_ss avx512 code end
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list