[x265] [PATCH 035 of 307] x86: AVX512 ssd_ss_32x32
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:33 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1500550192 -19800
# Thu Jul 20 16:59:52 2017 +0530
# Node ID 2eda6628c75302a10d59918a58740d6e27434293
# Parent 0320e60b3323546eb6767508f1c39cd088e9f03e
x86: AVX512 ssd_ss_32x32
AVX2 performance : 12.73x
AVX512 performance : 19.72x
diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 20 10:56:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530
@@ -3852,6 +3852,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512);
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
+ p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
}
#endif
diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Thu Jul 20 10:56:37 2017 +0530
+++ b/source/common/x86/ssd-a.asm Thu Jul 20 16:59:52 2017 +0530
@@ -1457,6 +1457,47 @@
paddd m5, m3
%endmacro
+%macro PROCESS_SSD_SS_32x8_AVX512 0
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + 2 * r1]
+ movu m3, [r0 + r5]
+
+ psubw m0, [r2]
+ psubw m1, [r2 + r3]
+ psubw m2, [r2 + 2 * r3]
+ psubw m3, [r2 + r6]
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m4, m0
+ paddd m5, m1
+ paddd m4, m2
+ paddd m5, m3
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + 2 * r1]
+ movu m3, [r0 + r5]
+
+ psubw m0, [r2]
+ psubw m1, [r2 + r3]
+ psubw m2, [r2 + 2 * r3]
+ psubw m3, [r2 + r6]
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m4, m0
+ paddd m5, m1
+ paddd m4, m2
+ paddd m5, m3
+%endmacro
+
INIT_ZMM avx512
cglobal pixel_ssd_ss_64x64, 4,7,6
add r1d, r1d
@@ -1492,6 +1533,30 @@
HADDD m4, m0
movd eax, xm4
RET
+
+INIT_ZMM avx512
+cglobal pixel_ssd_ss_32x32, 4,7,6
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+ pxor m4, m4
+ pxor m5, m5
+
+ PROCESS_SSD_SS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_SSD_SS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_SSD_SS_32x8_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ PROCESS_SSD_SS_32x8_AVX512
+ paddd m4, m5
+ HADDD m4, m0
+ movd eax, xm4
+ RET
;-----------------------------------------------------------------------------
; ssd_ss avx512 code end
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list