[x265] [PATCH 290 of 307] x86: AVX512 pixel_satd_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:48 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515745345 -19800
# Fri Jan 12 13:52:25 2018 +0530
# Node ID 1c2875198a213a5f8d84bff57fcec15727f94a4f
# Parent d43237051962eab3cd761cf24f3971de09c07aa5
x86: AVX512 pixel_satd_32xN
Size | AVX2 performance | AVX512 performance
-----------------------------------------------
32x8 | 10.34x | 12.26x
32x16 | 10.21x | 12.40x
32x24 | 10.47x | 13.23x
32x32 | 10.55x | 12.46x
32x48 | 10.60x | 12.59x
32x64 | 10.56x | 12.65x
diff -r d43237051962 -r 1c2875198a21 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jan 11 14:31:13 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jan 12 13:52:25 2018 +0530
@@ -5342,6 +5342,20 @@
p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
+ p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
+ p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
+ p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
+ p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
+ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
}
#endif
diff -r d43237051962 -r 1c2875198a21 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Jan 11 14:31:13 2018 +0530
+++ b/source/common/x86/pixel-a.asm Fri Jan 12 13:52:25 2018 +0530
@@ -14068,8 +14068,102 @@
paddd xm0, xm1
movd eax, xm0
RET
+
+%macro PROCESS_SATD_32x4_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows
+ ; rows 0-3
+ pmovzxbw m0, [r0]
+ pmovzxbw m4, [r2]
+ psubw m0, m4
+ pmovzxbw m1, [r0 + r1]
+ pmovzxbw m5, [r2 + r3]
+ psubw m1, m5
+ pmovzxbw m2, [r0 + r1 * 2]
+ pmovzxbw m4, [r2 + r3 * 2]
+ psubw m2, m4
+ pmovzxbw m3, [r0 + r4]
+ pmovzxbw m5, [r2 + r5]
+ psubw m3, m5
+ paddw m4, m0, m1
+ psubw m1, m0
+ paddw m0, m2, m3
+ psubw m3, m2
+ punpckhwd m2, m4, m1
+ punpcklwd m4, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddw m3, m4, m0
+ psubw m0, m4
+ paddw m4, m2, m1
+ psubw m1, m2
+ punpckhdq m2, m3, m0
+ punpckldq m3, m0
+ paddw m0, m3, m2
+ psubw m2, m3
+ punpckhdq m3, m4, m1
+ punpckldq m4, m1
+ paddw m1, m4, m3
+ psubw m3, m4
+ punpckhqdq m4, m0, m1
+ punpcklqdq m0, m1
+ pabsw m0, m0
+ pabsw m4, m4
+ pmaxsw m0, m0, m4
+ punpckhqdq m1, m2, m3
+ punpcklqdq m2, m3
+ pabsw m2, m2
+ pabsw m1, m1
+ pmaxsw m2, m1
+ pxor m7, m7
+ mova m1, m0
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m0
+ punpckhwd m1, m7
+ paddd m6, m1
+ pxor m7, m7
+ mova m1, m2
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m2
+ punpckhwd m1, m7
+ paddd m6, m1
+%endmacro
+
+%macro SATD_MAIN_AVX512_END 0
+ vextracti32x8 ym7, m6, 1
+ paddd ym6, ym7
+ vextracti128 xm7, ym6, 1
+ paddd xm6, xm6, xm7
+ punpckhqdq xm7, xm6, xm6
+ paddd xm6, xm7
+ movq rax, xm6
+ rorx rdx, rax, 32
+ add eax, edx
+%endmacro
+
+%macro SATD_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_satd_32x%1, 4,6,8
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+%rep %1/4 - 1
+ PROCESS_SATD_32x4_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+%endrep
+ PROCESS_SATD_32x4_AVX512
+ SATD_MAIN_AVX512_END
+ RET
+%endmacro
+
+SATD_32xN_AVX512 8
+SATD_32xN_AVX512 16
+SATD_32xN_AVX512 24
+SATD_32xN_AVX512 32
+SATD_32xN_AVX512 48
+SATD_32xN_AVX512 64
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
-
%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
INIT_YMM avx2
cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
More information about the x265-devel
mailing list