[x265] [PATCH 241 of 307] x86: AVX512 pixel_satd_64xN and 32xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:59 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512622521 -19800
# Thu Dec 07 10:25:21 2017 +0530
# Node ID 9bd38bd06850914d1cbf617063ea0e1e60f66219
# Parent 2d298099a8d6b266a32b975de4b6a369988d3887
x86: AVX512 pixel_satd_64xN and 32xN for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
32x8 | 10.99x | 17.98x
32x16 | 12.18x | 17.05x
32x24 | 13.11x | 19.70x
32x32 | 13.21x | 18.36x
32x64 | 13.27x | 19.04x
64x16 | 12.36x | 17.15x
64x32 | 11.63x | 17.78x
64x48 | 12.00x | 19.23x
64x64 | 12.12x | 19.20x
diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 06 10:53:15 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 10:25:21 2017 +0530
@@ -3015,6 +3015,24 @@
//Luma_hps_48x64
p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
+ p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
+ p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
+ p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
+ p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
+ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
+ p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512);
+ p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
+ p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
+ p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
+
}
#endif
}
diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Dec 06 10:53:15 2017 +0530
+++ b/source/common/x86/pixel-a.asm Thu Dec 07 10:25:21 2017 +0530
@@ -13958,6 +13958,192 @@
paddd xm6, xm7
movd eax, xm6
RET
+
+%macro SATD_HBD_AVX512_END 0
+ vextracti32x8 ym7, m6, 1
+ paddd ym6, ym7
+ vextracti128 xm7, ym6, 1
+ paddd xm6, xm7
+ pxor xm7, xm7
+ movhlps xm7, xm6
+ paddd xm6, xm7
+ pshufd xm7, xm6, 1
+ paddd xm6, xm7
+ movd eax, xm6
+%endmacro
+
+%macro PROCESS_SATD_32x8_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 8 rows
+ ; rows 0-3
+ movu m0, [r0]
+ movu m4, [r2]
+ psubw m0, m4
+ movu m1, [r0 + r1]
+ movu m5, [r2 + r3]
+ psubw m1, m5
+ movu m2, [r0 + r1 * 2]
+ movu m4, [r2 + r3 * 2]
+ psubw m2, m4
+ movu m3, [r0 + r4]
+ movu m5, [r2 + r5]
+ psubw m3, m5
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ paddw m4, m0, m1
+ psubw m1, m0
+ paddw m0, m2, m3
+ psubw m3, m2
+ punpckhwd m2, m4, m1
+ punpcklwd m4, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddw m3, m4, m0
+ psubw m0, m4
+ paddw m4, m2, m1
+ psubw m1, m2
+ punpckhdq m2, m3, m0
+ punpckldq m3, m0
+ paddw m0, m3, m2
+ psubw m2, m3
+ punpckhdq m3, m4, m1
+ punpckldq m4, m1
+ paddw m1, m4, m3
+ psubw m3, m4
+ punpckhqdq m4, m0, m1
+ punpcklqdq m0, m1
+ pabsw m0, m0
+ pabsw m4, m4
+ pmaxsw m0, m0, m4
+ punpckhqdq m1, m2, m3
+ punpcklqdq m2, m3
+ pabsw m2, m2
+ pabsw m1, m1
+ pmaxsw m2, m1
+ pxor m7, m7
+ mova m1, m0
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m0
+ punpckhwd m1, m7
+ paddd m6, m1
+ pxor m7, m7
+ mova m1, m2
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m2
+ punpckhwd m1, m7
+ paddd m6, m1
+ ; rows 4-7
+ movu m0, [r0]
+ movu m4, [r2]
+ psubw m0, m4
+ movu m1, [r0 + r1]
+ movu m5, [r2 + r3]
+ psubw m1, m5
+ movu m2, [r0 + r1 * 2]
+ movu m4, [r2 + r3 * 2]
+ psubw m2, m4
+ movu m3, [r0 + r4]
+ movu m5, [r2 + r5]
+ psubw m3, m5
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r3 * 4]
+ paddw m4, m0, m1
+ psubw m1, m0
+ paddw m0, m2, m3
+ psubw m3, m2
+ punpckhwd m2, m4, m1
+ punpcklwd m4, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddw m3, m4, m0
+ psubw m0, m4
+ paddw m4, m2, m1
+ psubw m1, m2
+ punpckhdq m2, m3, m0
+ punpckldq m3, m0
+ paddw m0, m3, m2
+ psubw m2, m3
+ punpckhdq m3, m4, m1
+ punpckldq m4, m1
+ paddw m1, m4, m3
+ psubw m3, m4
+ punpckhqdq m4, m0, m1
+ punpcklqdq m0, m1
+ pabsw m0, m0
+ pabsw m4, m4
+ pmaxsw m0, m0, m4
+ punpckhqdq m1, m2, m3
+ punpcklqdq m2, m3
+ pabsw m2, m2
+ pabsw m1, m1
+ pmaxsw m2, m1
+ pxor m7, m7
+ mova m1, m0
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m0
+ punpckhwd m1, m7
+ paddd m6, m1
+ pxor m7, m7
+ mova m1, m2
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m2
+ punpckhwd m1, m7
+ paddd m6, m1
+%endmacro
+
+%macro SATD_32xN_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_satd_32x%1, 4,8,8
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+%rep %1/8
+ PROCESS_SATD_32x8_HBD_AVX512
+%endrep
+ SATD_HBD_AVX512_END
+ RET
+%endmacro
+
+SATD_32xN_HBD_AVX512 8
+SATD_32xN_HBD_AVX512 16
+SATD_32xN_HBD_AVX512 24
+SATD_32xN_HBD_AVX512 32
+SATD_32xN_HBD_AVX512 64
+
+%macro SATD_64xN_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_satd_64x%1, 4,8,8
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+%rep %1/8
+ PROCESS_SATD_32x8_HBD_AVX512
+%endrep
+ lea r0, [r6 + mmsize]
+ lea r2, [r7 + mmsize]
+%rep %1/8
+ PROCESS_SATD_32x8_HBD_AVX512
+%endrep
+ SATD_HBD_AVX512_END
+ RET
+%endmacro
+
+SATD_64xN_HBD_AVX512 16
+SATD_64xN_HBD_AVX512 32
+SATD_64xN_HBD_AVX512 48
+SATD_64xN_HBD_AVX512 64
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
More information about the x265-devel
mailing list