[x265] [PATCH 250 of 307] x86: pixel_satd_16xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:08 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512648175 -19800
# Thu Dec 07 17:32:55 2017 +0530
# Node ID 86d3d34de566d7696028b5e798a79b9de3a6e62b
# Parent 617aa7cf2c76368cb8a3b252175c1b3d6f716915
x86: pixel_satd_16xN for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x8 | 9.62x | 14.03x
16x16 | 12.07x | 13.57x
16x32 | 12.82x | 16.03x
16x64 | 12.92x | 15.76x
This patch also cleanup existing satd AVX512 code
diff -r 617aa7cf2c76 -r 86d3d34de566 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 08 14:12:55 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 17:32:55 2017 +0530
@@ -3026,7 +3026,10 @@
p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
//Luma_hps_48x64
p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
-
+ p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512);
+ p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512);
+ p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx512);
+ p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx512);
p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
@@ -3036,11 +3039,17 @@
p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
-
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
diff -r 617aa7cf2c76 -r 86d3d34de566 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Fri Dec 08 14:12:55 2017 +0530
+++ b/source/common/x86/pixel-a.asm Thu Dec 07 17:32:55 2017 +0530
@@ -8227,7 +8227,7 @@
pmaxsw m%1, m%3
pmaxsw m%2, m%4
%endmacro
-
+%if HIGH_BIT_DEPTH==0
INIT_ZMM avx512
cglobal pixel_satd_16x8_internal
vbroadcasti64x4 m6, [hmul_16p]
@@ -8381,7 +8381,7 @@
SUMSUB_BA w, 0, 1, 2
HMAXABSW2 0, 1, 2, 3
SATD_AVX512_END 1
-
+%endif
; Input 10bit, Output 8bit
;------------------------------------------------------------------------------------------------------------------------
;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
@@ -13971,82 +13971,31 @@
paddd xm6, xm7
movd eax, xm6
%endmacro
-
-%macro PROCESS_SATD_32x8_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 8 rows
+%macro PROCESS_SATD_16x8_HBD_AVX512 0 ; function to compute satd cost for 16 columns, 8 rows
; rows 0-3
- movu m0, [r0]
- movu m4, [r2]
+ lea r6, [r0 + r1 * 4]
+ lea r7, [r2 + r3 * 4]
+ movu ym0, [r0]
+ movu ym4, [r2]
+ vinserti32x8 m0, [r6], 1
+ vinserti32x8 m4, [r7], 1
psubw m0, m4
- movu m1, [r0 + r1]
- movu m5, [r2 + r3]
+ movu ym1, [r0 + r1]
+ movu ym5, [r2 + r3]
+ vinserti32x8 m1, [r6 + r1], 1
+ vinserti32x8 m5, [r7 + r3], 1
psubw m1, m5
- movu m2, [r0 + r1 * 2]
- movu m4, [r2 + r3 * 2]
+ movu ym2, [r0 + r1 * 2]
+ movu ym4, [r2 + r3 * 2]
+ vinserti32x8 m2, [r6 + r1 * 2], 1
+ vinserti32x8 m4, [r7 + r3 * 2], 1
psubw m2, m4
- movu m3, [r0 + r4]
- movu m5, [r2 + r5]
+ movu ym3, [r0 + r4]
+ movu ym5, [r2 + r5]
+ vinserti32x8 m3, [r6 + r4], 1
+ vinserti32x8 m5, [r7 + r5], 1
psubw m3, m5
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
- paddw m4, m0, m1
- psubw m1, m0
- paddw m0, m2, m3
- psubw m3, m2
- punpckhwd m2, m4, m1
- punpcklwd m4, m1
- punpckhwd m1, m0, m3
- punpcklwd m0, m3
- paddw m3, m4, m0
- psubw m0, m4
- paddw m4, m2, m1
- psubw m1, m2
- punpckhdq m2, m3, m0
- punpckldq m3, m0
- paddw m0, m3, m2
- psubw m2, m3
- punpckhdq m3, m4, m1
- punpckldq m4, m1
- paddw m1, m4, m3
- psubw m3, m4
- punpckhqdq m4, m0, m1
- punpcklqdq m0, m1
- pabsw m0, m0
- pabsw m4, m4
- pmaxsw m0, m0, m4
- punpckhqdq m1, m2, m3
- punpcklqdq m2, m3
- pabsw m2, m2
- pabsw m1, m1
- pmaxsw m2, m1
- pxor m7, m7
- mova m1, m0
- punpcklwd m1, m7
- paddd m6, m1
- mova m1, m0
- punpckhwd m1, m7
- paddd m6, m1
- pxor m7, m7
- mova m1, m2
- punpcklwd m1, m7
- paddd m6, m1
- mova m1, m2
- punpckhwd m1, m7
- paddd m6, m1
- ; rows 4-7
- movu m0, [r0]
- movu m4, [r2]
- psubw m0, m4
- movu m1, [r0 + r1]
- movu m5, [r2 + r3]
- psubw m1, m5
- movu m2, [r0 + r1 * 2]
- movu m4, [r2 + r3 * 2]
- psubw m2, m4
- movu m3, [r0 + r4]
- movu m5, [r2 + r5]
- psubw m3, m5
- lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r3 * 4]
+
paddw m4, m0, m1
psubw m1, m0
paddw m0, m2, m3
@@ -14092,6 +14041,89 @@
punpckhwd m1, m7
paddd m6, m1
%endmacro
+%macro PROCESS_SATD_32x4_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows
+ ; rows 0-3
+ movu m0, [r0]
+ movu m4, [r2]
+ psubw m0, m4
+ movu m1, [r0 + r1]
+ movu m5, [r2 + r3]
+ psubw m1, m5
+ movu m2, [r0 + r1 * 2]
+ movu m4, [r2 + r3 * 2]
+ psubw m2, m4
+ movu m3, [r0 + r4]
+ movu m5, [r2 + r5]
+ psubw m3, m5
+ paddw m4, m0, m1
+ psubw m1, m0
+ paddw m0, m2, m3
+ psubw m3, m2
+ punpckhwd m2, m4, m1
+ punpcklwd m4, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddw m3, m4, m0
+ psubw m0, m4
+ paddw m4, m2, m1
+ psubw m1, m2
+ punpckhdq m2, m3, m0
+ punpckldq m3, m0
+ paddw m0, m3, m2
+ psubw m2, m3
+ punpckhdq m3, m4, m1
+ punpckldq m4, m1
+ paddw m1, m4, m3
+ psubw m3, m4
+ punpckhqdq m4, m0, m1
+ punpcklqdq m0, m1
+ pabsw m0, m0
+ pabsw m4, m4
+ pmaxsw m0, m0, m4
+ punpckhqdq m1, m2, m3
+ punpcklqdq m2, m3
+ pabsw m2, m2
+ pabsw m1, m1
+ pmaxsw m2, m1
+ pxor m7, m7
+ mova m1, m0
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m0
+ punpckhwd m1, m7
+ paddd m6, m1
+ pxor m7, m7
+ mova m1, m2
+ punpcklwd m1, m7
+ paddd m6, m1
+ mova m1, m2
+ punpckhwd m1, m7
+ paddd m6, m1
+%endmacro
+
+%macro SATD_16xN_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_satd_16x%1, 4,8,8
+ add r1d, r1d
+ add r3d, r3d
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+
+%rep %1/8 - 1
+ PROCESS_SATD_16x8_HBD_AVX512
+ lea r0, [r6 + 4 * r1]
+ lea r2, [r7 + 4 * r3]
+%endrep
+ PROCESS_SATD_16x8_HBD_AVX512
+ SATD_HBD_AVX512_END
+ RET
+%endmacro
+
+SATD_16xN_HBD_AVX512 8
+SATD_16xN_HBD_AVX512 16
+SATD_16xN_HBD_AVX512 32
+SATD_16xN_HBD_AVX512 64
%macro SATD_32xN_HBD_AVX512 1
INIT_ZMM avx512
@@ -14103,10 +14135,12 @@
pxor m6, m6
mov r6, r0
mov r7, r2
-
-%rep %1/8
- PROCESS_SATD_32x8_HBD_AVX512
+%rep %1/4 - 1
+ PROCESS_SATD_32x4_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
%endrep
+ PROCESS_SATD_32x4_HBD_AVX512
SATD_HBD_AVX512_END
RET
%endmacro
@@ -14127,15 +14161,20 @@
pxor m6, m6
mov r6, r0
mov r7, r2
-
-%rep %1/8
- PROCESS_SATD_32x8_HBD_AVX512
+%rep %1/4 - 1
+ PROCESS_SATD_32x4_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
%endrep
+ PROCESS_SATD_32x4_HBD_AVX512
lea r0, [r6 + mmsize]
lea r2, [r7 + mmsize]
-%rep %1/8
- PROCESS_SATD_32x8_HBD_AVX512
+%rep %1/4 - 1
+ PROCESS_SATD_32x4_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
%endrep
+ PROCESS_SATD_32x4_HBD_AVX512
SATD_HBD_AVX512_END
RET
%endmacro
More information about the x265-devel
mailing list