[x265] [PATCH 243 of 307] [x265-avx512]x86: AVX512 sad_x3_16xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:01 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1512637265 -19800
# Thu Dec 07 14:31:05 2017 +0530
# Node ID 0ffc9c56a0a7361e98e6388e3067e4a78e8cd252
# Parent 931dd781dc0c6de76bb31d0215db7a7af885f9bf
[x265-avx512]x86: AVX512 sad_x3_16xN for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x8 | 16.34x | 17.91x
16x12 | 17.38x | 18.82x
16x16 | 17.90x | 20.07x
16x32 | 18.39x | 21.77x
16x64 | 18.00x | 22.43x
diff -r 931dd781dc0c -r 0ffc9c56a0a7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Dec 07 11:07:35 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 14:31:05 2017 +0530
@@ -2495,6 +2495,11 @@
p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512);
+ p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx512);
+ p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx512);
+ p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx512);
+ p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_avx512);
+ p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_avx512);
p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
diff -r 931dd781dc0c -r 0ffc9c56a0a7 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Thu Dec 07 11:07:35 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Dec 07 14:31:05 2017 +0530
@@ -2443,6 +2443,54 @@
%endmacro
+%macro PROCESS_SAD_X3_16x4_AVX512 0
+ movu ym6, [r0]
+ vinserti64x4 m6, [r0 + 2 * FENC_STRIDE], 1
+ movu ym3, [r1]
+ vinserti64x4 m3, [r1 + r4], 1
+ movu ym4, [r2]
+ vinserti64x4 m4, [r2 + r4], 1
+ movu ym5, [r3]
+ vinserti64x4 m5, [r3 + r4], 1
+
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+
+ pmaddwd m3, m7
+ paddd m0, m3
+ pmaddwd m4, m7
+ paddd m1, m4
+ pmaddwd m5, m7
+ paddd m2, m5
+
+ movu ym6, [r0 + 4 * FENC_STRIDE]
+ vinserti64x4 m6, [r0 + 6 * FENC_STRIDE], 1
+ movu ym3, [r1 + 2 * r4]
+ vinserti64x4 m3, [r1 + r6], 1
+ movu ym4, [r2 + 2 * r4]
+ vinserti64x4 m4, [r2 + r6], 1
+ movu ym5, [r3 + 2 * r4]
+ vinserti64x4 m5, [r3 + r6], 1
+
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+
+ pmaddwd m3, m7
+ paddd m0, m3
+ pmaddwd m4, m7
+ paddd m1, m4
+ pmaddwd m5, m7
+ paddd m2, m5
+%endmacro
+
%macro PROCESS_SAD_X3_32x4_AVX512 0
movu m6, [r0]
@@ -2700,6 +2748,118 @@
;------------------------------------------------------------------------------------------------------------------------------------------
+; void pixel_sad_x3_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
+;------------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x8, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ PROCESS_SAD_X3_16x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_16x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x12, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+ %rep 2
+ PROCESS_SAD_X3_16x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ %endrep
+ PROCESS_SAD_X3_16x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x16, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ %rep 3
+ PROCESS_SAD_X3_16x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ %endrep
+ PROCESS_SAD_X3_16x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x32, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ %rep 7
+ PROCESS_SAD_X3_16x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ %endrep
+ PROCESS_SAD_X3_16x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_16x64, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ %rep 15
+ PROCESS_SAD_X3_16x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ %endrep
+ PROCESS_SAD_X3_16x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+%endif
+
+;------------------------------------------------------------------------------------------------------------------------------------------
; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
;------------------------------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
More information about the x265-devel
mailing list