[x265] [PATCH 065 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:03 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1501765251 -19800
# Thu Aug 03 18:30:51 2017 +0530
# Node ID df45017fca906d5f3370dcc78e43284622753a73
# Parent 200e6c43adc0c77e588a44d734e7d340e4753ccd
[x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth
Size | AVX2 performance | AVX512 performance
------------------------------------------------
32x8 | 20.72x | 29.20x
32x16 | 19.31x | 30.53x
32x24 | 19.78x | 33.32x
32x32 | 20.02x | 32.71x
32x64 | 20.40x | 33.30x
diff -r 200e6c43adc0 -r df45017fca90 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 01 18:52:23 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530
@@ -2307,6 +2307,12 @@
p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512);
+ p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
+ p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
+ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
+ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
+ p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
diff -r 200e6c43adc0 -r df45017fca90 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Tue Aug 01 18:52:23 2017 +0530
+++ b/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530
@@ -2497,3 +2497,362 @@
SAD_X 4, 64, 48
SAD_X 4, 64, 64
+;============================
+; SAD x3/x4 avx512 code start
+;============================
+
+%macro PROCESS_SAD_X3_32x4_AVX512 0
+ movu m6, [r0]
+ movu m3, [r1]
+ movu m4, [r2]
+ movu m5, [r3]
+
+
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+
+ pmaddwd m3, m7
+ paddd m0, m3
+ pmaddwd m4, m7
+ paddd m1, m4
+ pmaddwd m5, m7
+ paddd m2, m5
+
+ movu m6, [r0 + 2 * FENC_STRIDE]
+ movu m3, [r1 + r4]
+ movu m4, [r2 + r4]
+ movu m5, [r3 + r4]
+
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+
+ pmaddwd m3, m7
+ paddd m0, m3
+ pmaddwd m4, m7
+ paddd m1, m4
+ pmaddwd m5, m7
+ paddd m2, m5
+
+ movu m6, [r0 + 4 * FENC_STRIDE]
+ movu m3, [r1 + 2 * r4]
+ movu m4, [r2 + 2 * r4]
+ movu m5, [r3 + 2 * r4]
+
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+
+ pmaddwd m3, m7
+ paddd m0, m3
+ pmaddwd m4, m7
+ paddd m1, m4
+ pmaddwd m5, m7
+ paddd m2, m5
+
+ movu m6, [r0 + 6 * FENC_STRIDE]
+ movu m3, [r1 + r6]
+ movu m4, [r2 + r6]
+ movu m5, [r3 + r6]
+
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ pabsw m3, m3
+ pabsw m4, m4
+ pabsw m5, m5
+
+ pmaddwd m3, m7
+ paddd m0, m3
+ pmaddwd m4, m7
+ paddd m1, m4
+ pmaddwd m5, m7
+ paddd m2, m5
+%endmacro
+
+
+%macro PROCESS_SAD_X3_END_AVX512 0
+ vextracti32x8 ym3, m0, 1
+ vextracti32x8 ym4, m1, 1
+ vextracti32x8 ym5, m2, 1
+
+ paddd ym0, ym3
+ paddd ym1, ym4
+ paddd ym2, ym5
+
+ vextracti64x2 xm3, m0, 1
+ vextracti64x2 xm4, m1, 1
+ vextracti64x2 xm5, m2, 1
+
+ paddd xm0, xm3
+ paddd xm1, xm4
+ paddd xm2, xm5
+
+ pshufd xm3, xm0, 00001110b
+ pshufd xm4, xm1, 00001110b
+ pshufd xm5, xm2, 00001110b
+
+ paddd xm0, xm3
+ paddd xm1, xm4
+ paddd xm2, xm5
+
+ pshufd xm3, xm0, 00000001b
+ pshufd xm4, xm1, 00000001b
+ pshufd xm5, xm2, 00000001b
+
+ paddd xm0, xm3
+ paddd xm1, xm4
+ paddd xm2, xm5
+
+ movd [r5 + 0], xm0
+ movd [r5 + 4], xm1
+ movd [r5 + 8], xm2
+%endmacro
+
+
+;------------------------------------------------------------------------------------------------------------------------------------------
+; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
+;------------------------------------------------------------------------------------------------------------------------------------------
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_32x8, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_32x16, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_32x24, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_32x32, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x3_32x64, 6,7,8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ vbroadcasti32x8 m7, [pw_1]
+
+ add r4d, r4d
+ lea r6d, [r4 * 3]
+
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ add r0, FENC_STRIDE * 8
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r4 * 4]
+ PROCESS_SAD_X3_32x4_AVX512
+ PROCESS_SAD_X3_END_AVX512
+ RET
+
More information about the x265-devel
mailing list