[x265] [PATCH 020 of 307] x86: AVX512 pixel_sad_x4_W64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:18 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499756069 -19800
# Tue Jul 11 12:24:29 2017 +0530
# Node ID a32718b2358bab3f19861d8402fe9adc8e312633
# Parent 40ab4480d070fca77c35c97c7c229b25d9a98a8a
x86: AVX512 pixel_sad_x4_W64
Size | AVX2 performance | AVX512 performance
------------------------------------------------
64x16 | 67.53x | 87.52x
64x32 | 73.27x | 100.10x
64x48 | 76.21x | 100.98x
64x64 | 79.72x | 102.79x
diff -r 40ab4480d070 -r a32718b2358b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 11 10:05:30 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 11 12:24:29 2017 +0530
@@ -3736,6 +3736,11 @@
p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
+ p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
+ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
+ p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
+ p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
+
p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512);
diff -r 40ab4480d070 -r a32718b2358b source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Tue Jul 11 10:05:30 2017 +0530
+++ b/source/common/x86/sad-a.asm Tue Jul 11 12:24:29 2017 +0530
@@ -4128,6 +4128,315 @@
SAD_X4_48x8_AVX2
PIXEL_SAD_X4_END_AVX2
RET
+
+;------------------------------------------------------------
+;sad_x4 avx512 code start
+;------------------------------------------------------------
+%macro SAD_X4_64x8_AVX512 0
+ movu m4, [r0]
+ movu m5, [r1]
+ movu m6, [r2]
+ movu m7, [r3]
+ movu m8, [r4]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE]
+ movu m5, [r1 + r5]
+ movu m6, [r2 + r5]
+ movu m7, [r3 + r5]
+ movu m8, [r4 + r5]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 2]
+ movu m5, [r1 + r5 * 2]
+ movu m6, [r2 + r5 * 2]
+ movu m7, [r3 + r5 * 2]
+ movu m8, [r4 + r5 * 2]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 3]
+ movu m5, [r1 + r7]
+ movu m6, [r2 + r7]
+ movu m7, [r3 + r7]
+ movu m8, [r4 + r7]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ movu m4, [r0]
+ movu m5, [r1]
+ movu m6, [r2]
+ movu m7, [r3]
+ movu m8, [r4]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE]
+ movu m5, [r1 + r5]
+ movu m6, [r2 + r5]
+ movu m7, [r3 + r5]
+ movu m8, [r4 + r5]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 2]
+ movu m5, [r1 + r5 * 2]
+ movu m6, [r2 + r5 * 2]
+ movu m7, [r3 + r5 * 2]
+ movu m8, [r4 + r5 * 2]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 3]
+ movu m5, [r1 + r7]
+ movu m6, [r2 + r7]
+ movu m7, [r3 + r7]
+ movu m8, [r4 + r7]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+%endmacro
+
+%macro PIXEL_SAD_X4_END_AVX512 0
+ vextracti32x8 ym4, m0, 1
+ vextracti32x8 ym5, m1, 1
+ vextracti32x8 ym6, m2, 1
+ vextracti32x8 ym7, m3, 1
+ paddd ym0, ym4
+ paddd ym1, ym5
+ paddd ym2, ym6
+ paddd ym3, ym7
+ vextracti64x2 xm4, m0, 1
+ vextracti64x2 xm5, m1, 1
+ vextracti64x2 xm6, m2, 1
+ vextracti64x2 xm7, m3, 1
+ paddd xm0, xm4
+ paddd xm1, xm5
+ paddd xm2, xm6
+ paddd xm3, xm7
+ pshufd xm4, xm0, 2
+ pshufd xm5, xm1, 2
+ pshufd xm6, xm2, 2
+ pshufd xm7, xm3, 2
+ paddd xm0, xm4
+ paddd xm1, xm5
+ paddd xm2, xm6
+ paddd xm3, xm7
+ movd [r6 + 0], xm0
+ movd [r6 + 4], xm1
+ movd [r6 + 8], xm2
+ movd [r6 + 12], xm3
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x16, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ PIXEL_SAD_X4_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x32, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ PIXEL_SAD_X4_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x48, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ PIXEL_SAD_X4_END_AVX512
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x64, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+ SAD_X4_64x8_AVX512
+ PIXEL_SAD_X4_END_AVX512
+ RET
+;------------------------------------------------------------
+;sad_x4 avx512 code end
+;------------------------------------------------------------
%endif
INIT_XMM sse2
More information about the x265-devel
mailing list