[x265] [PATCH 054 of 307] [x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:52 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1501239623 -19800
# Fri Jul 28 16:30:23 2017 +0530
# Node ID e65ac86010af8f7ab1e5b43591330eeb6c818473
# Parent f8687bef93f25b343606e42f4fd252d5f0897d1a
[x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth
Size | AVX2 performance | AVX512 performance
------------------------------------------------
32x8 | 27.57x | 35.17x
32x16 | 27.96x | 40.74x
32x24 | 31.21x | 45.19x
32x32 | 32.12x | 47.23x
32x64 | 28.79x | 53.35x
diff -r f8687bef93f2 -r e65ac86010af source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 01 12:39:14 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jul 28 16:30:23 2017 +0530
@@ -2256,6 +2256,12 @@
p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
+ p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
+ p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
+ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
+ p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
+ p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
diff -r f8687bef93f2 -r e65ac86010af source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Tue Aug 01 12:39:14 2017 +0530
+++ b/source/common/x86/sad16-a.asm Fri Jul 28 16:30:23 2017 +0530
@@ -1154,6 +1154,179 @@
INIT_XMM sse2
SAD_12 12, 16
+%macro PROCESS_SAD_32x8_AVX512 0
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + 2 * r3]
+ movu m4, [r2 + r5]
+ psubw m1, [r0]
+ psubw m2, [r0 + r1]
+ psubw m3, [r0 + 2 * r1]
+ psubw m4, [r0 + r4]
+ pabsw m1, m1
+ pabsw m2, m2
+ pabsw m3, m3
+ pabsw m4, m4
+ paddw m1, m2
+ paddw m3, m4
+ paddw m5, m1, m3
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + 2 * r3]
+ movu m4, [r2 + r5]
+ psubw m1, [r0]
+ psubw m2, [r0 + r1]
+ psubw m3, [r0 + 2 * r1]
+ psubw m4, [r0 + r4]
+ pabsw m1, m1
+ pabsw m2, m2
+ pabsw m3, m3
+ pabsw m4, m4
+ paddw m1, m2
+ paddw m3, m4
+ paddw m1, m3
+
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
+%endmacro
+
+%macro PROCESS_SAD_AVX512_END 0
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ vextracti64x2 xm1, m0, 1
+ paddd xm0, xm1
+ pshufd xm1, xm0, 00001110b
+ paddd xm0, xm1
+ pshufd xm1, xm0, 00000001b
+ paddd xm0, xm1
+ movd eax, xm0
+%endmacro
+
+
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal pixel_sad_32x8, 4,6,7
+ pxor m0, m0
+
+ vbroadcasti32x8 m6, [pw_1]
+
+ add r3d, r3d
+ add r1d, r1d
+ lea r4d, [r1 * 3]
+ lea r5d, [r3 * 3]
+
+ PROCESS_SAD_32x8_AVX512
+ PROCESS_SAD_AVX512_END
+ RET
+
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x16, 4,6,7
+ pxor m0, m0
+
+ vbroadcasti32x8 m6, [pw_1]
+
+ add r3d, r3d
+ add r1d, r1d
+ lea r4d, [r1 * 3]
+ lea r5d, [r3 * 3]
+
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ PROCESS_SAD_AVX512_END
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x24, 4,6,7
+ pxor m0, m0
+
+ vbroadcasti32x8 m6, [pw_1]
+
+ add r3d, r3d
+ add r1d, r1d
+ lea r4d, [r1 * 3]
+ lea r5d, [r3 * 3]
+
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ PROCESS_SAD_AVX512_END
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x32, 4,6,7
+ pxor m0, m0
+
+ vbroadcasti32x8 m6, [pw_1]
+
+ add r3d, r3d
+ add r1d, r1d
+ lea r4d, [r1 * 3]
+ lea r5d, [r3 * 3]
+
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ PROCESS_SAD_AVX512_END
+ RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_32x64, 4,6,7
+ pxor m0, m0
+
+ vbroadcasti32x8 m6, [pw_1]
+
+ add r3d, r3d
+ add r1d, r1d
+ lea r4d, [r1 * 3]
+ lea r5d, [r3 * 3]
+
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+ PROCESS_SAD_32x8_AVX512
+ PROCESS_SAD_AVX512_END
+ RET
;=============================================================================
; SAD x3/x4
More information about the x265-devel
mailing list