[x265] [PATCH 058 of 307] [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:56 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1501674408 -19800
# Wed Aug 02 17:16:48 2017 +0530
# Node ID 585b35cf6baad20d1cd5fb760d88ad2fbd99e63f
# Parent b355ac2912dd111b96dbb5893b34405863e7382f
[x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth
AVX2 performance : 29.55x
AVX512 performance : 40.07x
diff -r b355ac2912dd -r 585b35cf6baa source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 02 11:28:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 17:16:48 2017 +0530
@@ -2261,6 +2261,7 @@
p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
+ p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512);
p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
diff -r b355ac2912dd -r 585b35cf6baa source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Wed Aug 02 11:28:32 2017 +0530
+++ b/source/common/x86/sad16-a.asm Wed Aug 02 17:16:48 2017 +0530
@@ -1881,6 +1881,111 @@
PROCESS_SAD_AVX512_END
RET
+;-----------------------------------------------------------------------------
+; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal pixel_sad_48x64, 4, 7, 9
+ pxor m0, m0
+ mov r6d, 64/8
+
+ vbroadcasti32x8 m8, [pw_1]
+
+ add r3d, r3d
+ add r1d, r1d
+ lea r4d, [r1 * 3]
+ lea r5d, [r3 * 3]
+.loop:
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu ym3, [r2 + mmsize]
+ vinserti32x8 m3, [r2 + r3 + mmsize], 1
+ movu m4, [r0]
+ movu m5, [r0 + r1]
+ movu ym6, [r0 + mmsize]
+ vinserti32x8 m6, [r0 + r1 + mmsize], 1
+
+ psubw m1, m4
+ psubw m2, m5
+ psubw m3, m6
+ pabsw m1, m1
+ pabsw m2, m2
+ pabsw m3, m3
+ paddw m1, m2
+ paddw m7, m3, m1
+
+ movu m1, [r2 + 2 * r3]
+ movu m2, [r2 + r5]
+ movu ym3, [r2 + 2 * r3 + mmsize]
+ vinserti32x8 m3, [r2 + r5 + mmsize], 1
+ movu m4, [r0 + 2 * r1]
+ movu m5, [r0 + r4]
+ movu ym6, [r0 + 2 * r1 + mmsize]
+ vinserti32x8 m6, [r0 + r4 + mmsize], 1
+ psubw m1, m4
+ psubw m2, m5
+ psubw m3, m6
+ pabsw m1, m1
+ pabsw m2, m2
+ pabsw m3, m3
+ paddw m1, m2
+ paddw m1, m3
+
+ pmaddwd m7, m8
+ paddd m0, m7
+ pmaddwd m1, m8
+ paddd m0, m1
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ movu ym3, [r2 + mmsize]
+ vinserti32x8 m3, [r2 + r3 + mmsize], 1
+ movu m4, [r0]
+ movu m5, [r0 + r1]
+ movu ym6, [r0 + mmsize]
+ vinserti32x8 m6, [r0 + r1 + mmsize], 1
+
+ psubw m1, m4
+ psubw m2, m5
+ psubw m3, m6
+ pabsw m1, m1
+ pabsw m2, m2
+ pabsw m3, m3
+ paddw m1, m2
+ paddw m7, m3, m1
+
+ movu m1, [r2 + 2 * r3]
+ movu m2, [r2 + r5]
+ movu ym3, [r2 + 2 * r3 + mmsize]
+ vinserti32x8 m3, [r2 + r5 + mmsize], 1
+ movu m4, [r0 + 2 * r1]
+ movu m5, [r0 + r4]
+ movu ym6, [r0 + 2 * r1 + mmsize]
+ vinserti32x8 m6, [r0 + r4 + mmsize], 1
+ psubw m1, m4
+ psubw m2, m5
+ psubw m3, m6
+ pabsw m1, m1
+ pabsw m2, m2
+ pabsw m3, m3
+ paddw m1, m2
+ paddw m1, m3
+
+ pmaddwd m7, m8
+ paddd m0, m7
+ pmaddwd m1, m8
+ paddd m0, m1
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ dec r6d
+ jg .loop
+
+ PROCESS_SAD_AVX512_END
+ RET
+
;=============================================================================
; SAD x3/x4
;=============================================================================
More information about the x265-devel
mailing list