[x265] [PATCH 019 of 307] x86: AVX512 pixel_sad_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:17 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499747730 -19800
# Tue Jul 11 10:05:30 2017 +0530
# Node ID 40ab4480d070fca77c35c97c7c229b25d9a98a8a
# Parent 7d0bff5d6f2e1d2fe8609a3e498b1ccc149a10e9
x86: AVX512 pixel_sad_32xN
Size | AVX2 performance | AVX512 performance
------------------------------------------------
32x8 | 40.52x | 53.46x
32x16 | 58.49x | 52.20x
32x24 | 60.62x | 70.37x
32x32 | 52.25x | 58.86x
32x64 | 68.28x | 64.03x
diff -r 7d0bff5d6f2e -r 40ab4480d070 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 10 17:51:20 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 11 10:05:30 2017 +0530
@@ -3726,6 +3726,11 @@
}
if (cpuMask & X265_CPU_AVX512)
{
+ p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
+ p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
+ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
+ p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
+ p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
diff -r 7d0bff5d6f2e -r 40ab4480d070 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Jul 10 17:51:20 2017 +0530
+++ b/source/common/x86/sad-a.asm Tue Jul 11 10:05:30 2017 +0530
@@ -6253,4 +6253,45 @@
PIXEL_SAD_W64_AVX512 48
PIXEL_SAD_W64_AVX512 64
+%macro PIXEL_SAD_W32_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_sad_32x%1, 4,7,5
+ xorps m0, m0
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
+
+%rep %1/4
+ movu ym1, [r0] ; row 0 of pix0
+ movu ym2, [r2] ; row 0 of pix1
+ vinserti32x8 m1, [r0 + r1], 1 ; row 1 of pix0
+ vinserti32x8 m2, [r2 + r3], 1 ; row 1 of pix1
+ movu ym3, [r0 + 2 * r1] ; row 2 of pix0
+ movu ym4, [r2 + 2 * r3] ; row 2 of pix1
+ vinserti32x8 m3, [r0 + r5], 1 ; row 3 of pix0
+ vinserti32x8 m4, [r2 + r6], 1 ; row 3 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+%endrep
+
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ vextracti64x2 xm1, m0, 1
+ paddd xm0, xm1
+ pshufd xm1, xm0, 2
+ paddd xm0, xm1
+ movd eax, xm0
+ RET
+%endmacro
+
+PIXEL_SAD_W32_AVX512 8
+PIXEL_SAD_W32_AVX512 16
+PIXEL_SAD_W32_AVX512 24
+PIXEL_SAD_W32_AVX512 32
+PIXEL_SAD_W32_AVX512 64
%endif
More information about the x265-devel
mailing list