[x265] [PATCH 018 of 307] x86: AVX512 pixel_sad_64xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:16 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499689280 -19800
# Mon Jul 10 17:51:20 2017 +0530
# Node ID 7d0bff5d6f2e1d2fe8609a3e498b1ccc149a10e9
# Parent 6c409d2363c42f485748c5a9d3f4b209f58e6aa5
x86: AVX512 pixel_sad_64xN
Size | AVX2 performance | AVX512 performance
------------------------------------------------
64x16 | 53.37x | 87.20x
64x32 | 63.88x | 104.01x
64x48 | 71.80x | 111.25x
64x64 | 74.98x | 118.60x
diff -r 6c409d2363c4 -r 7d0bff5d6f2e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 10 12:10:44 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jul 10 17:51:20 2017 +0530
@@ -3726,6 +3726,11 @@
}
if (cpuMask & X265_CPU_AVX512)
{
+ p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
+ p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
+ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
+ p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
+
p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512);
diff -r 6c409d2363c4 -r 7d0bff5d6f2e source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Jul 10 12:10:44 2017 +0530
+++ b/source/common/x86/sad-a.asm Mon Jul 10 17:51:20 2017 +0530
@@ -6215,4 +6215,42 @@
movd eax, xm0
RET
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+%macro PIXEL_SAD_W64_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_sad_64x%1, 4,5,6
+ xorps m0, m0
+ xorps m5, m5
+
+%rep %1/2
+ movu m1, [r0] ; first 64 of row 0 of pix0
+ movu m2, [r2] ; first 64 of row 0 of pix1
+ movu m3, [r0 + r1] ; first 64 of row 1 of pix0
+ movu m4, [r2 + r3] ; first 64 of row 1 of pix1
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m5, m3
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+%endrep
+
+ paddd m0, m5
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ vextracti64x2 xm1, m0, 1
+ paddd xm0, xm1
+ pshufd xm1, xm0, 2
+ paddd xm0, xm1
+ movd eax, xm0
+ RET
+%endmacro
+
+PIXEL_SAD_W64_AVX512 16
+PIXEL_SAD_W64_AVX512 32
+PIXEL_SAD_W64_AVX512 48
+PIXEL_SAD_W64_AVX512 64
+
%endif
More information about the x265-devel
mailing list