[x265] [PATCH 019 of 307] x86: AVX512 pixel_sad_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:17 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499747730 -19800
#      Tue Jul 11 10:05:30 2017 +0530
# Node ID 40ab4480d070fca77c35c97c7c229b25d9a98a8a
# Parent  7d0bff5d6f2e1d2fe8609a3e498b1ccc149a10e9
x86: AVX512 pixel_sad_32xN

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x8    |     40.52x       |      53.46x
32x16   |     58.49x       |      52.20x
32x24   |     60.62x       |      70.37x
32x32   |     52.25x       |      58.86x
32x64   |     68.28x       |      64.03x

diff -r 7d0bff5d6f2e -r 40ab4480d070 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 10 17:51:20 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 11 10:05:30 2017 +0530
@@ -3726,6 +3726,11 @@
     }
     if (cpuMask & X265_CPU_AVX512)
     {
+        p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
+        p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
+        p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
+        p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
+        p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
         p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
diff -r 7d0bff5d6f2e -r 40ab4480d070 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Mon Jul 10 17:51:20 2017 +0530
+++ b/source/common/x86/sad-a.asm	Tue Jul 11 10:05:30 2017 +0530
@@ -6253,4 +6253,45 @@
 PIXEL_SAD_W64_AVX512 48
 PIXEL_SAD_W64_AVX512 64
 
+%macro PIXEL_SAD_W32_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_sad_32x%1, 4,7,5
+    xorps           m0, m0
+    lea             r5, [r1 * 3]
+    lea             r6, [r3 * 3]
+
+%rep %1/4
+    movu           ym1, [r0]               ; row 0 of pix0
+    movu           ym2, [r2]               ; row 0 of pix1
+    vinserti32x8    m1, [r0 + r1], 1       ; row 1 of pix0
+    vinserti32x8    m2, [r2 + r3], 1       ; row 1 of pix1
+    movu           ym3, [r0 + 2 * r1]      ; row 2 of pix0
+    movu           ym4, [r2 + 2 * r3]      ; row 2 of pix1
+    vinserti32x8    m3, [r0 + r5], 1       ; row 3 of pix0
+    vinserti32x8    m4, [r2 + r6], 1       ; row 3 of pix1
+
+    psadbw         m1, m2
+    psadbw         m3, m4
+    paddd          m0, m1
+    paddd          m0, m3
+
+    lea            r2,     [r2 + 4 * r3]
+    lea            r0,     [r0 + 4 * r1]
+%endrep
+
+    vextracti32x8  ym1, m0, 1
+    paddd          ym0, ym1
+    vextracti64x2  xm1, m0, 1
+    paddd          xm0, xm1
+    pshufd         xm1, xm0, 2
+    paddd          xm0, xm1
+    movd           eax, xm0
+    RET
+%endmacro
+
+PIXEL_SAD_W32_AVX512 8
+PIXEL_SAD_W32_AVX512 16
+PIXEL_SAD_W32_AVX512 24
+PIXEL_SAD_W32_AVX512 32
+PIXEL_SAD_W32_AVX512 64
 %endif


More information about the x265-devel mailing list