[x265] [PATCH 018 of 307] x86: AVX512 pixel_sad_64xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:16 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499689280 -19800
#      Mon Jul 10 17:51:20 2017 +0530
# Node ID 7d0bff5d6f2e1d2fe8609a3e498b1ccc149a10e9
# Parent  6c409d2363c42f485748c5a9d3f4b209f58e6aa5
x86: AVX512 pixel_sad_64xN

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
64x16   |     53.37x       |      87.20x
64x32   |     63.88x       |     104.01x
64x48   |     71.80x       |     111.25x
64x64   |     74.98x       |     118.60x

diff -r 6c409d2363c4 -r 7d0bff5d6f2e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 10 12:10:44 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jul 10 17:51:20 2017 +0530
@@ -3726,6 +3726,11 @@
     }
     if (cpuMask & X265_CPU_AVX512)
     {
+        p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
+        p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
+        p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
+        p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
+
         p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
         p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
         p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512);
diff -r 6c409d2363c4 -r 7d0bff5d6f2e source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Mon Jul 10 12:10:44 2017 +0530
+++ b/source/common/x86/sad-a.asm	Mon Jul 10 17:51:20 2017 +0530
@@ -6215,4 +6215,42 @@
     movd            eax, xm0
     RET
 
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+%macro PIXEL_SAD_W64_AVX512 1
+INIT_ZMM avx512
+cglobal pixel_sad_64x%1, 4,5,6
+    xorps           m0, m0
+    xorps           m5, m5
+
+%rep %1/2
+    movu           m1, [r0]               ; first 64 of row 0 of pix0
+    movu           m2, [r2]               ; first 64 of row 0 of pix1
+    movu           m3, [r0 + r1]          ; first 64 of row 1 of pix0
+    movu           m4, [r2 + r3]          ; first 64 of row 1 of pix1
+    psadbw         m1, m2
+    psadbw         m3, m4
+    paddd          m0, m1
+    paddd          m5, m3
+    lea            r2, [r2 + 2 * r3]
+    lea            r0, [r0 + 2 * r1]
+%endrep
+
+    paddd          m0, m5
+    vextracti32x8  ym1, m0, 1
+    paddd          ym0, ym1
+    vextracti64x2  xm1, m0, 1
+    paddd          xm0, xm1
+    pshufd         xm1, xm0, 2
+    paddd          xm0, xm1
+    movd           eax, xm0
+    RET
+%endmacro
+
+PIXEL_SAD_W64_AVX512 16
+PIXEL_SAD_W64_AVX512 32
+PIXEL_SAD_W64_AVX512 48
+PIXEL_SAD_W64_AVX512 64
+
 %endif


More information about the x265-devel mailing list