[x265] [PATCH 020 of 307] x86: AVX512 pixel_sad_x4_W64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:18 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499756069 -19800
#      Tue Jul 11 12:24:29 2017 +0530
# Node ID a32718b2358bab3f19861d8402fe9adc8e312633
# Parent  40ab4480d070fca77c35c97c7c229b25d9a98a8a
x86: AVX512 pixel_sad_x4_W64

Size    | AVX2 performance | AVX512 performance
------------------------------------------------
64x16   |     67.53x       |      87.52x
64x32   |     73.27x       |     100.10x
64x48   |     76.21x       |     100.98x
64x64   |     79.72x       |     102.79x

diff -r 40ab4480d070 -r a32718b2358b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 11 10:05:30 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 11 12:24:29 2017 +0530
@@ -3736,6 +3736,11 @@
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
 
+        p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
+        p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
+        p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
+        p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
+
         p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
         p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
         p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512);
diff -r 40ab4480d070 -r a32718b2358b source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Tue Jul 11 10:05:30 2017 +0530
+++ b/source/common/x86/sad-a.asm	Tue Jul 11 12:24:29 2017 +0530
@@ -4128,6 +4128,315 @@
     SAD_X4_48x8_AVX2
     PIXEL_SAD_X4_END_AVX2
     RET
+
+;------------------------------------------------------------
+;sad_x4 avx512 code start
+;------------------------------------------------------------
+%macro SAD_X4_64x8_AVX512 0
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+%endmacro
+
+%macro PIXEL_SAD_X4_END_AVX512 0
+    vextracti32x8  ym4, m0, 1
+    vextracti32x8  ym5, m1, 1
+    vextracti32x8  ym6, m2, 1
+    vextracti32x8  ym7, m3, 1
+    paddd          ym0, ym4
+    paddd          ym1, ym5
+    paddd          ym2, ym6
+    paddd          ym3, ym7
+    vextracti64x2  xm4, m0, 1
+    vextracti64x2  xm5, m1, 1
+    vextracti64x2  xm6, m2, 1
+    vextracti64x2  xm7, m3, 1
+    paddd          xm0, xm4
+    paddd          xm1, xm5
+    paddd          xm2, xm6
+    paddd          xm3, xm7
+    pshufd         xm4, xm0, 2
+    pshufd         xm5, xm1, 2
+    pshufd         xm6, xm2, 2
+    pshufd         xm7, xm3, 2
+    paddd          xm0, xm4
+    paddd          xm1, xm5
+    paddd          xm2, xm6
+    paddd          xm3, xm7
+    movd           [r6 + 0], xm0
+    movd           [r6 + 4], xm1
+    movd           [r6 + 8], xm2
+    movd           [r6 + 12], xm3
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x16, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    PIXEL_SAD_X4_END_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x32, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    PIXEL_SAD_X4_END_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x48, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    PIXEL_SAD_X4_END_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_x4_64x64, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+    SAD_X4_64x8_AVX512
+    PIXEL_SAD_X4_END_AVX512
+    RET
+;------------------------------------------------------------
+;sad_x4 avx512 code end
+;------------------------------------------------------------
 %endif
 
 INIT_XMM sse2


More information about the x265-devel mailing list