[x265] [PATCH 059 of 307] [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:57 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1501674408 -19800
#      Wed Aug 02 17:16:48 2017 +0530
# Node ID 55ed1898de6bd2b8688aa8f1f7b29ae35f674ab4
# Parent  585b35cf6baad20d1cd5fb760d88ad2fbd99e63f
[x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth

AVX2 performance   : 29.55x
AVX512 performance : 40.07x

diff -r 585b35cf6baa -r 55ed1898de6b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 02 17:16:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 02 17:16:48 2017 +0530
@@ -2272,6 +2272,7 @@
         p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
         p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
         p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
+        p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512);
         p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
diff -r 585b35cf6baa -r 55ed1898de6b source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Wed Aug 02 17:16:48 2017 +0530
+++ b/source/common/x86/sad16-a.asm	Wed Aug 02 17:16:48 2017 +0530
@@ -1986,6 +1986,111 @@
     PROCESS_SAD_AVX512_END
     RET
 
+;-----------------------------------------------------------------------------
+; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal pixel_sad_48x64, 4, 7, 9
+    pxor    m0,  m0
+    mov     r6d, 64/8
+
+    vbroadcasti32x8 m8, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+.loop:
+    movu            m1,  [r2]
+    movu            m2,  [r2 + r3]
+    movu           ym3,  [r2 + mmsize]
+    vinserti32x8    m3,  [r2 + r3 + mmsize], 1
+    movu            m4,  [r0]
+    movu            m5,  [r0 + r1]
+    movu           ym6,  [r0 + mmsize]
+    vinserti32x8    m6,  [r0 + r1 + mmsize], 1
+
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m7, m3, m1
+
+    movu            m1,  [r2 + 2 * r3]
+    movu            m2,  [r2 + r5]
+    movu           ym3,  [r2 + 2 * r3 + mmsize]
+    vinserti32x8    m3,  [r2 + r5 + mmsize], 1
+    movu            m4,  [r0 + 2 * r1]
+    movu            m5,  [r0 + r4]
+    movu           ym6,  [r0 + 2 * r1 + mmsize]
+    vinserti32x8    m6,  [r0 + r4 + mmsize], 1
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m1, m3
+
+    pmaddwd m7, m8
+    paddd   m0, m7
+    pmaddwd m1, m8
+    paddd   m0, m1
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    movu            m1,  [r2]
+    movu            m2,  [r2 + r3]
+    movu           ym3,  [r2 + mmsize]
+    vinserti32x8    m3,  [r2 + r3 + mmsize], 1
+    movu            m4,  [r0]
+    movu            m5,  [r0 + r1]
+    movu           ym6,  [r0 + mmsize]
+    vinserti32x8    m6,  [r0 + r1 + mmsize], 1
+
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m7, m3, m1
+
+    movu            m1,  [r2 + 2 * r3]
+    movu            m2,  [r2 + r5]
+    movu           ym3,  [r2 + 2 * r3 + mmsize]
+    vinserti32x8    m3,  [r2 + r5 + mmsize], 1
+    movu            m4,  [r0 + 2 * r1]
+    movu            m5,  [r0 + r4]
+    movu           ym6,  [r0 + 2 * r1 + mmsize]
+    vinserti32x8    m6,  [r0 + r4 + mmsize], 1
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m1, m3
+
+    pmaddwd m7, m8
+    paddd   m0, m7
+    pmaddwd m1, m8
+    paddd   m0, m1
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    dec     r6d
+    jg      .loop
+
+    PROCESS_SAD_AVX512_END
+    RET
+
 ;=============================================================================
 ; SAD x3/x4
 ;=============================================================================


More information about the x265-devel mailing list