[x265] [PATCH 058 of 307] [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:56 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1501674408 -19800
#      Wed Aug 02 17:16:48 2017 +0530
# Node ID 585b35cf6baad20d1cd5fb760d88ad2fbd99e63f
# Parent  b355ac2912dd111b96dbb5893b34405863e7382f
[x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth

AVX2 performance   : 29.55x
AVX512 performance : 40.07x

diff -r b355ac2912dd -r 585b35cf6baa source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 02 11:28:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 02 17:16:48 2017 +0530
@@ -2261,6 +2261,7 @@
         p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
         p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
         p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
+        p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512);
         p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
diff -r b355ac2912dd -r 585b35cf6baa source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Wed Aug 02 11:28:32 2017 +0530
+++ b/source/common/x86/sad16-a.asm	Wed Aug 02 17:16:48 2017 +0530
@@ -1881,6 +1881,111 @@
     PROCESS_SAD_AVX512_END
     RET
 
+;-----------------------------------------------------------------------------
+; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal pixel_sad_48x64, 4, 7, 9
+    pxor    m0,  m0
+    mov     r6d, 64/8
+
+    vbroadcasti32x8 m8, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+.loop:
+    movu            m1,  [r2]
+    movu            m2,  [r2 + r3]
+    movu           ym3,  [r2 + mmsize]
+    vinserti32x8    m3,  [r2 + r3 + mmsize], 1
+    movu            m4,  [r0]
+    movu            m5,  [r0 + r1]
+    movu           ym6,  [r0 + mmsize]
+    vinserti32x8    m6,  [r0 + r1 + mmsize], 1
+
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m7, m3, m1
+
+    movu            m1,  [r2 + 2 * r3]
+    movu            m2,  [r2 + r5]
+    movu           ym3,  [r2 + 2 * r3 + mmsize]
+    vinserti32x8    m3,  [r2 + r5 + mmsize], 1
+    movu            m4,  [r0 + 2 * r1]
+    movu            m5,  [r0 + r4]
+    movu           ym6,  [r0 + 2 * r1 + mmsize]
+    vinserti32x8    m6,  [r0 + r4 + mmsize], 1
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m1, m3
+
+    pmaddwd m7, m8
+    paddd   m0, m7
+    pmaddwd m1, m8
+    paddd   m0, m1
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    movu            m1,  [r2]
+    movu            m2,  [r2 + r3]
+    movu           ym3,  [r2 + mmsize]
+    vinserti32x8    m3,  [r2 + r3 + mmsize], 1
+    movu            m4,  [r0]
+    movu            m5,  [r0 + r1]
+    movu           ym6,  [r0 + mmsize]
+    vinserti32x8    m6,  [r0 + r1 + mmsize], 1
+
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m7, m3, m1
+
+    movu            m1,  [r2 + 2 * r3]
+    movu            m2,  [r2 + r5]
+    movu           ym3,  [r2 + 2 * r3 + mmsize]
+    vinserti32x8    m3,  [r2 + r5 + mmsize], 1
+    movu            m4,  [r0 + 2 * r1]
+    movu            m5,  [r0 + r4]
+    movu           ym6,  [r0 + 2 * r1 + mmsize]
+    vinserti32x8    m6,  [r0 + r4 + mmsize], 1
+    psubw   m1, m4
+    psubw   m2, m5
+    psubw   m3, m6
+    pabsw   m1, m1
+    pabsw   m2, m2
+    pabsw   m3, m3
+    paddw   m1, m2
+    paddw   m1, m3
+
+    pmaddwd m7, m8
+    paddd   m0, m7
+    pmaddwd m1, m8
+    paddd   m0, m1
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    dec     r6d
+    jg      .loop
+
+    PROCESS_SAD_AVX512_END
+    RET
+
 ;=============================================================================
 ; SAD x3/x4
 ;=============================================================================


More information about the x265-devel mailing list