[x265] [PATCH 242 of 307] [x265-avx512]x86: AVX512 sad_16x32 and sad_16x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:00 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1512625055 -19800
#      Thu Dec 07 11:07:35 2017 +0530
# Node ID 931dd781dc0c6de76bb31d0215db7a7af885f9bf
# Parent  9bd38bd06850914d1cbf617063ea0e1e60f66219
[x265-avx512]x86: AVX512 sad_16x32 and sad_16x64 for high bit depth

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
16x32 |      15.49x       |      16.89x
16x64 |      16.46x       |      17.84x

diff -r 9bd38bd06850 -r 931dd781dc0c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Dec 07 10:25:21 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Dec 07 11:07:35 2017 +0530
@@ -2434,6 +2434,8 @@
 
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
 
+        p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512);
+        p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512);
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
         p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
diff -r 9bd38bd06850 -r 931dd781dc0c source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Thu Dec 07 10:25:21 2017 +0530
+++ b/source/common/x86/sad16-a.asm	Thu Dec 07 11:07:35 2017 +0530
@@ -1277,6 +1277,46 @@
     paddd   m0, m1
 %endmacro
 
+%macro PROCESS_SAD_16x8_AVX512 0
+    movu            ym1, [r2]
+    vinserti64x4     m1, [r2 + r3],  1
+    movu            ym2, [r2 + 2 * r3]
+    vinserti64x4     m2, [r2 + r5],  1
+    movu            ym3, [r0]
+    vinserti64x4     m3, [r0 + r1],  1
+    movu            ym4, [r0 + 2 * r1]
+    vinserti64x4     m4, [r0 + r4],  1
+
+    psubw   m1, m3
+    psubw   m2, m4
+    pabsw   m1, m1
+    pabsw   m2, m2
+    paddw   m5, m1, m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+
+    movu            ym1, [r2]
+    vinserti64x4     m1, [r2 + r3],  1
+    movu            ym2, [r2 + 2 * r3]
+    vinserti64x4     m2, [r2 + r5],  1
+    movu            ym3, [r0]
+    vinserti64x4     m3, [r0 + r1],  1
+    movu            ym4, [r0 + 2 * r1]
+    vinserti64x4     m4, [r0 + r4],  1
+
+    psubw   m1, m3
+    psubw   m2, m4
+    pabsw   m1, m1
+    pabsw   m2, m2
+    paddw   m1, m2
+
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
+%endmacro
+
 %macro PROCESS_SAD_AVX512_END 0
     vextracti32x8  ym1, m0, 1
     paddd          ym0, ym1
@@ -1523,6 +1563,51 @@
 %endif
 
 ;-----------------------------------------------------------------------------
+; int pixel_sad_16x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_sad_16x32, 4,6,7
+    pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    %rep 3
+        PROCESS_SAD_16x8_AVX512
+        lea            r2, [r2 + 4 * r3]
+        lea            r0, [r0 + 4 * r1]
+    %endrep
+    PROCESS_SAD_16x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
+
+INIT_ZMM avx512
+cglobal pixel_sad_16x64, 4,6,7
+   pxor    m0, m0
+
+    vbroadcasti32x8 m6, [pw_1]
+
+    add     r3d, r3d
+    add     r1d, r1d
+    lea     r4d, [r1 * 3]
+    lea     r5d, [r3 * 3]
+
+    %rep 7
+        PROCESS_SAD_16x8_AVX512
+        lea            r2, [r2 + 4 * r3]
+        lea            r0, [r0 + 4 * r1]
+    %endrep
+    PROCESS_SAD_16x8_AVX512
+    PROCESS_SAD_AVX512_END
+    RET
+%endif
+
+;-----------------------------------------------------------------------------
 ; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %if ARCH_X86_64


More information about the x265-devel mailing list