[x265] [PATCH 197 of 307] x86: AVX512 interp_8tap_horiz_ps_48x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:15 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1511432557 -19800
#      Thu Nov 23 15:52:37 2017 +0530
# Node ID 3de51f6e7ff7d4d8d67e52f25bde15efd6b2fd57
# Parent  360960bc701dcc51e491699fc3a4a9cffc4e627f
x86: AVX512 interp_8tap_horiz_ps_48x64 for high bit depth

AVX2 performance  : 9.35x
AVX512 performance: 18.32x

diff -r 360960bc701d -r 3de51f6e7ff7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 23 14:20:02 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 23 15:52:37 2017 +0530
@@ -2964,6 +2964,8 @@
         p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx512);
         p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx512);
         p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
+        //Luma_hps_48x64
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
 
     }
 #endif
diff -r 360960bc701d -r 3de51f6e7ff7 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Thu Nov 23 14:20:02 2017 +0530
+++ b/source/common/x86/ipfilter16.asm	Thu Nov 23 15:52:37 2017 +0530
@@ -11010,6 +11010,461 @@
 IPFILTER_LUMA_PS_AVX512_16xN 64
 %endif
 
+%macro PROCESS_IPFILTER_LUMA_PS_48x4_AVX512 0
+    ; register map
+    ; m0, m1, m2, m3 - interpolate coeff
+    ; m4, m5         - shuffle load order table
+    ; m6             - INTERP_OFFSET_PS
+    ; m7             - shuffle store order table
+
+    movu            m8,       [r0]
+    movu            m9,       [r0 + 8]
+    movu            m10,      [r0 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m13,      m3
+    pmaddwd         m10,      m2
+    paddd           m10,      m13
+    paddd           m9,       m10
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2],     m8
+
+    movu            m8,       [r0 + r1]
+    movu            m9,       [r0 + r1 + 8]
+    movu            m10,      [r0 + r1 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m12,      m13,       m3
+    pmaddwd         m14,      m10,       m2
+    paddd           m12,      m14
+    paddd           m9,       m12
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + r3],m8
+
+    movu            m8,       [r0 + 2 * r1]
+    movu            m9,       [r0 + 2 * r1 + 8]
+    movu            m10,      [r0 + 2 * r1 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m13,      m3
+    pmaddwd         m10,      m2
+    paddd           m10,      m13
+    paddd           m9,       m10
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + 2 * r3],       m8
+
+    movu            m8,       [r0 + r6]
+    movu            m9,       [r0 + r6 + 8]
+    movu            m10,      [r0 + r6 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m12,      m13,       m3
+    pmaddwd         m14,      m10,       m2
+    paddd           m12,      m14
+    paddd           m9,       m12
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + r7],m8
+
+    movu            ym8,      [r0 + mmsize]
+    vinserti32x8     m8,      [r0 + r1 + mmsize],      1
+    movu            ym9,      [r0 + mmsize + 8]
+    vinserti32x8     m9,      [r0 + r1 + mmsize + 8],  1
+    movu            ym10,     [r0 + mmsize + 16]
+    vinserti32x8     m10,     [r0 + r1 + mmsize + 16], 1
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m13,      m3
+    pmaddwd         m10,      m2
+    paddd           m10,      m13
+    paddd           m9,       m10
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + mmsize],      ym8
+    vextracti32x8   [r2 + r3 + mmsize], m8,        1
+
+    movu            ym8,      [r0 + 2 * r1 + mmsize]
+    vinserti32x8     m8,      [r0 + r6 + mmsize],            1
+    movu            ym9,      [r0 + 2 * r1 + mmsize + 8]
+    vinserti32x8     m9,      [r0 + r6 + mmsize + 8],        1
+    movu            ym10,     [r0 + 2 * r1 + mmsize + 16]
+    vinserti32x8     m10,     [r0 + r6 + mmsize + 16],       1
+
+    pshufb          m11,      m8,       m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,       m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,      m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,      m3
+    pmaddwd         m14,      m9,       m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m12,      m13,      m3
+    pmaddwd         m14,      m10,      m2
+    paddd           m12,      m14
+    paddd           m9,       m12
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + 2 * r3 + mmsize],        ym8
+    vextracti32x8   [r2 + r7 + mmsize],     m8,      1
+%endmacro
+
+%macro PROCESS_IPFILTER_LUMA_PS_48x3_AVX512 0
+    movu            m8,       [r0]
+    movu            m9,       [r0 + 8]
+    movu            m10,      [r0 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m13,      m3
+    pmaddwd         m10,      m2
+    paddd           m10,      m13
+    paddd           m9,       m10
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2],     m8
+
+    movu            m8,       [r0 + r1]
+    movu            m9,       [r0 + r1 + 8]
+    movu            m10,      [r0 + r1 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m12,      m13,       m3
+    pmaddwd         m14,      m10,       m2
+    paddd           m12,      m14
+    paddd           m9,       m12
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + r3],m8
+
+    movu            m8,       [r0 + 2 * r1]
+    movu            m9,       [r0 + 2 * r1 + 8]
+    movu            m10,      [r0 + 2 * r1 + 16]
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m13,      m3
+    pmaddwd         m10,      m2
+    paddd           m10,      m13
+    paddd           m9,       m10
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + 2 * r3],       m8
+
+    movu            ym8,      [r0 + mmsize]
+    vinserti32x8     m8,      [r0 + r1 + mmsize],      1
+    movu            ym9,      [r0 + mmsize + 8]
+    vinserti32x8     m9,      [r0 + r1 + mmsize + 8],  1
+    movu            ym10,     [r0 + mmsize + 16]
+    vinserti32x8     m10,     [r0 + r1 + mmsize + 16], 1
+
+    pshufb          m11,      m8,        m5
+    pshufb          m8,       m4
+    pshufb          m12,      m9,        m5
+    pshufb          m9,       m4
+    pshufb          m13,      m10,       m5
+    pshufb          m10,      m4
+
+    pmaddwd         m8,       m0
+    pmaddwd         m11,      m1
+    paddd           m8,       m11
+    pmaddwd         m11,      m12,       m3
+    pmaddwd         m14,      m9,        m2
+    paddd           m11,      m14
+    paddd           m8,       m11
+    paddd           m8,       m6
+    psrad           m8,       INTERP_SHIFT_PS
+
+    pmaddwd         m9,       m0
+    pmaddwd         m12,      m1
+    paddd           m9,       m12
+    pmaddwd         m13,      m3
+    pmaddwd         m10,      m2
+    paddd           m10,      m13
+    paddd           m9,       m10
+    paddd           m9,       m6
+    psrad           m9,       INTERP_SHIFT_PS
+
+    packssdw        m8,       m9
+    pshufb          m8,       m7
+    movu            [r2 + mmsize],      ym8
+    vextracti32x8   [r2 + r3 + mmsize], m8,        1
+
+    movu            ym8,      [r0 + 2 * r1 + mmsize]
+    movu            ym9,      [r0 + 2 * r1 + mmsize + 8]
+    movu            ym10,     [r0 + 2 * r1 + mmsize + 16]
+
+    pshufb          ym11,      ym8,       ym5
+    pshufb          ym8,       ym4
+    pshufb          ym12,      ym9,       ym5
+    pshufb          ym9,       ym4
+    pshufb          ym13,      ym10,      ym5
+    pshufb          ym10,      ym4
+
+    pmaddwd         ym8,       ym0
+    pmaddwd         ym11,      ym1
+    paddd           ym8,       ym11
+    pmaddwd         ym11,      ym12,      ym3
+    pmaddwd         ym14,      ym9,       ym2
+    paddd           ym11,      ym14
+    paddd           ym8,       ym11
+    paddd           ym8,       ym6
+    psrad           ym8,       INTERP_SHIFT_PS
+
+    pmaddwd         ym9,       ym0
+    pmaddwd         ym12,      ym1
+    paddd           ym9,       ym12
+    pmaddwd         ym12,      ym13,      ym3
+    pmaddwd         ym14,      ym10,      ym2
+    paddd           ym12,      ym14
+    paddd           ym9,       ym12
+    paddd           ym9,       ym6
+    psrad           ym9,       INTERP_SHIFT_PS
+
+    packssdw        ym8,       ym9
+    pshufb          ym8,       ym7
+    movu            [r2 + 2 * r3 + mmsize],        ym8
+%endmacro
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_ps_48x64, 4,9,15
+    add              r1d,        r1d
+    add              r3d,        r3d
+    mov              r4d,        r4m
+    mov              r5d,        r5m
+    shl              r4d,        6
+
+    lea              r6,         [3 * r1]
+    lea              r7,         [3 * r3]
+%ifdef PIC
+    lea              r8,         [tab_LumaCoeffH_avx512]
+    vpbroadcastd     m0,         [r8 + r4]
+    vpbroadcastd     m1,         [r8 + r4 + 4]
+    vpbroadcastd     m2,         [r8 + r4 + 8]
+    vpbroadcastd     m3,         [r8 + r4 + 12]
+%else
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
+%endif
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
+    vbroadcasti32x8  m6,         [INTERP_OFFSET_PS]
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
+
+    sub              r0,  6
+    mov              r4d, 64
+    test             r5d, r5d
+    jz               .loop
+    lea              r6,  [r1 * 3]
+    sub              r0,  r6
+    add              r4d, 7
+    PROCESS_IPFILTER_LUMA_PS_48x4_AVX512
+    lea              r0,  [r0 + r6]
+    lea              r2,  [r2 + r7]
+    sub              r4d, 3
+
+.loop:
+    PROCESS_IPFILTER_LUMA_PS_48x4_AVX512
+    lea              r0,  [r0 + 4 * r1]
+    lea              r2,  [r2 + 4 * r3]
+    sub              r4d, 4
+    jnz              .loop
+    RET
+%endif
 ;-------------------------------------------------------------------------------------------------------------
 ;avx512 luma_hps code end
 ;-------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list