[x265] [PATCH 100 of 307] x86: AVX512 interp_8tap_horiz_pp_48x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:38 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503912578 -19800
#      Mon Aug 28 14:59:38 2017 +0530
# Node ID 562c00d2153193eec85ab907b60eeb5aca7cc609
# Parent  a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd
x86: AVX512 interp_8tap_horiz_pp_48x64

AVX2 performance    : 19.57x
AVX512 perfornamce  : 35.25x

diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 28 14:46:28 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Aug 28 14:59:38 2017 +0530
@@ -4159,6 +4159,7 @@
         p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
         p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
         p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
+        p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
 
         p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512);
         p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Aug 28 14:46:28 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Aug 28 14:59:38 2017 +0530
@@ -10489,6 +10489,151 @@
     vextracti32x4     [r2 + r7],     m7,    3
 %endmacro
 
+%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
+    ; register map
+    ; m0 , m1 interpolate coeff
+    ; m2 , m3, m4  shuffle order table
+    ; m5 - pw_1
+    ; m6 - pw_512
+
+    movu             ym7,        [r0]
+    vinserti32x8      m7,        [r0 + r1], 1
+    movu             ym9,        [r0 + 8]
+    vinserti32x8      m9,        [r0 + r1 + 8], 1
+
+    pshufb            m8,        m7,        m3
+    pshufb            m7,        m2
+    pshufb            m10,       m9,        m3
+    pshufb            m11,       m9,        m4
+    pshufb            m9,        m2
+
+    pmaddubsw         m7,        m0
+    pmaddubsw         m12,       m8,        m1
+    pmaddwd           m7,        m5
+    pmaddwd           m12,       m5
+    paddd             m7,        m12
+
+    pmaddubsw         m8,        m0
+    pmaddubsw         m12,       m9,        m1
+    pmaddwd           m8,        m5
+    pmaddwd           m12,       m5
+    paddd             m8,        m12
+
+    pmaddubsw         m9,        m0
+    pmaddubsw         m12,       m10,       m1
+    pmaddwd           m9,        m5
+    pmaddwd           m12,       m5
+    paddd             m9,        m12
+
+    pmaddubsw         m10,       m0
+    pmaddubsw         m12,      m11,        m1
+    pmaddwd           m10,      m5
+    pmaddwd           m12,      m5
+    paddd             m10,      m12
+
+    packssdw          m7,       m8
+    packssdw          m9,       m10
+    pmulhrsw          m7,       m6
+    pmulhrsw          m9,       m6
+    packuswb          m7,       m9
+    movu              [r2],     ym7
+    vextracti32x8     [r2 + r3], m7, 1
+
+    movu             ym7,        [r0 + 2 * r1]
+    vinserti32x8      m7,        [r0 + r6],          1
+    movu             ym9,        [r0 + 2 * r1 + 8]
+    vinserti32x8      m9,        [r0 + r6 + 8],      1
+
+    pshufb            m8,        m7,        m3
+    pshufb            m7,        m2
+    pshufb            m10,       m9,        m3
+    pshufb            m11,       m9,        m4
+    pshufb            m9,        m2
+
+    pmaddubsw         m7,        m0
+    pmaddubsw         m12,       m8,        m1
+    pmaddwd           m7,        m5
+    pmaddwd           m12,       m5
+    paddd             m7,        m12
+
+    pmaddubsw         m8,        m0
+    pmaddubsw         m12,       m9,        m1
+    pmaddwd           m8,        m5
+    pmaddwd           m12,       m5
+    paddd             m8,        m12
+
+    pmaddubsw         m9,        m0
+    pmaddubsw         m12,       m10,       m1
+    pmaddwd           m9,        m5
+    pmaddwd           m12,       m5
+    paddd             m9,        m12
+
+    pmaddubsw         m10,       m0
+    pmaddubsw         m12,      m11,        m1
+    pmaddwd           m10,      m5
+    pmaddwd           m12,      m5
+    paddd             m10,      m12
+
+    packssdw          m7,       m8
+    packssdw          m9,       m10
+    pmulhrsw          m7,       m6
+    pmulhrsw          m9,       m6
+    packuswb          m7,       m9
+    movu              [r2 + 2 * r3],     ym7
+    vextracti32x8     [r2 + r7],          m7,    1
+
+    movu             xm7,        [r0 + mmsize/2]
+    vinserti32x4      m7,        [r0 + r1 + mmsize/2],          1
+    vinserti32x4      m7,        [r0 + 2 * r1 + mmsize/2],      2
+    vinserti32x4      m7,        [r0 + r6 + mmsize/2],          3
+
+    pshufb            m8,        m7,        m3
+    pshufb            m7,        m2
+
+    movu             xm9,        [r0 + 40]
+    vinserti32x4      m9,        [r0 + r1 + 40],      1
+    vinserti32x4      m9,        [r0 + 2 * r1 + 40],  2
+    vinserti32x4      m9,        [r0 + r6 + 40],      3
+
+    pshufb            m10,       m9,        m3
+    pshufb            m11,       m9,        m4
+    pshufb            m9,        m2
+
+    pmaddubsw         m7,        m0
+    pmaddubsw         m12,       m8,        m1
+    pmaddwd           m7,        m5
+    pmaddwd           m12,       m5
+    paddd             m7,        m12
+
+    pmaddubsw         m8,        m0
+    pmaddubsw         m12,       m9,        m1
+    pmaddwd           m8,        m5
+    pmaddwd           m12,       m5
+    paddd             m8,        m12
+
+    pmaddubsw         m9,        m0
+    pmaddubsw         m12,       m10,       m1
+    pmaddwd           m9,        m5
+    pmaddwd           m12,       m5
+    paddd             m9,        m12
+
+    pmaddubsw         m10,       m0
+    pmaddubsw         m12,      m11,        m1
+    pmaddwd           m10,      m5
+    pmaddwd           m12,      m5
+    paddd             m10,      m12
+
+    packssdw          m7,       m8
+    packssdw          m9,       m10
+    pmulhrsw          m7,       m6
+    pmulhrsw          m9,       m6
+    packuswb          m7,       m9
+    movu              [r2 + mmsize/2],         xm7
+    vextracti32x4     [r2 + r3 + mmsize/2],     m7,    1
+    vextracti32x4     [r2 + 2 * r3 + mmsize/2], m7,    2
+    vextracti32x4     [r2 + r7 + mmsize/2],     m7,    3
+%endmacro
+
 %macro IPFILTER_LUMA_64xN_AVX512 1
 INIT_ZMM avx512
 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
@@ -10592,6 +10737,34 @@
 IPFILTER_LUMA_16xN_AVX512 16
 IPFILTER_LUMA_16xN_AVX512 32
 IPFILTER_LUMA_16xN_AVX512 64
+
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_pp_48x64, 4,8,14
+    sub               r0,    3
+    mov               r4d,   r4m
+    lea               r6,    [3 * r1]
+    lea               r7,    [3 * r3]
+%ifdef PIC
+    lea               r5,        [tab_LumaCoeff]
+    vpbroadcastd      m0,        [r5 + r4 * 8]
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
+    vpbroadcastd      m5,        [pw_1]
+    vbroadcasti32x8   m6,        [pw_512]
+
+%rep 15
+    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
+    lea               r0,        [r0 + 4 * r1]
+    lea               r2,        [r2 + 4 * r3]
+%endrep
+    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
+    RET
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_luma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------
\ No newline at end of file


More information about the x265-devel mailing list