[x265] [PATCH 130 of 307] x86: AVX512 interp_8tap_horiz_ps_48x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:08 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1504167838 -19800
#      Thu Aug 31 13:53:58 2017 +0530
# Node ID 857c53de924aea24ec7ba208ffc5981c347c1852
# Parent  be9e1235a7e3ddc6d4de0b38d0253edf1726fa6d
x86: AVX512 interp_8tap_horiz_ps_48x64

AVX2 Performance  : 14.25x
AVX512 Performance: 20.81x

diff -r be9e1235a7e3 -r 857c53de924a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 30 17:56:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Aug 31 13:53:58 2017 +0530
@@ -4440,6 +4440,8 @@
         p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx512);
         p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx512);
         p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
+
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
     }
 #endif
 }
diff -r be9e1235a7e3 -r 857c53de924a source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Aug 30 17:56:16 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Aug 31 13:53:58 2017 +0530
@@ -11394,6 +11394,104 @@
 IPFILTER_LUMA_PS_8TAP_16xN_AVX512 16
 IPFILTER_LUMA_PS_8TAP_16xN_AVX512 32
 IPFILTER_LUMA_PS_8TAP_16xN_AVX512 64
+
+%macro PROCESS_IPFILTER_LUMA_PS_48x1_AVX512 0
+    ; register map
+    ; m0 , m1     - interpolate coeff
+    ; m2 , m3, m4 - load shuffle order table
+    ; m5          - pw_1
+    ; m6          - pw_2000
+    ; m7          - store shuffle order table
+
+    movu              ym8,           [r0]
+    vinserti32x8      m8,            [r0 + 8],            1
+    pshufb            m9,            m8,                  m3
+    pshufb            m10,           m8,                  m4
+    pshufb            m8,             m2
+
+    pmaddubsw         m8,            m0
+    pmaddubsw         m11,           m9,                  m1
+    pmaddwd           m8,            m5
+    pmaddwd           m11,           m5
+    paddd             m8,            m11
+
+    pmaddubsw         m9,            m0
+    pmaddubsw         m11,           m10,                 m1
+    pmaddwd           m9,            m5
+    pmaddwd           m11,           m5
+    paddd             m9,            m11
+
+    packssdw          m8,            m9
+    psubw             m8,            m6
+    vpermq            m8,            m7,                  m8
+    movu              [r2],          m8
+
+    movu              ym8,           [r0 + 32]
+    vinserti32x4      m8,            [r0 + 40],           1
+    pshufb            ym9,           ym8,                 ym3
+    pshufb            ym10,           ym8,                ym4
+    pshufb            ym8,            ym2
+
+    pmaddubsw         ym8,            ym0
+    pmaddubsw         ym11,           ym9,                ym1
+    pmaddwd           ym8,            ym5
+    pmaddwd           ym11,           ym5
+    paddd             ym8,            ym11
+
+    pmaddubsw         ym9,            ym0
+    pmaddubsw         ym11,           ym10,               ym1
+    pmaddwd           ym9,            ym5
+    pmaddwd           ym11,           ym5
+    paddd             ym9,            ym11
+
+    packssdw          ym8,            ym9
+    psubw             ym8,            ym6
+    movu              [r2 + mmsize],  ym8
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA_PS_48xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_ps_48x%1, 4,7,12
+    mov               r4d,   r4m
+    mov               r5d,   r5m
+
+%ifdef PIC
+    lea               r6,        [tab_LumaCoeff]
+    vpbroadcastd      m0,        [r6 + r4 * 8]
+    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
+    vpbroadcastd      m5,        [pw_1]
+    vbroadcasti32x8   m6,        [pw_2000]
+    mova              m7,        [interp8_hps_store_avx512]
+
+    mov               r4d,       %1
+    sub               r0,        3
+    test              r5d,       r5d
+    jz                .loop
+    lea               r6,        [r1 * 3]
+    sub               r0,        r6                           ; r0(src)-r6
+    add               r4d,       7                            ; blkheight += N - 1
+
+.loop:
+    PROCESS_IPFILTER_LUMA_PS_48x1_AVX512
+    lea               r0,        [r0 + r1]
+    lea               r2,        [r2 + 2 * r3]
+    dec               r4d
+    jnz               .loop
+    RET
+%endmacro
+
+IPFILTER_LUMA_PS_48xN_AVX512 64
+
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_luma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------
\ No newline at end of file


More information about the x265-devel mailing list