[x265] [PATCH 131 of 307] x86: AVX512 interp_4tap_horiz_ps_48x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:09 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1504168830 -19800
#      Thu Aug 31 14:10:30 2017 +0530
# Node ID 7f92fdd23823946026a4f55cb8c0f252cd658d07
# Parent  857c53de924aea24ec7ba208ffc5981c347c1852
x86: AVX512 interp_4tap_horiz_ps_48x64

AVX2 Performance  : 27.81x
AVX512 Performance: 29.93x

diff -r 857c53de924a -r 7f92fdd23823 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Aug 31 13:53:58 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Aug 31 14:10:30 2017 +0530
@@ -4442,6 +4442,7 @@
         p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
 
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
     }
 #endif
 }
diff -r 857c53de924a -r 7f92fdd23823 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Aug 31 13:53:58 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Aug 31 14:10:30 2017 +0530
@@ -10670,6 +10670,83 @@
     IPFILTER_CHROMA_PS_16xN_AVX512 8
     IPFILTER_CHROMA_PS_16xN_AVX512 4
 
+%macro PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512 0
+    movu               ym6,          [r0]
+    vinserti32x8       m6,           [r0 + 4], 1
+    pshufb             m7,           m6,       m2
+    pshufb             m6,           m6,       m1
+    pmaddubsw          m6,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m6,           m3
+    pmaddwd            m7,           m3
+
+    packssdw           m6,           m7
+    psubw              m6,           m4
+    vpermq             m6,           m8,       m6
+    movu               [r2],         m6
+
+    movu              xm6,          [r0 + 32]
+    vinserti32x4      m6,           [r0 + 36], 1
+    pshufb            ym7,          ym6,       ym2
+    pshufb            ym6,          ym6,       ym1
+    pmaddubsw         ym6,          ym0
+    pmaddubsw         ym7,          ym0
+    pmaddwd           ym6,          ym3
+    pmaddwd           ym7,          ym3
+
+    packssdw          ym6,          ym7
+    psubw             ym6,          ym4
+    vpermq            ym6,          ym9,       ym6
+    movu              [r2 + mmsize],ym6
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_PS_48xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_ps_48x%1, 4,7,10
+    mov             r4d, r4m
+    mov             r5d, r5m
+
+%ifdef PIC
+    lea               r6,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r6 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
+    vbroadcasti32x8    m3,           [pw_1]
+    vbroadcasti32x8    m4,           [pw_2000]
+    mova               m8,           [interp4_hps_shuf_avx512]
+    mova               m9,           [interp4_hps_store_16xN_avx512]
+
+    ; register map
+    ; m0    - interpolate coeff
+    ; m1,m2 - load shuffle order table
+    ; m3    - constant word 1
+    ; m4    - constant word 2000
+    ; m8   - store shuffle order table
+
+    mov               r6d,         %1
+    dec               r0
+    test              r5d,         r5d
+    je                .loop
+    sub               r0,          r1
+    add               r6d,         3
+
+.loop:
+    PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512
+    lea               r2,           [r2 + 2 * r3]
+    lea               r0,           [r0 + r1]
+    dec               r6d
+    jnz               .loop
+    RET
+%endmacro
+
+    IPFILTER_CHROMA_PS_48xN_AVX512 64
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_chroma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list