[x265] [PATCH 099 of 307] x86: AVX512 interp_4tap_horiz_pp_48x64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:37 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503911788 -19800
#      Mon Aug 28 14:46:28 2017 +0530
# Node ID a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd
# Parent  45e4dd746cfd9380dbe2344a5754a6ff6e9feed5
x86: AVX512 interp_4tap_horiz_pp_48x64

AVX2 performance    : 17.53x
AVX512 performance  : 33.60x

diff -r 45e4dd746cfd -r a7bf0a24cfc8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 28 13:46:50 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Aug 28 14:46:28 2017 +0530
@@ -4094,6 +4094,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512);
 
         //i422 chroma_hpp
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
diff -r 45e4dd746cfd -r a7bf0a24cfc8 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Aug 28 13:46:50 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Aug 28 14:46:28 2017 +0530
@@ -9949,6 +9949,103 @@
     vextracti32x4     [r2 + r7],     m5,       3
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512 0
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1, m2 - shuffle order table
+    ; m3 - constant word 1
+    ; m4 - constant word 512
+    movu              ym5,           [r0]
+    vinserti32x8       m5,           [r0 + r1], 1
+    movu              ym7,           [r0 + 4]
+    vinserti32x8       m7,           [r0 + r1 + 4], 1
+
+    pshufb             m6,           m5,           m2
+    pshufb             m5,           m1
+    pshufb             m8,           m7,           m2
+    pshufb             m7,           m1
+
+    pmaddubsw          m5,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m5,           m3
+    pmaddwd            m7,           m3
+
+    pmaddubsw          m6,           m0
+    pmaddubsw          m8,           m0
+    pmaddwd            m6,           m3
+    pmaddwd            m8,           m3
+
+    packssdw           m5,           m7
+    packssdw           m6,           m8
+    pmulhrsw           m5,           m4
+    pmulhrsw           m6,           m4
+    packuswb           m5,           m6
+    movu             [r2],          ym5
+    vextracti32x8    [r2 + r3],      m5,            1
+
+    movu              ym5,           [r0 + 2 * r1]
+    vinserti32x8       m5,           [r0 + r6], 1
+    movu              ym7,           [r0 + 2 * r1 + 4]
+    vinserti32x8       m7,           [r0 + r6 + 4], 1
+
+    pshufb             m6,           m5,           m2
+    pshufb             m5,           m1
+    pshufb             m8,           m7,           m2
+    pshufb             m7,           m1
+
+    pmaddubsw          m5,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m5,           m3
+    pmaddwd            m7,           m3
+
+    pmaddubsw          m6,           m0
+    pmaddubsw          m8,           m0
+    pmaddwd            m6,           m3
+    pmaddwd            m8,           m3
+
+    packssdw           m5,           m7
+    packssdw           m6,           m8
+    pmulhrsw           m5,           m4
+    pmulhrsw           m6,           m4
+    packuswb           m5,           m6
+    movu             [r2 + 2 * r3], ym5
+    vextracti32x8    [r2 + r7],      m5,            1
+
+    movu              xm5,           [r0 + mmsize/2]
+    vinserti32x4       m5,           [r0 + r1 + mmsize/2],            1
+    vinserti32x4       m5,           [r0 + 2 * r1 + mmsize/2],        2
+    vinserti32x4       m5,           [r0 + r6 + mmsize/2],            3
+    pshufb             m6,           m5,       m2
+    pshufb             m5,           m1
+
+    movu              xm7,           [r0 + 36]
+    vinserti32x4       m7,           [r0 + r1 + 36],        1
+    vinserti32x4       m7,           [r0 + 2 * r1 + 36],    2
+    vinserti32x4       m7,           [r0 + r6 + 36],        3
+    pshufb             m8,           m7,       m2
+    pshufb             m7,           m1
+
+    pmaddubsw          m5,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m5,           m3
+    pmaddwd            m7,           m3
+
+    pmaddubsw          m6,           m0
+    pmaddubsw          m8,           m0
+    pmaddwd            m6,           m3
+    pmaddwd            m8,           m3
+
+    packssdw           m5,           m7
+    packssdw           m6,           m8
+    pmulhrsw           m5,           m4
+    pmulhrsw           m6,           m4
+    packuswb           m5,           m6
+    movu              [r2 + mmsize/2],          xm5
+    vextracti32x4     [r2 + r3 + mmsize/2],     m5,       1
+    vextracti32x4     [r2 + 2 * r3 + mmsize/2], m5,       2
+    vextracti32x4     [r2 + r7 + mmsize/2],     m5,       3
+%endmacro
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
 ;-------------------------------------------------------------------------------------------------------------
@@ -10054,6 +10151,32 @@
     IPFILTER_CHROMA_PP_16xN_AVX512 32
     IPFILTER_CHROMA_PP_16xN_AVX512 64
 
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_pp_48x64, 4,8,9
+    mov               r4d,          r4m
+    lea               r6,           [3 * r1]
+    lea               r7,           [3 * r3]
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
+    vbroadcasti32x8   m3,           [pw_1]
+    vbroadcasti32x8   m4,           [pw_512]
+    dec               r0
+
+%rep 15
+    PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
+    lea               r2,           [r2 + 4 * r3]
+    lea               r0,           [r0 + 4 * r1]
+%endrep
+    PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
+    RET
+
 %macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
     movu               ym6,          [r0]
     vinserti32x8       m6,           [r0 + 4], 1


More information about the x265-devel mailing list