[x265] [PATCH 161 of 307] x86: AVX512 interp_4tap_horiz_ps_8xN for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:39 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1509605321 -19800
#      Thu Nov 02 12:18:41 2017 +0530
# Node ID f4cd489d06cfbbf66c6f0f7dc684606c80615c5e
# Parent  b45e6581fdf689f4b17e4e0dae733a7df795de07
x86: AVX512 interp_4tap_horiz_ps_8xN for high bit depth

Color Space i420
Size  |  AVX2 performance  | AVX512 performance
----------------------------------------------
8x4   |      21.70x        |      35.85x
8x8   |      22.91x        |      33.85x
8x16  |      20.03x        |      39.94x
8x32  |      22.08x        |      47.20x


Color Space i422
Size  |  AVX2 performance  | AVX512 performance
----------------------------------------------
8x4   |      21.37x        |       35.57x
8x8   |      21.69x        |      33.72x
8x12  |      19.50x        |      37.37x
8x16  |      21.05x        |      39.71x
8x32  |      22.01x        |      47.10x
8x64  |      23.50x        |      47.07x

Color Space i444
Size  |  AVX2 performance  | AVX512 performance
----------------------------------------------
8x4   |      21.94x        |      35.60x
8x8   |      21.62x        |      35.04x
8x16  |      20.17x        |      39.90x
8x32  |      22.03x        |      46.81x

diff -r b45e6581fdf6 -r f4cd489d06cf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 13 22:43:09 2017 -0800
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 02 12:18:41 2017 +0530
@@ -2778,6 +2778,23 @@
 
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
 
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hps = PFX(interp_4tap_horiz_ps_8x12_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hps = PFX(interp_4tap_horiz_ps_8x64_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
+
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
+
     }
 #endif
 }
diff -r b45e6581fdf6 -r f4cd489d06cf source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Mon Nov 13 22:43:09 2017 -0800
+++ b/source/common/x86/ipfilter16.asm	Thu Nov 02 12:18:41 2017 +0530
@@ -8146,6 +8146,132 @@
 FILTER_VER_PS_CHROMA_64xN_AVX512 48
 FILTER_VER_PS_CHROMA_64xN_AVX512 64
 %endif
+
+%macro PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512 0
+    ; register map
+    ; m0 , m1 - interpolate coeff
+    ; m2 , m3 - shuffle load order table
+    ; m4      - INTERP_OFFSET_PS
+    ; m5      - shuffle store order table
+
+    movu            xm6,       [r0]
+    vinserti32x4    m6,        [r0 + r1],      1
+    vinserti32x4    m6,        [r0 + 2 * r1],  2
+    vinserti32x4    m6,        [r0 + r6],      3
+
+    pshufb          m8,        m6,        m3
+    pshufb          m6,        m2
+    pmaddwd         m6,        m0
+    pmaddwd         m8,        m1
+    paddd           m6,        m8
+    paddd           m6,        m4
+    psrad           m6,        INTERP_SHIFT_PS
+
+    movu            xm7,       [r0 + 8]
+    vinserti32x4    m7,        [r0 + r1 + 8],      1
+    vinserti32x4    m7,        [r0 + 2 * r1 + 8],  2
+    vinserti32x4    m7,        [r0 + r6 + 8],      3
+
+    pshufb          m8,        m7,        m3
+    pshufb          m7,        m2
+    pmaddwd         m7,        m0
+    pmaddwd         m8,        m1
+    paddd           m7,        m8
+    paddd           m7,        m4
+    psrad           m7,        INTERP_SHIFT_PS
+
+    packssdw        m6,        m7
+    pshufb          m6,        m5
+    movu            [r2],      xm6
+    vextracti32x4   [r2 + r3],     m6,        1
+    vextracti32x4   [r2 + 2 * r3], m6,        2
+    vextracti32x4   [r2 + r7],     m6,        3
+%endmacro
+
+%macro PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512 0
+    movu            xm6,       [r0]
+    vinserti32x4    m6,        [r0 + r1],      1
+    vinserti32x4    m6,        [r0 + 2 * r1],  2
+
+    pshufb          m8,        m6,        m3
+    pshufb          m6,        m2
+    pmaddwd         m6,        m0
+    pmaddwd         m8,        m1
+    paddd           m6,        m8
+    paddd           m6,        m4
+    psrad           m6,        INTERP_SHIFT_PS
+
+    movu            xm7,       [r0 + 8]
+    vinserti32x4    m7,        [r0 + r1 + 8],      1
+    vinserti32x4    m7,        [r0 + 2 * r1 + 8],  2
+
+    pshufb          m8,        m7,        m3
+    pshufb          m7,        m2
+    pmaddwd         m7,        m0
+    pmaddwd         m8,        m1
+    paddd           m7,        m8
+    paddd           m7,        m4
+    psrad           m7,        INTERP_SHIFT_PS
+
+    packssdw        m6,        m7
+    pshufb          m6,        m5
+    movu            [r2],      xm6
+    vextracti32x4   [r2 + r3],     m6,        1
+    vextracti32x4   [r2 + 2 * r3], m6,        2
+%endmacro
+
+%macro IPFILTER_CHROMA_PS_AVX512_8xN 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_ps_8x%1, 4,9,9
+    add             r1d, r1d
+    add             r3d, r3d
+    mov             r4d, r4m
+    mov             r5d, r5m
+
+    lea             r6, [3 * r1]
+    lea             r7, [3 * r3]
+%ifdef PIC
+    lea             r8, [tab_ChromaCoeff]
+    vpbroadcastd    m0, [r8 + r4 * 8]
+    vpbroadcastd    m1, [r8 + r4 * 8 + 4]
+%else
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
+%endif
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
+    vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
+
+    mov               r8d,         %1
+    sub               r0,          2
+    test              r5d,         r5d
+    jz                .loop
+    sub               r0,          r1
+    add               r8d,         3
+    PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512
+    lea               r0,  [r0 + r6]
+    lea               r2,  [r2 + r7]
+    sub               r8d, 3
+
+.loop:
+    PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512
+    lea             r0,  [r0 + 4 * r1]
+    lea             r2,  [r2 + 4 * r3]
+    sub             r8d, 4
+    jnz             .loop
+    RET
+%endmacro
+
+%if ARCH_X86_64
+IPFILTER_CHROMA_PS_AVX512_8xN 4
+IPFILTER_CHROMA_PS_AVX512_8xN 8
+IPFILTER_CHROMA_PS_AVX512_8xN 12
+IPFILTER_CHROMA_PS_AVX512_8xN 16
+IPFILTER_CHROMA_PS_AVX512_8xN 32
+IPFILTER_CHROMA_PS_AVX512_8xN 64
+%endif
+
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_chroma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list