[x265] [PATCH 090 of 307] x86: AVX512 interp_8tap_horiz_pp_16xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:28 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502617345 -19800
#      Sun Aug 13 15:12:25 2017 +0530
# Node ID d9200885420957bccd4edea62bf87bbe8831bc62
# Parent  4be3c35eb7510f269a548f248e4f5904b4107d74
x86: AVX512 interp_8tap_horiz_pp_16xN

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
16x4  |      19.10x       |      26.27x
16x8  |      19.37x       |      26.59x
16x12 |      19.99x       |      32.66x
16x16 |      19.13x       |      31.47x
16x32 |      18.94x       |      33.38x
16x64 |      18.07x       |      29.97x

diff -r 4be3c35eb751 -r d92008854209 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Sep 01 10:24:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Sun Aug 13 15:12:25 2017 +0530
@@ -4053,6 +4053,12 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
 
+        p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512);
+        p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512);
+        p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512);
+        p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512);
+        p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512);
+        p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512);
         p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
         p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
         p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
diff -r 4be3c35eb751 -r d92008854209 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Sep 01 10:24:43 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Sun Aug 13 15:12:25 2017 +0530
@@ -10233,6 +10233,65 @@
     vextracti32x8     [r2 + r3], m7, 1
 %endmacro
 
+%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
+    ; register map
+    ; m0 , m1 interpolate coeff
+    ; m2 , m3, m4  shuffle order table
+    ; m5 - pw_1
+    ; m6 - pw_512
+
+    movu             xm7,        [r0]
+    vinserti32x4      m7,        [r0 + r1],          1
+    vinserti32x4      m7,        [r0 + 2 * r1],      2
+    vinserti32x4      m7,        [r0 + r6],          3
+
+    pshufb            m8,        m7,        m3
+    pshufb            m7,        m2
+
+    movu             xm9,        [r0 + 8]
+    vinserti32x4      m9,        [r0 + r1 + 8],      1
+    vinserti32x4      m9,        [r0 + 2 * r1 + 8],  2
+    vinserti32x4      m9,        [r0 + r6 + 8],      3
+
+    pshufb            m10,       m9,        m3
+    pshufb            m11,       m9,        m4
+    pshufb            m9,        m2
+
+    pmaddubsw         m7,        m0
+    pmaddubsw         m12,       m8,        m1
+    pmaddwd           m7,        m5
+    pmaddwd           m12,       m5
+    paddd             m7,        m12
+
+    pmaddubsw         m8,        m0
+    pmaddubsw         m12,       m9,        m1
+    pmaddwd           m8,        m5
+    pmaddwd           m12,       m5
+    paddd             m8,        m12
+
+    pmaddubsw         m9,        m0
+    pmaddubsw         m12,       m10,       m1
+    pmaddwd           m9,        m5
+    pmaddwd           m12,       m5
+    paddd             m9,        m12
+
+    pmaddubsw         m10,       m0
+    pmaddubsw         m12,      m11,        m1
+    pmaddwd           m10,      m5
+    pmaddwd           m12,      m5
+    paddd             m10,      m12
+
+    packssdw          m7,       m8
+    packssdw          m9,       m10
+    pmulhrsw          m7,       m6
+    pmulhrsw          m9,       m6
+    packuswb          m7,       m9
+    movu              [r2],         xm7
+    vextracti32x4     [r2 + r3],     m7,    1
+    vextracti32x4     [r2 + 2 * r3], m7,    2
+    vextracti32x4     [r2 + r7],     m7,    3
+%endmacro
+
 %macro IPFILTER_LUMA_64xN_AVX512 1
 INIT_ZMM avx512
 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
@@ -10299,6 +10358,43 @@
 IPFILTER_LUMA_32xN_AVX512 24
 IPFILTER_LUMA_32xN_AVX512 32
 IPFILTER_LUMA_32xN_AVX512 64
+
+%macro IPFILTER_LUMA_16xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_pp_16x%1, 4,8,14
+    sub               r0,    3
+    mov               r4d,   r4m
+    lea               r6,    [3 * r1]
+    lea               r7,    [3 * r3]
+%ifdef PIC
+    lea               r5,        [tab_LumaCoeff]
+    vpbroadcastd      m0,        [r5 + r4 * 8]
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
+    vpbroadcastd      m5,        [pw_1]
+    vbroadcasti32x8   m6,        [pw_512]
+
+%rep %1/4 -1
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
+    lea               r0,        [r0 + 4 * r1]
+    lea               r2,        [r2 + 4 * r3]
+%endrep
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
+    RET
+%endmacro
+
+IPFILTER_LUMA_16xN_AVX512 4
+IPFILTER_LUMA_16xN_AVX512 8
+IPFILTER_LUMA_16xN_AVX512 12
+IPFILTER_LUMA_16xN_AVX512 16
+IPFILTER_LUMA_16xN_AVX512 32
+IPFILTER_LUMA_16xN_AVX512 64
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_luma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------
\ No newline at end of file


More information about the x265-devel mailing list