[x265] [PATCH 127 of 307] x86: AVX512 interp_8tap_horiz_ps_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:05 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1503469803 -19800
#      Wed Aug 23 12:00:03 2017 +0530
# Node ID 7527c103cbe87811ec9e380a00d8a8605b761377
# Parent  f85a7562c867c13e7f99540ef72117d62f13a956
x86: AVX512 interp_8tap_horiz_ps_32xN

Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
32x8  |      14.24x       |      26.69x
32x16 |      14.63x       |      25.46x
32x24 |      14.98x       |      25.57x
32x32 |      15.02x       |      25.92x
32x64 |      15.36x       |      26.75x

diff -r f85a7562c867 -r 7527c103cbe8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 28 15:42:39 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 23 12:00:03 2017 +0530
@@ -4408,6 +4408,12 @@
         p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx512);
         p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx512);
         p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx512);
+
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx512);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx512);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx512);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx512);
+        p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx512);
     }
 #endif
 }
diff -r f85a7562c867 -r 7527c103cbe8 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Aug 28 15:42:39 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Aug 23 12:00:03 2017 +0530
@@ -11114,6 +11114,82 @@
 IPFILTER_LUMA_PS_64xN_AVX512 32
 IPFILTER_LUMA_PS_64xN_AVX512 48
 IPFILTER_LUMA_PS_64xN_AVX512 64
+
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX512 0
+    ; register map
+    ; m0 , m1     - interpolate coeff
+    ; m2 , m3, m4 - load shuffle order table
+    ; m5          - pw_1
+    ; m6          - pw_2000
+    ; m7          - store shuffle order table
+
+    movu              ym8,           [r0]
+    vinserti32x8      m8,            [r0 + 8],            1
+    pshufb            m9,            m8,                  m3
+    pshufb            m10,           m8,                  m4
+    pshufb            m8,             m2
+
+    pmaddubsw         m8,            m0
+    pmaddubsw         m11,           m9,                  m1
+    pmaddwd           m8,            m5
+    pmaddwd           m11,           m5
+    paddd             m8,            m11
+
+    pmaddubsw         m9,            m0
+    pmaddubsw         m11,           m10,                 m1
+    pmaddwd           m9,            m5
+    pmaddwd           m11,           m5
+    paddd             m9,            m11
+
+    packssdw          m8,            m9
+    psubw             m8,            m6
+    vpermq            m8,            m7,                  m8
+    movu              [r2],          m8
+%endmacro
+
+%macro IPFILTER_LUMA_PS_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_8tap_horiz_ps_32x%1, 4,7,12
+    mov               r4d,   r4m
+    mov               r5d,   r5m
+
+%ifdef PIC
+    lea               r6,        [tab_LumaCoeff]
+    vpbroadcastd      m0,        [r6 + r4 * 8]
+    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
+%else
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
+    vpbroadcastd      m5,        [pw_1]
+    vbroadcasti32x8   m6,        [pw_2000]
+    mova              m7,        [interp8_hps_store_avx512]
+
+    mov               r4d,       %1
+    sub               r0,        3
+    test              r5d,       r5d
+    jz                .loop
+    lea               r6,        [r1 * 3]
+    sub               r0,        r6                           ; r0(src)-r6
+    add               r4d,       7                            ; blkheight += N - 1
+
+.loop:
+    PROCESS_IPFILTER_LUMA_PS_32x1_AVX512
+    lea               r0,        [r0 + r1]
+    lea               r2,        [r2 + 2 * r3]
+    dec               r4d
+    jnz               .loop
+    RET
+%endmacro
+
+IPFILTER_LUMA_PS_32xN_AVX512 8
+IPFILTER_LUMA_PS_32xN_AVX512 16
+IPFILTER_LUMA_PS_32xN_AVX512 24
+IPFILTER_LUMA_PS_32xN_AVX512 32
+IPFILTER_LUMA_PS_32xN_AVX512 64
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_luma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------
\ No newline at end of file


More information about the x265-devel mailing list