[x265] [PATCH 076 of 307] x86: AVX512 interp_4tap_horiz_pp_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:14 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502347959 -19800
#      Thu Aug 10 12:22:39 2017 +0530
# Node ID f489bc0b864c48f557cc40b739e84fe1040e8728
# Parent  7bdf20f62d02f5714c1332695ffa8c7c6a9d8a5a
x86: AVX512 interp_4tap_horiz_pp_32xN

Color Space i444
Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x8    |     23.96x       |      31.57x
32x16   |     24.38x       |      33.22x
32x24   |     22.41x       |      36.92x
32x32   |     21.54x       |      34.09x
32x64   |     23.27x       |      29.14x
Color Space i422
Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x16   |     25.55x       |      33.16x
32x32   |     22.08x       |      35.13x
32x48   |     24.01x       |      34.53x
32x64   |     23.76x       |      35.21x

diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 04 16:47:58 2018 -0700
+++ b/source/common/x86/asm-primitives.cpp	Thu Aug 10 12:22:39 2017 +0530
@@ -4001,6 +4001,18 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
 
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
+
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
+
     }
 #endif
 }
diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Apr 04 16:47:58 2018 -0700
+++ b/source/common/x86/ipfilter8.asm	Thu Aug 10 12:22:39 2017 +0530
@@ -150,6 +150,8 @@
 
 const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 
+const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15
+
 SECTION .text
 
 cextern pb_128
@@ -9867,6 +9869,44 @@
     movu              [r2],          m5
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1, m2 - shuffle order table
+    ; m3 - constant word 1
+    ; m4 - constant word 512
+    ; m9 - store shuffle order table
+
+    movu              ym5,           [r0]
+    vinserti32x8       m5,           [r0 + 4], 1
+
+    pshufb             m6,           m5,       m2
+    pshufb             m5,           m5,       m1
+    pmaddubsw          m5,           m0
+    pmaddubsw          m6,           m0
+    pmaddwd            m5,           m3
+    pmaddwd            m6,           m3
+
+    movu              ym7,           [r0 + r1]
+    vinserti32x8       m7,           [r0 + r1 + 4], 1
+
+    pshufb             m8,           m7,       m2
+    pshufb             m7,           m7,       m1
+    pmaddubsw          m7,           m0
+    pmaddubsw          m8,           m0
+    pmaddwd            m7,           m3
+    pmaddwd            m8,           m3
+
+    packssdw           m5,           m6
+    packssdw           m7,           m8
+    pmulhrsw           m5,           m4
+    pmulhrsw           m7,           m4
+    packuswb           m5,           m7
+    vpermd             m5,           m9,           m5
+    movu             [r2],          ym5
+    vextracti32x8    [r2 + r3],      m5,            1
+%endmacro
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
 ;-------------------------------------------------------------------------------------------------------------
@@ -9902,6 +9942,40 @@
     IPFILTER_CHROMA_PP_64xN_AVX512  48
     IPFILTER_CHROMA_PP_64xN_AVX512  16
 
+%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_pp_32x%1, 4,6,10
+    mov               r4d,               r4m
+
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
+    movu              m9,           [interp4_horiz_shuf_store1_avx512]
+    vbroadcasti32x8   m3,           [pw_1]
+    vbroadcasti32x8   m4,           [pw_512]
+    dec               r0
+
+%rep %1/2 - 1
+    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
+    lea               r2,           [r2 + 2 * r3]
+    lea               r0,           [r0 + 2 * r1]
+%endrep
+    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
+    RET
+%endmacro
+
+    IPFILTER_CHROMA_PP_32xN_AVX512 16
+    IPFILTER_CHROMA_PP_32xN_AVX512 24
+    IPFILTER_CHROMA_PP_32xN_AVX512 8
+    IPFILTER_CHROMA_PP_32xN_AVX512 32
+    IPFILTER_CHROMA_PP_32xN_AVX512 64
+    IPFILTER_CHROMA_PP_32xN_AVX512 48
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_chroma_pp_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list