[x265] [PATCH 092 of 307] x86: AVX512 interp_4tap_horiz_pp_16xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:30 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1502628508 -19800
#      Sun Aug 13 18:18:28 2017 +0530
# Node ID ed1932a414bf5962bbeccfd5c9e208b7db90f77f
# Parent  dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd
x86: AVX512 interp_4tap_horiz_pp_16xN

Color Space i444
Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
16x4  |      12.87x       |      20.91x
16x8  |      18.03x       |      27.40x
16x12 |      16.95x       |      24.97x
16x16 |      18.82x       |      27.13x
16x32 |      16.21x       |      25.76x
16x64 |      17.41x       |      26.04x

diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Sep 01 10:33:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Sun Aug 13 18:18:28 2017 +0530
@@ -4021,14 +4021,30 @@
         p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
 
         //i422 chroma_hpp
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
 
         //i420 chroma_hpp
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Sep 01 10:33:48 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Sun Aug 13 18:18:28 2017 +0530
@@ -9907,6 +9907,48 @@
     vextracti32x8    [r2 + r3],      m5,            1
 %endmacro
 
+%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1, m2 - shuffle order table
+    ; m3 - constant word 1
+    ; m4 - constant word 512
+
+    movu              xm5,           [r0]
+    vinserti32x4       m5,           [r0 + r1],            1
+    vinserti32x4       m5,           [r0 + 2 * r1],        2
+    vinserti32x4       m5,           [r0 + r6],            3
+    pshufb             m6,           m5,       m2
+    pshufb             m5,           m1
+
+    movu              xm7,           [r0 + 4]
+    vinserti32x4       m7,           [r0 + r1 + 4],        1
+    vinserti32x4       m7,           [r0 + 2 * r1 + 4],    2
+    vinserti32x4       m7,           [r0 + r6 + 4],        3
+    pshufb             m8,           m7,       m2
+    pshufb             m7,           m1
+
+    pmaddubsw          m5,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m5,           m3
+    pmaddwd            m7,           m3
+
+    pmaddubsw          m6,           m0
+    pmaddubsw          m8,           m0
+    pmaddwd            m6,           m3
+    pmaddwd            m8,           m3
+
+    packssdw           m5,           m7
+    packssdw           m6,           m8
+    pmulhrsw           m5,           m4
+    pmulhrsw           m6,           m4
+    packuswb           m5,           m6
+    movu              [r2],          xm5
+    vextracti32x4     [r2 + r3],     m5,       1
+    vextracti32x4     [r2 + 2 * r3], m5,       2
+    vextracti32x4     [r2 + r7],     m5,       3
+%endmacro
+
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
 ;-------------------------------------------------------------------------------------------------------------
@@ -9976,6 +10018,42 @@
     IPFILTER_CHROMA_PP_32xN_AVX512 64
     IPFILTER_CHROMA_PP_32xN_AVX512 48
 
+%macro IPFILTER_CHROMA_PP_16xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_pp_16x%1, 4,8,9
+    mov               r4d,          r4m
+    lea               r6,           [3 * r1]
+    lea               r7,           [3 * r3]
+%ifdef PIC
+    lea               r5,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
+    vbroadcasti32x8   m3,           [pw_1]
+    vbroadcasti32x8   m4,           [pw_512]
+    dec               r0
+
+%rep %1/4 - 1
+    PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
+    lea               r2,           [r2 + 4 * r3]
+    lea               r0,           [r0 + 4 * r1]
+%endrep
+    PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
+    RET
+%endmacro
+
+    IPFILTER_CHROMA_PP_16xN_AVX512 4
+    IPFILTER_CHROMA_PP_16xN_AVX512 8
+    IPFILTER_CHROMA_PP_16xN_AVX512 12
+    IPFILTER_CHROMA_PP_16xN_AVX512 16
+    IPFILTER_CHROMA_PP_16xN_AVX512 24
+    IPFILTER_CHROMA_PP_16xN_AVX512 32
+    IPFILTER_CHROMA_PP_16xN_AVX512 64
+
 %macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
     movu               ym6,          [r0]
     vinserti32x8       m6,           [r0 + 4], 1


More information about the x265-devel mailing list