[x265] [PATCH 085 of 307] x86: AVX512 interp_4tap_horiz_ps_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:23 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1502434970 -19800
#      Fri Aug 11 12:32:50 2017 +0530
# Node ID 6f811dfd5690866f4c432911982a30665dc0e91c
# Parent  951e9a16296e5d1e528c0083630fde8122bd15c1
x86: AVX512 interp_4tap_horiz_ps_32xN

Color Space i444
Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x8    |     25.91x       |      38.35x
32x16   |     25.45x       |      32.02x
32x24   |     25.80x       |      32.73x
32x32   |     33.49x       |      38.02x
32x64   |     27.42x       |      36.20x
Color Space i422
Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x16   |     24.74x       |      33.95x
32x32   |     33.31x       |      34.28x
32x48   |     27.11x       |      35.98x
32x64   |     27.32x       |      35.02x
Color Space i420
Size    | AVX2 performance | AVX512 performance
------------------------------------------------
32x8    |     27.16x       |      36.68x
32x16   |     24.87x       |      31.40x
32x24   |     25.98x       |      34.08x
32x32   |     33.01x       |      34.71x

diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Aug 10 22:47:55 2017 -0700
+++ b/source/common/x86/asm-primitives.cpp	Fri Aug 11 12:32:50 2017 +0530
@@ -4034,6 +4034,25 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512);
+
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
+
+        //i422 chroma_hps
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512);
+
+        //i420 chroma_hps
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
+
     }
 #endif
 }
diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Aug 10 22:47:55 2017 -0700
+++ b/source/common/x86/ipfilter8.asm	Fri Aug 11 12:32:50 2017 +0530
@@ -10010,7 +10010,7 @@
 %endmacro
 
 ;-------------------------------------------------------------------------------------------------------------
-; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-------------------------------------------------------------------------------------------------------------
 %macro IPFILTER_CHROMA_PS_64xN_AVX512 1
 INIT_ZMM avx512
@@ -10059,6 +10059,74 @@
     IPFILTER_CHROMA_PS_64xN_AVX512 48
     IPFILTER_CHROMA_PS_64xN_AVX512 16
 
+%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
+    movu               ym6,          [r0]
+    vinserti32x8       m6,           [r0 + 4], 1
+    pshufb             m7,           m6,       m2
+    pshufb             m6,           m6,       m1
+    pmaddubsw          m6,           m0
+    pmaddubsw          m7,           m0
+    pmaddwd            m6,           m3
+    pmaddwd            m7,           m3
+
+    packssdw           m6,           m7
+    psubw              m6,           m4
+    vpermq             m6,           m8,       m6
+    movu               [r2],         m6
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_PS_32xN_AVX512 1
+INIT_ZMM avx512
+cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
+    mov             r4d, r4m
+    mov             r5d, r5m
+
+%ifdef PIC
+    lea               r6,           [tab_ChromaCoeff]
+    vpbroadcastd      m0,           [r6 + r4 * 4]
+%else
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
+    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
+    vbroadcasti32x8    m3,           [pw_1]
+    vbroadcasti32x8    m4,           [pw_2000]
+    mova               m8,           [interp8_hps_shuf_avx512]
+
+    ; register map
+    ; m0    - interpolate coeff
+    ; m1,m2 - load shuffle order table
+    ; m3    - constant word 1
+    ; m4    - constant word 2000
+    ; m8   - store shuffle order table
+
+    mov               r6d,         %1
+    dec               r0
+    test              r5d,         r5d
+    je                .loop
+    sub               r0,          r1
+    add               r6d,         3
+
+.loop:
+    PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512
+    lea               r2,           [r2 + 2 * r3]
+    lea               r0,           [r0 + r1]
+    dec               r6d
+    jnz               .loop
+    RET
+%endmacro
+
+    IPFILTER_CHROMA_PS_32xN_AVX512 64
+    IPFILTER_CHROMA_PS_32xN_AVX512 48
+    IPFILTER_CHROMA_PS_32xN_AVX512 32
+    IPFILTER_CHROMA_PS_32xN_AVX512 24
+    IPFILTER_CHROMA_PS_32xN_AVX512 16
+    IPFILTER_CHROMA_PS_32xN_AVX512 8
+
 ;-------------------------------------------------------------------------------------------------------------
 ;ipfilter_chroma_avx512 code end
 ;-------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list