[x265] [PATCH 135 of 307] x86: AVX512 interp_4tap_vert_pp_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:13 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1509357597 -19800
#      Mon Oct 30 15:29:57 2017 +0530
# Node ID d9a598bc340402c0846f031c6a118a548e8ce083
# Parent  334c109f5cef24cca9c955d3ff3b381629e66afa
x86: AVX512 interp_4tap_vert_pp_32xN

i444
Size  |  AVX2 performance | AVX512 performance
----------------------------------------------
32x8  |      47.64x       |      62.01x
32x16 |      54.38x       |      65.53x
32x24 |      55.56x       |      73.30x
32x32 |      57.63x       |      73.34x
32x64 |      60.36x       |      78.26x

diff -r 334c109f5cef -r d9a598bc3404 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Oct 30 12:28:05 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Oct 30 15:29:57 2017 +0530
@@ -4548,6 +4548,20 @@
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
 
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
+
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
diff -r 334c109f5cef -r d9a598bc3404 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Oct 30 12:28:05 2017 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon Oct 30 15:29:57 2017 +0530
@@ -10792,6 +10792,152 @@
 
 IPFILTER_CHROMA_PS_48xN_AVX512 64
 
+%macro PROCESS_CHROMA_VERT_PP_32x8_AVX512 0
+    movu             ym0,              [r0]                        ; m0 = row 0
+    lea               r6,              [r0 + 4 * r1]
+    lea               r7,              [r2 + 4 * r3]
+    vinserti32x8      m0,              [r6],              1        ; m0 = row 4
+    movu             ym1,              [r0 + r1]                   ; m1 = row 1
+    vinserti32x8      m1,              [r6 + r1],         1        ; m1 = row 5
+    punpcklbw         m2,              m0,                m1
+    punpckhbw         m3,              m0,                m1
+    pmaddubsw         m2,              m10
+    pmaddubsw         m3,              m10
+
+    movu             ym0,              [r0 + r1 * 2]               ; m0 = row 2
+    vinserti32x8      m0,              [r6 + r1 * 2],     1        ; m0 = row 6
+    punpcklbw         m4,              m1,                m0
+    punpckhbw         m5,              m1,                m0
+    pmaddubsw         m4,              m10
+    pmaddubsw         m5,              m10
+
+    movu             ym1,              [r0 + r4]                   ; m1 = row 3
+    vinserti32x8      m1,              [r6 + r4],         1        ; m1 = row 7
+    punpcklbw         m6,              m0,                m1
+    punpckhbw         m7,              m0,                m1
+    pmaddubsw         m8,              m6,                m11
+    pmaddubsw         m9,              m7,                m11
+    pmaddubsw         m6,              m10
+    pmaddubsw         m7,              m10
+
+    paddw             m2,              m8
+    paddw             m3,              m9
+
+    pmulhrsw          m2,              m12
+    pmulhrsw          m3,              m12
+    packuswb          m2,              m3
+    movu              [r2],            ym2
+    vextracti32x8     [r7],            m2,                1
+    lea               r0,              [r0 + r1 * 4]
+    lea               r6,              [r6 + r1 * 4]
+
+    movu             ym0,              [r0]                        ; m0 = row 4
+    vinserti32x8      m0,              [r6],              1        ; m0 = row 8
+    punpcklbw         m2,              m1,                m0
+    punpckhbw         m3,              m1,                m0
+    pmaddubsw         m8,              m2,                m11
+    pmaddubsw         m9,              m3,                m11
+    pmaddubsw         m2,              m10
+    pmaddubsw         m3,              m10
+
+    paddw             m4,              m8
+    paddw             m5,              m9
+    pmulhrsw          m4,              m12
+    pmulhrsw          m5,              m12
+    packuswb          m4,              m5
+    movu              [r2 + r3],       ym4
+    vextracti32x8     [r7 + r3],       m4,                1
+
+    movu             ym1,              [r0 + r1]                   ; m1 = row 5
+    vinserti32x8      m1,              [r6 + r1],         1        ; m1 = row 9
+    punpcklbw         m4,              m0,                m1
+    punpckhbw         m5,              m0,                m1
+    pmaddubsw         m4,              m11
+    pmaddubsw         m5,              m11
+    paddw             m6,              m4
+    paddw             m7,              m5
+
+    pmulhrsw          m6,              m12
+    pmulhrsw          m7,              m12
+    packuswb          m6,              m7
+    movu              [r2 + r3 * 2],   ym6
+    vextracti32x8     [r7 + r3 * 2],   m6,                1
+
+    movu             ym0,              [r0 + r1 * 2]               ; m0 = row 6
+    vinserti32x8      m0,              [r6 + r1 * 2],     1        ; m0 = row 10
+    punpcklbw         m6,              m1,                m0
+    punpckhbw         m7,              m1,                m0
+    pmaddubsw         m6,              m11
+    pmaddubsw         m7,              m11
+    paddw             m2,              m6
+    paddw             m3,              m7
+    pmulhrsw          m2,              m12
+    pmulhrsw          m3,              m12
+    packuswb          m2,              m3
+    movu              [r2 + r5],       ym2
+    vextracti32x8     [r7 + r5],       m2,                1
+%endmacro
+
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal interp_4tap_vert_pp_32x8, 4, 8, 13
+    mov               r4d,             r4m
+    shl               r4d,             7
+
+%ifdef PIC
+    lea               r5,              [tab_ChromaCoeffVer_32_avx512]
+    add               r5,              r4
+%else
+    lea               r5,              [tab_ChromaCoeffVer_32_avx512 + r4]
+%endif
+
+    mova              m10,             [r5]
+    mova              m11,             [r5 + mmsize]
+    lea               r4,              [r1 * 3]
+    sub               r0,              r1
+    vbroadcasti32x8   m12,             [pw_512]
+    lea               r5,              [r3 * 3]
+    PROCESS_CHROMA_VERT_PP_32x8_AVX512
+    RET
+%endif
+
+%macro FILTER_VER_PP_CHROMA_AVX512_32xN 1
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal interp_4tap_vert_pp_32x%1, 4, 8, 13
+    mov               r4d,             r4m
+    shl               r4d,             7
+
+%ifdef PIC
+    lea               r5,              [tab_ChromaCoeffVer_32_avx512]
+    add               r5,              r4
+%else
+    lea               r5,              [tab_ChromaCoeffVer_32_avx512 + r4]
+%endif
+
+    mova              m10,             [r5]
+    mova              m11,             [r5 + mmsize]
+    lea               r4,              [r1 * 3]
+    sub               r0,              r1
+    vbroadcasti32x8   m12,             [pw_512]
+    lea               r5,              [r3 * 3]
+
+%rep %1/8 - 1
+    PROCESS_CHROMA_VERT_PP_32x8_AVX512
+    lea               r0, [r0 + r1 * 4]
+    lea               r2, [r7 + r3 * 4]
+%endrep
+    PROCESS_CHROMA_VERT_PP_32x8_AVX512
+    RET
+%endif
+%endmacro
+
+FILTER_VER_PP_CHROMA_AVX512_32xN 64
+FILTER_VER_PP_CHROMA_AVX512_32xN 48
+FILTER_VER_PP_CHROMA_AVX512_32xN 32
+FILTER_VER_PP_CHROMA_AVX512_32xN 24
+FILTER_VER_PP_CHROMA_AVX512_32xN 16
+
 %macro PROCESS_CHROMA_VERT_PP_64x4_AVX512 0
     movu              m0,              [r0]                        ; m0 = row 0
     movu              m1,              [r0 + r1]                   ; m1 = row 1


More information about the x265-devel mailing list