[x265] [PATCH 156 of 307] [x265-avx512]x86: AVX512 idct8x8

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:34 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1509359051 -19800
#      Mon Oct 30 15:54:11 2017 +0530
# Node ID 94523acd49e4b021384036a43a308cbc30cb4766
# Parent  42f980b52743c90920b50eb8a8d31a6d57568e09
[x265-avx512]x86: AVX512 idct8x8

AVX2 Performance    :    8.91x
AVX512 Performance  :    9.82x

diff -r 42f980b52743 -r 94523acd49e4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Nov 07 12:10:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Oct 30 15:54:11 2017 +0530
@@ -2734,6 +2734,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx512);
 
         p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
+        p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
@@ -4710,6 +4711,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
 
         p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
+        p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
 
     }
 #endif
diff -r 42f980b52743 -r 94523acd49e4 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Tue Nov 07 12:10:13 2017 +0530
+++ b/source/common/x86/dct8.asm	Mon Oct 30 15:54:11 2017 +0530
@@ -165,12 +165,35 @@
                 times 4 dw 50, -89, 18, 75
                 times 4 dw 18, -50, 75, -89
 
+avx512_idct8_1:   times 8 dw 64, 83, 64, 36
+                  times 8 dw 64, 36, -64, -83
+                  times 8 dw 64, -36, -64, 83
+                  times 8 dw 64, -83, 64, -36
+
+avx512_idct8_2:   times 8 dw 89, 75, 50, 18
+                  times 8 dw 75, -18, -89, -50
+                  times 8 dw 50, -89, 18, 75
+                  times 8 dw 18, -50, 75, -89
+
+avx512_idct8_3:   dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
+                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83
+                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83
+                  dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
+                  dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89
+                  dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75
+                  dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50
+                  dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89
+
 idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7
 
 const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
 
 idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
 
+const idct8_avx512_shuf2,    times 4 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
+
+idct8_avx512_shuf3:    times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+
 tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
                 dw 87, 57, 9, -43, -80, -90, -70, -25
                 dw 80, 9, -70, -87, -25, 57, 90, 43
@@ -3135,6 +3158,225 @@
     mova            [r1 + r3],         xm3
     RET
 
+
+%macro IDCT8_AVX512_PASS_1 0
+    pmaddwd         m5,                m4, m17
+    pmaddwd         m6,                m0, m18
+    paddd           m5,                m6
+
+    pmaddwd         m6,                m1, m21
+    pmaddwd         m3,                m2, m22
+    paddd           m6,                m3
+
+    paddd           m3,                m5, m6
+    paddd           m3,                m11
+    psrad           m3,                IDCT_SHIFT1
+
+    psubd           m5,                m6
+    paddd           m5,                m11
+    psrad           m5,                IDCT_SHIFT1
+
+    pmaddwd         m6,                m4, m19
+    pmaddwd         m8,                m0, m20
+    paddd           m6,                m8
+
+    pmaddwd         m8,                m1, m23
+    pmaddwd         m9,                m2, m24
+    paddd           m8,                m9
+
+    paddd           m9,                m6, m8
+    paddd           m9,                m11
+    psrad           m9,                IDCT_SHIFT1
+
+    psubd           m6,                m8
+    paddd           m6,                m11
+    psrad           m6,                IDCT_SHIFT1
+
+    packssdw        m3,                m9
+    vpermq          m3,                m3, 0xD8
+
+    packssdw        m6,                m5
+    vpermq          m6,                m6, 0xD8
+%endmacro
+
+
+%macro IDCT8_AVX512_PASS_2 0
+    punpcklqdq      m2,                m3, m13
+    punpckhqdq      m0,                m3, m13
+
+    pmaddwd         m3,                m2, [r5]
+    pmaddwd         m5,                m2, [r5 + 1 * mmsize]
+    pmaddwd         m6,                m2, [r5 + 2 * mmsize]
+    pmaddwd         m7,                m2, [r5 + 3 * mmsize]
+
+    pshufd           m14,     m3,     q2301
+    pshufd           m16,     m5,     q2301
+    paddd             m3,    m14
+    paddd             m5,    m16
+    punpckhdq        m14,     m3,      m5
+    punpckldq        m16,     m3,      m5
+    punpckhdq         m3,    m16,     m14
+
+    pshufd           m14,     m6,     q2301
+    pshufd           m16,     m7,     q2301
+    paddd             m6,    m14
+    paddd             m7,    m16
+    punpckhdq        m14,     m6,      m7
+    punpckldq        m16,     m6,      m7
+    punpckhdq         m6,    m16,     m14
+
+
+    pshufb          m3,                [idct8_avx512_shuf2]
+    pshufb          m6,                [idct8_avx512_shuf2]
+    punpcklqdq      m7,                m3, m6
+    punpckhqdq      m3,                m6
+
+    pmaddwd         m5,                m0, [r6]
+    pmaddwd         m6,                m0, [r6 + 1 * mmsize]
+    pmaddwd         m8,                m0, [r6 + 2 * mmsize]
+    pmaddwd         m9,                m0, [r6 + 3 * mmsize]
+
+    pshufd          m14,     m5,    q2301
+    pshufd          m16,     m6,    q2301
+    paddd            m5,    m14
+    paddd            m6,    m16
+    punpckhdq       m14,     m5,    m6
+    punpckldq       m16,     m5,    m6
+    punpckhdq        m5,    m16,   m14
+
+    pshufd          m14,     m8,    q2301
+    pshufd          m16,     m9,    q2301
+    paddd            m8,    m14
+    paddd            m9,    m16
+    punpckhdq       m14,     m8,    m9
+    punpckldq       m16,     m8,    m9
+    punpckhdq        m8,    m16,   m14
+
+    pshufb          m5,                [idct8_avx512_shuf2]
+    pshufb          m8,                [idct8_avx512_shuf2]
+    punpcklqdq      m6,                m5, m8
+    punpckhqdq      m5,                m8
+
+    paddd           m8,                m7, m6
+    paddd           m8,                m12
+    psrad           m8,                IDCT_SHIFT2
+
+    psubd           m7,                m6
+    paddd           m7,                m12
+    psrad           m7,                IDCT_SHIFT2
+
+    pshufb          m7,                [idct8_avx512_shuf3]
+    packssdw        m8,                 m7
+
+    paddd           m9,                m3, m5
+    paddd           m9,                m12
+    psrad           m9,                IDCT_SHIFT2
+
+    psubd           m3,                m5
+    paddd           m3,                m12
+    psrad           m3,                IDCT_SHIFT2
+
+    pshufb          m3,                [idct8_avx512_shuf3]
+    packssdw        m9,                m3
+%endmacro
+
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal idct8, 3, 7, 25
+%if BIT_DEPTH == 12
+    %define         IDCT_SHIFT2        8
+    vpbroadcastd    m12,                [pd_128]
+%elif BIT_DEPTH == 10
+    %define         IDCT_SHIFT2        10
+    vpbroadcastd    m12,                [pd_512]
+%elif BIT_DEPTH == 8
+    %define         IDCT_SHIFT2        12
+    vpbroadcastd    m12,                [pd_2048]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+%define             IDCT_SHIFT1         7
+
+    vpbroadcastd     m11,               [pd_64]
+
+    lea             r4,                [avx512_idct8_3]
+    lea             r5,                [avx2_idct8_1]
+    lea             r6,                [avx2_idct8_2]
+
+    ;pass1
+    mova            ym1,                [r0 + 0 * 32]      ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
+    mova            ym0,                [r0 + 1 * 32]      ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
+    vpunpcklwd      ym5,      ym1,      ym0                ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+    vpunpckhwd      ym1,      ym0                          ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
+    vinserti128     ym4,      ym5,      xm1,       1       ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
+    vextracti128    xm2,      ym5,      1                  ; [1 3 1 3 1 3 1 3]
+    vinserti128     ym1,      ym1,      xm2,       0       ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
+
+    mova            ym2,                [r0 + 2 * 32]      ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
+    mova            ym0,                [r0 + 3 * 32]      ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
+    vpunpcklwd      ym5,      ym2,      ym0                ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+    vpunpckhwd      ym2,      ym0                          ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
+    vinserti128     ym0,      ym5,      xm2,       1       ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
+    vextracti128    xm5,      ym5,      1                  ; [5 7 5 7 5 7 5 7]
+    vinserti128     ym2,      ym2,      xm5,       0       ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
+
+    mova            ym5,                [idct8_shuf1]
+    vpermd          ym4,                ym5, ym4
+    vpermd          ym0,                ym5, ym0
+    vpermd          ym1,                ym5, ym1
+    vpermd          ym2,                ym5, ym2
+
+    vinserti64x4    m4,        m4,      ym4, 1
+    vinserti64x4    m0,        m0,      ym0, 1
+    vinserti64x4    m1,        m1,      ym1, 1
+    vinserti64x4    m2,        m2,      ym2, 1
+
+    movu            m17,                [r4]
+    movu            m18,                [r4 + 1 * mmsize]
+    movu            m19,                [r4 + 2 * mmsize]
+    movu            m20,                [r4 + 3 * mmsize]
+    movu            m21,                [r4 + 4 * mmsize]
+    movu            m22,                [r4 + 5 * mmsize]
+    movu            m23,                [r4 + 6 * mmsize]
+    movu            m24,                [r4 + 7 * mmsize]
+
+    IDCT8_AVX512_PASS_1
+
+    vextracti64x4   ym13,       m3,      1
+    vextracti64x4   ym14,       m6,      1
+    vinserti64x4      m3,       m3,      ym14, 1
+    vinserti64x4     m13,      m13,       ym6, 1
+
+    ;pass2
+    add             r2d,               r2d
+    lea             r3,                [r2 * 3]
+    lea             r5,                [avx512_idct8_1]
+    lea             r6,                [avx512_idct8_2]
+
+    IDCT8_AVX512_PASS_2
+
+    vextracti128    xm3,               ym8, 1
+    mova            [r1],              xm8
+    mova            [r1 + r2],         xm3
+    vextracti128    xm3,               ym9, 1
+    mova            [r1 + r2 * 2],     xm9
+    mova            [r1 + r3],         xm3
+
+    lea             r1,                [r1 + r2 * 4]
+
+    vextracti64x4   ym10,   m8, 1
+    vextracti64x4   ym11,   m9, 1
+
+    vextracti128    xm3,               ym10, 1
+    mova            [r1],              xm10
+    mova            [r1 + r2],         xm3
+    vextracti128    xm3,               ym11, 1
+    mova            [r1 + r2 * 2],     xm11
+    mova            [r1 + r3],         xm3
+    RET
+%endif
+
 %macro IDCT_PASS1 2
     vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]
 
diff -r 42f980b52743 -r 94523acd49e4 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Tue Nov 07 12:10:13 2017 +0530
+++ b/source/common/x86/dct8.h	Mon Oct 30 15:54:11 2017 +0530
@@ -44,5 +44,6 @@
 void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
 
 void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(idct8_avx512(const int16_t* src, int16_t* dst, intptr_t dstStride));
 
 #endif // ifndef X265_DCT8_H


More information about the x265-devel mailing list