[x265] [PATCH 175 of 307] [x265-avx512]x86: AVX512 idct16x16

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:32:53 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1509948596 -19800
#      Mon Nov 06 11:39:56 2017 +0530
# Node ID 8bbcc1bd3c1381e936695a6eff30a17cc2633b6f
# Parent  df3c576cd32c50b0412ad3d70eeebfe8fb511da1
[x265-avx512]x86: AVX512 idct16x16

AVX2 Performance    :    11.67x
AVX512 Performance  :    12.80x

diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 13 16:02:40 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 06 11:39:56 2017 +0530
@@ -2837,6 +2837,8 @@
 
         p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
         p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
+        p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
+
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
@@ -4835,6 +4837,7 @@
 
         p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
         p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
+        p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
 
     }
 #endif
diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Mon Nov 13 16:02:40 2017 +0530
+++ b/source/common/x86/dct8.asm	Mon Nov 06 11:39:56 2017 +0530
@@ -218,6 +218,27 @@
 
 idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
 
+
+tab_AVX512_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
+                       dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
+                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87
+                       dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
+
+tab_AVX512_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
+                       dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
+                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50
+                       dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
+
+idct16_AVX512_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15
+
+idct16_AVX512_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13
+
+idct16_AVX512_shuff2:   dq 0, 1, 8, 9, 4, 5, 12, 13
+idct16_AVX512_shuff3:   dq 2, 3, 10, 11, 6, 7, 14, 15
+idct16_AVX512_shuff4:   dq 4, 5, 12, 13, 0, 1, 8, 9
+idct16_AVX512_shuff5:   dq 6, 7, 14, 15, 2, 3, 10, 11
+idct16_AVX512_shuff6:   times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+
 tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
                 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
                 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
@@ -3671,6 +3692,599 @@
     jnz             .pass2
     RET
 
+
+%macro IDCT16_AVX512_PASS1 3
+    movu            m5,  [tab_AVX512_idct16_2 + %1 * 64]
+    pmaddwd         m9, m0, m5
+    pmaddwd         m10, m7, m5
+
+    vpsrldq         m16,   m9, 4
+    paddd            m9,  m16
+    vpslldq         m17,   m10, 4
+    paddd            m10,  m17
+    vmovdqu32        m9   {k1}, m10
+
+    pmaddwd         m10, m6, m5
+    pmaddwd         m11, m8, m5
+
+    vpsrldq         m16,   m10, 4
+    paddd            m10,  m16
+    vpslldq         m17,   m11, 4
+    paddd            m11,  m17
+    vmovdqu32        m10   {k1}, m11
+
+    vpsrldq         m16,   m9, 8
+    paddd            m9,  m16
+    vpslldq         m17,   m10, 8
+    paddd            m10,  m17
+    vmovdqu32        m9   {k2}, m10
+
+    movu            m5,  [tab_AVX512_idct16_1 + %1 * 64]
+    pmaddwd         m10, m1, m5
+    pmaddwd         m11, m3, m5
+
+    vpsrldq         m16,   m10, 4
+    paddd            m10,  m16
+    vpslldq         m17,   m11, 4
+    paddd            m11,  m17
+    vmovdqu32        m10   {k1}, m11
+
+    pmaddwd         m11, m4, m5
+    pmaddwd         m12, m2, m5
+
+    vpsrldq         m16,   m11, 4
+    paddd            m11,  m16
+    vpslldq         m17,   m12, 4
+    paddd            m12,  m17
+    vmovdqu32        m11   {k1}, m12
+
+    vpsrldq         m16,   m10, 8
+    paddd            m10,  m16
+    vpslldq         m17,   m11, 8
+    paddd            m11,  m17
+    vmovdqu32        m10   {k2}, m11
+
+    paddd           m11, m9, m10
+    paddd           m11, m14
+    psrad           m11, IDCT_SHIFT1
+
+    psubd           m9, m10
+    paddd           m9, m14
+    psrad           m9, IDCT_SHIFT1
+
+    movu            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
+    pmaddwd         m10, m0, m5
+    pmaddwd         m12, m7, m5
+
+
+    vpsrldq         m16,   m10, 4
+    paddd            m10,  m16
+    vpslldq         m17,   m12, 4
+    paddd            m12,  m17
+    vmovdqu32        m10   {k1}, m12
+
+    pmaddwd         m12, m6, m5
+    pmaddwd         m13, m8, m5
+
+
+    vpsrldq         m16,   m12, 4
+    paddd            m12,  m16
+    vpslldq         m17,   m13, 4
+    paddd            m13,  m17
+    vmovdqu32        m12   {k1}, m13
+
+
+    vpsrldq         m16,   m10, 8
+    paddd            m10,  m16
+    vpslldq         m17,   m12, 8
+    paddd            m12,  m17
+    vmovdqu32        m10   {k2}, m12
+
+
+
+    movu            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64] 
+    pmaddwd         m12, m1, m5
+    pmaddwd         m13, m3, m5
+
+
+    vpsrldq         m16,   m12, 4
+    paddd            m12,  m16
+    vpslldq         m17,   m13, 4
+    paddd            m13,  m17
+    vmovdqu32        m12   {k1}, m13
+
+    pmaddwd         m13, m4, m5
+    pmaddwd         m5, m2
+
+
+    vpsrldq         m16,   m13, 4
+    paddd            m13,  m16
+    vpslldq         m17,   m5, 4
+    paddd            m5,  m17
+    vmovdqu32        m13   {k1}, m5
+
+
+    vpsrldq         m16,   m12, 8
+    paddd            m12,  m16
+    vpslldq         m17,   m13, 8
+    paddd            m13,  m17
+    vmovdqu32        m12   {k2}, m13
+
+
+    paddd           m5, m10, m12
+    paddd           m5, m14
+    psrad           m5, IDCT_SHIFT1
+
+    psubd           m10, m12
+    paddd           m10, m14
+    psrad           m10, IDCT_SHIFT1
+
+    packssdw        m11, m5
+    packssdw        m9, m10
+
+    movu            m10, [idct16_AVX512_shuff]
+    movu            m5,  [idct16_AVX512_shuff1]
+
+    vpermd          m%2, m10, m11
+    vpermd          m%3, m5, m9
+%endmacro
+
+%macro IDCT16_AVX512_PASS2 2
+    vpermq          m0, m%1, 0xD8
+
+    pmaddwd         m1, m0, m7
+    pmaddwd         m2, m0, m8
+
+
+    vpsrldq         m14,   m1, 4
+    paddd            m1,  m14
+    vpslldq         m31,   m2, 4
+    paddd            m2,  m31
+    vmovdqu32        m1   {k1}, m2
+
+    pmaddwd         m2, m0, m9
+    pmaddwd         m3, m0, m10
+
+
+    vpsrldq         m14,   m2, 4
+    paddd            m2,  m14
+    vpslldq         m31,   m3, 4
+    paddd            m3,  m31
+    vmovdqu32        m2   {k1}, m3
+
+
+    vpsrldq         m14,   m1, 8
+    paddd            m1,  m14
+    vpslldq         m31,   m2, 8
+    paddd            m2,  m31
+    vmovdqu32        m1   {k2}, m2
+
+    pmaddwd         m2, m0, m11
+    pmaddwd         m3, m0, m12
+
+
+    vpsrldq         m14,   m2, 4
+    paddd            m2,  m14
+    vpslldq         m31,   m3, 4
+    paddd            m3,  m31
+    vmovdqu32        m2   {k1}, m3
+
+    vbroadcasti64x2  m14, [r5 + 112]
+    pmaddwd         m3, m0, m13
+    pmaddwd         m4, m0, m14
+
+
+    vpsrldq         m14,   m3, 4
+    paddd            m3,  m14
+    vpslldq         m31,   m4, 4
+    paddd            m4,  m31
+    vmovdqu32        m3   {k1}, m4
+
+
+    vpsrldq         m14,   m2, 8
+    paddd            m2,  m14
+    vpslldq         m31,   m3, 8
+    paddd            m3,  m31
+    vmovdqu32        m2   {k2}, m3
+
+    vpermq          m0, m%2, 0xD8
+    pmaddwd         m3, m0, m16
+    pmaddwd         m4, m0, m17
+
+
+    vpsrldq         m14,   m3, 4
+    paddd            m3,  m14
+    vpslldq         m31,   m4, 4
+    paddd            m4,  m31
+    vmovdqu32        m3   {k1}, m4
+
+    pmaddwd         m4, m0, m19
+    pmaddwd         m5, m0, m23
+
+
+    vpsrldq         m14,   m4, 4
+    paddd            m4,  m14
+    vpslldq         m31,   m5, 4
+    paddd            m5,  m31
+    vmovdqu32        m4   {k1}, m5
+
+
+    vpsrldq         m14,   m3, 8
+    paddd            m3,  m14
+    vpslldq         m31,   m4, 8
+    paddd            m4,  m31
+    vmovdqu32        m3   {k2}, m4
+
+
+    pmaddwd         m4, m0, m28
+    pmaddwd         m5, m0, m29
+
+    vpsrldq         m14,   m4, 4
+    paddd            m4,  m14
+    vpslldq         m31,   m5, 4
+    paddd            m5,  m31
+    vmovdqu32        m4   {k1}, m5
+
+    pmaddwd         m6, m0, m30
+    vbroadcasti64x2  m31, [r6 + 112]
+    pmaddwd         m0, m31
+
+
+    vpsrldq         m14,   m6, 4
+    paddd            m6,  m14
+    vpslldq         m31,   m0, 4
+    paddd            m0,  m31
+    vmovdqu32        m6   {k1}, m0
+
+
+    vpsrldq         m14,   m4, 8
+    paddd            m4,  m14
+    vpslldq         m31,   m6, 8
+    paddd            m6,  m31
+    vmovdqu32        m4   {k2}, m6
+
+    paddd           m5, m1, m3
+    paddd           m5, m15
+    psrad           m5, IDCT_SHIFT2
+
+    psubd           m1, m3
+    paddd           m1, m15
+    psrad           m1, IDCT_SHIFT2
+
+    paddd           m6, m2, m4
+    paddd           m6, m15
+    psrad           m6, IDCT_SHIFT2
+
+    psubd           m2, m4
+    paddd           m2, m15
+    psrad           m2, IDCT_SHIFT2
+
+    packssdw        m5, m6
+    packssdw        m1, m2
+    pshufb          m2, m1, [idct16_AVX512_shuff6]
+%endmacro
+
+
+;-------------------------------------------------------
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
+;-------------------------------------------------------
+INIT_ZMM avx512
+cglobal idct16, 3, 8, 32
+%if BIT_DEPTH == 12
+    %define         IDCT_SHIFT2        8
+    vpbroadcastd    m15,                [pd_128]
+%elif BIT_DEPTH == 10
+    %define         IDCT_SHIFT2        10
+    vpbroadcastd    m15,                [pd_512]
+%elif BIT_DEPTH == 8
+    %define         IDCT_SHIFT2        12
+    vpbroadcastd    m15,                [pd_2048]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+%define             IDCT_SHIFT1         7
+
+    vpbroadcastd    m14,               [pd_64]
+
+    add             r2d,               r2d
+
+    mov             r7d,    0xAAAA
+    kmovd            k1,    r7d
+    mov             r7d,    0xCCCC
+    kmovd            k2,    r7d
+
+.pass1:
+     movu            xm0, [r0 +  0 * 32]
+     movu            xm1, [r0 +  8 * 32]
+     punpckhqdq      xm2, xm0, xm1
+     punpcklqdq      xm0, xm1
+     vinserti128     ym0, ym0, xm2, 1
+
+     movu            xm1, [r0 +  1 * 32]
+     movu            xm2, [r0 +  9 * 32]
+     punpckhqdq      xm3, xm1, xm2
+     punpcklqdq      xm1, xm2
+     vinserti128     ym1, ym1, xm3, 1
+
+     movu            xm2, [r0 + 2  * 32]
+     movu            xm3, [r0 + 10 * 32]
+     punpckhqdq      xm4, xm2, xm3
+     punpcklqdq      xm2, xm3
+     vinserti128     ym2, ym2, xm4, 1
+
+     movu            xm3, [r0 + 3  * 32]
+     movu            xm4, [r0 + 11 * 32]
+     punpckhqdq      xm5, xm3, xm4
+     punpcklqdq      xm3, xm4
+     vinserti128     ym3, ym3, xm5, 1
+
+     movu            xm4, [r0 + 4  * 32]
+     movu            xm5, [r0 + 12 * 32]
+     punpckhqdq      xm6, xm4, xm5
+     punpcklqdq      xm4, xm5
+     vinserti128     ym4, ym4, xm6, 1
+
+     movu            xm5, [r0 + 5  * 32]
+     movu            xm6, [r0 + 13 * 32]
+     punpckhqdq      xm7, xm5, xm6
+     punpcklqdq      xm5, xm6
+     vinserti128     ym5, ym5, xm7, 1
+
+     movu            xm6, [r0 + 6  * 32]
+     movu            xm7, [r0 + 14 * 32]
+     punpckhqdq      xm8, xm6, xm7
+     punpcklqdq      xm6, xm7
+     vinserti128     ym6, ym6, xm8, 1
+
+     movu            xm7, [r0 + 7  * 32]
+     movu            xm8, [r0 + 15 * 32]
+     punpckhqdq      xm9, xm7, xm8
+     punpcklqdq      xm7, xm8
+     vinserti128     ym7, ym7, xm9, 1
+
+    punpckhwd       ym8, ym0, ym2                ;[8 10]
+    punpcklwd       ym0, ym2                    ;[0 2]
+
+    punpckhwd       ym2, ym1, ym3                ;[9 11]
+    punpcklwd       ym1, ym3                    ;[1 3]
+
+    punpckhwd       ym3, ym4, ym6                ;[12 14]
+    punpcklwd       ym4, ym6                    ;[4 6]
+
+    punpckhwd       ym6, ym5, ym7                ;[13 15]
+    punpcklwd       ym5, ym7                    ;[5 7]
+
+    punpckhdq       ym7, ym0, ym4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
+    punpckldq       ym0, ym4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
+
+    punpckhdq       ym4, ym8, ym3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
+    punpckldq       ym8, ym3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
+
+    punpckhdq       ym3, ym1, ym5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
+    punpckldq       ym1, ym5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
+
+    punpckhdq       ym5, ym2, ym6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
+    punpckldq       ym2, ym6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
+
+    punpckhqdq      ym6, ym0, ym8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
+    punpcklqdq      ym0, ym8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
+
+    punpckhqdq      ym8, ym7, ym4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
+    punpcklqdq      ym7, ym4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
+
+    punpckhqdq      ym4, ym1, ym2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
+    punpcklqdq      ym1, ym2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
+
+    punpckhqdq      ym2, ym3, ym5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
+    punpcklqdq      ym3, ym5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
+
+    vinserti64x4    m6,        m6,      ym6, 1
+    vinserti64x4    m0,        m0,      ym0, 1
+    vinserti64x4    m8,        m8,      ym8, 1
+    vinserti64x4    m7,        m7,      ym7, 1
+    vinserti64x4    m4,        m4,      ym4, 1
+    vinserti64x4    m1,        m1,      ym1, 1
+    vinserti64x4    m2,        m2,      ym2, 1
+    vinserti64x4    m3,        m3,      ym3, 1
+
+
+    IDCT16_AVX512_PASS1      0, 18, 19
+    IDCT16_AVX512_PASS1      2, 20, 21
+
+    add             r0, 16
+
+     movu            xm0, [r0 +  0 * 32]
+     movu            xm1, [r0 +  8 * 32]
+     punpckhqdq      xm2, xm0, xm1
+     punpcklqdq      xm0, xm1
+     vinserti128     ym0, ym0, xm2, 1
+
+     movu            xm1, [r0 +  1 * 32]
+     movu            xm2, [r0 +  9 * 32]
+     punpckhqdq      xm3, xm1, xm2
+     punpcklqdq      xm1, xm2
+     vinserti128     ym1, ym1, xm3, 1
+
+     movu            xm2, [r0 + 2  * 32]
+     movu            xm3, [r0 + 10 * 32]
+     punpckhqdq      xm4, xm2, xm3
+     punpcklqdq      xm2, xm3
+     vinserti128     ym2, ym2, xm4, 1
+
+     movu            xm3, [r0 + 3  * 32]
+     movu            xm4, [r0 + 11 * 32]
+     punpckhqdq      xm5, xm3, xm4
+     punpcklqdq      xm3, xm4
+     vinserti128     ym3, ym3, xm5, 1
+
+     movu            xm4, [r0 + 4  * 32]
+     movu            xm5, [r0 + 12 * 32]
+     punpckhqdq      xm6, xm4, xm5
+     punpcklqdq      xm4, xm5
+     vinserti128     ym4, ym4, xm6, 1
+
+     movu            xm5, [r0 + 5  * 32]
+     movu            xm6, [r0 + 13 * 32]
+     punpckhqdq      xm7, xm5, xm6
+     punpcklqdq      xm5, xm6
+     vinserti128     ym5, ym5, xm7, 1
+
+     movu            xm6, [r0 + 6  * 32]
+     movu            xm7, [r0 + 14 * 32]
+     punpckhqdq      xm8, xm6, xm7
+     punpcklqdq      xm6, xm7
+     vinserti128     ym6, ym6, xm8, 1
+
+     movu            xm7, [r0 + 7  * 32]
+     movu            xm8, [r0 + 15 * 32]
+     punpckhqdq      xm9, xm7, xm8
+     punpcklqdq      xm7, xm8
+     vinserti128     ym7, ym7, xm9, 1
+
+    punpckhwd       ym8, ym0, ym2                ;[8 10]
+    punpcklwd       ym0, ym2                    ;[0 2]
+
+    punpckhwd       ym2, ym1, ym3                ;[9 11]
+    punpcklwd       ym1, ym3                    ;[1 3]
+
+    punpckhwd       ym3, ym4, ym6                ;[12 14]
+    punpcklwd       ym4, ym6                    ;[4 6]
+
+    punpckhwd       ym6, ym5, ym7                ;[13 15]
+    punpcklwd       ym5, ym7                    ;[5 7]
+
+    punpckhdq       ym7, ym0, ym4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
+    punpckldq       ym0, ym4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
+
+    punpckhdq       ym4, ym8, ym3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
+    punpckldq       ym8, ym3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
+
+    punpckhdq       ym3, ym1, ym5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
+    punpckldq       ym1, ym5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
+
+    punpckhdq       ym5, ym2, ym6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
+    punpckldq       ym2, ym6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
+
+    punpckhqdq      ym6, ym0, ym8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
+    punpcklqdq      ym0, ym8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
+
+    punpckhqdq      ym8, ym7, ym4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
+    punpcklqdq      ym7, ym4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
+
+    punpckhqdq      ym4, ym1, ym2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
+    punpcklqdq      ym1, ym2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
+
+    punpckhqdq      ym2, ym3, ym5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
+    punpcklqdq      ym3, ym5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
+
+    vinserti64x4    m6,        m6,      ym6, 1
+    vinserti64x4    m0,        m0,      ym0, 1
+    vinserti64x4    m8,        m8,      ym8, 1
+    vinserti64x4    m7,        m7,      ym7, 1
+    vinserti64x4    m4,        m4,      ym4, 1
+    vinserti64x4    m1,        m1,      ym1, 1
+    vinserti64x4    m2,        m2,      ym2, 1
+    vinserti64x4    m3,        m3,      ym3, 1
+
+
+    IDCT16_AVX512_PASS1      0, 22, 23
+    IDCT16_AVX512_PASS1      2, 24, 25
+
+    movu       m26,    [idct16_AVX512_shuff2]
+    movu       m27,    [idct16_AVX512_shuff3]
+    vpermi2q   m26,    m18, m22
+    vpermi2q   m27,    m18, m22
+    movu       m18,    [idct16_AVX512_shuff2]
+    movu       m22,    [idct16_AVX512_shuff3]
+    vpermi2q   m18,    m20, m24
+    vpermi2q   m22,    m20, m24
+    movu       m20,    [idct16_AVX512_shuff4]
+    movu       m24,    [idct16_AVX512_shuff5]
+    vpermi2q   m20,    m21, m25
+    vpermi2q   m24,    m21, m25
+    movu       m21,    [idct16_AVX512_shuff4]
+    movu       m25,    [idct16_AVX512_shuff5]
+    vpermi2q   m21,    m19, m23
+    vpermi2q   m25,    m19, m23
+
+    lea             r5, [tab_idct16_2]
+    lea             r6, [tab_idct16_1]
+
+    vbroadcasti64x2  m7,  [r5]
+    vbroadcasti64x2  m8,  [r5 + 16]
+    vbroadcasti64x2  m9,  [r5 + 32]
+    vbroadcasti64x2  m10, [r5 + 48]
+    vbroadcasti64x2  m11, [r5 + 64]
+    vbroadcasti64x2  m12, [r5 + 80]
+    vbroadcasti64x2  m13, [r5 + 96]
+
+    vbroadcasti64x2  m16, [r6]
+    vbroadcasti64x2  m17, [r6 + 16]
+    vbroadcasti64x2  m19, [r6 + 32]
+    vbroadcasti64x2  m23, [r6 + 48]
+    vbroadcasti64x2  m28, [r6 + 64]
+    vbroadcasti64x2  m29, [r6 + 80]
+    vbroadcasti64x2  m30, [r6 + 96]
+
+
+    IDCT16_AVX512_PASS2 26, 27
+     mova            [r1], xm5
+     mova            [r1 + 16], xm2
+     vextracti128    [r1 + r2], ym5, 1
+     vextracti128    [r1 + r2 + 16], ym2, 1
+     vextracti64x4   ym14, m5, 1
+     vextracti64x4   ym31, m2, 1
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm14
+     mova            [r1 + 16], xm31
+     vextracti128    [r1 + r2], ym14, 1
+     vextracti128    [r1 + r2 + 16], ym31, 1
+
+    IDCT16_AVX512_PASS2 18, 22
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm5
+     mova            [r1 + 16], xm2
+     vextracti128    [r1 + r2], ym5, 1
+     vextracti128    [r1 + r2 + 16], ym2, 1
+     vextracti64x4   ym14, m5, 1
+     vextracti64x4   ym31, m2, 1
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm14
+     mova            [r1 + 16], xm31
+     vextracti128    [r1 + r2], ym14, 1
+     vextracti128    [r1 + r2 + 16], ym31, 1
+
+    IDCT16_AVX512_PASS2 20, 24
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm5
+     mova            [r1 + 16], xm2
+     vextracti128    [r1 + r2], ym5, 1
+     vextracti128    [r1 + r2 + 16], ym2, 1
+     vextracti64x4   ym14, m5, 1
+     vextracti64x4   ym31, m2, 1
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm14
+     mova            [r1 + 16], xm31
+     vextracti128    [r1 + r2], ym14, 1
+     vextracti128    [r1 + r2 + 16], ym31, 1
+
+    IDCT16_AVX512_PASS2 21, 25
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm5
+     mova            [r1 + 16], xm2
+     vextracti128    [r1 + r2], ym5, 1
+     vextracti128    [r1 + r2 + 16], ym2, 1
+     vextracti64x4   ym14, m5, 1
+     vextracti64x4   ym31, m2, 1
+     lea             r1, [r1 + 2 * r2]
+     mova            [r1], xm14
+     mova            [r1 + 16], xm31
+     vextracti128    [r1 + r2], ym14, 1
+     vextracti128    [r1 + r2 + 16], ym31, 1
+    RET
+
+
+
 %macro IDCT32_PASS1 1
     vbroadcasti128  m3, [tab_idct32_1 + %1 * 32]
     vbroadcasti128  m13, [tab_idct32_1 + %1 * 32 + 16]
diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Mon Nov 13 16:02:40 2017 +0530
+++ b/source/common/x86/dct8.h	Mon Nov 06 11:39:56 2017 +0530
@@ -45,5 +45,6 @@
 
 void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
 
 #endif // ifndef X265_DCT8_H


More information about the x265-devel mailing list