[x265] [PATCH 267 of 307] [x265-avx512]x86: AVX512 optimize idct16x16

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:25 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1513930936 -19800
#      Fri Dec 22 13:52:16 2017 +0530
# Node ID a2224f4d257cf5f5cd391f455aae3117b7fe65ab
# Parent  e883724b1af9f60e9d91be3aa6fe7b949e782684
[x265-avx512]x86: AVX512 optimize idct16x16
AVX2 Performance          :    11.63x
AVX512 Performance (old)  :    13.07x
AVX512 Performance (opt)  :    13.72x

Overall 15.23% gains over avx2

diff -r e883724b1af9 -r a2224f4d257c source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Tue Dec 26 15:35:24 2017 +0530
+++ b/source/common/x86/dct8.asm	Fri Dec 22 13:52:16 2017 +0530
@@ -288,6 +288,10 @@
 idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7
 
 idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
+idct16_shuff2:  dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+idct16_shuff3:  dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
+idct16_shuff4:  dd 0, 8, 2, 10, 4, 12, 6, 14
+idct16_shuff5:  dd 1, 9, 3, 11, 5, 13, 7, 15
 
 
 tab_AVX512_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
@@ -4797,8 +4801,8 @@
 
 %macro IDCT16_AVX512_PASS1 3
     movu            m5,  [tab_AVX512_idct16_2 + %1 * 64]
-    pmaddwd         m9, m0, m5
-    pmaddwd         m10, m7, m5
+    pmaddwd         m9, m4, m5
+    pmaddwd         m10, m6, m5
 
     vpsrldq         m16,   m9, 4
     paddd            m9,  m16
@@ -4806,7 +4810,7 @@
     paddd            m10,  m17
     vmovdqu32        m9   {k1}, m10
 
-    pmaddwd         m10, m6, m5
+    pmaddwd         m10, m7, m5
     pmaddwd         m11, m8, m5
 
     vpsrldq         m16,   m10, 4
@@ -4822,8 +4826,8 @@
     vmovdqu32        m9   {k2}, m10
 
     mova            m5,  [tab_AVX512_idct16_1 + %1 * 64]
-    pmaddwd         m10, m1, m5
-    pmaddwd         m11, m3, m5
+    pmaddwd         m10, m28, m5
+    pmaddwd         m11, m29, m5
 
     vpsrldq         m16,   m10, 4
     paddd            m10,  m16
@@ -4831,8 +4835,8 @@
     paddd            m11,  m17
     vmovdqu32        m10   {k1}, m11
 
-    pmaddwd         m11, m4, m5
-    pmaddwd         m12, m2, m5
+    pmaddwd         m11, m30, m5
+    pmaddwd         m12, m31, m5
 
     vpsrldq         m16,   m11, 4
     paddd            m11,  m16
@@ -4855,8 +4859,8 @@
     psrad           m9, IDCT_SHIFT1
 
     mova            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
-    pmaddwd         m10, m0, m5
-    pmaddwd         m12, m7, m5
+    pmaddwd         m10, m4, m5
+    pmaddwd         m12, m6, m5
 
 
     vpsrldq         m16,   m10, 4
@@ -4865,7 +4869,7 @@
     paddd            m12,  m17
     vmovdqu32        m10   {k1}, m12
 
-    pmaddwd         m12, m6, m5
+    pmaddwd         m12, m7, m5
     pmaddwd         m13, m8, m5
 
 
@@ -4885,8 +4889,8 @@
 
 
     mova            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64] 
-    pmaddwd         m12, m1, m5
-    pmaddwd         m13, m3, m5
+    pmaddwd         m12, m28, m5
+    pmaddwd         m13, m29, m5
 
 
     vpsrldq         m16,   m12, 4
@@ -4895,8 +4899,8 @@
     paddd            m13,  m17
     vmovdqu32        m12   {k1}, m13
 
-    pmaddwd         m13, m4, m5
-    pmaddwd         m5, m2
+    pmaddwd         m13, m30, m5
+    pmaddwd         m5, m31
 
 
     vpsrldq         m16,   m13, 4
@@ -5094,199 +5098,174 @@
     kmovd            k1,    r7d
     mov             r7d,    0xCCCC
     kmovd            k2,    r7d
+    mova          ym2, [idct16_shuff2]
+    mova          ym3, [idct16_shuff3]
+    mova         ym26, [idct16_shuff4]
+    mova         ym27, [idct16_shuff5]
 
 .pass1:
-     movu            xm0, [r0 +  0 * 32]
-     movu            xm1, [r0 +  8 * 32]
-     punpckhqdq      xm2, xm0, xm1
-     punpcklqdq      xm0, xm1
-     vinserti128     ym0, ym0, xm2, 1
-
-     movu            xm1, [r0 +  1 * 32]
-     movu            xm2, [r0 +  9 * 32]
-     punpckhqdq      xm3, xm1, xm2
-     punpcklqdq      xm1, xm2
-     vinserti128     ym1, ym1, xm3, 1
-
-     movu            xm2, [r0 + 2  * 32]
-     movu            xm3, [r0 + 10 * 32]
-     punpckhqdq      xm4, xm2, xm3
-     punpcklqdq      xm2, xm3
-     vinserti128     ym2, ym2, xm4, 1
-
-     movu            xm3, [r0 + 3  * 32]
-     movu            xm4, [r0 + 11 * 32]
-     punpckhqdq      xm5, xm3, xm4
-     punpcklqdq      xm3, xm4
-     vinserti128     ym3, ym3, xm5, 1
-
-     movu            xm4, [r0 + 4  * 32]
-     movu            xm5, [r0 + 12 * 32]
-     punpckhqdq      xm6, xm4, xm5
-     punpcklqdq      xm4, xm5
-     vinserti128     ym4, ym4, xm6, 1
-
-     movu            xm5, [r0 + 5  * 32]
-     movu            xm6, [r0 + 13 * 32]
-     punpckhqdq      xm7, xm5, xm6
-     punpcklqdq      xm5, xm6
-     vinserti128     ym5, ym5, xm7, 1
-
-     movu            xm6, [r0 + 6  * 32]
-     movu            xm7, [r0 + 14 * 32]
-     punpckhqdq      xm8, xm6, xm7
-     punpcklqdq      xm6, xm7
-     vinserti128     ym6, ym6, xm8, 1
-
-     movu            xm7, [r0 + 7  * 32]
-     movu            xm8, [r0 + 15 * 32]
-     punpckhqdq      xm9, xm7, xm8
-     punpcklqdq      xm7, xm8
-     vinserti128     ym7, ym7, xm9, 1
-
-    punpckhwd       ym8, ym0, ym2                ;[8 10]
-    punpcklwd       ym0, ym2                    ;[0 2]
-
-    punpckhwd       ym2, ym1, ym3                ;[9 11]
-    punpcklwd       ym1, ym3                    ;[1 3]
-
-    punpckhwd       ym3, ym4, ym6                ;[12 14]
-    punpcklwd       ym4, ym6                    ;[4 6]
-
-    punpckhwd       ym6, ym5, ym7                ;[13 15]
-    punpcklwd       ym5, ym7                    ;[5 7]
-
-    punpckhdq       ym7, ym0, ym4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
-    punpckldq       ym0, ym4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
-
-    punpckhdq       ym4, ym8, ym3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
-    punpckldq       ym8, ym3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
-
-    punpckhdq       ym3, ym1, ym5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
-    punpckldq       ym1, ym5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
-
-    punpckhdq       ym5, ym2, ym6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
-    punpckldq       ym2, ym6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
-
-    punpckhqdq      ym6, ym0, ym8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
-    punpcklqdq      ym0, ym8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
-
-    punpckhqdq      ym8, ym7, ym4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
-    punpcklqdq      ym7, ym4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
-
-    punpckhqdq      ym4, ym1, ym2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
-    punpcklqdq      ym1, ym2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
-
-    punpckhqdq      ym2, ym3, ym5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
-    punpcklqdq      ym3, ym5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
-
-    vinserti64x4    m6,        m6,      ym6, 1
-    vinserti64x4    m0,        m0,      ym0, 1
-    vinserti64x4    m8,        m8,      ym8, 1
-    vinserti64x4    m7,        m7,      ym7, 1
-    vinserti64x4    m4,        m4,      ym4, 1
-    vinserti64x4    m1,        m1,      ym1, 1
-    vinserti64x4    m2,        m2,      ym2, 1
-    vinserti64x4    m3,        m3,      ym3, 1
-
+    movu          xm0, [r0 + 0 * 32]
+    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
+    movu          xm1, [r0 + 2 * 32]
+    vinserti128   ym1, ym1, [r0 + 10 * 32], 1
+
+    mova          ym9, ym2
+    mova         ym10, ym3
+    vpermi2w      ym9, ym0, ym1
+    vpermi2w     ym10, ym0, ym1
+
+    movu          xm0, [r0 + 4 * 32]
+    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
+    movu          xm1, [r0 + 6 * 32]
+    vinserti128   ym1, ym1, [r0 + 14 * 32], 1
+
+    mova         ym11, ym2
+    mova         ym12, ym3
+    vpermi2w     ym11, ym0,  ym1
+    vpermi2w     ym12, ym0,  ym1
+
+    mova         ym4,  ym26
+    mova         ym6,  ym27
+    vpermi2d     ym4,   ym9, ym11
+    vpermi2d     ym6,   ym9, ym11
+
+    mova         ym7, ym26
+    mova         ym8, ym27
+    vpermi2d     ym7, ym10, ym12
+    vpermi2d     ym8, ym10, ym12
+
+    vpermq       ym4, ym4,  q3120
+    vpermq       ym6, ym6,  q3120
+    vpermq       ym7, ym7,  q3120
+    vpermq       ym8, ym8,  q3120
+
+    movu          xm0, [r0 + 1 * 32]
+    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
+    movu          xm1, [r0 + 3 * 32]
+    vinserti128   ym1, ym1, [r0 + 11 * 32], 1
+
+    mova          ym9, ym2
+    mova         ym10, ym3
+    vpermi2w      ym9,  ym0, ym1
+    vpermi2w     ym10,  ym0, ym1
+
+    movu          xm0, [r0 + 5 * 32]
+    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
+    movu          xm1, [r0 + 7 * 32]
+    vinserti128   ym1, ym1, [r0 + 15 * 32], 1
+
+    mova         ym11,  ym2
+    mova         ym12,  ym3
+    vpermi2w     ym11,  ym0,  ym1
+    vpermi2w     ym12,  ym0,  ym1
+
+    mova         ym28,  ym26
+    mova         ym29,  ym27
+    vpermi2d     ym28,  ym9, ym11
+    vpermi2d     ym29,  ym9, ym11
+
+    mova         ym30, ym26
+    mova         ym31, ym27
+    vpermi2d     ym30, ym10, ym12
+    vpermi2d     ym31, ym10, ym12
+
+    vpermq       ym28, ym28,  q3120
+    vpermq       ym29, ym29,  q3120
+    vpermq       ym30, ym30,  q3120
+    vpermq       ym31, ym31,  q3120
+
+    vinserti64x4    m4,          m4,      ym4, 1
+    vinserti64x4    m6,          m6,      ym6, 1
+    vinserti64x4    m7,          m7,      ym7, 1
+    vinserti64x4    m8,          m8,      ym8, 1
+    vinserti64x4    m28,        m28,      ym28, 1
+    vinserti64x4    m29,        m29,      ym29, 1
+    vinserti64x4    m30,        m30,      ym30, 1
+    vinserti64x4    m31,        m31,      ym31, 1
 
     IDCT16_AVX512_PASS1      0, 18, 19
     IDCT16_AVX512_PASS1      2, 20, 21
 
     add             r0, 16
 
-     movu            xm0, [r0 +  0 * 32]
-     movu            xm1, [r0 +  8 * 32]
-     punpckhqdq      xm2, xm0, xm1
-     punpcklqdq      xm0, xm1
-     vinserti128     ym0, ym0, xm2, 1
-
-     movu            xm1, [r0 +  1 * 32]
-     movu            xm2, [r0 +  9 * 32]
-     punpckhqdq      xm3, xm1, xm2
-     punpcklqdq      xm1, xm2
-     vinserti128     ym1, ym1, xm3, 1
-
-     movu            xm2, [r0 + 2  * 32]
-     movu            xm3, [r0 + 10 * 32]
-     punpckhqdq      xm4, xm2, xm3
-     punpcklqdq      xm2, xm3
-     vinserti128     ym2, ym2, xm4, 1
-
-     movu            xm3, [r0 + 3  * 32]
-     movu            xm4, [r0 + 11 * 32]
-     punpckhqdq      xm5, xm3, xm4
-     punpcklqdq      xm3, xm4
-     vinserti128     ym3, ym3, xm5, 1
-
-     movu            xm4, [r0 + 4  * 32]
-     movu            xm5, [r0 + 12 * 32]
-     punpckhqdq      xm6, xm4, xm5
-     punpcklqdq      xm4, xm5
-     vinserti128     ym4, ym4, xm6, 1
-
-     movu            xm5, [r0 + 5  * 32]
-     movu            xm6, [r0 + 13 * 32]
-     punpckhqdq      xm7, xm5, xm6
-     punpcklqdq      xm5, xm6
-     vinserti128     ym5, ym5, xm7, 1
-
-     movu            xm6, [r0 + 6  * 32]
-     movu            xm7, [r0 + 14 * 32]
-     punpckhqdq      xm8, xm6, xm7
-     punpcklqdq      xm6, xm7
-     vinserti128     ym6, ym6, xm8, 1
-
-     movu            xm7, [r0 + 7  * 32]
-     movu            xm8, [r0 + 15 * 32]
-     punpckhqdq      xm9, xm7, xm8
-     punpcklqdq      xm7, xm8
-     vinserti128     ym7, ym7, xm9, 1
-
-    punpckhwd       ym8, ym0, ym2                ;[8 10]
-    punpcklwd       ym0, ym2                    ;[0 2]
-
-    punpckhwd       ym2, ym1, ym3                ;[9 11]
-    punpcklwd       ym1, ym3                    ;[1 3]
-
-    punpckhwd       ym3, ym4, ym6                ;[12 14]
-    punpcklwd       ym4, ym6                    ;[4 6]
-
-    punpckhwd       ym6, ym5, ym7                ;[13 15]
-    punpcklwd       ym5, ym7                    ;[5 7]
-
-    punpckhdq       ym7, ym0, ym4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
-    punpckldq       ym0, ym4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
-
-    punpckhdq       ym4, ym8, ym3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
-    punpckldq       ym8, ym3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
-
-    punpckhdq       ym3, ym1, ym5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
-    punpckldq       ym1, ym5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
-
-    punpckhdq       ym5, ym2, ym6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
-    punpckldq       ym2, ym6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
-
-    punpckhqdq      ym6, ym0, ym8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
-    punpcklqdq      ym0, ym8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
-
-    punpckhqdq      ym8, ym7, ym4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
-    punpcklqdq      ym7, ym4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
-
-    punpckhqdq      ym4, ym1, ym2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
-    punpcklqdq      ym1, ym2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
-
-    punpckhqdq      ym2, ym3, ym5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
-    punpcklqdq      ym3, ym5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
-
-    vinserti64x4    m6,        m6,      ym6, 1
-    vinserti64x4    m0,        m0,      ym0, 1
-    vinserti64x4    m8,        m8,      ym8, 1
-    vinserti64x4    m7,        m7,      ym7, 1
-    vinserti64x4    m4,        m4,      ym4, 1
-    vinserti64x4    m1,        m1,      ym1, 1
-    vinserti64x4    m2,        m2,      ym2, 1
-    vinserti64x4    m3,        m3,      ym3, 1
+    movu          xm0, [r0 + 0 * 32]
+    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
+    movu          xm1, [r0 + 2 * 32]
+    vinserti128   ym1, ym1, [r0 + 10 * 32], 1
+
+    mova          ym9, ym2
+    mova         ym10, ym3
+    vpermi2w      ym9, ym0, ym1
+    vpermi2w     ym10, ym0, ym1
+
+    movu          xm0, [r0 + 4 * 32]
+    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
+    movu          xm1, [r0 + 6 * 32]
+    vinserti128   ym1, ym1, [r0 + 14 * 32], 1
+
+    mova         ym11, ym2
+    mova         ym12, ym3
+    vpermi2w     ym11, ym0,  ym1
+    vpermi2w     ym12, ym0,  ym1
+
+    mova         ym4,  ym26
+    mova         ym6,  ym27
+    vpermi2d     ym4,   ym9, ym11
+    vpermi2d     ym6,   ym9, ym11
+
+    mova         ym7, ym26
+    mova         ym8, ym27
+    vpermi2d     ym7, ym10, ym12
+    vpermi2d     ym8, ym10, ym12
+
+    vpermq       ym4, ym4,  q3120
+    vpermq       ym6, ym6,  q3120
+    vpermq       ym7, ym7,  q3120
+    vpermq       ym8, ym8,  q3120
+
+    movu          xm0, [r0 + 1 * 32]
+    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
+    movu          xm1, [r0 + 3 * 32]
+    vinserti128   ym1, ym1, [r0 + 11 * 32], 1
+
+    mova          ym9, ym2
+    mova         ym10, ym3
+    vpermi2w      ym9,  ym0, ym1
+    vpermi2w     ym10,  ym0, ym1
+
+    movu          xm0, [r0 + 5 * 32]
+    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
+    movu          xm1, [r0 + 7 * 32]
+    vinserti128   ym1, ym1, [r0 + 15 * 32], 1
+
+    mova         ym11,  ym2
+    mova         ym12,  ym3
+    vpermi2w     ym11,  ym0,  ym1
+    vpermi2w     ym12,  ym0,  ym1
+
+    mova         ym28,  ym26
+    mova         ym29,  ym27
+    vpermi2d     ym28,  ym9, ym11
+    vpermi2d     ym29,  ym9, ym11
+
+    mova         ym30, ym26
+    mova         ym31, ym27
+    vpermi2d     ym30, ym10, ym12
+    vpermi2d     ym31, ym10, ym12
+
+    vpermq       ym28, ym28,  q3120
+    vpermq       ym29, ym29,  q3120
+    vpermq       ym30, ym30,  q3120
+    vpermq       ym31, ym31,  q3120
+
+    vinserti64x4    m4,          m4,      ym4, 1
+    vinserti64x4    m6,          m6,      ym6, 1
+    vinserti64x4    m7,          m7,      ym7, 1
+    vinserti64x4    m8,          m8,      ym8, 1
+    vinserti64x4    m28,        m28,      ym28, 1
+    vinserti64x4    m29,        m29,      ym29, 1
+    vinserti64x4    m30,        m30,      ym30, 1
+    vinserti64x4    m31,        m31,      ym31, 1
 
 
     IDCT16_AVX512_PASS1      0, 22, 23


More information about the x265-devel mailing list