[x265] [PATCH] asm: avx2 asm code for idct32x32

Murugan Vairavel murugan at multicorewareinc.com
Fri Sep 26 07:16:25 CEST 2014


Sorry, I will send a new patch with proper username.

On Fri, Sep 26, 2014 at 2:09 AM, chen <chenm003 at 163.com> wrote:

> right, need improve in future
>
> The bottleneck on PHADDD, PEXTRD, see the summary below:
>
> Port Binding In Cycles Per Iteration:
>
> --------------------------------------------------------------------------------------------------
> |  Port  |   0   -  DV   |   1   |   2   -   D   |   3   -   D   |   4
> |   5   |   6   |   7   |
>
> --------------------------------------------------------------------------------------------------
> | Cycles | 164.0    0.0  | 173.0 | 85.6    64.0  | 85.6    64.0  | 77.0  |
> 350.0 | 12.0  | 33.8  |
>
> --------------------------------------------------------------------------------------------------
>
>
> At 2014-09-25 21:38:57,murugan at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User zibiah
> ># Date 1411652280 -19800
> >#      Thu Sep 25 19:08:00 2014 +0530
> ># Node ID 47cfab9cbd76bc4feb076e514afe4542dcca912e
> ># Parent  e47e127da779d23314a402fa74723b0e82a2c75d
> >asm: avx2 asm code for idct32x32
> >
> >diff -r e47e127da779 -r 47cfab9cbd76 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp	Wed Sep 24 21:51:12 2014 -0500
> >+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 25 19:08:00 2014 +0530
> >@@ -1449,6 +1449,7 @@
> >         p.dct[DCT_16x16] = x265_dct16_avx2;
> >         p.dct[DCT_32x32] = x265_dct32_avx2;
> >         p.idct[IDCT_16x16] = x265_idct16_avx2;
> >+        p.idct[IDCT_32x32] = x265_idct32_avx2;
> > #endif
> >     }
> >     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
> >@@ -1784,6 +1785,7 @@
> >         p.dct[DCT_16x16] = x265_dct16_avx2;
> >         p.dct[DCT_32x32] = x265_dct32_avx2;
> >         p.idct[IDCT_16x16] = x265_idct16_avx2;
> >+        p.idct[IDCT_32x32] = x265_idct32_avx2;
> > #endif
> >     }
> > #endif // if HIGH_BIT_DEPTH
> >diff -r e47e127da779 -r 47cfab9cbd76 source/common/x86/dct8.asm
> >--- a/source/common/x86/dct8.asm	Wed Sep 24 21:51:12 2014 -0500
> >+++ b/source/common/x86/dct8.asm	Thu Sep 25 19:08:00 2014 +0530
> >@@ -167,6 +167,60 @@
> >
> > idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
> >
> >+tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
> >+                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
> >+                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
> >+                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
> >+                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
> >+                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
> >+                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
> >+                dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
> >+                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
> >+                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
> >+                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
> >+                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
> >+                dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
> >+                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
> >+                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
> >+                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
> >+
> >+
> >+tab_idct32_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
> >+                dw 64, 75, 36, -18, -64, -89, -83, -50
> >+                dw 64, 50, -36, -89, -64, 18, 83, 75
> >+                dw 64, 18, -83, -50, 64, 75, -36, -89
> >+                dw 64, -18, -83, 50, 64, -75, -36, 89
> >+                dw 64, -50, -36, 89, -64, -18, 83, -75
> >+                dw 64, -75, 36, 18, -64, 89, -83, 50
> >+                dw 64, -89, 83, -75, 64, -50, 36, -18
> >+
> >+
> >+tab_idct32_3:   dw 90, 87, 80, 70, 57, 43, 25, 9
> >+                dw 87, 57, 9, -43, -80, -90, -70, -25
> >+                dw 80, 9, -70, -87, -25, 57, 90, 43
> >+                dw 70, -43, -87, 9, 90, 25, -80, -57
> >+                dw 57, -80, -25, 90, -9, -87, 43, 70
> >+                dw 43, -90, 57, 25, -87, 70, 9, -80
> >+                dw 25, -70, 90, -80, 43, 9, -57, 87
> >+                dw 9, -25, 43, -57, 70, -80, 87, -90
> >+
> >+tab_idct32_4:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
> >+                dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
> >+                dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
> >+                dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
> >+                dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
> >+                dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
> >+                dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
> >+                dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
> >+                dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
> >+                dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
> >+                dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
> >+                dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
> >+                dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
> >+                dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
> >+                dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
> >+                dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
> >+
> > avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
> >                 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
> >
> >@@ -2007,4 +2061,350 @@
> >     dec             r4d
> >     jnz             .pass2
> >     RET
> >+
> >+%macro IDCT32_PASS1 1
> >+    vbroadcasti128  m3, [tab_idct32_1 + %1 * 32]
> >+    vbroadcasti128  m13, [tab_idct32_1 + %1 * 32 + 16]
> >+    pmaddwd         m9, m4, m3
> >+    pmaddwd         m10, m8, m13
> >+    phaddd          m9, m10
> >+
> >+    pmaddwd         m10, m2, m3
> >+    pmaddwd         m11, m1, m13
> >+    phaddd          m10, m11
> >+
> >+    phaddd          m9, m10
> >+
> >+    vbroadcasti128  m3, [tab_idct32_1 + (15 - %1) * 32]
> >+    vbroadcasti128  m13, [tab_idct32_1 + (15- %1) * 32 + 16]
> >+    pmaddwd         m10, m4, m3
> >+    pmaddwd         m11, m8, m13
> >+    phaddd          m10, m11
> >+
> >+    pmaddwd         m11, m2, m3
> >+    pmaddwd         m12, m1, m13
> >+    phaddd          m11, m12
> >+
> >+    phaddd          m10, m11
> >+    phaddd          m9, m10                       ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]
> >+
> >+    vbroadcasti128  m3, [tab_idct32_2 + %1 * 16]
> >+    pmaddwd         m10, m0, m3
> >+    pmaddwd         m11, m7, m3
> >+    phaddd          m10, m11
> >+    phaddd          m10, m10
> >+
> >+    vbroadcasti128  m3, [tab_idct32_3 + %1 * 16]
> >+    pmaddwd         m11, m5, m3
> >+    pmaddwd         m12, m6, m3
> >+    phaddd          m11, m12
> >+    phaddd          m11, m11
> >+
> >+    paddd           m12, m10, m11                 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
> >+    psubd           m10, m11                      ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]
> >+
> >+    punpcklqdq      m12, m10                      ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
> >+    paddd           m10, m9, m12
> >+    paddd           m10, m15
> >+    psrad           m10, IDCT_SHIFT1
> >+
> >+    psubd           m12, m9
> >+    paddd           m12, m15
> >+    psrad           m12, IDCT_SHIFT1
> >+
> >+    packssdw        m10, m12
> >+    vextracti128    xm12, m10, 1
> >+    movd            [r3 + %1 * 64], xm10
> >+    movd            [r3 + 32 + %1 * 64], xm12
> >+    pextrd          [r4 - %1 * 64], xm10, 1
> >+    pextrd          [r4+ 32 - %1 * 64], xm12, 1
> >+    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
> >+    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
> >+    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
> >+    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
> >+%endmacro
> >+
> >+;-------------------------------------------------------
> >+; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
> >+;-------------------------------------------------------
> >+
> >+; TODO: Reduce PHADDD instruction by PADDD
> >+
> >+INIT_YMM avx2
> >+cglobal idct32, 3, 6, 16, 0-32*64
> >+
> >+%define             IDCT_SHIFT1         7
> >+
> >+    vbroadcasti128  m15, [pd_64]
> >+
> >+    mov             r3, rsp
> >+    lea             r4, [r3 + 15 * 64]
> >+    mov             r5d, 8
> >+
> >+.pass1:
> >+    movu            xm0,    [r0 +  2 * 128]
> >+    movu            xm1,    [r0 + 18 * 128]
> >+    vinserti128     m0, m0, [r0 +  0 * 128], 1
> >+    vinserti128     m1, m1, [r0 + 16 * 128], 1
> >+
> >+    packssdw        m0, m1                      ;[2 18 0 16]
> >+
> >+    movu            xm1,    [r0 +  1 * 128]
> >+    movu            xm2,    [r0 +  9 * 128]
> >+    vinserti128     m1, m1, [r0 + 17 * 128], 1
> >+    vinserti128     m2, m2, [r0 + 25 * 128], 1
> >+    packssdw        m1, m2                      ;[1 9 17 25]
> >+
> >+    movu            xm2,    [r0 +  6 * 128]
> >+    movu            xm3,    [r0 + 22 * 128]
> >+    vinserti128     m2, m2, [r0 +  4 * 128], 1
> >+    vinserti128     m3, m3, [r0 + 20 * 128], 1
> >+    packssdw        m2, m3                      ;[6 22 4 20]
> >+
> >+    movu            xm3,    [r0 +  3 * 128]
> >+    movu            xm4,    [r0 + 11 * 128]
> >+    vinserti128     m3, m3, [r0 + 19 * 128], 1
> >+    vinserti128     m4, m4, [r0 + 27 * 128], 1
> >+    packssdw        m3, m4                      ;[3 11 19 27]
> >+
> >+    movu            xm4,    [r0 + 10 * 128]
> >+    movu            xm5,    [r0 + 26 * 128]
> >+    vinserti128     m4, m4, [r0 +  8 * 128], 1
> >+    vinserti128     m5, m5, [r0 + 24 * 128], 1
> >+    packssdw        m4, m5                      ;[10 26 8 24]
> >+
> >+    movu            xm5,    [r0 +  5 * 128]
> >+    movu            xm6,    [r0 + 13 * 128]
> >+    vinserti128     m5, m5, [r0 + 21 * 128], 1
> >+    vinserti128     m6, m6, [r0 + 29 * 128], 1
> >+    packssdw        m5, m6                      ;[5 13 21 29]
> >+
> >+    movu            xm6,    [r0 + 14 * 128]
> >+    movu            xm7,    [r0 + 30 * 128]
> >+    vinserti128     m6, m6, [r0 + 12 * 128], 1
> >+    vinserti128     m7, m7, [r0 + 28 * 128], 1
> >+    packssdw        m6, m7                      ;[14 30 12 28]
> >+
> >+    movu            xm7,    [r0 +  7 * 128]
> >+    movu            xm8,    [r0 + 15 * 128]
> >+    vinserti128     m7, m7, [r0 + 23 * 128], 1
> >+    vinserti128     m8, m8, [r0 + 31 * 128], 1
> >+    packssdw        m7, m8                      ;[7 15 23 31]
> >+
> >+    punpckhwd       m8, m0, m2                  ;[18 22 16 20]
> >+    punpcklwd       m0, m2                      ;[2 6 0 4]
> >+
> >+    punpckhwd       m2, m1, m3                  ;[9 11 25 27]
> >+    punpcklwd       m1, m3                      ;[1 3 17 19]
> >+
> >+    punpckhwd       m3, m4, m6                  ;[26 30 24 28]
> >+    punpcklwd       m4, m6                      ;[10 14 8 12]
> >+
> >+    punpckhwd       m6, m5, m7                  ;[13 15 29 31]
> >+    punpcklwd       m5, m7                      ;[5 7 21 23]
> >+
> >+    punpckhdq       m7, m0, m4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
> >+    punpckldq       m0, m4                      ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
> >+
> >+    punpckhdq       m4, m8, m3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
> >+    punpckldq       m8, m3                      ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
> >+
> >+    punpckhdq       m3, m1, m5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
> >+    punpckldq       m1, m5                      ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
> >+
> >+    punpckhdq       m5, m2, m6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
> >+    punpckldq       m2, m6                      ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
> >+
> >+    punpckhqdq      m6, m0, m8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
> >+    punpcklqdq      m0, m8                      ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
> >+
> >+    punpckhqdq      m8, m7, m4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
> >+    punpcklqdq      m7, m4                      ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
> >+
> >+    punpckhqdq      m4, m1, m2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
> >+    punpcklqdq      m1, m2                      ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
> >+
> >+    punpckhqdq      m2, m3, m5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
> >+    punpcklqdq      m3, m5                      ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
> >+
> >+    vperm2i128      m5, m0, m6, 0x20            ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
> >+    vperm2i128      m0, m0, m6, 0x31            ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]
> >+
> >+    vperm2i128      m6, m7, m8, 0x20            ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
> >+    vperm2i128      m7, m7, m8, 0x31            ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]
> >+
> >+    vperm2i128      m8, m1, m4, 0x31            ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
> >+    vperm2i128      m4, m1, m4, 0x20            ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]
> >+
> >+    vperm2i128      m1, m3, m2, 0x31            ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
> >+    vperm2i128      m2, m3, m2, 0x20            ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]
> >+
> >+    IDCT32_PASS1 0
> >+    IDCT32_PASS1 1
> >+    IDCT32_PASS1 2
> >+    IDCT32_PASS1 3
> >+    IDCT32_PASS1 4
> >+    IDCT32_PASS1 5
> >+    IDCT32_PASS1 6
> >+    IDCT32_PASS1 7
> >+
> >+    add             r0, 16
> >+    add             r3, 4
> >+    add             r4, 4
> >+    dec             r5d
> >+    jnz             .pass1
> >+
> >+%if BIT_DEPTH == 10
> >+    %define         IDCT_SHIFT2        10
> >+    vpbroadcastd    m15,                [pd_512]
> >+%elif BIT_DEPTH == 8
> >+    %define         IDCT_SHIFT2        12
> >+    vpbroadcastd    m15,                [pd_2048]
> >+%else
> >+    %error Unsupported BIT_DEPTH!
> > %endif
> >+
> >+    mov             r3, rsp
> >+    add             r2d, r2d
> >+    mov             r4d, 32
> >+
> >+    mova            m7,  [tab_idct32_4]
> >+    mova            m8,  [tab_idct32_4 + 32]
> >+    mova            m9,  [tab_idct32_4 + 64]
> >+    mova            m10, [tab_idct32_4 + 96]
> >+    mova            m11, [tab_idct32_4 + 128]
> >+    mova            m12, [tab_idct32_4 + 160]
> >+    mova            m13, [tab_idct32_4 + 192]
> >+    mova            m14, [tab_idct32_4 + 224]
> >+.pass2:
> >+    movu            m0, [r3]
> >+    movu            m1, [r3 + 32]
> >+
> >+    pmaddwd         m2, m0, m7
> >+    pmaddwd         m3, m0, m8
> >+    phaddd          m2, m3
> >+
> >+    pmaddwd         m3, m0, m9
> >+    pmaddwd         m4, m0, m10
> >+    phaddd          m3, m4
> >+
> >+    phaddd          m2, m3
> >+
> >+    pmaddwd         m3, m0, m11
> >+    pmaddwd         m4, m0, m12
> >+    phaddd          m3, m4
> >+
> >+    pmaddwd         m4, m0, m13
> >+    pmaddwd         m5, m0, m14
> >+    phaddd          m4, m5
> >+
> >+    phaddd          m3, m4
> >+
> >+    vperm2i128      m4, m2, m3, 0x31
> >+    vperm2i128      m2, m2, m3, 0x20
> >+    paddd           m2, m4
> >+
> >+    pmaddwd         m3, m0, [tab_idct32_4 + 256]
> >+    pmaddwd         m4, m0, [tab_idct32_4 + 288]
> >+    phaddd          m3, m4
> >+
> >+    pmaddwd         m4, m0, [tab_idct32_4 + 320]
> >+    pmaddwd         m5, m0, [tab_idct32_4 + 352]
> >+    phaddd          m4, m5
> >+
> >+    phaddd          m3, m4
> >+
> >+    pmaddwd         m4, m0, [tab_idct32_4 + 384]
> >+    pmaddwd         m5, m0, [tab_idct32_4 + 416]
> >+    phaddd          m4, m5
> >+
> >+    pmaddwd         m5, m0, [tab_idct32_4 + 448]
> >+    pmaddwd         m0,     [tab_idct32_4 + 480]
> >+    phaddd          m5, m0
> >+
> >+    phaddd          m4, m5
> >+
> >+    vperm2i128      m0, m3, m4, 0x31
> >+    vperm2i128      m3, m3, m4, 0x20
> >+    paddd           m3, m0
> >+
> >+    pmaddwd         m4, m1, [tab_idct32_1]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 32]
> >+    phaddd          m4, m0
> >+
> >+    pmaddwd         m5, m1, [tab_idct32_1 + 64]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 96]
> >+    phaddd          m5, m0
> >+
> >+    phaddd          m4, m5
> >+
> >+    pmaddwd         m5, m1, [tab_idct32_1 + 128]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 160]
> >+    phaddd          m5, m0
> >+
> >+    pmaddwd         m6, m1, [tab_idct32_1 + 192]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 224]
> >+    phaddd          m6, m0
> >+
> >+    phaddd          m5, m6
> >+
> >+    vperm2i128      m0, m4, m5, 0x31
> >+    vperm2i128      m4, m4, m5, 0x20
> >+    paddd           m4, m0
> >+
> >+    pmaddwd         m5, m1, [tab_idct32_1 + 256]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 288]
> >+    phaddd          m5, m0
> >+
> >+    pmaddwd         m6, m1, [tab_idct32_1 + 320]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 352]
> >+    phaddd          m6, m0
> >+
> >+    phaddd          m5, m6
> >+
> >+    pmaddwd         m6, m1, [tab_idct32_1 + 384]
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 416]
> >+    phaddd          m6, m0
> >+
> >+    pmaddwd         m0, m1, [tab_idct32_1 + 448]
> >+    pmaddwd         m1,     [tab_idct32_1 + 480]
> >+    phaddd          m0, m1
> >+
> >+    phaddd          m6, m0
> >+
> >+    vperm2i128      m0, m5, m6, 0x31
> >+    vperm2i128      m5, m5, m6, 0x20
> >+    paddd           m5, m0
> >+
> >+    paddd           m6, m2, m4
> >+    paddd           m6, m15
> >+    psrad           m6, IDCT_SHIFT2
> >+
> >+    psubd           m2, m4
> >+    paddd           m2, m15
> >+    psrad           m2, IDCT_SHIFT2
> >+
> >+    paddd           m4, m3, m5
> >+    paddd           m4, m15
> >+    psrad           m4, IDCT_SHIFT2
> >+
> >+    psubd           m3, m5
> >+    paddd           m3, m15
> >+    psrad           m3, IDCT_SHIFT2
> >+
> >+    packssdw        m6, m4
> >+    packssdw        m2, m3
> >+
> >+    vpermq          m6, m6, 0xD8
> >+    vpermq          m2, m2, 0x8D
> >+    pshufb          m2, [dct16_shuf1]
> >+
> >+    mova            [r1], m6
> >+    mova            [r1 + 32], m2
> >+
> >+    add             r1, r2
> >+    add             r3, 64
> >+    dec             r4d
> >+    jnz             .pass2
> >+    RET
> >+%endif
> >diff -r e47e127da779 -r 47cfab9cbd76 source/common/x86/dct8.h
> >--- a/source/common/x86/dct8.h	Wed Sep 24 21:51:12 2014 -0500
> >+++ b/source/common/x86/dct8.h	Thu Sep 25 19:08:00 2014 +0530
> >@@ -34,6 +34,7 @@
> > void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> > void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> > void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> >+void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> >
> > void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> > void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>


-- 
With Regards,

Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140926/62687382/attachment-0001.html>


More information about the x265-devel mailing list