[x265] [PATCH] asm: avx2 assembly code for idct16x16

chen chenm003 at 163.com
Thu Sep 18 23:12:31 CEST 2014


it's right

At 2014-09-18 16:58:19,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1411030664 -19800
>#      Thu Sep 18 14:27:44 2014 +0530
># Node ID 44692411ababc212746c99f9ea44c3536cac0119
># Parent  86686bd153db547c33cfe23407f32e5e050f9d62
>asm: avx2 assembly code for idct16x16
>
>diff -r 86686bd153db -r 44692411abab source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Wed Sep 17 12:52:38 2014 +0200
>+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 18 14:27:44 2014 +0530
>@@ -1447,6 +1447,7 @@
> #if X86_64
>         p.dct[DCT_16x16] = x265_dct16_avx2;
>         p.dct[DCT_32x32] = x265_dct32_avx2;
>+        p.idct[IDCT_16x16] = x265_idct16_avx2;
> #endif
>     }
>     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
>@@ -1749,6 +1750,7 @@
> #if X86_64
>         p.dct[DCT_16x16] = x265_dct16_avx2;
>         p.dct[DCT_32x32] = x265_dct32_avx2;
>+        p.idct[IDCT_16x16] = x265_idct16_avx2;
> #endif
>     }
> #endif // if HIGH_BIT_DEPTH
>diff -r 86686bd153db -r 44692411abab source/common/x86/dct8.asm
>--- a/source/common/x86/dct8.asm	Wed Sep 17 12:52:38 2014 +0200
>+++ b/source/common/x86/dct8.asm	Thu Sep 18 14:27:44 2014 +0530
>@@ -134,6 +134,28 @@
>                 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
>                 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
> 
>+tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
>+                dw 87, 57, 9, -43, -80, -90, -70, -25
>+                dw 80, 9, -70, -87, -25, 57, 90, 43
>+                dw 70, -43, -87, 9, 90, 25, -80, -57
>+                dw 57, -80, -25, 90, -9, -87, 43, 70
>+                dw 43, -90, 57, 25, -87, 70, 9, -80
>+                dw 25, -70, 90, -80, 43, 9, -57, 87
>+                dw 9, -25, 43, -57, 70, -80, 87, -90
>+
>+tab_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
>+                dw 64, 75, 36, -18, -64, -89, -83, -50
>+                dw 64, 50, -36, -89, -64, 18, 83, 75
>+                dw 64, 18, -83, -50, 64, 75, -36, -89
>+                dw 64, -18, -83, 50, 64, -75, -36, 89
>+                dw 64, -50, -36, 89, -64, -18, 83, -75
>+                dw 64, -75, 36, 18, -64, 89, -83, 50
>+                dw 64, -89, 83, -75, 64, -50, 36, -18
>+
>+idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7
>+
>+idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
>+
> avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
>                 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
> 
>@@ -1662,4 +1684,282 @@
>     dec             r4d
>     jnz             .pass2
>     RET
>+
>+%macro IDCT_PASS1 2
>+    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]
>+
>+    pmaddwd         m9, m0, m5
>+    pmaddwd         m10, m7, m5
>+    phaddd          m9, m10
>+
>+    pmaddwd         m10, m6, m5
>+    pmaddwd         m11, m8, m5
>+    phaddd          m10, m11
>+
>+    phaddd          m9, m10
>+    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16]
>+
>+    pmaddwd         m10, m1, m5
>+    pmaddwd         m11, m3, m5
>+    phaddd          m10, m11
>+
>+    pmaddwd         m11, m4, m5
>+    pmaddwd         m12, m2, m5
>+    phaddd          m11, m12
>+
>+    phaddd          m10, m11
>+
>+    paddd           m11, m9, m10
>+    paddd           m11, m14
>+    psrad           m11, IDCT_SHIFT1
>+
>+    psubd           m9, m10
>+    paddd           m9, m14
>+    psrad           m9, IDCT_SHIFT1
>+
>+    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16 + 16]
>+
>+    pmaddwd         m10, m0, m5
>+    pmaddwd         m12, m7, m5
>+    phaddd          m10, m12
>+
>+    pmaddwd         m12, m6, m5
>+    pmaddwd         m13, m8, m5
>+    phaddd          m12, m13
>+
>+    phaddd          m10, m12
>+    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16  + 16]
>+
>+    pmaddwd         m12, m1, m5
>+    pmaddwd         m13, m3, m5
>+    phaddd          m12, m13
>+
>+    pmaddwd         m13, m4, m5
>+    pmaddwd         m5, m2
>+    phaddd          m13, m5
>+
>+    phaddd          m12, m13
>+
>+    paddd           m5, m10, m12
>+    paddd           m5, m14
>+    psrad           m5, IDCT_SHIFT1
>+
>+    psubd           m10, m12
>+    paddd           m10, m14
>+    psrad           m10, IDCT_SHIFT1
>+
>+    packssdw        m11, m5
>+    packssdw        m9, m10
>+
>+    mova            m10, [idct16_shuff]
>+    mova            m5,  [idct16_shuff1]
>+
>+    vpermd          m12, m10, m11
>+    vpermd          m13, m5, m9
>+    mova            [r3 + %1 * 16 * 2], xm12
>+    mova            [r3 + %2 * 16 * 2], xm13
>+    vextracti128    [r3 + %2 * 16 * 2 + 32], m13, 1
>+    vextracti128    [r3 + %1 * 16 * 2 + 32], m12, 1
>+%endmacro
>+
>+;-------------------------------------------------------
>+; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
>+;-------------------------------------------------------
>+INIT_YMM avx2
>+cglobal idct16, 3, 7, 16, 0-16*mmsize
>+%if BIT_DEPTH == 10
>+    %define         IDCT_SHIFT2        10
>+    vpbroadcastd    m15,                [pd_512]
>+%elif BIT_DEPTH == 8
>+    %define         IDCT_SHIFT2        12
>+    vpbroadcastd    m15,                [pd_2048]
>+%else
>+    %error Unsupported BIT_DEPTH!
> %endif
>+%define             IDCT_SHIFT1         7
>+
>+    vbroadcasti128  m14,               [pd_64]
>+
>+    add             r2d,               r2d
>+    mov             r3, rsp
>+    mov             r4d, 2
>+
>+.pass1:
>+    movu            m0, [r0 +  0 * 64]
>+    movu            m1, [r0 +  8 * 64]
>+    packssdw        m0, m1                    ;[0L 8L 0H 8H]
>+
>+    movu            m1, [r0 +  1 * 64]
>+    movu            m2, [r0 +  9 * 64]
>+    packssdw        m1, m2                    ;[1L 9L 1H 9H]
>+
>+    movu            m2, [r0 +  2 * 64]
>+    movu            m3, [r0 + 10 * 64]
>+    packssdw        m2, m3                    ;[2L 10L 2H 10H]
>+
>+    movu            m3, [r0 +  3 * 64]
>+    movu            m4, [r0 + 11 * 64]
>+    packssdw        m3, m4                    ;[3L 11L 3H 11H]
>+
>+    movu            m4, [r0 +  4 * 64]
>+    movu            m5, [r0 + 12 * 64]
>+    packssdw        m4, m5                    ;[4L 12L 4H 12H]
>+
>+    movu            m5, [r0 +  5 * 64]
>+    movu            m6, [r0 + 13 * 64]
>+    packssdw        m5, m6                    ;[5L 13L 5H 13H]
>+
>+    movu            m6, [r0 +  6 * 64]
>+    movu            m7, [r0 + 14 * 64]
>+    packssdw        m6, m7                    ;[6L 14L 6H 14H]
>+
>+    movu            m7, [r0 +  7 * 64]
>+    movu            m8, [r0 + 15 * 64]
>+    packssdw        m7, m8                    ;[7L 15L 7H 15H]
>+
>+    punpckhwd       m8, m0, m2                ;[8 10]
>+    punpcklwd       m0, m2                    ;[0 2]
>+
>+    punpckhwd       m2, m1, m3                ;[9 11]
>+    punpcklwd       m1, m3                    ;[1 3]
>+
>+    punpckhwd       m3, m4, m6                ;[12 14]
>+    punpcklwd       m4, m6                    ;[4 6]
>+
>+    punpckhwd       m6, m5, m7                ;[13 15]
>+    punpcklwd       m5, m7                    ;[5 7]
>+
>+    punpckhdq       m7, m0, m4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
>+    punpckldq       m0, m4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
>+
>+    punpckhdq       m4, m8, m3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
>+    punpckldq       m8, m3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
>+
>+    punpckhdq       m3, m1, m5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
>+    punpckldq       m1, m5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
>+
>+    punpckhdq       m5, m2, m6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
>+    punpckldq       m2, m6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
>+
>+    punpckhqdq      m6, m0, m8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
>+    punpcklqdq      m0, m8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
>+
>+    punpckhqdq      m8, m7, m4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
>+    punpcklqdq      m7, m4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
>+
>+    punpckhqdq      m4, m1, m2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
>+    punpcklqdq      m1, m2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
>+
>+    punpckhqdq      m2, m3, m5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
>+    punpcklqdq      m3, m5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
>+
>+    IDCT_PASS1      0, 14
>+    IDCT_PASS1      2, 12
>+    IDCT_PASS1      4, 10
>+    IDCT_PASS1      6, 8
>+
>+    add             r0, 32
>+    add             r3, 16
>+    dec             r4d
>+    jnz             .pass1
>+
>+    mov             r3, rsp
>+    mov             r4d, 8
>+    lea             r5, [tab_idct16_2]
>+    lea             r6, [tab_idct16_1]
>+
>+    vbroadcasti128  m7,  [r5]
>+    vbroadcasti128  m8,  [r5 + 16]
>+    vbroadcasti128  m9,  [r5 + 32]
>+    vbroadcasti128  m10, [r5 + 48]
>+    vbroadcasti128  m11, [r5 + 64]
>+    vbroadcasti128  m12, [r5 + 80]
>+    vbroadcasti128  m13, [r5 + 96]
>+
>+.pass2:
>+    movu            m1, [r3]
>+    vpermq          m0, m1, 0xD8
>+
>+    pmaddwd         m1, m0, m7
>+    pmaddwd         m2, m0, m8
>+    phaddd          m1, m2
>+
>+    pmaddwd         m2, m0, m9
>+    pmaddwd         m3, m0, m10
>+    phaddd          m2, m3
>+
>+    phaddd          m1, m2
>+
>+    pmaddwd         m2, m0, m11
>+    pmaddwd         m3, m0, m12
>+    phaddd          m2, m3
>+
>+    vbroadcasti128  m14, [r5 + 112]
>+    pmaddwd         m3, m0, m13
>+    pmaddwd         m4, m0, m14
>+    phaddd          m3, m4
>+
>+    phaddd          m2, m3
>+
>+    movu            m3, [r3 + 32]
>+    vpermq          m0, m3, 0xD8
>+
>+    vbroadcasti128  m14, [r6]
>+    pmaddwd         m3, m0, m14
>+    vbroadcasti128  m14, [r6 + 16]
>+    pmaddwd         m4, m0, m14
>+    phaddd          m3, m4
>+
>+    vbroadcasti128  m14, [r6 + 32]
>+    pmaddwd         m4, m0, m14
>+    vbroadcasti128  m14, [r6 + 48]
>+    pmaddwd         m5, m0, m14
>+    phaddd          m4, m5
>+
>+    phaddd          m3, m4
>+
>+    vbroadcasti128  m14, [r6 + 64]
>+    pmaddwd         m4, m0, m14
>+    vbroadcasti128  m14, [r6 + 80]
>+    pmaddwd         m5, m0, m14
>+    phaddd          m4, m5
>+
>+    vbroadcasti128  m14, [r6 + 96]
>+    pmaddwd         m6, m0, m14
>+    vbroadcasti128  m14, [r6 + 112]
>+    pmaddwd         m0, m14
>+    phaddd          m6, m0
>+
>+    phaddd          m4, m6
>+
>+    paddd           m5, m1, m3
>+    paddd           m5, m15
>+    psrad           m5, IDCT_SHIFT2
>+
>+    psubd           m1, m3
>+    paddd           m1, m15
>+    psrad           m1, IDCT_SHIFT2
>+
>+    paddd           m6, m2, m4
>+    paddd           m6, m15
>+    psrad           m6, IDCT_SHIFT2
>+
>+    psubd           m2, m4
>+    paddd           m2, m15
>+    psrad           m2, IDCT_SHIFT2
>+
>+    packssdw        m5, m6
>+    packssdw        m1, m2
>+    pshufb          m2, m1, [dct16_shuf1]
>+
>+    mova            [r1], xm5
>+    mova            [r1 + 16], xm2
>+    vextracti128    [r1 + r2], m5, 1
>+    vextracti128    [r1 + r2 + 16], m2, 1
>+
>+    lea             r1, [r1 + 2 * r2]
>+    add             r3, 64
>+    dec             r4d
>+    jnz             .pass2
>+    RET
>+%endif
>diff -r 86686bd153db -r 44692411abab source/common/x86/dct8.h
>--- a/source/common/x86/dct8.h	Wed Sep 17 12:52:38 2014 +0200
>+++ b/source/common/x86/dct8.h	Thu Sep 18 14:27:44 2014 +0530
>@@ -27,6 +27,7 @@
> void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
>+void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140919/5ccfa61e/attachment-0001.html>


More information about the x265-devel mailing list