[x265] [PATCH] asm: avx2 assembly code for idct16x16

murugan at multicorewareinc.com murugan at multicorewareinc.com
Thu Sep 18 10:58:19 CEST 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1411030664 -19800
#      Thu Sep 18 14:27:44 2014 +0530
# Node ID 44692411ababc212746c99f9ea44c3536cac0119
# Parent  86686bd153db547c33cfe23407f32e5e050f9d62
asm: avx2 assembly code for idct16x16

diff -r 86686bd153db -r 44692411abab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 17 12:52:38 2014 +0200
+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 18 14:27:44 2014 +0530
@@ -1447,6 +1447,7 @@
 #if X86_64
         p.dct[DCT_16x16] = x265_dct16_avx2;
         p.dct[DCT_32x32] = x265_dct32_avx2;
+        p.idct[IDCT_16x16] = x265_idct16_avx2;
 #endif
     }
     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
@@ -1749,6 +1750,7 @@
 #if X86_64
         p.dct[DCT_16x16] = x265_dct16_avx2;
         p.dct[DCT_32x32] = x265_dct32_avx2;
+        p.idct[IDCT_16x16] = x265_idct16_avx2;
 #endif
     }
 #endif // if HIGH_BIT_DEPTH
diff -r 86686bd153db -r 44692411abab source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Sep 17 12:52:38 2014 +0200
+++ b/source/common/x86/dct8.asm	Thu Sep 18 14:27:44 2014 +0530
@@ -134,6 +134,28 @@
                 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
                 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
 
+tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
+                dw 87, 57, 9, -43, -80, -90, -70, -25
+                dw 80, 9, -70, -87, -25, 57, 90, 43
+                dw 70, -43, -87, 9, 90, 25, -80, -57
+                dw 57, -80, -25, 90, -9, -87, 43, 70
+                dw 43, -90, 57, 25, -87, 70, 9, -80
+                dw 25, -70, 90, -80, 43, 9, -57, 87
+                dw 9, -25, 43, -57, 70, -80, 87, -90
+
+tab_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
+                dw 64, 75, 36, -18, -64, -89, -83, -50
+                dw 64, 50, -36, -89, -64, 18, 83, 75
+                dw 64, 18, -83, -50, 64, 75, -36, -89
+                dw 64, -18, -83, 50, 64, -75, -36, 89
+                dw 64, -50, -36, 89, -64, -18, 83, -75
+                dw 64, -75, 36, 18, -64, 89, -83, 50
+                dw 64, -89, 83, -75, 64, -50, 36, -18
+
+idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7
+
+idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
+
 avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
 
@@ -1662,4 +1684,282 @@
     dec             r4d
     jnz             .pass2
     RET
+
+%macro IDCT_PASS1 2
+    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]
+
+    pmaddwd         m9, m0, m5
+    pmaddwd         m10, m7, m5
+    phaddd          m9, m10
+
+    pmaddwd         m10, m6, m5
+    pmaddwd         m11, m8, m5
+    phaddd          m10, m11
+
+    phaddd          m9, m10
+    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16]
+
+    pmaddwd         m10, m1, m5
+    pmaddwd         m11, m3, m5
+    phaddd          m10, m11
+
+    pmaddwd         m11, m4, m5
+    pmaddwd         m12, m2, m5
+    phaddd          m11, m12
+
+    phaddd          m10, m11
+
+    paddd           m11, m9, m10
+    paddd           m11, m14
+    psrad           m11, IDCT_SHIFT1
+
+    psubd           m9, m10
+    paddd           m9, m14
+    psrad           m9, IDCT_SHIFT1
+
+    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16 + 16]
+
+    pmaddwd         m10, m0, m5
+    pmaddwd         m12, m7, m5
+    phaddd          m10, m12
+
+    pmaddwd         m12, m6, m5
+    pmaddwd         m13, m8, m5
+    phaddd          m12, m13
+
+    phaddd          m10, m12
+    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16  + 16]
+
+    pmaddwd         m12, m1, m5
+    pmaddwd         m13, m3, m5
+    phaddd          m12, m13
+
+    pmaddwd         m13, m4, m5
+    pmaddwd         m5, m2
+    phaddd          m13, m5
+
+    phaddd          m12, m13
+
+    paddd           m5, m10, m12
+    paddd           m5, m14
+    psrad           m5, IDCT_SHIFT1
+
+    psubd           m10, m12
+    paddd           m10, m14
+    psrad           m10, IDCT_SHIFT1
+
+    packssdw        m11, m5
+    packssdw        m9, m10
+
+    mova            m10, [idct16_shuff]
+    mova            m5,  [idct16_shuff1]
+
+    vpermd          m12, m10, m11
+    vpermd          m13, m5, m9
+    mova            [r3 + %1 * 16 * 2], xm12
+    mova            [r3 + %2 * 16 * 2], xm13
+    vextracti128    [r3 + %2 * 16 * 2 + 32], m13, 1
+    vextracti128    [r3 + %1 * 16 * 2 + 32], m12, 1
+%endmacro
+
+;-------------------------------------------------------
+; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_YMM avx2
+cglobal idct16, 3, 7, 16, 0-16*mmsize
+%if BIT_DEPTH == 10
+    %define         IDCT_SHIFT2        10
+    vpbroadcastd    m15,                [pd_512]
+%elif BIT_DEPTH == 8
+    %define         IDCT_SHIFT2        12
+    vpbroadcastd    m15,                [pd_2048]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
+%define             IDCT_SHIFT1         7
+
+    vbroadcasti128  m14,               [pd_64]
+
+    add             r2d,               r2d
+    mov             r3, rsp
+    mov             r4d, 2
+
+.pass1:
+    movu            m0, [r0 +  0 * 64]
+    movu            m1, [r0 +  8 * 64]
+    packssdw        m0, m1                    ;[0L 8L 0H 8H]
+
+    movu            m1, [r0 +  1 * 64]
+    movu            m2, [r0 +  9 * 64]
+    packssdw        m1, m2                    ;[1L 9L 1H 9H]
+
+    movu            m2, [r0 +  2 * 64]
+    movu            m3, [r0 + 10 * 64]
+    packssdw        m2, m3                    ;[2L 10L 2H 10H]
+
+    movu            m3, [r0 +  3 * 64]
+    movu            m4, [r0 + 11 * 64]
+    packssdw        m3, m4                    ;[3L 11L 3H 11H]
+
+    movu            m4, [r0 +  4 * 64]
+    movu            m5, [r0 + 12 * 64]
+    packssdw        m4, m5                    ;[4L 12L 4H 12H]
+
+    movu            m5, [r0 +  5 * 64]
+    movu            m6, [r0 + 13 * 64]
+    packssdw        m5, m6                    ;[5L 13L 5H 13H]
+
+    movu            m6, [r0 +  6 * 64]
+    movu            m7, [r0 + 14 * 64]
+    packssdw        m6, m7                    ;[6L 14L 6H 14H]
+
+    movu            m7, [r0 +  7 * 64]
+    movu            m8, [r0 + 15 * 64]
+    packssdw        m7, m8                    ;[7L 15L 7H 15H]
+
+    punpckhwd       m8, m0, m2                ;[8 10]
+    punpcklwd       m0, m2                    ;[0 2]
+
+    punpckhwd       m2, m1, m3                ;[9 11]
+    punpcklwd       m1, m3                    ;[1 3]
+
+    punpckhwd       m3, m4, m6                ;[12 14]
+    punpcklwd       m4, m6                    ;[4 6]
+
+    punpckhwd       m6, m5, m7                ;[13 15]
+    punpcklwd       m5, m7                    ;[5 7]
+
+    punpckhdq       m7, m0, m4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
+    punpckldq       m0, m4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]
+
+    punpckhdq       m4, m8, m3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
+    punpckldq       m8, m3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]
+
+    punpckhdq       m3, m1, m5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
+    punpckldq       m1, m5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]
+
+    punpckhdq       m5, m2, m6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
+    punpckldq       m2, m6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]
+
+    punpckhqdq      m6, m0, m8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
+    punpcklqdq      m0, m8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]
+
+    punpckhqdq      m8, m7, m4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
+    punpcklqdq      m7, m4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]
+
+    punpckhqdq      m4, m1, m2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
+    punpcklqdq      m1, m2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]
+
+    punpckhqdq      m2, m3, m5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
+    punpcklqdq      m3, m5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]
+
+    IDCT_PASS1      0, 14
+    IDCT_PASS1      2, 12
+    IDCT_PASS1      4, 10
+    IDCT_PASS1      6, 8
+
+    add             r0, 32
+    add             r3, 16
+    dec             r4d
+    jnz             .pass1
+
+    mov             r3, rsp
+    mov             r4d, 8
+    lea             r5, [tab_idct16_2]
+    lea             r6, [tab_idct16_1]
+
+    vbroadcasti128  m7,  [r5]
+    vbroadcasti128  m8,  [r5 + 16]
+    vbroadcasti128  m9,  [r5 + 32]
+    vbroadcasti128  m10, [r5 + 48]
+    vbroadcasti128  m11, [r5 + 64]
+    vbroadcasti128  m12, [r5 + 80]
+    vbroadcasti128  m13, [r5 + 96]
+
+.pass2:
+    movu            m1, [r3]
+    vpermq          m0, m1, 0xD8
+
+    pmaddwd         m1, m0, m7
+    pmaddwd         m2, m0, m8
+    phaddd          m1, m2
+
+    pmaddwd         m2, m0, m9
+    pmaddwd         m3, m0, m10
+    phaddd          m2, m3
+
+    phaddd          m1, m2
+
+    pmaddwd         m2, m0, m11
+    pmaddwd         m3, m0, m12
+    phaddd          m2, m3
+
+    vbroadcasti128  m14, [r5 + 112]
+    pmaddwd         m3, m0, m13
+    pmaddwd         m4, m0, m14
+    phaddd          m3, m4
+
+    phaddd          m2, m3
+
+    movu            m3, [r3 + 32]
+    vpermq          m0, m3, 0xD8
+
+    vbroadcasti128  m14, [r6]
+    pmaddwd         m3, m0, m14
+    vbroadcasti128  m14, [r6 + 16]
+    pmaddwd         m4, m0, m14
+    phaddd          m3, m4
+
+    vbroadcasti128  m14, [r6 + 32]
+    pmaddwd         m4, m0, m14
+    vbroadcasti128  m14, [r6 + 48]
+    pmaddwd         m5, m0, m14
+    phaddd          m4, m5
+
+    phaddd          m3, m4
+
+    vbroadcasti128  m14, [r6 + 64]
+    pmaddwd         m4, m0, m14
+    vbroadcasti128  m14, [r6 + 80]
+    pmaddwd         m5, m0, m14
+    phaddd          m4, m5
+
+    vbroadcasti128  m14, [r6 + 96]
+    pmaddwd         m6, m0, m14
+    vbroadcasti128  m14, [r6 + 112]
+    pmaddwd         m0, m14
+    phaddd          m6, m0
+
+    phaddd          m4, m6
+
+    paddd           m5, m1, m3
+    paddd           m5, m15
+    psrad           m5, IDCT_SHIFT2
+
+    psubd           m1, m3
+    paddd           m1, m15
+    psrad           m1, IDCT_SHIFT2
+
+    paddd           m6, m2, m4
+    paddd           m6, m15
+    psrad           m6, IDCT_SHIFT2
+
+    psubd           m2, m4
+    paddd           m2, m15
+    psrad           m2, IDCT_SHIFT2
+
+    packssdw        m5, m6
+    packssdw        m1, m2
+    pshufb          m2, m1, [dct16_shuf1]
+
+    mova            [r1], xm5
+    mova            [r1 + 16], xm2
+    vextracti128    [r1 + r2], m5, 1
+    vextracti128    [r1 + r2 + 16], m2, 1
+
+    lea             r1, [r1 + 2 * r2]
+    add             r3, 64
+    dec             r4d
+    jnz             .pass2
+    RET
+%endif
diff -r 86686bd153db -r 44692411abab source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Sep 17 12:52:38 2014 +0200
+++ b/source/common/x86/dct8.h	Thu Sep 18 14:27:44 2014 +0530
@@ -27,6 +27,7 @@
 void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);


More information about the x265-devel mailing list