[x265] [PATCH] asm: avx2 assembly code for dct32x32

murugan at multicorewareinc.com murugan at multicorewareinc.com
Fri Sep 12 08:44:09 CEST 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1410503566 -19800
#      Fri Sep 12 12:02:46 2014 +0530
# Node ID c57ba6597819aa9dd82e5dccee04eb38a05a5772
# Parent  7e29b10982d2eb7fd79f581d99996f04184522ba
asm: avx2 assembly code for dct32x32

diff -r 7e29b10982d2 -r c57ba6597819 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Sep 11 19:24:28 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Sep 12 12:02:46 2014 +0530
@@ -1446,6 +1446,7 @@
         p.dequant_normal = x265_dequant_normal_avx2;
 #if X86_64
         p.dct[DCT_16x16] = x265_dct16_avx2;
+        p.dct[DCT_32x32] = x265_dct32_avx2;
 #endif
     }
     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
@@ -1747,6 +1748,7 @@
         p.dequant_normal = x265_dequant_normal_avx2;
 #if X86_64
         p.dct[DCT_16x16] = x265_dct16_avx2;
+        p.dct[DCT_32x32] = x265_dct32_avx2;
 #endif
     }
 #endif // if HIGH_BIT_DEPTH
diff -r 7e29b10982d2 -r c57ba6597819 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Thu Sep 11 19:24:28 2014 +0530
+++ b/source/common/x86/dct8.asm	Fri Sep 12 12:02:46 2014 +0530
@@ -68,6 +68,72 @@
 
 dct16_shuf2:    times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
 
+tab_dct32_1:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4
+                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
+                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
+                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
+                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
+                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
+                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
+                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
+                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38
+                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
+                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
+                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
+                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
+                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
+                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61
+                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
+                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
+                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
+                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
+                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
+                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78
+                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
+                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
+                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
+                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
+                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
+                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
+                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
+                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90
+                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
+                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+
+tab_dct32_2:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+                dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
+                dw -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
+                dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
+                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
+                dw -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
+                dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
+                dw 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
+                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
+                dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
+                dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
+                dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
+                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
+                dw -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
+                dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
+                dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
+                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
+                dw -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
+                dw -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
+                dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
+                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
+                dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
+                dw -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
+                dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
+                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
+                dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
+                dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
+                dw 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
+                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
+                dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
+                dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
+                dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
+
 avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
 
@@ -135,6 +201,7 @@
 cextern pd_128
 cextern pd_256
 cextern pd_512
+cextern pd_1024
 cextern pd_2048
 cextern pw_ppppmmmm
 
@@ -1336,4 +1403,263 @@
     dec             r4d
     jnz             .pass2
     RET
+
+%macro DCT32_PASS_1 4
+    vbroadcasti128  m8,                [r7 + %1]
+
+    pmaddwd         m11,               m%3, m8
+    pmaddwd         m12,               m%4, m8
+    phaddd          m11,               m12
+
+    vbroadcasti128  m8,                [r7 + %1 + 32]
+    vbroadcasti128  m10,               [r7 + %1 + 48]
+    pmaddwd         m12,               m5, m8
+    pmaddwd         m13,               m6, m10
+    phaddd          m12,               m13
+
+    pmaddwd         m13,               m4, m8
+    pmaddwd         m14,               m7, m10
+    phaddd          m13,               m14
+
+    phaddd          m12,               m13
+
+    phaddd          m11,               m12
+    paddd           m11,               m9
+    psrad           m11,               DCT_SHIFT
+
+    vpermq          m11,               m11, 0xD8
+    packssdw        m11,               m11
+    movq            [r5 + %2],         xm11
+    vextracti128    xm10,              m11, 1
+    movq            [r5 + %2 + 64],    xm10
+%endmacro
+
+%macro DCT32_PASS_2 1
+    mova            m8,                [r7 + %1]
+    mova            m10,               [r8 + %1]
+    pmaddwd         m11,               m0, m8
+    pmaddwd         m12,               m1, m10
+    paddd           m11,               m12
+
+    pmaddwd         m12,               m2, m8
+    pmaddwd         m13,               m3, m10
+    paddd           m12,               m13
+
+    phaddd          m11,               m12
+
+    pmaddwd         m12,               m4, m8
+    pmaddwd         m13,               m5, m10
+    paddd           m12,               m13
+
+    pmaddwd         m13,               m6, m8
+    pmaddwd         m14,               m7, m10
+    paddd           m13,               m14
+
+    phaddd          m12,               m13
+
+    phaddd          m11,               m12
+    vextracti128    xm10,              m11, 1
+    paddd           xm11,              xm10
+
+    paddd           xm11,               xm9
+    psrad           xm11,               DCT_SHIFT2
+
+%endmacro
+
+INIT_YMM avx2
+cglobal dct32, 3, 9, 16, 0-64*mmsize
+%if BIT_DEPTH == 10
+    %define         DCT_SHIFT          6
+    vpbroadcastq    m9,                [pd_32]
+%elif BIT_DEPTH == 8
+    %define         DCT_SHIFT          4
+    vpbroadcastq    m9,                [pd_8]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
+%define             DCT_SHIFT2         11
+
+    add             r2d,               r2d
+
+    lea             r7,                [tab_dct32_1]
+    lea             r8,                [tab_dct32_2]
+    lea             r3,                [r2 * 3]
+    mov             r5,                rsp
+    mov             r4d,               8
+    mova            m15,               [dct16_shuf1]
+
+.pass1:
+    mova            m2,                [r0]
+    mova            m1,                [r0 + 32]
+    pshufb          m1,                m15
+    vpermq          m1,                m1, 0x4E
+    psubw           m7,                m2, m1
+    paddw           m2,                m1
+
+    mova            m1,                [r0 + r2 * 2]
+    mova            m0,                [r0 + r2 * 2 + 32]
+    pshufb          m0,                m15
+    vpermq          m0,                m0, 0x4E
+    psubw           m8,                m1, m0
+    paddw           m1,                m0
+    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row2lo] for E
+    vperm2i128      m3,                m2, m1, 0x31        ; [row0hi  row2hi] for E
+    pshufb          m3,                m15
+    psubw           m1,                m0, m3
+    paddw           m0,                m3
+
+    vperm2i128      m5,                m7, m8, 0x20        ; [row0lo  row2lo] for O
+    vperm2i128      m6,                m7, m8, 0x31        ; [row0hi  row2hi] for O
+
+
+    mova            m4,                [r0 + r2]
+    mova            m2,                [r0 + r2 + 32]
+    pshufb          m2,                m15
+    vpermq          m2,                m2, 0x4E
+    psubw           m10,               m4, m2
+    paddw           m4,                m2
+
+    mova            m3,                [r0 + r3]
+    mova            m2,                [r0 + r3 + 32]
+    pshufb          m2,                m15
+    vpermq          m2,                m2, 0x4E
+    psubw           m11,               m3, m2
+    paddw           m3,                m2
+    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row3lo] for E
+    vperm2i128      m8,                m4, m3, 0x31        ; [row1hi  row3hi] for E
+    pshufb          m8,                m15
+    psubw           m3,                m2, m8
+    paddw           m2,                m8
+
+    vperm2i128      m4,                m10, m11, 0x20      ; [row1lo  row3lo] for O
+    vperm2i128      m7,                m10, m11, 0x31      ; [row1hi  row3hi] for O
+
+
+    DCT32_PASS_1    0 * 32,            0 * 64, 0, 2
+    DCT32_PASS_1    2 * 32,            2 * 64, 1, 3
+    DCT32_PASS_1    4 * 32,            4 * 64, 0, 2
+    DCT32_PASS_1    6 * 32,            6 * 64, 1, 3
+    DCT32_PASS_1    8 * 32,            8 * 64, 0, 2
+    DCT32_PASS_1    10 * 32,           10 * 64, 1, 3
+    DCT32_PASS_1    12 * 32,           12 * 64, 0, 2
+    DCT32_PASS_1    14 * 32,           14 * 64, 1, 3
+    DCT32_PASS_1    16 * 32,           16 * 64, 0, 2
+    DCT32_PASS_1    18 * 32,           18 * 64, 1, 3
+    DCT32_PASS_1    20 * 32,           20 * 64, 0, 2
+    DCT32_PASS_1    22 * 32,           22 * 64, 1, 3
+    DCT32_PASS_1    24 * 32,           24 * 64, 0, 2
+    DCT32_PASS_1    26 * 32,           26 * 64, 1, 3
+    DCT32_PASS_1    28 * 32,           28 * 64, 0, 2
+    DCT32_PASS_1    30 * 32,           30 * 64, 1, 3
+
+    add             r5,                8
+    lea             r0,                [r0 + r2 * 4]
+
+    dec             r4d
+    jnz             .pass1
+
+    add             r2d,               r2d
+    lea             r3,                [r2 * 3]
+    mov             r5,                rsp
+    mov             r4d,               8
+    vpbroadcastq    m9,                [pd_1024]
+
+.pass2:
+    mova            m0,                [r5 + 0 * 64]
+    mova            m1,                [r5 + 0 * 64 + 32]
+
+    mova            m2,                [r5 + 1 * 64]
+    mova            m3,                [r5 + 1 * 64 + 32]
+
+    mova            m4,                [r5 + 2 * 64]
+    mova            m5,                [r5 + 2 * 64 + 32]
+
+    mova            m6,                [r5 + 3 * 64]
+    mova            m7,                [r5 + 3 * 64 + 32]
+
+    DCT32_PASS_2    0 * 32
+    mova            [r1],              xm11
+    DCT32_PASS_2    1 * 32
+    mova            [r1 + r2],         xm11
+    DCT32_PASS_2    2 * 32
+    mova            [r1 + r2 * 2],     xm11
+    DCT32_PASS_2    3 * 32
+    mova            [r1 + r3],         xm11
+
+    lea             r6,                [r1 + r2 * 4]
+    DCT32_PASS_2    4 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    5 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    6 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    7 * 32
+    mova            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    8 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    9 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    10 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    11 * 32
+    mova            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    12 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    13 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    14 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    15 * 32
+    mova            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    16 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    17 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    18 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    19 * 32
+    mova            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    20 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    21 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    22 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    23 * 32
+    mova            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    24 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    25 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    26 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    27 * 32
+    mova            [r6 + r3],         xm11
+
+    lea             r6,                [r6 + r2 * 4]
+    DCT32_PASS_2    28 * 32
+    mova            [r6],              xm11
+    DCT32_PASS_2    29 * 32
+    mova            [r6 + r2],         xm11
+    DCT32_PASS_2    30 * 32
+    mova            [r6 + r2 * 2],     xm11
+    DCT32_PASS_2    31 * 32
+    mova            [r6 + r3],         xm11
+
+    add             r5,                256
+    add             r1,                16
+
+    dec             r4d
+    jnz             .pass2
+    RET
+%endif
diff -r 7e29b10982d2 -r c57ba6597819 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Thu Sep 11 19:24:28 2014 +0530
+++ b/source/common/x86/dct8.h	Fri Sep 12 12:02:46 2014 +0530
@@ -31,6 +31,7 @@
 void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 
 void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
 void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);


More information about the x265-devel mailing list