[x265] [PATCH] asm: avx2 assembly code for idct8x8

chen chenm003 at 163.com
Thu Sep 25 22:24:00 CEST 2014


right

At 2014-09-25 21:21:01,yuvaraj at multicorewareinc.com wrote:
># HG changeset patch
># User Yuvaraj Venkatesh<yuvaraj at multicorewareinc.com>
># Date 1411651243 -19800
>#      Thu Sep 25 18:50:43 2014 +0530
># Node ID 9f8f750f96e4465a52962df0831bbc908db493a2
># Parent  0d330611fa978d271a87361685a3908faa2d4d8f
>asm: avx2 assembly code for idct8x8
>
>diff -r 0d330611fa97 -r 9f8f750f96e4 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Thu Sep 25 13:12:43 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 25 18:50:43 2014 +0530
>@@ -1448,6 +1448,7 @@
>         p.dct[DCT_8x8] = x265_dct8_avx2;
>         p.dct[DCT_16x16] = x265_dct16_avx2;
>         p.dct[DCT_32x32] = x265_dct32_avx2;
>+        p.idct[IDCT_8x8] = x265_idct8_avx2;
>         p.idct[IDCT_16x16] = x265_idct16_avx2;
> #endif
>     }
>@@ -1783,6 +1784,7 @@
>         p.dct[DCT_8x8] = x265_dct8_avx2;
>         p.dct[DCT_16x16] = x265_dct16_avx2;
>         p.dct[DCT_32x32] = x265_dct32_avx2;
>+        p.idct[IDCT_8x8] = x265_idct8_avx2;
>         p.idct[IDCT_16x16] = x265_idct16_avx2;
> #endif
>     }
>diff -r 0d330611fa97 -r 9f8f750f96e4 source/common/x86/dct8.asm
>--- a/source/common/x86/dct8.asm	Thu Sep 25 13:12:43 2014 +0530
>+++ b/source/common/x86/dct8.asm	Thu Sep 25 18:50:43 2014 +0530
>@@ -145,6 +145,22 @@
>                 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
>                 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
> 
>+avx2_idct8_1:   times 4 dw 64, 83, 64, 36
>+                times 4 dw 64, 36, -64, -83
>+                times 4 dw 64, -36, -64, 83
>+                times 4 dw 64, -83, 64, -36
>+
>+avx2_idct8_2:   times 4 dw 89, 75, 50, 18
>+                times 4 dw 75, -18, -89, -50
>+                times 4 dw 50, -89, 18, 75
>+                times 4 dw 18, -50, 75, -89
>+
>+idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7
>+
>+idct8_shuf2:    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
>+
>+idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
>+
> tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
>                 dw 87, 57, 9, -43, -80, -90, -70, -25
>                 dw 80, 9, -70, -87, -25, 57, 90, 43
>@@ -1730,6 +1746,184 @@
>     jnz             .pass2
>     RET
> 
>+%macro IDCT8_PASS_1 1
>+    vpbroadcastd    m7,                [r5 + %1]
>+    vpbroadcastd    m10,               [r5 + %1 + 4]
>+    pmaddwd         m5,                m4, m7
>+    pmaddwd         m6,                m0, m10
>+    paddd           m5,                m6
>+
>+    vpbroadcastd    m7,                [r6 + %1]
>+    vpbroadcastd    m10,               [r6 + %1 + 4]
>+    pmaddwd         m6,                m1, m7
>+    pmaddwd         m3,                m2, m10
>+    paddd           m6,                m3
>+
>+    paddd           m3,                m5, m6
>+    paddd           m3,                m11
>+    psrad           m3,                IDCT_SHIFT1
>+
>+    psubd           m5,                m6
>+    paddd           m5,                m11
>+    psrad           m5,                IDCT_SHIFT1
>+
>+    vpbroadcastd    m7,                [r5 + %1 + 32]
>+    vpbroadcastd    m10,               [r5 + %1 + 36]
>+    pmaddwd         m6,                m4, m7
>+    pmaddwd         m8,                m0, m10
>+    paddd           m6,                m8
>+
>+    vpbroadcastd    m7,                [r6 + %1 + 32]
>+    vpbroadcastd    m10,               [r6 + %1 + 36]
>+    pmaddwd         m8,                m1, m7
>+    pmaddwd         m9,                m2, m10
>+    paddd           m8,                m9
>+
>+    paddd           m9,                m6, m8
>+    paddd           m9,                m11
>+    psrad           m9,                IDCT_SHIFT1
>+
>+    psubd           m6,                m8
>+    paddd           m6,                m11
>+    psrad           m6,                IDCT_SHIFT1
>+
>+    packssdw        m3,                m9
>+    vpermq          m3,                m3, 0xD8
>+
>+    packssdw        m6,                m5
>+    vpermq          m6,                m6, 0xD8
>+%endmacro
>+
>+%macro IDCT8_PASS_2 0
>+    punpcklqdq      m2,                m0, m1
>+    punpckhqdq      m0,                m1
>+
>+    pmaddwd         m3,                m2, [r5]
>+    pmaddwd         m5,                m2, [r5 + 32]
>+    pmaddwd         m6,                m2, [r5 + 64]
>+    pmaddwd         m7,                m2, [r5 + 96]
>+    phaddd          m3,                m5
>+    phaddd          m6,                m7
>+    pshufb          m3,                [idct8_shuf2]
>+    pshufb          m6,                [idct8_shuf2]
>+    punpcklqdq      m7,                m3, m6
>+    punpckhqdq      m3,                m6
>+
>+    pmaddwd         m5,                m0, [r6]
>+    pmaddwd         m6,                m0, [r6 + 32]
>+    pmaddwd         m8,                m0, [r6 + 64]
>+    pmaddwd         m9,                m0, [r6 + 96]
>+    phaddd          m5,                m6
>+    phaddd          m8,                m9
>+    pshufb          m5,                [idct8_shuf2]
>+    pshufb          m8,                [idct8_shuf2]
>+    punpcklqdq      m6,                m5, m8
>+    punpckhqdq      m5,                m8
>+
>+    paddd           m8,                m7, m6
>+    paddd           m8,                m12
>+    psrad           m8,                IDCT_SHIFT2
>+
>+    psubd           m7,                m6
>+    paddd           m7,                m12
>+    psrad           m7,                IDCT_SHIFT2
>+
>+    pshufb          m7,                [idct8_shuf3]
>+    packssdw        m8,                 m7
>+
>+    paddd           m9,                m3, m5
>+    paddd           m9,                m12
>+    psrad           m9,                IDCT_SHIFT2
>+
>+    psubd           m3,                m5
>+    paddd           m3,                m12
>+    psrad           m3,                IDCT_SHIFT2
>+
>+    pshufb          m3,                [idct8_shuf3]
>+    packssdw        m9,                m3
>+%endmacro
>+
>+INIT_YMM avx2
>+cglobal idct8, 3, 7, 13, 0-8*16
>+%if BIT_DEPTH == 10
>+    %define         IDCT_SHIFT2        10
>+    vpbroadcastd    m12,                [pd_512]
>+%elif BIT_DEPTH == 8
>+    %define         IDCT_SHIFT2        12
>+    vpbroadcastd    m12,                [pd_2048]
>+%else
>+    %error Unsupported BIT_DEPTH!
>+%endif
>+%define             IDCT_SHIFT1         7
>+
>+    vbroadcasti128  m11,               [pd_64]
>+
>+    mov             r4,                rsp
>+    lea             r5,                [avx2_idct8_1]
>+    lea             r6,                [avx2_idct8_2]
>+
>+    ;pass1
>+    mova            m0,                [r0 + 0 * 32]
>+    mova            m1,                [r0 + 4 * 32]
>+    packssdw        m0,                m1               ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
>+    mova            m1,                [r0 + 2 * 32]
>+    mova            m2,                [r0 + 6 * 32]
>+    packssdw        m1,                m2               ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
>+    mova            m2,                [r0 + 1 * 32]
>+    mova            m3,                [r0 + 5 * 32]
>+    packssdw        m2,                m3               ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
>+    mova            m3,                [r0 + 3 * 32]
>+    mova            m4,                [r0 + 7 * 32]
>+    packssdw        m3,                m4               ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
>+
>+    mova            m5,                [idct8_shuf1]
>+
>+    punpcklwd       m4,                m0, m1           ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
>+    punpckhwd       m0,                m1               ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
>+    vpermd          m4,                m5, m4
>+    vpermd          m0,                m5, m0
>+
>+    punpcklwd       m1,                m2, m3           ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
>+    punpckhwd       m2,                m3               ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
>+    vpermd          m1,                m5, m1
>+    vpermd          m2,                m5, m2
>+
>+    IDCT8_PASS_1    0
>+    mova            [r4],              m3
>+    mova            [r4 + 96],         m6
>+
>+    IDCT8_PASS_1    64
>+    mova            [r4 + 32],         m3
>+    mova            [r4 + 64],         m6
>+
>+    ;pass2
>+    add             r2d,               r2d
>+    lea             r3,                [r2 * 3]
>+
>+    mova            m0,                [r4]
>+    mova            m1,                [r4 + 32]
>+    IDCT8_PASS_2
>+
>+    vextracti128    xm3,               m8, 1
>+    mova            [r1],              xm8
>+    mova            [r1 + r2],         xm3
>+    vextracti128    xm3,               m9, 1
>+    mova            [r1 + r2 * 2],     xm9
>+    mova            [r1 + r3],         xm3
>+
>+    lea             r1,                [r1 + r2 * 4]
>+    mova            m0,                [r4 + 64]
>+    mova            m1,                [r4 + 96]
>+    IDCT8_PASS_2
>+
>+    vextracti128    xm3,               m8, 1
>+    mova            [r1],              xm8
>+    mova            [r1 + r2],         xm3
>+    vextracti128    xm3,               m9, 1
>+    mova            [r1 + r2 * 2],     xm9
>+    mova            [r1 + r3],         xm3
>+    RET
>+
> %macro IDCT_PASS1 2
>     vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]
> 
>diff -r 0d330611fa97 -r 9f8f750f96e4 source/common/x86/dct8.h
>--- a/source/common/x86/dct8.h	Thu Sep 25 13:12:43 2014 +0530
>+++ b/source/common/x86/dct8.h	Thu Sep 25 18:50:43 2014 +0530
>@@ -24,17 +24,19 @@
> #ifndef X265_DCT8_H
> #define X265_DCT8_H
> void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
>+void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
>+void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
>-void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>-void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
>-void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
>-void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
>-void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>-void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> 
>+void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>+void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>+void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
>+void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
>+void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
>+
> void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> 
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140926/74374b65/attachment-0001.html>


More information about the x265-devel mailing list