<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>right, just improve in future</div><pre><br>At 2014-09-12 14:44:09,murugan@multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan@multicorewareinc.com>
># Date 1410503566 -19800
># Fri Sep 12 12:02:46 2014 +0530
># Node ID c57ba6597819aa9dd82e5dccee04eb38a05a5772
># Parent 7e29b10982d2eb7fd79f581d99996f04184522ba
>asm: avx2 assembly code for dct32x32
>
>diff -r 7e29b10982d2 -r c57ba6597819 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Thu Sep 11 19:24:28 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp Fri Sep 12 12:02:46 2014 +0530
>@@ -1446,6 +1446,7 @@
> p.dequant_normal = x265_dequant_normal_avx2;
> #if X86_64
> p.dct[DCT_16x16] = x265_dct16_avx2;
>+ p.dct[DCT_32x32] = x265_dct32_avx2;
> #endif
> }
> /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
>@@ -1747,6 +1748,7 @@
> p.dequant_normal = x265_dequant_normal_avx2;
> #if X86_64
> p.dct[DCT_16x16] = x265_dct16_avx2;
>+ p.dct[DCT_32x32] = x265_dct32_avx2;
> #endif
> }
> #endif // if HIGH_BIT_DEPTH
>diff -r 7e29b10982d2 -r c57ba6597819 source/common/x86/dct8.asm
>--- a/source/common/x86/dct8.asm Thu Sep 11 19:24:28 2014 +0530
>+++ b/source/common/x86/dct8.asm Fri Sep 12 12:02:46 2014 +0530
>@@ -68,6 +68,72 @@
>
> dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
>
>+tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
>+ dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
>+ dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
>+ dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
>+ dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
>+ dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
>+ dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
>+ dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
>+ dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
>+ dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
>+ dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
>+ dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
>+ dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
>+ dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
>+ dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
>+ dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
>+ dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
>+ dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
>+ dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
>+ dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
>+ dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
>+ dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
>+ dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
>+ dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
>+ dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
>+ dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
>+ dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
>+ dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
>+ dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
>+ dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
>+ dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
>+ dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
>+
>+tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
>+ dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
>+ dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
>+ dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
>+ dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
>+ dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
>+ dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
>+ dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
>+ dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
>+ dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
>+ dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
>+ dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
>+ dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
>+ dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
>+ dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
>+ dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
>+ dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
>+ dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
>+ dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
>+ dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
>+ dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
>+ dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
>+ dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
>+ dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
>+ dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
>+ dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
>+ dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
>+ dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
>+ dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
>+ dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
>+ dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
>+ dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
>+
> avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
> dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
>
>@@ -135,6 +201,7 @@
> cextern pd_128
> cextern pd_256
> cextern pd_512
>+cextern pd_1024
> cextern pd_2048
> cextern pw_ppppmmmm
>
>@@ -1336,4 +1403,263 @@
> dec r4d
> jnz .pass2
> RET
>+
>+%macro DCT32_PASS_1 4
>+ vbroadcasti128 m8, [r7 + %1]
>+
>+ pmaddwd m11, m%3, m8
>+ pmaddwd m12, m%4, m8
>+ phaddd m11, m12
>+
>+ vbroadcasti128 m8, [r7 + %1 + 32]
>+ vbroadcasti128 m10, [r7 + %1 + 48]
>+ pmaddwd m12, m5, m8
>+ pmaddwd m13, m6, m10
>+ phaddd m12, m13
>+
>+ pmaddwd m13, m4, m8
>+ pmaddwd m14, m7, m10
>+ phaddd m13, m14
>+
>+ phaddd m12, m13
>+
>+ phaddd m11, m12
>+ paddd m11, m9
>+ psrad m11, DCT_SHIFT
>+
>+ vpermq m11, m11, 0xD8
>+ packssdw m11, m11
>+ movq [r5 + %2], xm11
>+ vextracti128 xm10, m11, 1
>+ movq [r5 + %2 + 64], xm10
>+%endmacro
>+
>+%macro DCT32_PASS_2 1
>+ mova m8, [r7 + %1]
>+ mova m10, [r8 + %1]
>+ pmaddwd m11, m0, m8
>+ pmaddwd m12, m1, m10
>+ paddd m11, m12
>+
>+ pmaddwd m12, m2, m8
>+ pmaddwd m13, m3, m10
>+ paddd m12, m13
>+
>+ phaddd m11, m12
>+
>+ pmaddwd m12, m4, m8
>+ pmaddwd m13, m5, m10
>+ paddd m12, m13
>+
>+ pmaddwd m13, m6, m8
>+ pmaddwd m14, m7, m10
>+ paddd m13, m14
>+
>+ phaddd m12, m13
>+
>+ phaddd m11, m12
>+ vextracti128 xm10, m11, 1
>+ paddd xm11, xm10
>+
>+ paddd xm11, xm9
>+ psrad xm11, DCT_SHIFT2
>+
>+%endmacro
>+
>+INIT_YMM avx2
>+cglobal dct32, 3, 9, 16, 0-64*mmsize
>+%if BIT_DEPTH == 10
>+ %define DCT_SHIFT 6
>+ vpbroadcastq m9, [pd_32]
>+%elif BIT_DEPTH == 8
>+ %define DCT_SHIFT 4
>+ vpbroadcastq m9, [pd_8]
>+%else
>+ %error Unsupported BIT_DEPTH!
> %endif
>+%define DCT_SHIFT2 11
>+
>+ add r2d, r2d
>+
>+ lea r7, [tab_dct32_1]
>+ lea r8, [tab_dct32_2]
>+ lea r3, [r2 * 3]
>+ mov r5, rsp
>+ mov r4d, 8
>+ mova m15, [dct16_shuf1]
>+
>+.pass1:
>+ mova m2, [r0]
>+ mova m1, [r0 + 32]
>+ pshufb m1, m15
>+ vpermq m1, m1, 0x4E
>+ psubw m7, m2, m1
>+ paddw m2, m1
>+
>+ mova m1, [r0 + r2 * 2]
>+ mova m0, [r0 + r2 * 2 + 32]
>+ pshufb m0, m15
>+ vpermq m0, m0, 0x4E
>+ psubw m8, m1, m0
>+ paddw m1, m0
>+ vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E
>+ vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E
>+ pshufb m3, m15
>+ psubw m1, m0, m3
>+ paddw m0, m3
>+
>+ vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O
>+ vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O
>+
>+
>+ mova m4, [r0 + r2]
>+ mova m2, [r0 + r2 + 32]
>+ pshufb m2, m15
>+ vpermq m2, m2, 0x4E
>+ psubw m10, m4, m2
>+ paddw m4, m2
>+
>+ mova m3, [r0 + r3]
>+ mova m2, [r0 + r3 + 32]
>+ pshufb m2, m15
>+ vpermq m2, m2, 0x4E
>+ psubw m11, m3, m2
>+ paddw m3, m2
>+ vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E
>+ vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E
>+ pshufb m8, m15
>+ psubw m3, m2, m8
>+ paddw m2, m8
>+
>+ vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O
>+ vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O
>+
>+
>+ DCT32_PASS_1 0 * 32, 0 * 64, 0, 2
>+ DCT32_PASS_1 2 * 32, 2 * 64, 1, 3
>+ DCT32_PASS_1 4 * 32, 4 * 64, 0, 2
>+ DCT32_PASS_1 6 * 32, 6 * 64, 1, 3
>+ DCT32_PASS_1 8 * 32, 8 * 64, 0, 2
>+ DCT32_PASS_1 10 * 32, 10 * 64, 1, 3
>+ DCT32_PASS_1 12 * 32, 12 * 64, 0, 2
>+ DCT32_PASS_1 14 * 32, 14 * 64, 1, 3
>+ DCT32_PASS_1 16 * 32, 16 * 64, 0, 2
>+ DCT32_PASS_1 18 * 32, 18 * 64, 1, 3
>+ DCT32_PASS_1 20 * 32, 20 * 64, 0, 2
>+ DCT32_PASS_1 22 * 32, 22 * 64, 1, 3
>+ DCT32_PASS_1 24 * 32, 24 * 64, 0, 2
>+ DCT32_PASS_1 26 * 32, 26 * 64, 1, 3
>+ DCT32_PASS_1 28 * 32, 28 * 64, 0, 2
>+ DCT32_PASS_1 30 * 32, 30 * 64, 1, 3
>+
>+ add r5, 8
>+ lea r0, [r0 + r2 * 4]
>+
>+ dec r4d
>+ jnz .pass1
>+
>+ add r2d, r2d
>+ lea r3, [r2 * 3]
>+ mov r5, rsp
>+ mov r4d, 8
>+ vpbroadcastq m9, [pd_1024]
>+
>+.pass2:
>+ mova m0, [r5 + 0 * 64]
>+ mova m1, [r5 + 0 * 64 + 32]
>+
>+ mova m2, [r5 + 1 * 64]
>+ mova m3, [r5 + 1 * 64 + 32]
>+
>+ mova m4, [r5 + 2 * 64]
>+ mova m5, [r5 + 2 * 64 + 32]
>+
>+ mova m6, [r5 + 3 * 64]
>+ mova m7, [r5 + 3 * 64 + 32]
>+
>+ DCT32_PASS_2 0 * 32
>+ mova [r1], xm11
>+ DCT32_PASS_2 1 * 32
>+ mova [r1 + r2], xm11
>+ DCT32_PASS_2 2 * 32
>+ mova [r1 + r2 * 2], xm11
>+ DCT32_PASS_2 3 * 32
>+ mova [r1 + r3], xm11
>+
>+ lea r6, [r1 + r2 * 4]
>+ DCT32_PASS_2 4 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 5 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 6 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 7 * 32
>+ mova [r6 + r3], xm11
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT32_PASS_2 8 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 9 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 10 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 11 * 32
>+ mova [r6 + r3], xm11
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT32_PASS_2 12 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 13 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 14 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 15 * 32
>+ mova [r6 + r3], xm11
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT32_PASS_2 16 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 17 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 18 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 19 * 32
>+ mova [r6 + r3], xm11
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT32_PASS_2 20 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 21 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 22 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 23 * 32
>+ mova [r6 + r3], xm11
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT32_PASS_2 24 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 25 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 26 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 27 * 32
>+ mova [r6 + r3], xm11
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT32_PASS_2 28 * 32
>+ mova [r6], xm11
>+ DCT32_PASS_2 29 * 32
>+ mova [r6 + r2], xm11
>+ DCT32_PASS_2 30 * 32
>+ mova [r6 + r2 * 2], xm11
>+ DCT32_PASS_2 31 * 32
>+ mova [r6 + r3], xm11
>+
>+ add r5, 256
>+ add r1, 16
>+
>+ dec r4d
>+ jnz .pass2
>+ RET
>+%endif
>diff -r 7e29b10982d2 -r c57ba6597819 source/common/x86/dct8.h
>--- a/source/common/x86/dct8.h Thu Sep 11 19:24:28 2014 +0530
>+++ b/source/common/x86/dct8.h Fri Sep 12 12:02:46 2014 +0530
>@@ -31,6 +31,7 @@
> void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
>+void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
>
> void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>