<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div> </div><pre><br>At 2014-09-08 19:22:04,yuvaraj@multicorewareinc.com wrote:
># HG changeset patch
># User Yuvaraj Venkatesh<a href="mailto:yuvaraj@multicorewareinc.com>># Date 1410175257 -19800># Mon Sep 08 16:50:57 2014 +0530># Node ID 74efa467c779ada694ecb9b765b60278aec36950># Parent 8cbfec8d6b4d293a5e7f32e8fef46700b9f1cf6a>asm: avx2 assembly code for dct16>">yuvaraj@multicorewareinc.com>
># Date 1410175257 -19800
># Mon Sep 08 16:50:57 2014 +0530
># Node ID 74efa467c779ada694ecb9b765b60278aec36950
># Parent 8cbfec8d6b4d293a5e7f32e8fef46700b9f1cf6a
>asm: avx2 assembly code for dct16
>
</a>+%macro DCT16_PASS_1_E 2
>+ vpbroadcastq m7, [tab_dct16_1 + %1]
</pre><pre>buffer address of tab_dct16_1 is better, you have many free register</pre><pre>>+ pmaddwd m4, m0, m7
>+ pmaddwd m6, m2, m7
>+ phaddd m4, m6
>+
>+ paddd m4, m9
>+ psrad m4, DCT_SHIFT
>+
>+ packssdw m4, m4
>+ vpermq m4, m4, 0x08
>+
>+ mova [r5 + %2], xm4
>+%endmacro
>+
>+%if ARCH_X86_64
</pre><pre>'ARCH_X86_64 == 1', other part use this style</pre><pre> </pre><pre>>+INIT_YMM avx2
>+cglobal dct16, 3, 9, 15, 0-16*mmsize
>+%if BIT_DEPTH == 10
>+ %define DCT_SHIFT 5
>+ vpbroadcastd m9, [pd_16]
>+%elif BIT_DEPTH == 8
>+ %define DCT_SHIFT 3
>+ vpbroadcastd m9, [pd_4]
>+%else
>+ %error Unsupported BIT_DEPTH!
>+%endif
>+%define DCT_SHIFT2 10
>+
>+ add r2d, r2d
>+
>+ lea r3, [r2 * 3]
>+ mov r5, rsp
>+ mov r4d, 2
</pre><pre>16/8 ?</pre><pre> </pre><pre>>+ mova m13, [dct16_shuf1]
>+ mova m14, [dct16_shuf2]
>+
>+.pass1:
>+ lea r6, [r0 + r2 * 4]
>+
>+ mova m2, [r0]
>+ mova m1, [r6]
>+ vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]
>+ vperm2i128 m1, m1, m2, 0x13 ; [row0hi row4hi]
>+
>+ mova m4, [r0 + r2]
>+ mova m3, [r6 + r2]
>+ vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]
>+ vperm2i128 m3, m3, m4, 0x13 ; [row1hi row5hi]
</pre><pre>keep the same order is more readable</pre><pre> </pre><pre>>+
>+ mova m6, [r0 + r2 * 2]
>+ mova m5, [r6 + r2 * 2]
>+ vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]
>+ vperm2i128 m5, m5, m6, 0x13 ; [row2hi row6hi]
>+
>+ mova m8, [r0 + r3]
>+ mova m7, [r6 + r3]
>+ vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]
>+ vperm2i128 m7, m7, m8, 0x13 ; [row3hi row7hi]
</pre><pre>>+
>+ pshufb m1, m13
>+ pshufb m3, m13
>+ pshufb m5, m13
>+ pshufb m7, m13
>+
>+ paddw m8, m0, m1 ;E
>+ psubw m0, m1 ;O
>+
>+ paddw m1, m2, m3 ;E
>+ psubw m2, m3 ;O
>+
>+ paddw m3, m4, m5 ;E
>+ psubw m4, m5 ;O
>+
>+ paddw m5, m6, m7 ;E
>+ psubw m6, m7 ;O
>+
>+ DCT16_PASS_1_O 1 * 16, 1 * 32
>+ DCT16_PASS_1_O 3 * 16, 3 * 32
>+ DCT16_PASS_1_O 5 * 16, 1 * 32 + 16
>+ DCT16_PASS_1_O 7 * 16, 3 * 32 + 16
>+ DCT16_PASS_1_O 9 * 16, 5 * 32
>+ DCT16_PASS_1_O 11 * 16, 7 * 32
>+ DCT16_PASS_1_O 13 * 16, 5 * 32 + 16
>+ DCT16_PASS_1_O 15 * 16, 7 * 32 + 16
>+
>+ pshufb m8, m14
>+ pshufb m1, m14
>+ phaddw m0, m8, m1
>+
>+ pshufb m3, m14
>+ pshufb m5, m14
>+ phaddw m2, m3, m5
>+
>+ DCT16_PASS_1_E 0 * 16, 0 * 32
>+ DCT16_PASS_1_E 4 * 16, 0 * 32 + 16
>+ DCT16_PASS_1_E 8 * 16, 4 * 32
>+ DCT16_PASS_1_E 12 * 16, 4 * 32 + 16
>+
>+ phsubw m0, m8, m1
>+ phsubw m2, m3, m5
>+
>+ DCT16_PASS_1_E 2 * 16, 2 * 32
>+ DCT16_PASS_1_E 6 * 16, 2 * 32 + 16
>+ DCT16_PASS_1_E 10 * 16, 6 * 32
>+ DCT16_PASS_1_E 14 * 16, 6 * 32 + 16
>+
>+ lea r0, [r0 + 8 * r2]
>+ add r5, 256
>+
>+ dec r4d
>+ jnz .pass1
>+
>+ mov r5, rsp
>+ mov r4d, 2
>+ add r2d, r2d
>+ lea r3, [r2 * 3]
>+ vpbroadcastd m9, [pd_512]
>+
>+.pass2:
>+ mova m0, [r5 + 0 * 32] ; [row0lo row4lo]
>+ mova m1, [r5 + 8 * 32] ; [row0hi row4hi]
>+
>+ mova m2, [r5 + 1 * 32] ; [row1lo row5lo]
>+ mova m3, [r5 + 9 * 32] ; [row1hi row5hi]
>+
>+ mova m4, [r5 + 2 * 32] ; [row2lo row6lo]
>+ mova m5, [r5 + 10 * 32] ; [row2hi row6hi]
>+
>+ mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
>+ mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
>+
>+ DCT16_PASS_2 0 * 16
>+ mova [r1], m10
>+ DCT16_PASS_2 1 * 16
>+ mova [r1 + r2], m10
>+ DCT16_PASS_2 2 * 16
>+ mova [r1 + r2 * 2], m10
>+ DCT16_PASS_2 3 * 16
>+ mova [r1 + r3], m10
>+
>+ lea r6, [r1 + r2 * 4]
>+ DCT16_PASS_2 4 * 16
>+ mova [r6], m10
>+ DCT16_PASS_2 5 * 16
>+ mova [r6 + r2], m10
>+ DCT16_PASS_2 6 * 16
>+ mova [r6 + r2 * 2], m10
>+ DCT16_PASS_2 7 * 16
>+ mova [r6 + r3], m10
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT16_PASS_2 8 * 16
>+ mova [r6], m10
>+ DCT16_PASS_2 9 * 16
>+ mova [r6 + r2], m10
>+ DCT16_PASS_2 10 * 16
>+ mova [r6 + r2 * 2], m10
>+ DCT16_PASS_2 11 * 16
>+ mova [r6 + r3], m10
>+
>+ lea r6, [r6 + r2 * 4]
>+ DCT16_PASS_2 12 * 16
>+ mova [r6], m10
>+ DCT16_PASS_2 13 * 16
>+ mova [r6 + r2], m10
>+ DCT16_PASS_2 14 * 16
>+ mova [r6 + r2 * 2], m10
>+ DCT16_PASS_2 15 * 16
>+ mova [r6 + r3], m10
>+
>+ add r1, 32
>+ add r5, 128
>+
>+ dec r4d
>+ jnz .pass2
>+
>+ RET
>+%endif
</pre></div>