<div dir="ltr">Sorry, I removed it by mistake. I will resend the patch.</div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Sep 8, 2014 at 3:00 PM, Steve Borho <span dir="ltr"><<a href="mailto:steve@borho.org" target="_blank">steve@borho.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="">On 09/08, <a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a> wrote:<br>
> # HG changeset patch<br>
> # User Yuvaraj Venkatesh<<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
> # Date 1410159827 -19800<br>
> # Mon Sep 08 12:33:47 2014 +0530<br>
> # Node ID f097f381abe941bca59584a56d861e53dabbd033<br>
> # Parent 8cbfec8d6b4d293a5e7f32e8fef46700b9f1cf6a<br>
> asm: avx2 assembly code for dct16<br>
><br>
> diff -r 8cbfec8d6b4d -r f097f381abe9 source/common/x86/asm-primitives.cpp<br>
> --- a/source/common/x86/asm-primitives.cpp Sun Sep 07 12:54:27 2014 +0900<br>
> +++ b/source/common/x86/asm-primitives.cpp Mon Sep 08 12:33:47 2014 +0530<br>
> @@ -1737,9 +1737,10 @@<br>
> p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;<br>
> p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;<br>
> p.denoiseDct = x265_denoise_dct_avx2;<br>
> -<br>
> p.dct[DCT_4x4] = x265_dct4_avx2;<br>
> - p.nquant = x265_nquant_avx2;<br>
<br>
</span>did you intend to remove the quant function? I'm guessing not.<br>
<div><div class="h5"><br>
> +#if X86_64<br>
> + p.dct[DCT_16x16] = x265_dct16_avx2;<br>
> +#endif<br>
> p.dequant_normal = x265_dequant_normal_avx2;<br>
> }<br>
> #endif // if HIGH_BIT_DEPTH<br>
> diff -r 8cbfec8d6b4d -r f097f381abe9 source/common/x86/dct8.asm<br>
> --- a/source/common/x86/dct8.asm Sun Sep 07 12:54:27 2014 +0900<br>
> +++ b/source/common/x86/dct8.asm Mon Sep 08 12:33:47 2014 +0530<br>
> @@ -29,13 +29,52 @@<br>
> %include "x86util.asm"<br>
><br>
> SECTION_RODATA 32<br>
> +tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64<br>
> + dw 90, 87, 80, 70, 57, 43, 25, 9<br>
> + dw 89, 75, 50, 18, -18, -50, -75, -89<br>
> + dw 87, 57, 9, -43, -80, -90, -70, -25<br>
> + dw 83, 36, -36, -83, -83, -36, 36, 83<br>
> + dw 80, 9, -70, -87, -25, 57, 90, 43<br>
> + dw 75, -18, -89, -50, 50, 89, 18, -75<br>
> + dw 70, -43, -87, 9, 90, 25, -80, -57<br>
> + dw 64, -64, -64, 64, 64, -64, -64, 64<br>
> + dw 57, -80, -25, 90, -9, -87, 43, 70<br>
> + dw 50, -89, 18, 75, -75, -18, 89, -50<br>
> + dw 43, -90, 57, 25, -87, 70, 9, -80<br>
> + dw 36, -83, 83, -36, -36, 83, -83, 36<br>
> + dw 25, -70, 90, -80, 43, 9, -57, 87<br>
> + dw 18, -50, 75, -89, 89, -75, 50, -18<br>
> + dw 9, -25, 43, -57, 70, -80, 87, -90<br>
> +<br>
> +<br>
> +tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64<br>
> + dw -9, -25, -43, -57, -70, -80, -87, -90<br>
> + dw -89, -75, -50, -18, 18, 50, 75, 89<br>
> + dw 25, 70, 90, 80, 43, -9, -57, -87<br>
> + dw 83, 36, -36, -83, -83, -36, 36, 83<br>
> + dw -43, -90, -57, 25, 87, 70, -9, -80<br>
> + dw -75, 18, 89, 50, -50, -89, -18, 75<br>
> + dw 57, 80, -25, -90, -9, 87, 43, -70<br>
> + dw 64, -64, -64, 64, 64, -64, -64, 64<br>
> + dw -70, -43, 87, 9, -90, 25, 80, -57<br>
> + dw -50, 89, -18, -75, 75, 18, -89, 50<br>
> + dw 80, -9, -70, 87, -25, -57, 90, -43<br>
> + dw 36, -83, 83, -36, -36, 83, -83, 36<br>
> + dw -87, 57, -9, -43, 80, -90, 70, -25<br>
> + dw -18, 50, -75, 89, -89, 75, -50, 18<br>
> + dw 90, -87, 80, -70, 57, -43, 25, -9<br>
> +<br>
> +dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1<br>
> +<br>
> +dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9<br>
> +<br>
> +avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64<br>
> + dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83<br>
><br>
> tab_dct4: times 4 dw 64, 64<br>
> times 4 dw 83, 36<br>
> times 4 dw 64, -64<br>
> times 4 dw 36, -83<br>
> -avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64<br>
> - dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83<br>
><br>
> dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13<br>
><br>
> @@ -1067,3 +1106,233 @@<br>
> RET<br>
><br>
> %endif ; !HIGH_BIT_DEPTH<br>
> +<br>
> +%macro DCT16_PASS_1_E 2<br>
> + vpbroadcastq m7, [tab_dct16_1 + %1]<br>
> +<br>
> + pmaddwd m4, m0, m7<br>
> + pmaddwd m6, m2, m7<br>
> + phaddd m4, m6<br>
> +<br>
> + paddd m4, m9<br>
> + psrad m4, DCT_SHIFT<br>
> +<br>
> + packssdw m4, m4<br>
> + vpermq m4, m4, 0x08<br>
> +<br>
> + mova [r5 + %2], xm4<br>
> +%endmacro<br>
> +<br>
> +%macro DCT16_PASS_1_O 2<br>
> + vbroadcasti128 m7, [tab_dct16_1 + %1]<br>
> +<br>
> + pmaddwd m10, m0, m7<br>
> + pmaddwd m11, m2, m7<br>
> + phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5]<br>
> +<br>
> + pmaddwd m11, m4, m7<br>
> + pmaddwd m12, m6, m7<br>
> + phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7]<br>
> +<br>
> + phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7]<br>
> +<br>
> + paddd m10, m9<br>
> + psrad m10, DCT_SHIFT<br>
> +<br>
> + packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]<br>
> + vpermq m10, m10, 0x08<br>
> +<br>
> + mova [r5 + %2], xm10<br>
> +%endmacro<br>
> +<br>
> +%macro DCT16_PASS_2 1<br>
> + vbroadcasti128 m8, [tab_dct16_1 + %1]<br>
> + vbroadcasti128 m13, [tab_dct16_2 + %1]<br>
> +<br>
> + pmaddwd m10, m0, m8<br>
> + pmaddwd m11, m1, m13<br>
> + paddd m10, m11<br>
> +<br>
> + pmaddwd m11, m2, m8<br>
> + pmaddwd m12, m3, m13<br>
> + paddd m11, m12<br>
> + phaddd m10, m11<br>
> +<br>
> + pmaddwd m11, m4, m8<br>
> + pmaddwd m12, m5, m13<br>
> + paddd m11, m12<br>
> +<br>
> + pmaddwd m12, m6, m8<br>
> + pmaddwd m13, m7, m13<br>
> + paddd m12, m13<br>
> + phaddd m11, m12<br>
> +<br>
> + phaddd m10, m11<br>
> + paddd m10, m9<br>
> + psrad m10, DCT_SHIFT2<br>
> +%endmacro<br>
> +<br>
> +%if ARCH_X86_64<br>
> +INIT_YMM avx2<br>
> +cglobal dct16, 3, 9, 15, 0-16*mmsize<br>
> +%if BIT_DEPTH == 10<br>
> + %define DCT_SHIFT 5<br>
> + vpbroadcastd m9, [pd_16]<br>
> +%elif BIT_DEPTH == 8<br>
> + %define DCT_SHIFT 3<br>
> + vpbroadcastd m9, [pd_4]<br>
> +%else<br>
> + %error Unsupported BIT_DEPTH!<br>
> +%endif<br>
> +%define DCT_SHIFT2 10<br>
> +<br>
> + add r2d, r2d<br>
> +<br>
> + lea r3, [r2 * 3]<br>
> + mov r5, rsp<br>
> + mov r4d, 2<br>
> + mova m13, [dct16_shuf1]<br>
> + mova m14, [dct16_shuf2]<br>
> +<br>
> +.pass1:<br>
> + lea r6, [r0 + r2 * 4]<br>
> +<br>
> + mova m2, [r0]<br>
> + mova m1, [r6]<br>
> + vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]<br>
> + vperm2i128 m1, m1, m2, 0x13 ; [row0hi row4hi]<br>
> +<br>
> + mova m4, [r0 + r2]<br>
> + mova m3, [r6 + r2]<br>
> + vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]<br>
> + vperm2i128 m3, m3, m4, 0x13 ; [row1hi row5hi]<br>
> +<br>
> + mova m6, [r0 + r2 * 2]<br>
> + mova m5, [r6 + r2 * 2]<br>
> + vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]<br>
> + vperm2i128 m5, m5, m6, 0x13 ; [row2hi row6hi]<br>
> +<br>
> + mova m8, [r0 + r3]<br>
> + mova m7, [r6 + r3]<br>
> + vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]<br>
> + vperm2i128 m7, m7, m8, 0x13 ; [row3hi row7hi]<br>
> +<br>
> + pshufb m1, m13<br>
> + pshufb m3, m13<br>
> + pshufb m5, m13<br>
> + pshufb m7, m13<br>
> +<br>
> + paddw m8, m0, m1 ;E<br>
> + psubw m0, m1 ;O<br>
> +<br>
> + paddw m1, m2, m3 ;E<br>
> + psubw m2, m3 ;O<br>
> +<br>
> + paddw m3, m4, m5 ;E<br>
> + psubw m4, m5 ;O<br>
> +<br>
> + paddw m5, m6, m7 ;E<br>
> + psubw m6, m7 ;O<br>
> +<br>
> + DCT16_PASS_1_O 1 * 16, 1 * 32<br>
> + DCT16_PASS_1_O 3 * 16, 3 * 32<br>
> + DCT16_PASS_1_O 5 * 16, 1 * 32 + 16<br>
> + DCT16_PASS_1_O 7 * 16, 3 * 32 + 16<br>
> + DCT16_PASS_1_O 9 * 16, 5 * 32<br>
> + DCT16_PASS_1_O 11 * 16, 7 * 32<br>
> + DCT16_PASS_1_O 13 * 16, 5 * 32 + 16<br>
> + DCT16_PASS_1_O 15 * 16, 7 * 32 + 16<br>
> +<br>
> + pshufb m8, m14<br>
> + pshufb m1, m14<br>
> + phaddw m0, m8, m1<br>
> +<br>
> + pshufb m3, m14<br>
> + pshufb m5, m14<br>
> + phaddw m2, m3, m5<br>
> +<br>
> + DCT16_PASS_1_E 0 * 16, 0 * 32<br>
> + DCT16_PASS_1_E 4 * 16, 0 * 32 + 16<br>
> + DCT16_PASS_1_E 8 * 16, 4 * 32<br>
> + DCT16_PASS_1_E 12 * 16, 4 * 32 + 16<br>
> +<br>
> + phsubw m0, m8, m1<br>
> + phsubw m2, m3, m5<br>
> +<br>
> + DCT16_PASS_1_E 2 * 16, 2 * 32<br>
> + DCT16_PASS_1_E 6 * 16, 2 * 32 + 16<br>
> + DCT16_PASS_1_E 10 * 16, 6 * 32<br>
> + DCT16_PASS_1_E 14 * 16, 6 * 32 + 16<br>
> +<br>
> + lea r0, [r0 + 8 * r2]<br>
> + add r5, 256<br>
> +<br>
> + dec r4d<br>
> + jnz .pass1<br>
> +<br>
> + mov r5, rsp<br>
> + mov r4d, 2<br>
> + add r2d, r2d<br>
> + lea r3, [r2 * 3]<br>
> + vpbroadcastd m9, [pd_512]<br>
> +<br>
> +.pass2:<br>
> + mova m0, [r5 + 0 * 32] ; [row0lo row4lo]<br>
> + mova m1, [r5 + 8 * 32] ; [row0hi row4hi]<br>
> +<br>
> + mova m2, [r5 + 1 * 32] ; [row1lo row5lo]<br>
> + mova m3, [r5 + 9 * 32] ; [row1hi row5hi]<br>
> +<br>
> + mova m4, [r5 + 2 * 32] ; [row2lo row6lo]<br>
> + mova m5, [r5 + 10 * 32] ; [row2hi row6hi]<br>
> +<br>
> + mova m6, [r5 + 3 * 32] ; [row3lo row7lo]<br>
> + mova m7, [r5 + 11 * 32] ; [row3hi row7hi]<br>
> +<br>
> + DCT16_PASS_2 0 * 16<br>
> + mova [r1], m10<br>
> + DCT16_PASS_2 1 * 16<br>
> + mova [r1 + r2], m10<br>
> + DCT16_PASS_2 2 * 16<br>
> + mova [r1 + r2 * 2], m10<br>
> + DCT16_PASS_2 3 * 16<br>
> + mova [r1 + r3], m10<br>
> +<br>
> + lea r6, [r1 + r2 * 4]<br>
> + DCT16_PASS_2 4 * 16<br>
> + mova [r6], m10<br>
> + DCT16_PASS_2 5 * 16<br>
> + mova [r6 + r2], m10<br>
> + DCT16_PASS_2 6 * 16<br>
> + mova [r6 + r2 * 2], m10<br>
> + DCT16_PASS_2 7 * 16<br>
> + mova [r6 + r3], m10<br>
> +<br>
> + lea r6, [r6 + r2 * 4]<br>
> + DCT16_PASS_2 8 * 16<br>
> + mova [r6], m10<br>
> + DCT16_PASS_2 9 * 16<br>
> + mova [r6 + r2], m10<br>
> + DCT16_PASS_2 10 * 16<br>
> + mova [r6 + r2 * 2], m10<br>
> + DCT16_PASS_2 11 * 16<br>
> + mova [r6 + r3], m10<br>
> +<br>
> + lea r6, [r6 + r2 * 4]<br>
> + DCT16_PASS_2 12 * 16<br>
> + mova [r6], m10<br>
> + DCT16_PASS_2 13 * 16<br>
> + mova [r6 + r2], m10<br>
> + DCT16_PASS_2 14 * 16<br>
> + mova [r6 + r2 * 2], m10<br>
> + DCT16_PASS_2 15 * 16<br>
> + mova [r6 + r3], m10<br>
> +<br>
> + add r1, 32<br>
> + add r5, 128<br>
> +<br>
> + dec r4d<br>
> + jnz .pass2<br>
> +<br>
> + RET<br>
> +%endif<br>
> diff -r 8cbfec8d6b4d -r f097f381abe9 source/common/x86/dct8.h<br>
> --- a/source/common/x86/dct8.h Sun Sep 07 12:54:27 2014 +0900<br>
> +++ b/source/common/x86/dct8.h Mon Sep 08 12:33:47 2014 +0530<br>
> @@ -30,6 +30,7 @@<br>
> void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);<br>
> void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);<br>
> void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);<br>
> +void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);<br>
><br>
> void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);<br>
> void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);<br>
</div></div>> _______________________________________________<br>
> x265-devel mailing list<br>
> <a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
> <a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<span class="HOEnZb"><font color="#888888"><br>
--<br>
Steve Borho<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</font></span></blockquote></div><br></div>