[x265] [PATCH] asm: idct8 sse2
chen
chenm003 at 163.com
Fri Nov 21 04:46:03 CET 2014
I remember we have IDCT asm code, how many cycles/performance on your version?
At 2014-11-21 11:24:08,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1416540113 28800
># Node ID f4a932dba6993d8bcff3ddea7dc8c83c55d52396
># Parent 1d17ec0cb9548194b90495c5d7c94552c71abbf5
>asm: idct8 sse2
>
>this version is based directly on the sse3 intrinsic
>basically, it's the intrinsic version with almost no optimizations
>but I thought it might be a better starting point for optimization
>than gcc's optimized output
>
>diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Thu Nov 20 20:04:02 2014 +0530
>+++ b/source/common/x86/asm-primitives.cpp Thu Nov 20 19:21:53 2014 -0800
>@@ -1376,6 +1376,7 @@
> p.dct[DCT_4x4] = x265_dct4_sse2;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
>+ p.idct[IDCT_8x8] = x265_idct8_sse2;
>
> LUMA_SS_FILTERS(_sse2);
> }
>@@ -1564,6 +1565,7 @@
>
> p.dct[DCT_4x4] = x265_dct4_sse2;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
>+ p.idct[IDCT_8x8] = x265_idct8_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
>
> p.planecopy_sp = x265_downShift_16_sse2;
>diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/dct8.asm
>--- a/source/common/x86/dct8.asm Thu Nov 20 20:04:02 2014 +0530
>+++ b/source/common/x86/dct8.asm Thu Nov 20 19:21:53 2014 -0800
>@@ -302,6 +302,19 @@
>
> pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
>
>+tab_idct8: times 4 dw 89, 75
>+ times 4 dw 50, 18
>+ times 4 dw 75, -18
>+ times 4 dw -89, -50
>+ times 4 dw 50, -89
>+ times 4 dw 18, 75
>+ times 4 dw 18, -50
>+ times 4 dw 75, -89
>+ times 4 dw 64, 64
>+ times 4 dw 64, -64
>+ times 4 dw 83, 36
>+ times 4 dw 36, -83
>+
> SECTION .text
> cextern pd_1
> cextern pd_2
>@@ -974,6 +987,437 @@
> RET
>
> ;-------------------------------------------------------
>+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
>+;-------------------------------------------------------
>+INIT_XMM sse2
>+
>+%if BIT_DEPTH == 10
>+ %define IDCT_SHIFT 10
>+ %define IDCT_ADD pd_512
>+%elif BIT_DEPTH == 8
>+ %define IDCT_SHIFT 12
>+ %define IDCT_ADD pd_2048
>+%else
>+ %error Unsupported BIT_DEPTH!
>+%endif
>+
>+cglobal idct8, 3,7, 16
>+ lea r2, [r2 + r2] ;set r2 to index of 1
>+ lea r4, [r2 + r2] ;set r4 to index of 2
>+ lea r3, [r4 + r2] ;set r3 to index of 3
>+ lea r4, [r4 + r3] ;set r4 to index of 5
>+ mov r5, rsp
>+ and r5, ~(16-1)
>+ movaps m6, [r0 + 1 * 16] ;m6 = m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
>+ movaps m15, [r0 + 3 * 16] ;m15 = m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
>+ mova m7, m6 ;m7 = m6, copy m128iS1 to m1
>+ punpcklwd m6, m15 ;m6 = m128Tmp0 = _mm_unpacklo_epi16(m6 = m128iS1, m15 = m128iS3);
>+ mova m0, [tab_idct8 + 0 * 16] ;m0 = tab_idct_8x8[0];
>+ mova m1, m0 ;m1 = m0, copy tab_idct_8x8[0] to m1
>+ pmaddwd m0, m6 ;m0 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m0 = tab_idct_8x8[0])));
>+ punpckhwd m7, m15 ;m7 = m128Tmp1 = _mm_unpackhi_epi16(m7 = m128iS1 , m15 = m128iS3);
>+ mova m12, [r0 + 5 * 16] ;m12 = m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
>+ pmaddwd m1, m7 ;m1 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m1 = tab_idct_8x8[0])));
>+ movu m15, [r0 + 7 * 16] ;m15 = m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
>+ mova m13, m12 ;m13 = m12, copy m128iS5 to m13
>+ punpcklwd m12, m15 ;m12 = m128Tmp2 = _mm_unpacklo_epi16(m12 = m128iS5, m15 = m128iS7);
>+ movu m8, [tab_idct8 + 1 * 16] ;m8 = tab_idct_8x8[1];
>+ movu m9, m8 ;m9 = m8, copy tab_idct_8x8[1] to m9
>+ pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[1])));
>+ punpckhwd m13, m15 ;m13 = m128Tmp3 = _mm_unpackhi_epi16(m13 = m128iS5, m15 = m128iS7);
>+ pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[1])));
>+ paddd m0, m8 ;m0 = O0l = _mm_add_epi32(m0 = E1l, m8 = E2l);
>+ paddd m1, m9 ;m1 = O0h = _mm_add_epi32(m1 = E1h, m9 = E2h);
>+ mova m2, [tab_idct8 + 2 * 16] ;m2 = tab_idct_8x8[2];
>+ mova m3, m2 ;m3 = m2, copy tab_idct_8x8[2] to m3
>+ pmaddwd m2, m6 ;m2 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m2 = tab_idct_8x8[2])));
>+ pmaddwd m3, m7 ;m3 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m3 = tab_idct_8x8[2])));
>+ mova m8, [tab_idct8 + 3 * 16] ;m8 = tab_idct_8x8[3];
>+ mova m9, m8 ;m9 = m8, copy tab_idct_8x8[3] to m9
>+ pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[3])));
>+ pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[3])));
>+ paddd m2, m8 ;m2 = O1l = _mm_add_epi32(m2 = E1l, m8 = E2l);
>+ paddd m3, m9 ;m3 = O1h = _mm_add_epi32(m3 = E1h, m9 = E2h);
>+ mova m4, [tab_idct8 + 4 * 16] ;m4 = tab_idct_8x8[4];
>+ mova m5, m4 ;m5 = m4, copy tab_idct_8x8[4] to m5
>+ pmaddwd m4, m6 ;m4 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m4 = tab_idct_8x8[4])));
>+ pmaddwd m5, m7 ;m5 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m5 = tab_idct_8x8[4])));
>+ mova m8, [tab_idct8 + 5 * 16] ;m8 = tab_idct_8x8[5];
>+ mova m9, m8 ;m9 = m8, copy tab_idct_8x8[5] to m9
>+ pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[5])));
>+ pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[5])));
>+ paddd m4, m8 ;m4 = O2l = _mm_add_epi32(m4 = E1l, m8 = E2l);
>+ paddd m5, m9 ;m5 = O2h = _mm_add_epi32(m5 = E1h, m9 = E2h);
>+ pmaddwd m6, [tab_idct8 + 6 * 16] ;m6 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
>+ pmaddwd m7, [tab_idct8 + 6 * 16] ;m7 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
>+ pmaddwd m12, [tab_idct8 + 7 * 16] ;m12 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
>+ pmaddwd m13, [tab_idct8 + 7 * 16] ;m13 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
>+ paddd m6, m12 ;m6 = O3l = _mm_add_epi32(m6 = E1l, m12 = E2l);
>+ paddd m7, m13 ;m7 = O3h = _mm_add_epi32(m7 = E1h, m13 = E2h);
>+
>+ ;/* ------- */
>+
>+ mova m8, [r0 + 0 * 16] ;m8 = m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
>+ mova m15, [r0 + 4 * 16] ;m15 = m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
>+ mova m9, m8 ;m9 = m8, copy m128iS0 to m9
>+ punpcklwd m8, m15 ;m8 = m128Tmp0 = _mm_unpacklo_epi16(m8 = m128iS0 , m15 = m128iS4);
>+ mova m10, m8 ;10 = m8, copy m128Tmp0 to m10
>+ pmaddwd m8, [tab_idct8 + 8 * 16] ;m8 = EE0l = _mm_madd_epi16(m8 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
>+ punpckhwd m9, m15 ;m9 = m128Tmp1 = _mm_unpackhi_epi16(m9 = m128iS0, m15 = m128iS4);
>+ mova m11, m9 ;m11 = m9, copy m128Tmp1 to m11
>+ pmaddwd m9, [tab_idct8 + 8 * 16] ;m9 = EE0h = _mm_madd_epi16(m9 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
>+ pmaddwd m10, [tab_idct8 + 9 * 16] ;m10 = EE1l = _mm_madd_epi16(m10 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
>+ pmaddwd m11, [tab_idct8 + 9 * 16] ;m11 = EE1h = _mm_madd_epi16(m11 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
>+
>+ ;/* ------- */
>+
>+ mova m12, [r0 + 2 * 16] ;m12 = m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
>+ mova m15, [r0 + 6 * 16] ;m15 = m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
>+ mova m13, m12 ;m13 = m12, copy m128iS2 to m13
>+ punpcklwd m12, m15 ;m12 = m128Tmp0 = _mm_unpacklo_epi16(m12 = m128iS2, m15 = m128iS6);
>+ mova m14, m12 ;m14 = m12, copy m128Tmp0 to m14
>+ pmaddwd m12, [tab_idct8 + 10 * 16] ;m12 = E00l = _mm_madd_epi16(m12 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
>+ punpckhwd m13, m15 ;m13 = m128Tmp1 = _mm_unpackhi_epi16(m13 = m128iS2, m15 = m128iS6);
>+ mova m15, m13 ;m15 = m13, copy m128Tmp1 to m15
>+ pmaddwd m13, [tab_idct8 + 10 * 16] ;m13 = E00h = _mm_madd_epi16(m13 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
>+ pmaddwd m14, [tab_idct8 + 11 * 16] ;m14 = E01l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
>+ pmaddwd m15, [tab_idct8 + 11 * 16] ;m15 = E01h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
>+ mova [r5 - 1 * 16], m12 ;s_1 = m12, copy E00l to stack[1]
>+ paddd m12, m8 ;m12 = E0l = _mm_add_epi32(m8 = EE0l, m12 = E00l);
>+ paddd m12, [pd_64] ;m12 = E0l = _mm_add_epi32(m12 = E0l, pd_64);
>+ mova [r5 - 2 * 16], m13 ;s_2 = m13, copy E00h to stack[2]
>+ paddd m13, m9 ;m13 = E0h = _mm_add_epi32(m9 = EE0h, m13 = E00h);
>+ paddd m13, [pd_64] ;m13 = E0h = _mm_add_epi32(m13 = E0h, pd_64);
>+ psubd m8, [r5 - 1 * 16] ;m8 = E3l = _mm_sub_epi32(m8 = EE0l, s_1 = E00l);
>+ psubd m9, [r5 - 2 * 16] ;m9 = E3h = _mm_sub_epi32(m9 = EE0h, s_2 = E00h);
>+ paddd m8, [pd_64] ;m8 = E3l = _mm_add_epi32(m8 = E3l, pd_64);
>+ mova [r5 - 1 * 16], m14 ;s_1 = m14, copy E01l to stack[1]
>+ paddd m9, [pd_64] ;m9 = E3h = _mm_add_epi32(m9 = E3h, pd_64);
>+ paddd m14, m10 ;m14 = E1l = _mm_add_epi32(m10 = EE1l, m14 = E01l);
>+ mova [r5 - 2 * 16], m15 ;s_2 = m15, copy E01h to stack[2]
>+ paddd m14, [pd_64] ;m14 = E1l = _mm_add_epi32(m14 = E1l, pd_64);
>+ paddd m15, m11 ;m15 = E1h = _mm_add_epi32(m11 = EE1h, m15 = E01h);
>+ paddd m15, [pd_64] ;m15 = E1h = _mm_add_epi32(m15 = E1h, pd_64);
>+ psubd m10, [r5 - 1 * 16] ;m10 = E2l = _mm_sub_epi32(m10 = EE1l, s_1 = E01l);
>+ mova [r5 - 1 * 16], m0 ;s_1 = m0, copy O0l to stack[1]
>+ paddd m10, [pd_64] ;m10 = E2l = _mm_add_epi32(m10 = E2l, pd_64);
>+ psubd m11, [r5 - 2 * 16] ;m11 = E2h = _mm_sub_epi32(m11 = EE1h, s_2 = E01h);
>+ paddd m11, [pd_64] ;m11 = E2h = _mm_add_epi32(m11 = E2h, pd_64);
>+ mova [r5 - 2 * 16], m1 ;s_2 = m1, copy O0h to stack[2]
>+ ;m0 = m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m12 = E0l, m0 = O0l), 7), _mm_srai_epi32(_mm_add_epi32(m13 = E0h, m1 = O0h), 7));
>+ paddd m0, m12 ;m0 = _mm_add_epi32(m12 = E0l, m0 = O0l)
>+ psrad m0, 7 ;m0 = _mm_srai_epi32(m0, 7)
>+ paddd m1, m13 ;m1 = _mm_add_epi32(m13 = E0h, m1 = O0h)
>+ psrad m1, 7 ;m1 = _mm_srai_epi32(m1, 7)
>+ packssdw m0, m1 ;m0 = m128iS0 = _mm_packs_epi32(m0 , m1)
>+ ;m12 = m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m12 = E0l, s_1 = O0l), 7), _mm_srai_epi32(_mm_sub_epi32(m13 = E0h, s_2 = O0h), 7));
>+ psubd m12, [r5 - 1 * 16] ;m12 = _mm_sub_epi32(m12 = E0l, s_1 = O0l)
>+ psrad m12, 7 ;m12 = _mm_srai_epi32(m12, 7)
>+ psubd m13, [r5 - 2 * 16] ;m13 = _mm_sub_epi32(m13 = E0h, s_2 = O0h)
>+ psrad m13, 7 ;m13 = _mm_srai_epi32(m13, 7)
>+ packssdw m12, m13 ;m12 = m128iS7 = _mm_packs_epi32(m12, m13)
>+ ;m2 = m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m14 = E1l, m2 = O1l), 7), _mm_srai_epi32(_mm_add_epi32(m15 = E1h, m3 = O1h), 7));
>+ mova m1, m2 ;m1 = m2, copy O1l to m1
>+ mova m13, m3 ;m13 = m3, copy O1h to m13
>+ paddd m2, m14 ;m2 = _mm_add_epi32(m14 = E1l, m2 = O1l)
>+ psrad m2, 7 ;m2 = _mm_srai_epi32(m2, 7)
>+ paddd m3 ,m15, ;m3 = _mm_add_epi32(m15 = E1h, m3 = O1h)
>+ psrad m3, 7 ;m3 = _mm_srai_epi32(m3, 7)
>+ packssdw m2, m3 ;m2 = m128iS1 = _mm_packs_epi32(m2, m3)
>+ ;m14 = m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m14 = E1l, m1 = O1l), 7), _mm_srai_epi32(_mm_sub_epi32(m15 = E1h, m13 = O1h), 7));
>+ psubd m14, m1 ;m14 = _mm_sub_epi32(m14 = E1l, m1 = O1l)
>+ psrad m14, 7 ;m14 = _mm_srai_epi32(m14, 7)
>+ psubd m15, m13 ;m15 = _mm_sub_epi32(m15 = E1h, m13 = O1h)
>+ psrad m15, 7 ;m15 = _mm_srai_epi32(m15, 7)
>+ packssdw m14, m15 ;m14 = m128iS6 = _mm_packs_epi32(m14, m15)
>+ ;m4 = m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m10 = E2l, m4 = O2l), 7), _mm_srai_epi32(_mm_add_epi32(m11 = E2h, m5 = O2h), 7));
>+ mova m3, m4 ;m3 = m4, copy O2l to m3
>+ mova m1, m5 ;m1 = m5, copy O2h to m1
>+ paddd m4, m10 ;m4 = _mm_add_epi32(m10 = E2l, m4 = O2l)
>+ psrad m4, 7 ;m4 = _mm_srai_epi32(m4, 7)
>+ paddd m5, m11 ;m5 = _mm_add_epi32(m11 = E2h, m5 = O2h)
>+ psrad m5, 7 ;m5 = _mm_srai_epi32(m5, 7)
>+ packssdw m4, m5 ;m4 = m128iS2 = _mm_packs_epi32(m4, m5)
>+ ;m10 = m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m10 = E2l, m3 = O2l), 7), _mm_srai_epi32(_mm_sub_epi32(m11 = E2h, m1 = O2h), 7));
>+ psubd m10, m3 ;m10 = _mm_sub_epi32(m10 = E2l, m3 = O2l)
>+ psrad m10, 7 ;m10 = _mm_srai_epi32(m10, 7)
>+ psubd m11, m1 ;m11 = _mm_sub_epi32(m11 = E2h, m1 = O2h)
>+ psrad m11, 7 ;m11 = _mm_srai_epi32(m11, 7)
>+ packssdw m10, m11 ;m10 = m128iS5 = _mm_packs_epi32(m10, m11)
>+ ;m6 = m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m8 = E3l, m6 = O3l), 7), _mm_srai_epi32(_mm_add_epi32(m9 = E3h, m7 = O3h), 7));
>+ mova m13, m6 ;m13 = m6, copy O3l to m13
>+ paddd m6, m8 ;m6 = _mm_add_epi32(m8 = E3l, m6 = O3l)
>+ psrad m6, 7 ;m6 = _mm_srai_epi32(m6, 7)
>+ mova m15, m7 ;m15 = m7, copy O3h to m15
>+ paddd m7, m9 ;m7 = _mm_add_epi32(m9 = E3h, m7 = O3h)
>+ psrad m7, 7 ;m7 = _mm_srai_epi32(m7, 7)
>+ packssdw m6, m7 ;m6 = m128iS3 = _mm_packs_epi32(m6, m7)
>+ ;m8 = m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m8 = E3l, m13 = O3l), 7), _mm_srai_epi32(_mm_sub_epi32(m9 = E3h, m15 = O3h), 7));
>+ psubd m8, m13 ;m8 = _mm_sub_epi32(m8 = E3l, m13 = O3l)
>+ psrad m8, 7 ;m8 = _mm_srai_epi32(m8, 7)
>+ psubd m9, m15 ;m9 = _mm_sub_epi32(m9 = E3h, m15 = O3h)
>+ psrad m9, 7 ;m9 = _mm_srai_epi32(m9, 7)
>+ packssdw m8, m9 ;m8 = m128iS4 = _mm_packs_epi32(m8, m9)
>+ ; /* Invers matrix */
>+
>+ mova m1, m0 ;m1 = m0, copy m128iS0 to m1
>+ punpcklwd m0, m8 ;m0 = E0l = _mm_unpacklo_epi16(m0 = m128iS0, m8 = m128iS4);
>+ mova m3, m2 ;m3 = m2, copy m128iS1 to m3
>+ punpcklwd m2, m10 ;m2 = E1l = _mm_unpacklo_epi16(m2 = m128iS1, m10 = m128iS5);
>+ mova m5, m4 ;m5 = m4, copy m128iS2 to m5
>+ punpcklwd m4, m14 ;m4 = E2l = _mm_unpacklo_epi16(m4 = m128iS2, m14 = m128iS6);
>+ mova m7, m6 ;m7 = m6, copy m128iS3 to m7
>+ punpcklwd m6, m12 ;m6 = E3l = _mm_unpacklo_epi16(m6 = m128iS3, m12 = m128iS7);
>+ punpckhwd m1, m8 ;m1 = O0l = _mm_unpackhi_epi16(m1 = m128iS0, m8 = m128iS4);
>+ punpckhwd m3, m10 ;m3 = O1l = _mm_unpackhi_epi16(m3 = m128iS1, m10 = m128iS5);
>+ punpckhwd m5, m14 ;m5 = O2l = _mm_unpackhi_epi16(m5 = m128iS2, m14 = m128iS6);
>+ punpckhwd m7, m12 ;m7 = O3l = _mm_unpackhi_epi16(m7 = m128iS3, m12 = m128iS7);
>+ mova m12, m0 ;m12 = m0, copy E0l to m12
>+ punpcklwd m0, m4 ;m0 = m128Tmp0 = _mm_unpacklo_epi16(m0 = E0l, m4 = E2l);
>+ mova m14, m0 ;m14 = m0, copy m128Tmp0 to m14
>+ mova m13, m2 ;m13 = m2, copy E1l to m13
>+ punpcklwd m2, m6 ;m2 = m128Tmp1 = _mm_unpacklo_epi16(m2 = E1l, m6 = E3l);
>+ punpcklwd m0, m2 ;m0 = m128iS0 = _mm_unpacklo_epi16(m0 = m128Tmp0, m2 = m128Tmp1);
>+ punpckhwd m14, m2 ;m14 = m128iS1 = _mm_unpackhi_epi16(m14 = m128Tmp0, m2 = m128Tmp1);
>+ punpckhwd m12, m4 ;m12 = m128Tmp2 = _mm_unpackhi_epi16(m12 = E0l, m4 = E2l);
>+ mova m2, m12 ;m2 = m12, copy m128Tmp2 to m2
>+ punpckhwd m13, m6 ;m13 = m128Tmp3 = _mm_unpackhi_epi16(m13 = E1l, m6 = E3l);
>+ punpcklwd m2, m13 ;m2 = m128iS2 = _mm_unpacklo_epi16(m2 = m128Tmp2, m13 = m128Tmp3);
>+ punpckhwd m12, m13 ;m12 = m128iS3 = _mm_unpackhi_epi16(m12 = m128Tmp2, m13 = m128Tmp3);
>+ mova m11, m1 ;m11 = m1, copy O0l to m11
>+ punpcklwd m1, m5 ;m1 = m128Tmp0 = _mm_unpacklo_epi16(m1 = O0l, m5 = O2l);
>+ mova m4, m1 ;m4 = m1, copy m128Tmp0 to m4
>+ mova m10, m3 ;m10 = m3, copy O1l to m10
>+ punpcklwd m3, m7 ;m3 = m128Tmp1 = _mm_unpacklo_epi16(m3 = O1l, m7 = O3l);
>+ punpcklwd m1, m3 ;m1 = m128iS4 = _mm_unpacklo_epi16(m1 = m128Tmp0, m3 = m128Tmp1);
>+ punpckhwd m4, m3 ;m4 = m128iS5 = _mm_unpackhi_epi16(m4 = m128Tmp0, m3 = m128Tmp1);
>+ punpckhwd m11, m5 ;m11 = m128Tmp2 = _mm_unpackhi_epi16(m11 = O0l, m5 = O2l);
>+ mova m5, m11 ;m5 = m11, copy m128Tmp2 to m5
>+ punpckhwd m10, m7 ;m10 = m128Tmp3 = _mm_unpackhi_epi16(m10 = O1l, m7 = O3l);
>+ punpcklwd m5, m10 ;m5 = m128iS6 = _mm_unpacklo_epi16(m5 = m128Tmp2, m10 = m128Tmp3);
>+ punpckhwd m11, m10 ;m11 = m128iS7 = _mm_unpackhi_epi16(m11 = m128Tmp2, m10 = m128Tmp3);
>+
>+ ;m128iAdd = _mm_set1_epi32(2048);
>+
>+ mova m15, m14 ;m15 = m14, copy m128iS1 to m15
>+ punpcklwd m14, m12 ;m14 = m128Tmp0 = _mm_unpacklo_epi16(m14 = m128iS1, m12 = m128iS3);
>+ mova m13, [tab_idct8] ;m13 = tab_idct_8x8[0];
>+ mova m6, m13 ;m6 = m13, copy tab_idct_8x8[0] to m6
>+ pmaddwd m13, m14 ;m13 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
>+ punpckhwd m15, m12 ;m15 = m128Tmp1 = _mm_unpackhi_epi16(m15 = m128iS1, m12 = m128iS3);
>+ pmaddwd m6, m15 ;m6 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m6 = tab_idct_8x8[0])));
>+ mova m8, m4 ;m8 = m4, copy m128iS5 to m8
>+ punpcklwd m4, m11 ;m4 = _m128Tmp2 = mm_unpacklo_epi16(m4 = m128iS5, m11 = m128iS7);
>+ mova m7, [tab_idct8 + 1 * 16] ;m7 = tab_idct_8x8[1];
>+ mova m9, m7 ;m9 = m7, copy tab_idct_8x8[1] to m7
>+ pmaddwd m7, m4 ;m7 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
>+ punpckhwd m8, m11 ;m8 = m128Tmp3 = _mm_unpackhi_epi16(m8 = m128iS5, m11 = m128iS7);
>+ pmaddwd m9, m8 ;m9 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
>+ paddd m13, m7 ;m13 = O0l = _mm_add_epi32(m13 = E1l, m7 = E2l);
>+ paddd m6, m9 ;m6 = O0h = _mm_add_epi32(m6 = E1h, m9 = E2h);
>+ mova m7, [tab_idct8 + 2 * 16] ;m7 = tab_idct_8x8[2];
>+ mova m9, m7 ;m9 = m7, copy tab_idct_8x8[2] to m9
>+ pmaddwd m7, m14 ;m7 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
>+ pmaddwd m9, m15 ;m9 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[2])));
>+ mova m10, [tab_idct8 + 3 * 16] ;m10 = tab_idct_8x8[3];
>+ mova m12, m10 ;m12 = m10, copy tab_idct_8x8[3] to m12
>+ pmaddwd m10, m4 ;m10 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
>+ pmaddwd m12, m8 ;m12 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(m12 = tab_idct_8x8[3])));
>+ paddd m7, m10 ;m7 = O1l = _mm_add_epi32(m7 = E1l, m10 = E2l);
>+ paddd m9, m12 ;m9 = O1h = _mm_add_epi32(m9 = E1h, m12);
>+ mova m10, [tab_idct8 + 4 * 16] ;m10 = tab_idct_8x8[4];
>+ mova m12, m10 ;m12 = m10, copy tab_idct_8x8[4] to m12
>+ pmaddwd m10, m14 ;m10 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
>+ pmaddwd m12, m15 ;m12 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m12 = tab_idct_8x8[4])));
>+ mova m11, [tab_idct8 + 5 * 16] ;m11 = tab_idct_8x8[5];
>+ mova m3, m11 ;m3 = m11, copy tab_idct_8x8[5] to m3
>+ pmaddwd m11, m4 ;m11 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
>+ pmaddwd m3, m8 ;m3 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(m3 = tab_idct_8x8[5])));
>+ paddd m10, m11 ;m10 = O2l = _mm_add_epi32(m10 = E1l, m11 = E2l);
>+ paddd m12, m3 ;m12 = O2h = _mm_add_epi32(m12 = E1h, m3 = E2h);
>+ pmaddwd m14, [tab_idct8 + 6 * 16] ;m14 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
>+ pmaddwd m15, [tab_idct8 + 6 * 16] ;m15 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
>+ pmaddwd m4, [tab_idct8 + 7 * 16] ;m4 = E2l = _mm_madd_epi16(m4 = m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
>+ pmaddwd m8, [tab_idct8 + 7 * 16] ;m8 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
>+ paddd m15, m8 ;m11 = O3h = _mm_add_epi32(m11 = E1h, m15 = E2h);
>+ paddd m14, m4 ;m3 = O3l = _mm_add_epi32(m3 = E1l, m14 = E2l);
>+
>+ mova m4, m0 ;m4 = m0, copy m128iS0 to m4
>+ punpcklwd m0, m1 ;m0 = m128Tmp0 = _mm_unpacklo_epi16(m0 = m128iS0, m1 = m128iS4);
>+ mova m11, m0 ;m11 = m0, copy m128Tmp0 to m15
>+ pmaddwd m0, [tab_idct8 + 8 * 16] ;m0 = EE0l = _mm_madd_epi16(m0 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
>+ punpckhwd m4, m1 ;m4 = m128Tmp1 = _mm_unpackhi_epi16(m4 = m128Tmp0, m1 = m128iS4);
>+ mova m3, m4 ;m3 = m4, copy m128Tmp1 to m3
>+ pmaddwd m4, [tab_idct8 + 8 * 16] ;m4 = EE0h = _mm_madd_epi16(m4 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
>+ pmaddwd m11, [tab_idct8 + 9 * 16] ;m13 = EE1l = _mm_madd_epi16(m11 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
>+ pmaddwd m3, [tab_idct8 + 9 * 16] ;m3 = EE1h = _mm_madd_epi16(m3 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
>+
>+ mova m8, m2 ;m8 = m2, copy m128iS2 to m8
>+ punpcklwd m2, m5 ;m2 = m128Tmp0 = _mm_unpacklo_epi16(m2 = m128iS2, m5 = m128iS6);
>+ mova m1, m2 ;m1 = m2, copy m128Tmp0 to m1
>+ pmaddwd m2, [tab_idct8 + 10 * 16] ;m2 = E00l = _mm_madd_epi16(m2 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
>+ punpckhwd m8, m5 ;m8 = m128Tmp1 = _mm_unpackhi_epi16(m8 = m128iS2, m5 = m128iS6);
>+ mova m5, m8 ;m5 = m8, copy m128Tmp1 to m5
>+ mova [r5 - 1 * 16], m2 ;s_1 = m2, copy E00l to stack[1]
>+ pmaddwd m5, [tab_idct8 + 10 * 16] ;m5 = E00h = _mm_madd_epi16(m5 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
>+ pmaddwd m1, [tab_idct8 + 11 * 16] ;m1 = E01l = _mm_madd_epi16(m1 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
>+ pmaddwd m8, [tab_idct8 + 11 * 16] ;m8 = E01h = _mm_madd_epi16(m8 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
>+ mova [r5 - 2 * 16], m5 ;s_2 = m5, copy E00h to stack[2]
>+ paddd m2, m0 ;m2 = E0l = _mm_add_epi32(m0 = EE0l, m2 = E00l);
>+ paddd m2, [IDCT_ADD] ;m2 = E0l = _mm_add_epi32(m2 = E0l, IDCT_ADD);
>+ paddd m5, m4 ;m5 = E0h = _mm_add_epi32(m4 = EE0h, m5 = E00h);
>+ paddd m5, [IDCT_ADD] ;m5 = E0h = _mm_add_epi32(m5 = E0h, IDCT_ADD);
>+ psubd m0, [r5 - 1 * 16] ;m0 = E3l = _mm_sub_epi32(m0 = EE0l, s_1 = E00l);
>+ mova [r5 - 1 * 16],m1 ;s_1 = m1, copy E01l to stack[1]
>+ paddd m0, [IDCT_ADD] ;m0 = E3l = _mm_add_epi32(m0 = E3l, IDCT_ADD);
>+ psubd m4, [r5 - 2 * 16] ;m4 = E3h = _mm_sub_epi32(m4 = EE0h, s_2 = E00h);
>+ paddd m4, [IDCT_ADD] ;m4 = E3h = _mm_add_epi32(m4, IDCT_ADD);
>+ paddd m1, m11 ;m1 = E1l = _mm_add_epi32(m15 = EE1l, m1 = E01l);
>+ mova [r5 - 2 * 16], m8 ;s_2 = m8, copy = E01h to stack[2]
>+ paddd m1, [IDCT_ADD] ;m1 = E1l = _mm_add_epi32(m1 = E1l, IDCT_ADD);
>+ paddd m8, m3 ;m8 = E1h = _mm_add_epi32(m14 = EE1h, m8 = E01h);
>+ paddd m8, [IDCT_ADD] ;m8 = E1h = _mm_add_epi32(m8 = E1h, IDCT_ADD);
>+ psubd m11, [r5 - 1 * 16] ;m15 = E2l = _mm_sub_epi32(m15 = EE1l, s_1 = E01l);
>+ paddd m11, [IDCT_ADD] ;m15 = E2l = _mm_add_epi32(m15 = E2l, IDCT_ADD);
>+ psubd m3, [r5 - 2 * 16] ;m14 = E2h = _mm_sub_epi32(m14 = EE1h, s_2 = E01h);
>+ paddd m3, [IDCT_ADD] ;m14 = E2h = _mm_add_epi32(m14 = E2h, IDCT_ADD);
>+
>+ ;m13 = m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m2 = E0l, m13 = O0l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m5 = E0h, m6 = O0h), IDCT_SHIFT));
>+ mova [r5 - 1 * 16], m13 ;s_1 = m13, copy O0l to stack[1]
>+ paddd m13, m2 ;m13 + m2, add E0l and O0l
>+ psrad m13, IDCT_SHIFT ;m13 = _mm_srai_epi32(m13, IDCT_SHIFT)
>+ mova [r5 - 2 * 16], m6 ;s_2 = m6, copy O0h to stack[2]
>+ paddd m6, m5 ;m6 + m5, add O0h and E0h
>+ psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
>+ packssdw m13, m6 ;m13 = m128iS0
>+ ;m2 = m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m2 = E0l, s_1 = O0l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m5 = E0h, s_2 = O0h), IDCT_SHIFT));
>+ psubd m2, [r5 - 1 * 16] ;m2 - s_1, E0l minus O0l
>+ psrad m2, IDCT_SHIFT ;m2 = _mm_srai_epi32(m2, IDCT_SHIFT)
>+ psubd m5, [r5 - 2 * 16] ;m5 - s_2, E0h minus O0h
>+ psrad m5, IDCT_SHIFT ;m5 = _mm_srai_epi32(m5, IDCT_SHIFT)
>+ packssdw m2, m5 ;m2 = m128iS7
>+ ;m1 = m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m1 = E1l, m7 = O1l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m8 = E1h, m9 = O1h), IDCT_SHIFT));
>+ mova m5, m1 ;m5 = m1, copy E1l to m5
>+ paddd m1, m7 ;m1 + m7, add E1l and O1l
>+ psrad m1, IDCT_SHIFT ;m1 = _mm_srai_epi32(m1, IDCT_SHIFT)
>+ mova m6, m8 ;m6 = m8, copy E1h to m6
>+ paddd m8, m9 ;m8 + m9, add E1h and O1h
>+ psrad m8, IDCT_SHIFT ;m8 = _mm_srai_epi32(m8, IDCT_SHIFT)
>+ packssdw m1, m8 ;m1 = m128iS1
>+ ;m5 = m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m5 = E1l, m7 = O1l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m6 = E1h, m9 = O1h), IDCT_SHIFT));
>+ psubd m5, m7 ;m5 - m7, E1l minus O1l
>+ psrad m5, IDCT_SHIFT ;m5 = _mm_srai_epi32(m5, IDCT_SHIFT)
>+ psubd m6, m9 ;m6 - m9, E1h minus O1h
>+ psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
>+ packssdw m5, m6 ;m5 = m128iS6
>+ ;m15 = m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m15 = E2l, m10 = O2l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m14 = E2h, m12 = O2h), IDCT_SHIFT));
>+ mova m6, m11 ;m6 = m15, copy E2l to m6
>+
>+ paddd m11, m10 ;m15 + m10, add E2l and O2l
>+ psrad m11, IDCT_SHIFT ;m11 = _mm_srai_epi32(m11, IDCT_SHIFT)
>+ mova m7, m3 ;m7 = m14, copy E2h to m7
>+ paddd m3, m12 ;m14 + m12, add E2h and O2h
>+ psrad m3, IDCT_SHIFT ;m3 = _mm_srai_epi32(m3, IDCT_SHIFT)
>+ packssdw m11, m3 ;m15 = m128iS2
>+ ;m6 = m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m6 = E2l, m10 = O2l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m7 = E2h, m12 = O2h), IDCT_SHIFT));
>+ psubd m6, m10 ;m6 - m10, E2l minus O2l
>+ psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
>+ psubd m7, m12 ;m7 - m12, E2h minus O2h
>+ psrad m7, IDCT_SHIFT ;m7 = _mm_srai_epi32(m7, IDCT_SHIFT)
>+ packssdw m6, m7 ;m6 = m128iS5
>+ ;m0 = m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m0 = E3l, m3 = O3l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m4 = E3h, m11 = O3h), IDCT_SHIFT));
>+ mova m8, m0 ;m8 = m0, copy E3l to m8
>+ paddd m0, m14 ;m0 + m3, add E3l and O3l
>+ psrad m0, IDCT_SHIFT ;m0 = _mm_srai_epi32(m0, IDCT_SHIFT)
>+ mova m7, m4 ;m7 = m4, copy E3h to m7
>+ paddd m4, m15 ;m4 + m11, add E3h and O3h
>+ psrad m4, IDCT_SHIFT ;m4 = _mm_srai_epi32(m4, IDCT_SHIFT)
>+ packssdw m0, m4 ;m0 = m128iS3
>+ ;m8 = m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m8 = E3l, m3 = O3l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m7 = E3h, m11 = O3h), IDCT_SHIFT));
>+ psubd m8, m14 ;m8 - m3, E3l minus O3l
>+ psrad m8, IDCT_SHIFT ;m8 = _mm_srai_epi32(m8, IDCT_SHIFT)
>+ psubd m7, m15 ;m7 - m11, E3h minus O3h
>+ psrad m7, IDCT_SHIFT ;m7 = _mm_srai_epi32(m7, IDCT_SHIFT)
>+ packssdw m8, m7 ;m8 = m128iS4
>+
>+; // [07 06 05 04 03 02 01 00]
>+; // [17 16 15 14 13 12 11 10]
>+; // [27 26 25 24 23 22 21 20]
>+; // [37 36 35 34 33 32 31 30]
>+; // [47 46 45 44 43 42 41 40]
>+; // [57 56 55 54 53 52 51 50]
>+; // [67 66 65 64 63 62 61 60]
>+; // [77 76 75 74 73 72 71 70]
>+
>+ mova m3, m13 ;m3 = m13, copy m128iS0 to m3
>+ punpcklwd m3, m1 ;m3 = T00 = _mm_unpacklo_epi16(m3 = m128iS0, m1 = m128iS1); // [13 03 12 02 11 01 10 00]
>+ punpckhwd m13, m1 ;m13 = T01 = _mm_unpackhi_epi16(m13 = m128iS0,m1 = m128iS1); // [17 07 16 06 15 05 14 04]
>+ mova m4, m11 ;m4 = m15
>+ punpcklwd m11, m0 ;m15 = T02 = _mm_unpacklo_epi16(m15 = m128iS2, m0 = m128iS3); // [33 23 32 22 31 21 30 20]
>+ punpckhwd m4, m0 ;m4 = T03 = _mm_unpackhi_epi16(m4 = m128iS2, m0 = m128iS3); // [37 27 36 26 35 25 34 24]
>+ mova m1, m8 ;m1 = m8, copy m128iS4 to m1
>+ punpcklwd m8, m6 ;m8 = T04 = _mm_unpacklo_epi16(m8 = m128iS4, m6 = m128iS5); // [53 43 52 42 51 41 50 40]
>+ punpckhwd m1, m6 ;m1 = T05 = _mm_unpackhi_epi16(m1 = m128iS4, m6 = m128iS5); // [57 47 56 46 55 45 54 44]
>+ mova m7, m5 ;m7 = m5, copy m128iS6 to m7
>+ punpcklwd m5, m2 ;m5 = T06 = _mm_unpacklo_epi16(m5 = m128iS6, m2 = m128iS7); // [73 63 72 62 71 61 70 60]
>+ punpckhwd m7, m2 ;m7 = T07 = _mm_unpackhi_epi16(m7 = m128iS6, m2 = m128iS7); // [77 67 76 66 75 65 74 64]
>+
>+ ;__m128i T10, T11;
>+ mova m0, m3 ;m0 = m3, copy T00 to m0
>+ punpckldq m0, m11 ;m0 = T10 = _mm_unpacklo_epi32(m0 = T00, m15 = T02); // [31 21 11 01 30 20 10 00]
>+ punpckhdq m3, m11 ;m3 = T11 = _mm_unpackhi_epi32(m3 = T00, m15 = T02); // [33 23 13 03 32 22 12 02]
>+; lea r2, [r2 + r2] ;set r2 to index of 1
>+; lea r4, [r2 + r2] ;set r4 to index of 2
>+; lea r3, [r4 + r2] ;set r3 to index of 3
>+; lea r4, [r4 + r3] ;set r4 to index of 5
>+ lea r0, [r4 + r2 * 2] ;set r0 to index of 7
>+ movq [r1], m0 ;_mm_storel_epi64((__m128i*)&dst[0 * stride + 0], m0 = T10); // [30 20 10 00]
>+ movq [r1 + r2 * 2], m3 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 0], m3 = T11); // [32 22 12 02]
>+ movhps [r1 + r2 * 1], m0 ;_mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(m0 = T10)); // [31 21 11 01]
>+ movhps [r1 + r3], m3 ;_mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(m3 = T11)); // [33 23 13 03]
>+
>+ mova m2, m8 ;m2 = m8, copy T04 to m2
>+ punpckldq m2, m5 ;m2 = T10 = _mm_unpacklo_epi32(m2 = T04, m5 = T06); // [71 61 51 41 70 60 50 40]
>+ punpckhdq m8, m5 ;m8 = T11 = _mm_unpackhi_epi32(m8 = T04, m5 = T06); // [73 63 53 43 72 62 52 42]
>+ movq [r1 + 8], m2 ;_mm_storel_epi64((__m128i*)&dst[0 * stride + 4], m2 = T10);
>+ movq [r1 + r2 * 2 + 8], m8 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 4], m8 = T11);
>+ movhps [r1 + r2 * 1 + 8], m2 ;_mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(m2 = T10));
>+ movq [r1 + r2 * 2 + 8], m8 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 4], m8 = T11);
>+
>+ movhps [r1 + r3 + 8], m8 ;_mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(m8 = T11));
>+ mova m11, m0
>+ punpcklqdq m0, m2
>+ punpckhqdq m11, m2
>+ mova [r1], m0
>+ mova [r1 + r2 * 1], m11
>+
>+ mova m5, m3
>+ punpcklqdq m3, m8
>+ punpckhqdq m5, m8
>+ mova [r1 + r2 * 2], m3
>+ mova [r1 + r3], m5
>+
>+ mova m6, m13 ;m6 = m13, copy T01 to m6
>+ punpckldq m6, m4 ;m6 = T10 = _mm_unpacklo_epi32(m6 = T01, m4 = T03); // [35 25 15 05 34 24 14 04]
>+ punpckhdq m13, m4 ;m13 = T11 = _mm_unpackhi_epi32(m13 = T01, m4 = T03); // [37 27 17 07 36 26 16 06]
>+ movq [r1 + r2 * 4], m6 ;_mm_storel_epi64((__m128i*)&dst[4 * stride + 0], m6 = T10);
>+ movq [r1 + r3 * 2], m13 ;_mm_storel_epi64((__m128i*)&dst[6 * stride + 0], m13 = T11);
>+ mova m9, m1 ;m9 = m1, copy T05 to m9
>+ movhps [r1 + r4], m6 ;_mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(m6 = T10));
>+ movhps [r1 + r0], m13 ;_mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(m13 = T11));
>+
>+ punpckldq m1, m7 ;m1 = T10 = _mm_unpacklo_epi32(m1 = T05, m7 = T07); // [75 65 55 45 74 64 54 44]
>+ punpckhdq m9, m7 ;m9 = T11 = _mm_unpackhi_epi32(m9 = T05, m7 = T07); // [77 67 57 47 76 56 46 36]
>+ movq [r1 + r2 * 4 + 8], m1 ;_mm_storel_epi64((__m128i*)&dst[4 * stride + 4], m1 = T10);
>+ movq [r1 + r3 * 2 + 8], m9 ;_mm_storel_epi64((__m128i*)&dst[6 * stride + 4], m9 = T11);
>+ movhps [r1 + r4 + 8], m1 ;_mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(m1 = T10));
>+ movhps [r1 + r0 + 8], m9 ;_mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(m9 = T11));
>+
>+ RET
>+%undef IDCT_SHIFT
>+%undef IDCT_ADD
>+
>+;-------------------------------------------------------
> ; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM ssse3
>diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/dct8.h
>--- a/source/common/x86/dct8.h Thu Nov 20 20:04:02 2014 +0530
>+++ b/source/common/x86/dct8.h Thu Nov 20 19:21:53 2014 -0800
>@@ -35,6 +35,7 @@
> void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
>+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
>
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141121/24f26fdb/attachment-0001.html>
More information about the x265-devel
mailing list