[x265] [PATCH] asm: idct8 sse2
Steve Borho
steve at borho.org
Fri Nov 21 07:21:21 CET 2014
On 11/20, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1416540113 28800
> # Node ID f4a932dba6993d8bcff3ddea7dc8c83c55d52396
> # Parent 1d17ec0cb9548194b90495c5d7c94552c71abbf5
> asm: idct8 sse2
>
> this version is based directly on the sse3 intrinsic
> basically, it's the intrinsic version with almost no optimizations
> but I thought it might be a better starting point for optimization
> than gcc's optimized output
Not bad for a start.
$ ./test/TestBench --test trans --cpu SSE2 | grep idct8x8
idct8x8 6.52x 1273.51 8304.11
$ ./test/TestBench --test trans --cpu SSSE3 | grep idct8x8
idct8x8 5.77x 1273.45 7345.68
$ ./test/TestBench --test trans --cpu AVX2 | grep idct8x8
idct8x8 8.28x 887.04 7345.68
Only the middle column there is really meaningful; but you can see your
SSE2 assembly routine is roughly the same speed as the intrinsic routine
using SSSE3, which tells me we can remove the intrinsic function as soon
as this routine is finished.
> diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Thu Nov 20 20:04:02 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp Thu Nov 20 19:21:53 2014 -0800
> @@ -1376,6 +1376,7 @@
> p.dct[DCT_4x4] = x265_dct4_sse2;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
> + p.idct[IDCT_8x8] = x265_idct8_sse2;
>
> LUMA_SS_FILTERS(_sse2);
> }
> @@ -1564,6 +1565,7 @@
>
> p.dct[DCT_4x4] = x265_dct4_sse2;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> + p.idct[IDCT_8x8] = x265_idct8_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
>
> p.planecopy_sp = x265_downShift_16_sse2;
> diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Thu Nov 20 20:04:02 2014 +0530
> +++ b/source/common/x86/dct8.asm Thu Nov 20 19:21:53 2014 -0800
> @@ -302,6 +302,19 @@
>
> pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
>
> +tab_idct8: times 4 dw 89, 75
> + times 4 dw 50, 18
> + times 4 dw 75, -18
> + times 4 dw -89, -50
> + times 4 dw 50, -89
> + times 4 dw 18, 75
> + times 4 dw 18, -50
> + times 4 dw 75, -89
> + times 4 dw 64, 64
> + times 4 dw 64, -64
> + times 4 dw 83, 36
> + times 4 dw 36, -83
> +
> SECTION .text
> cextern pd_1
> cextern pd_2
> @@ -974,6 +987,437 @@
> RET
>
> ;-------------------------------------------------------
> +; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
> +;-------------------------------------------------------
> +INIT_XMM sse2
> +
> +%if BIT_DEPTH == 10
> + %define IDCT_SHIFT 10
> + %define IDCT_ADD pd_512
> +%elif BIT_DEPTH == 8
> + %define IDCT_SHIFT 12
> + %define IDCT_ADD pd_2048
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> +
> +cglobal idct8, 3,7, 16
> + lea r2, [r2 + r2] ;set r2 to index of 1
> + lea r4, [r2 + r2] ;set r4 to index of 2
> + lea r3, [r4 + r2] ;set r3 to index of 3
> + lea r4, [r4 + r3] ;set r4 to index of 5
> + mov r5, rsp
> + and r5, ~(16-1)
I'm hoping Min or Praveen can comment on how to improve the assembly
code. I can only point out we prefer not to have the intrinsic code in
the comments.
> + movaps m6, [r0 + 1 * 16] ;m6 = m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
> + movaps m15, [r0 + 3 * 16] ;m15 = m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
> + mova m7, m6 ;m7 = m6, copy m128iS1 to m1
> + punpcklwd m6, m15 ;m6 = m128Tmp0 = _mm_unpacklo_epi16(m6 = m128iS1, m15 = m128iS3);
> + mova m0, [tab_idct8 + 0 * 16] ;m0 = tab_idct_8x8[0];
> + mova m1, m0 ;m1 = m0, copy tab_idct_8x8[0] to m1
> + pmaddwd m0, m6 ;m0 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m0 = tab_idct_8x8[0])));
> + punpckhwd m7, m15 ;m7 = m128Tmp1 = _mm_unpackhi_epi16(m7 = m128iS1 , m15 = m128iS3);
> + mova m12, [r0 + 5 * 16] ;m12 = m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
> + pmaddwd m1, m7 ;m1 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m1 = tab_idct_8x8[0])));
> + movu m15, [r0 + 7 * 16] ;m15 = m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
> + mova m13, m12 ;m13 = m12, copy m128iS5 to m13
> + punpcklwd m12, m15 ;m12 = m128Tmp2 = _mm_unpacklo_epi16(m12 = m128iS5, m15 = m128iS7);
> + movu m8, [tab_idct8 + 1 * 16] ;m8 = tab_idct_8x8[1];
> + movu m9, m8 ;m9 = m8, copy tab_idct_8x8[1] to m9
> + pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[1])));
> + punpckhwd m13, m15 ;m13 = m128Tmp3 = _mm_unpackhi_epi16(m13 = m128iS5, m15 = m128iS7);
> + pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[1])));
> + paddd m0, m8 ;m0 = O0l = _mm_add_epi32(m0 = E1l, m8 = E2l);
> + paddd m1, m9 ;m1 = O0h = _mm_add_epi32(m1 = E1h, m9 = E2h);
> + mova m2, [tab_idct8 + 2 * 16] ;m2 = tab_idct_8x8[2];
> + mova m3, m2 ;m3 = m2, copy tab_idct_8x8[2] to m3
> + pmaddwd m2, m6 ;m2 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m2 = tab_idct_8x8[2])));
> + pmaddwd m3, m7 ;m3 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m3 = tab_idct_8x8[2])));
> + mova m8, [tab_idct8 + 3 * 16] ;m8 = tab_idct_8x8[3];
> + mova m9, m8 ;m9 = m8, copy tab_idct_8x8[3] to m9
> + pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[3])));
> + pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[3])));
> + paddd m2, m8 ;m2 = O1l = _mm_add_epi32(m2 = E1l, m8 = E2l);
> + paddd m3, m9 ;m3 = O1h = _mm_add_epi32(m3 = E1h, m9 = E2h);
> + mova m4, [tab_idct8 + 4 * 16] ;m4 = tab_idct_8x8[4];
> + mova m5, m4 ;m5 = m4, copy tab_idct_8x8[4] to m5
> + pmaddwd m4, m6 ;m4 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m4 = tab_idct_8x8[4])));
> + pmaddwd m5, m7 ;m5 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m5 = tab_idct_8x8[4])));
> + mova m8, [tab_idct8 + 5 * 16] ;m8 = tab_idct_8x8[5];
> + mova m9, m8 ;m9 = m8, copy tab_idct_8x8[5] to m9
> + pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[5])));
> + pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[5])));
> + paddd m4, m8 ;m4 = O2l = _mm_add_epi32(m4 = E1l, m8 = E2l);
> + paddd m5, m9 ;m5 = O2h = _mm_add_epi32(m5 = E1h, m9 = E2h);
> + pmaddwd m6, [tab_idct8 + 6 * 16] ;m6 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
> + pmaddwd m7, [tab_idct8 + 6 * 16] ;m7 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
> + pmaddwd m12, [tab_idct8 + 7 * 16] ;m12 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
> + pmaddwd m13, [tab_idct8 + 7 * 16] ;m13 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
> + paddd m6, m12 ;m6 = O3l = _mm_add_epi32(m6 = E1l, m12 = E2l);
> + paddd m7, m13 ;m7 = O3h = _mm_add_epi32(m7 = E1h, m13 = E2h);
> +
> + ;/* ------- */
> +
> + mova m8, [r0 + 0 * 16] ;m8 = m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
> + mova m15, [r0 + 4 * 16] ;m15 = m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
> + mova m9, m8 ;m9 = m8, copy m128iS0 to m9
> + punpcklwd m8, m15 ;m8 = m128Tmp0 = _mm_unpacklo_epi16(m8 = m128iS0 , m15 = m128iS4);
> + mova m10, m8 ;10 = m8, copy m128Tmp0 to m10
> + pmaddwd m8, [tab_idct8 + 8 * 16] ;m8 = EE0l = _mm_madd_epi16(m8 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
> + punpckhwd m9, m15 ;m9 = m128Tmp1 = _mm_unpackhi_epi16(m9 = m128iS0, m15 = m128iS4);
> + mova m11, m9 ;m11 = m9, copy m128Tmp1 to m11
> + pmaddwd m9, [tab_idct8 + 8 * 16] ;m9 = EE0h = _mm_madd_epi16(m9 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
> + pmaddwd m10, [tab_idct8 + 9 * 16] ;m10 = EE1l = _mm_madd_epi16(m10 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
> + pmaddwd m11, [tab_idct8 + 9 * 16] ;m11 = EE1h = _mm_madd_epi16(m11 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
> +
> + ;/* ------- */
> +
> + mova m12, [r0 + 2 * 16] ;m12 = m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
> + mova m15, [r0 + 6 * 16] ;m15 = m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
> + mova m13, m12 ;m13 = m12, copy m128iS2 to m13
> + punpcklwd m12, m15 ;m12 = m128Tmp0 = _mm_unpacklo_epi16(m12 = m128iS2, m15 = m128iS6);
> + mova m14, m12 ;m14 = m12, copy m128Tmp0 to m14
> + pmaddwd m12, [tab_idct8 + 10 * 16] ;m12 = E00l = _mm_madd_epi16(m12 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
> + punpckhwd m13, m15 ;m13 = m128Tmp1 = _mm_unpackhi_epi16(m13 = m128iS2, m15 = m128iS6);
> + mova m15, m13 ;m15 = m13, copy m128Tmp1 to m15
> + pmaddwd m13, [tab_idct8 + 10 * 16] ;m13 = E00h = _mm_madd_epi16(m13 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
> + pmaddwd m14, [tab_idct8 + 11 * 16] ;m14 = E01l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
> + pmaddwd m15, [tab_idct8 + 11 * 16] ;m15 = E01h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
> + mova [r5 - 1 * 16], m12 ;s_1 = m12, copy E00l to stack[1]
> + paddd m12, m8 ;m12 = E0l = _mm_add_epi32(m8 = EE0l, m12 = E00l);
> + paddd m12, [pd_64] ;m12 = E0l = _mm_add_epi32(m12 = E0l, pd_64);
> + mova [r5 - 2 * 16], m13 ;s_2 = m13, copy E00h to stack[2]
> + paddd m13, m9 ;m13 = E0h = _mm_add_epi32(m9 = EE0h, m13 = E00h);
> + paddd m13, [pd_64] ;m13 = E0h = _mm_add_epi32(m13 = E0h, pd_64);
> + psubd m8, [r5 - 1 * 16] ;m8 = E3l = _mm_sub_epi32(m8 = EE0l, s_1 = E00l);
> + psubd m9, [r5 - 2 * 16] ;m9 = E3h = _mm_sub_epi32(m9 = EE0h, s_2 = E00h);
> + paddd m8, [pd_64] ;m8 = E3l = _mm_add_epi32(m8 = E3l, pd_64);
> + mova [r5 - 1 * 16], m14 ;s_1 = m14, copy E01l to stack[1]
> + paddd m9, [pd_64] ;m9 = E3h = _mm_add_epi32(m9 = E3h, pd_64);
> + paddd m14, m10 ;m14 = E1l = _mm_add_epi32(m10 = EE1l, m14 = E01l);
> + mova [r5 - 2 * 16], m15 ;s_2 = m15, copy E01h to stack[2]
> + paddd m14, [pd_64] ;m14 = E1l = _mm_add_epi32(m14 = E1l, pd_64);
> + paddd m15, m11 ;m15 = E1h = _mm_add_epi32(m11 = EE1h, m15 = E01h);
> + paddd m15, [pd_64] ;m15 = E1h = _mm_add_epi32(m15 = E1h, pd_64);
> + psubd m10, [r5 - 1 * 16] ;m10 = E2l = _mm_sub_epi32(m10 = EE1l, s_1 = E01l);
> + mova [r5 - 1 * 16], m0 ;s_1 = m0, copy O0l to stack[1]
> + paddd m10, [pd_64] ;m10 = E2l = _mm_add_epi32(m10 = E2l, pd_64);
> + psubd m11, [r5 - 2 * 16] ;m11 = E2h = _mm_sub_epi32(m11 = EE1h, s_2 = E01h);
> + paddd m11, [pd_64] ;m11 = E2h = _mm_add_epi32(m11 = E2h, pd_64);
> + mova [r5 - 2 * 16], m1 ;s_2 = m1, copy O0h to stack[2]
> + ;m0 = m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m12 = E0l, m0 = O0l), 7), _mm_srai_epi32(_mm_add_epi32(m13 = E0h, m1 = O0h), 7));
> + paddd m0, m12 ;m0 = _mm_add_epi32(m12 = E0l, m0 = O0l)
> + psrad m0, 7 ;m0 = _mm_srai_epi32(m0, 7)
> + paddd m1, m13 ;m1 = _mm_add_epi32(m13 = E0h, m1 = O0h)
> + psrad m1, 7 ;m1 = _mm_srai_epi32(m1, 7)
> + packssdw m0, m1 ;m0 = m128iS0 = _mm_packs_epi32(m0 , m1)
> + ;m12 = m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m12 = E0l, s_1 = O0l), 7), _mm_srai_epi32(_mm_sub_epi32(m13 = E0h, s_2 = O0h), 7));
> + psubd m12, [r5 - 1 * 16] ;m12 = _mm_sub_epi32(m12 = E0l, s_1 = O0l)
> + psrad m12, 7 ;m12 = _mm_srai_epi32(m12, 7)
> + psubd m13, [r5 - 2 * 16] ;m13 = _mm_sub_epi32(m13 = E0h, s_2 = O0h)
> + psrad m13, 7 ;m13 = _mm_srai_epi32(m13, 7)
> + packssdw m12, m13 ;m12 = m128iS7 = _mm_packs_epi32(m12, m13)
> + ;m2 = m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m14 = E1l, m2 = O1l), 7), _mm_srai_epi32(_mm_add_epi32(m15 = E1h, m3 = O1h), 7));
> + mova m1, m2 ;m1 = m2, copy O1l to m1
> + mova m13, m3 ;m13 = m3, copy O1h to m13
> + paddd m2, m14 ;m2 = _mm_add_epi32(m14 = E1l, m2 = O1l)
> + psrad m2, 7 ;m2 = _mm_srai_epi32(m2, 7)
> + paddd m3 ,m15, ;m3 = _mm_add_epi32(m15 = E1h, m3 = O1h)
> + psrad m3, 7 ;m3 = _mm_srai_epi32(m3, 7)
> + packssdw m2, m3 ;m2 = m128iS1 = _mm_packs_epi32(m2, m3)
> + ;m14 = m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m14 = E1l, m1 = O1l), 7), _mm_srai_epi32(_mm_sub_epi32(m15 = E1h, m13 = O1h), 7));
> + psubd m14, m1 ;m14 = _mm_sub_epi32(m14 = E1l, m1 = O1l)
> + psrad m14, 7 ;m14 = _mm_srai_epi32(m14, 7)
> + psubd m15, m13 ;m15 = _mm_sub_epi32(m15 = E1h, m13 = O1h)
> + psrad m15, 7 ;m15 = _mm_srai_epi32(m15, 7)
> + packssdw m14, m15 ;m14 = m128iS6 = _mm_packs_epi32(m14, m15)
> + ;m4 = m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m10 = E2l, m4 = O2l), 7), _mm_srai_epi32(_mm_add_epi32(m11 = E2h, m5 = O2h), 7));
> + mova m3, m4 ;m3 = m4, copy O2l to m3
> + mova m1, m5 ;m1 = m5, copy O2h to m1
> + paddd m4, m10 ;m4 = _mm_add_epi32(m10 = E2l, m4 = O2l)
> + psrad m4, 7 ;m4 = _mm_srai_epi32(m4, 7)
> + paddd m5, m11 ;m5 = _mm_add_epi32(m11 = E2h, m5 = O2h)
> + psrad m5, 7 ;m5 = _mm_srai_epi32(m5, 7)
> + packssdw m4, m5 ;m4 = m128iS2 = _mm_packs_epi32(m4, m5)
> + ;m10 = m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m10 = E2l, m3 = O2l), 7), _mm_srai_epi32(_mm_sub_epi32(m11 = E2h, m1 = O2h), 7));
> + psubd m10, m3 ;m10 = _mm_sub_epi32(m10 = E2l, m3 = O2l)
> + psrad m10, 7 ;m10 = _mm_srai_epi32(m10, 7)
> + psubd m11, m1 ;m11 = _mm_sub_epi32(m11 = E2h, m1 = O2h)
> + psrad m11, 7 ;m11 = _mm_srai_epi32(m11, 7)
> + packssdw m10, m11 ;m10 = m128iS5 = _mm_packs_epi32(m10, m11)
> + ;m6 = m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m8 = E3l, m6 = O3l), 7), _mm_srai_epi32(_mm_add_epi32(m9 = E3h, m7 = O3h), 7));
> + mova m13, m6 ;m13 = m6, copy O3l to m13
> + paddd m6, m8 ;m6 = _mm_add_epi32(m8 = E3l, m6 = O3l)
> + psrad m6, 7 ;m6 = _mm_srai_epi32(m6, 7)
> + mova m15, m7 ;m15 = m7, copy O3h to m15
> + paddd m7, m9 ;m7 = _mm_add_epi32(m9 = E3h, m7 = O3h)
> + psrad m7, 7 ;m7 = _mm_srai_epi32(m7, 7)
> + packssdw m6, m7 ;m6 = m128iS3 = _mm_packs_epi32(m6, m7)
> + ;m8 = m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m8 = E3l, m13 = O3l), 7), _mm_srai_epi32(_mm_sub_epi32(m9 = E3h, m15 = O3h), 7));
> + psubd m8, m13 ;m8 = _mm_sub_epi32(m8 = E3l, m13 = O3l)
> + psrad m8, 7 ;m8 = _mm_srai_epi32(m8, 7)
> + psubd m9, m15 ;m9 = _mm_sub_epi32(m9 = E3h, m15 = O3h)
> + psrad m9, 7 ;m9 = _mm_srai_epi32(m9, 7)
> + packssdw m8, m9 ;m8 = m128iS4 = _mm_packs_epi32(m8, m9)
> + ; /* Invers matrix */
> +
> + mova m1, m0 ;m1 = m0, copy m128iS0 to m1
> + punpcklwd m0, m8 ;m0 = E0l = _mm_unpacklo_epi16(m0 = m128iS0, m8 = m128iS4);
> + mova m3, m2 ;m3 = m2, copy m128iS1 to m3
> + punpcklwd m2, m10 ;m2 = E1l = _mm_unpacklo_epi16(m2 = m128iS1, m10 = m128iS5);
> + mova m5, m4 ;m5 = m4, copy m128iS2 to m5
> + punpcklwd m4, m14 ;m4 = E2l = _mm_unpacklo_epi16(m4 = m128iS2, m14 = m128iS6);
> + mova m7, m6 ;m7 = m6, copy m128iS3 to m7
> + punpcklwd m6, m12 ;m6 = E3l = _mm_unpacklo_epi16(m6 = m128iS3, m12 = m128iS7);
> + punpckhwd m1, m8 ;m1 = O0l = _mm_unpackhi_epi16(m1 = m128iS0, m8 = m128iS4);
> + punpckhwd m3, m10 ;m3 = O1l = _mm_unpackhi_epi16(m3 = m128iS1, m10 = m128iS5);
> + punpckhwd m5, m14 ;m5 = O2l = _mm_unpackhi_epi16(m5 = m128iS2, m14 = m128iS6);
> + punpckhwd m7, m12 ;m7 = O3l = _mm_unpackhi_epi16(m7 = m128iS3, m12 = m128iS7);
> + mova m12, m0 ;m12 = m0, copy E0l to m12
> + punpcklwd m0, m4 ;m0 = m128Tmp0 = _mm_unpacklo_epi16(m0 = E0l, m4 = E2l);
> + mova m14, m0 ;m14 = m0, copy m128Tmp0 to m14
> + mova m13, m2 ;m13 = m2, copy E1l to m13
> + punpcklwd m2, m6 ;m2 = m128Tmp1 = _mm_unpacklo_epi16(m2 = E1l, m6 = E3l);
> + punpcklwd m0, m2 ;m0 = m128iS0 = _mm_unpacklo_epi16(m0 = m128Tmp0, m2 = m128Tmp1);
> + punpckhwd m14, m2 ;m14 = m128iS1 = _mm_unpackhi_epi16(m14 = m128Tmp0, m2 = m128Tmp1);
> + punpckhwd m12, m4 ;m12 = m128Tmp2 = _mm_unpackhi_epi16(m12 = E0l, m4 = E2l);
> + mova m2, m12 ;m2 = m12, copy m128Tmp2 to m2
> + punpckhwd m13, m6 ;m13 = m128Tmp3 = _mm_unpackhi_epi16(m13 = E1l, m6 = E3l);
> + punpcklwd m2, m13 ;m2 = m128iS2 = _mm_unpacklo_epi16(m2 = m128Tmp2, m13 = m128Tmp3);
> + punpckhwd m12, m13 ;m12 = m128iS3 = _mm_unpackhi_epi16(m12 = m128Tmp2, m13 = m128Tmp3);
> + mova m11, m1 ;m11 = m1, copy O0l to m11
> + punpcklwd m1, m5 ;m1 = m128Tmp0 = _mm_unpacklo_epi16(m1 = O0l, m5 = O2l);
> + mova m4, m1 ;m4 = m1, copy m128Tmp0 to m4
> + mova m10, m3 ;m10 = m3, copy O1l to m10
> + punpcklwd m3, m7 ;m3 = m128Tmp1 = _mm_unpacklo_epi16(m3 = O1l, m7 = O3l);
> + punpcklwd m1, m3 ;m1 = m128iS4 = _mm_unpacklo_epi16(m1 = m128Tmp0, m3 = m128Tmp1);
> + punpckhwd m4, m3 ;m4 = m128iS5 = _mm_unpackhi_epi16(m4 = m128Tmp0, m3 = m128Tmp1);
> + punpckhwd m11, m5 ;m11 = m128Tmp2 = _mm_unpackhi_epi16(m11 = O0l, m5 = O2l);
> + mova m5, m11 ;m5 = m11, copy m128Tmp2 to m5
> + punpckhwd m10, m7 ;m10 = m128Tmp3 = _mm_unpackhi_epi16(m10 = O1l, m7 = O3l);
> + punpcklwd m5, m10 ;m5 = m128iS6 = _mm_unpacklo_epi16(m5 = m128Tmp2, m10 = m128Tmp3);
> + punpckhwd m11, m10 ;m11 = m128iS7 = _mm_unpackhi_epi16(m11 = m128Tmp2, m10 = m128Tmp3);
> +
> + ;m128iAdd = _mm_set1_epi32(2048);
> +
> + mova m15, m14 ;m15 = m14, copy m128iS1 to m15
> + punpcklwd m14, m12 ;m14 = m128Tmp0 = _mm_unpacklo_epi16(m14 = m128iS1, m12 = m128iS3);
> + mova m13, [tab_idct8] ;m13 = tab_idct_8x8[0];
> + mova m6, m13 ;m6 = m13, copy tab_idct_8x8[0] to m6
> + pmaddwd m13, m14 ;m13 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
> + punpckhwd m15, m12 ;m15 = m128Tmp1 = _mm_unpackhi_epi16(m15 = m128iS1, m12 = m128iS3);
> + pmaddwd m6, m15 ;m6 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m6 = tab_idct_8x8[0])));
> + mova m8, m4 ;m8 = m4, copy m128iS5 to m8
> + punpcklwd m4, m11 ;m4 = _m128Tmp2 = mm_unpacklo_epi16(m4 = m128iS5, m11 = m128iS7);
> + mova m7, [tab_idct8 + 1 * 16] ;m7 = tab_idct_8x8[1];
> + mova m9, m7 ;m9 = m7, copy tab_idct_8x8[1] to m7
> + pmaddwd m7, m4 ;m7 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
> + punpckhwd m8, m11 ;m8 = m128Tmp3 = _mm_unpackhi_epi16(m8 = m128iS5, m11 = m128iS7);
> + pmaddwd m9, m8 ;m9 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
> + paddd m13, m7 ;m13 = O0l = _mm_add_epi32(m13 = E1l, m7 = E2l);
> + paddd m6, m9 ;m6 = O0h = _mm_add_epi32(m6 = E1h, m9 = E2h);
> + mova m7, [tab_idct8 + 2 * 16] ;m7 = tab_idct_8x8[2];
> + mova m9, m7 ;m9 = m7, copy tab_idct_8x8[2] to m9
> + pmaddwd m7, m14 ;m7 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
> + pmaddwd m9, m15 ;m9 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[2])));
> + mova m10, [tab_idct8 + 3 * 16] ;m10 = tab_idct_8x8[3];
> + mova m12, m10 ;m12 = m10, copy tab_idct_8x8[3] to m12
> + pmaddwd m10, m4 ;m10 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
> + pmaddwd m12, m8 ;m12 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(m12 = tab_idct_8x8[3])));
> + paddd m7, m10 ;m7 = O1l = _mm_add_epi32(m7 = E1l, m10 = E2l);
> + paddd m9, m12 ;m9 = O1h = _mm_add_epi32(m9 = E1h, m12);
> + mova m10, [tab_idct8 + 4 * 16] ;m10 = tab_idct_8x8[4];
> + mova m12, m10 ;m12 = m10, copy tab_idct_8x8[4] to m12
> + pmaddwd m10, m14 ;m10 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
> + pmaddwd m12, m15 ;m12 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m12 = tab_idct_8x8[4])));
> + mova m11, [tab_idct8 + 5 * 16] ;m11 = tab_idct_8x8[5];
> + mova m3, m11 ;m3 = m11, copy tab_idct_8x8[5] to m3
> + pmaddwd m11, m4 ;m11 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
> + pmaddwd m3, m8 ;m3 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(m3 = tab_idct_8x8[5])));
> + paddd m10, m11 ;m10 = O2l = _mm_add_epi32(m10 = E1l, m11 = E2l);
> + paddd m12, m3 ;m12 = O2h = _mm_add_epi32(m12 = E1h, m3 = E2h);
> + pmaddwd m14, [tab_idct8 + 6 * 16] ;m14 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
> + pmaddwd m15, [tab_idct8 + 6 * 16] ;m15 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
> + pmaddwd m4, [tab_idct8 + 7 * 16] ;m4 = E2l = _mm_madd_epi16(m4 = m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
> + pmaddwd m8, [tab_idct8 + 7 * 16] ;m8 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
> + paddd m15, m8 ;m11 = O3h = _mm_add_epi32(m11 = E1h, m15 = E2h);
> + paddd m14, m4 ;m3 = O3l = _mm_add_epi32(m3 = E1l, m14 = E2l);
> +
> + mova m4, m0 ;m4 = m0, copy m128iS0 to m4
> + punpcklwd m0, m1 ;m0 = m128Tmp0 = _mm_unpacklo_epi16(m0 = m128iS0, m1 = m128iS4);
> + mova m11, m0 ;m11 = m0, copy m128Tmp0 to m15
> + pmaddwd m0, [tab_idct8 + 8 * 16] ;m0 = EE0l = _mm_madd_epi16(m0 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
> + punpckhwd m4, m1 ;m4 = m128Tmp1 = _mm_unpackhi_epi16(m4 = m128Tmp0, m1 = m128iS4);
> + mova m3, m4 ;m3 = m4, copy m128Tmp1 to m3
> + pmaddwd m4, [tab_idct8 + 8 * 16] ;m4 = EE0h = _mm_madd_epi16(m4 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
> + pmaddwd m11, [tab_idct8 + 9 * 16] ;m13 = EE1l = _mm_madd_epi16(m11 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
> + pmaddwd m3, [tab_idct8 + 9 * 16] ;m3 = EE1h = _mm_madd_epi16(m3 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
> +
> + mova m8, m2 ;m8 = m2, copy m128iS2 to m8
> + punpcklwd m2, m5 ;m2 = m128Tmp0 = _mm_unpacklo_epi16(m2 = m128iS2, m5 = m128iS6);
> + mova m1, m2 ;m1 = m2, copy m128Tmp0 to m1
> + pmaddwd m2, [tab_idct8 + 10 * 16] ;m2 = E00l = _mm_madd_epi16(m2 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
> + punpckhwd m8, m5 ;m8 = m128Tmp1 = _mm_unpackhi_epi16(m8 = m128iS2, m5 = m128iS6);
> + mova m5, m8 ;m5 = m8, copy m128Tmp1 to m5
> + mova [r5 - 1 * 16], m2 ;s_1 = m2, copy E00l to stack[1]
> + pmaddwd m5, [tab_idct8 + 10 * 16] ;m5 = E00h = _mm_madd_epi16(m5 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
> + pmaddwd m1, [tab_idct8 + 11 * 16] ;m1 = E01l = _mm_madd_epi16(m1 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
> + pmaddwd m8, [tab_idct8 + 11 * 16] ;m8 = E01h = _mm_madd_epi16(m8 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
> + mova [r5 - 2 * 16], m5 ;s_2 = m5, copy E00h to stack[2]
> + paddd m2, m0 ;m2 = E0l = _mm_add_epi32(m0 = EE0l, m2 = E00l);
> + paddd m2, [IDCT_ADD] ;m2 = E0l = _mm_add_epi32(m2 = E0l, IDCT_ADD);
> + paddd m5, m4 ;m5 = E0h = _mm_add_epi32(m4 = EE0h, m5 = E00h);
> + paddd m5, [IDCT_ADD] ;m5 = E0h = _mm_add_epi32(m5 = E0h, IDCT_ADD);
> + psubd m0, [r5 - 1 * 16] ;m0 = E3l = _mm_sub_epi32(m0 = EE0l, s_1 = E00l);
> + mova [r5 - 1 * 16],m1 ;s_1 = m1, copy E01l to stack[1]
> + paddd m0, [IDCT_ADD] ;m0 = E3l = _mm_add_epi32(m0 = E3l, IDCT_ADD);
> + psubd m4, [r5 - 2 * 16] ;m4 = E3h = _mm_sub_epi32(m4 = EE0h, s_2 = E00h);
> + paddd m4, [IDCT_ADD] ;m4 = E3h = _mm_add_epi32(m4, IDCT_ADD);
> + paddd m1, m11 ;m1 = E1l = _mm_add_epi32(m15 = EE1l, m1 = E01l);
> + mova [r5 - 2 * 16], m8 ;s_2 = m8, copy = E01h to stack[2]
> + paddd m1, [IDCT_ADD] ;m1 = E1l = _mm_add_epi32(m1 = E1l, IDCT_ADD);
> + paddd m8, m3 ;m8 = E1h = _mm_add_epi32(m14 = EE1h, m8 = E01h);
> + paddd m8, [IDCT_ADD] ;m8 = E1h = _mm_add_epi32(m8 = E1h, IDCT_ADD);
> + psubd m11, [r5 - 1 * 16] ;m15 = E2l = _mm_sub_epi32(m15 = EE1l, s_1 = E01l);
> + paddd m11, [IDCT_ADD] ;m15 = E2l = _mm_add_epi32(m15 = E2l, IDCT_ADD);
> + psubd m3, [r5 - 2 * 16] ;m14 = E2h = _mm_sub_epi32(m14 = EE1h, s_2 = E01h);
> + paddd m3, [IDCT_ADD] ;m14 = E2h = _mm_add_epi32(m14 = E2h, IDCT_ADD);
> +
> + ;m13 = m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m2 = E0l, m13 = O0l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m5 = E0h, m6 = O0h), IDCT_SHIFT));
> + mova [r5 - 1 * 16], m13 ;s_1 = m13, copy O0l to stack[1]
> + paddd m13, m2 ;m13 + m2, add E0l and O0l
> + psrad m13, IDCT_SHIFT ;m13 = _mm_srai_epi32(m13, IDCT_SHIFT)
> + mova [r5 - 2 * 16], m6 ;s_2 = m6, copy O0h to stack[2]
> + paddd m6, m5 ;m6 + m5, add O0h and E0h
> + psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
> + packssdw m13, m6 ;m13 = m128iS0
> + ;m2 = m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m2 = E0l, s_1 = O0l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m5 = E0h, s_2 = O0h), IDCT_SHIFT));
> + psubd m2, [r5 - 1 * 16] ;m2 - s_1, E0l minus O0l
> + psrad m2, IDCT_SHIFT ;m2 = _mm_srai_epi32(m2, IDCT_SHIFT)
> + psubd m5, [r5 - 2 * 16] ;m5 - s_2, E0h minus O0h
> + psrad m5, IDCT_SHIFT ;m5 = _mm_srai_epi32(m5, IDCT_SHIFT)
> + packssdw m2, m5 ;m2 = m128iS7
> + ;m1 = m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m1 = E1l, m7 = O1l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m8 = E1h, m9 = O1h), IDCT_SHIFT));
> + mova m5, m1 ;m5 = m1, copy E1l to m5
> + paddd m1, m7 ;m1 + m7, add E1l and O1l
> + psrad m1, IDCT_SHIFT ;m1 = _mm_srai_epi32(m1, IDCT_SHIFT)
> + mova m6, m8 ;m6 = m8, copy E1h to m6
> + paddd m8, m9 ;m8 + m9, add E1h and O1h
> + psrad m8, IDCT_SHIFT ;m8 = _mm_srai_epi32(m8, IDCT_SHIFT)
> + packssdw m1, m8 ;m1 = m128iS1
> + ;m5 = m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m5 = E1l, m7 = O1l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m6 = E1h, m9 = O1h), IDCT_SHIFT));
> + psubd m5, m7 ;m5 - m7, E1l minus O1l
> + psrad m5, IDCT_SHIFT ;m5 = _mm_srai_epi32(m5, IDCT_SHIFT)
> + psubd m6, m9 ;m6 - m9, E1h minus O1h
> + psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
> + packssdw m5, m6 ;m5 = m128iS6
> + ;m15 = m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m15 = E2l, m10 = O2l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m14 = E2h, m12 = O2h), IDCT_SHIFT));
> + mova m6, m11 ;m6 = m15, copy E2l to m6
> +
> + paddd m11, m10 ;m15 + m10, add E2l and O2l
> + psrad m11, IDCT_SHIFT ;m11 = _mm_srai_epi32(m11, IDCT_SHIFT)
> + mova m7, m3 ;m7 = m14, copy E2h to m7
> + paddd m3, m12 ;m14 + m12, add E2h and O2h
> + psrad m3, IDCT_SHIFT ;m3 = _mm_srai_epi32(m3, IDCT_SHIFT)
> + packssdw m11, m3 ;m15 = m128iS2
> + ;m6 = m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m6 = E2l, m10 = O2l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m7 = E2h, m12 = O2h), IDCT_SHIFT));
> + psubd m6, m10 ;m6 - m10, E2l minus O2l
> + psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
> + psubd m7, m12 ;m7 - m12, E2h minus O2h
> + psrad m7, IDCT_SHIFT ;m7 = _mm_srai_epi32(m7, IDCT_SHIFT)
> + packssdw m6, m7 ;m6 = m128iS5
> + ;m0 = m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m0 = E3l, m3 = O3l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m4 = E3h, m11 = O3h), IDCT_SHIFT));
> + mova m8, m0 ;m8 = m0, copy E3l to m8
> + paddd m0, m14 ;m0 + m3, add E3l and O3l
> + psrad m0, IDCT_SHIFT ;m0 = _mm_srai_epi32(m0, IDCT_SHIFT)
> + mova m7, m4 ;m7 = m4, copy E3h to m7
> + paddd m4, m15 ;m4 + m11, add E3h and O3h
> + psrad m4, IDCT_SHIFT ;m4 = _mm_srai_epi32(m4, IDCT_SHIFT)
> + packssdw m0, m4 ;m0 = m128iS3
> + ;m8 = m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m8 = E3l, m3 = O3l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m7 = E3h, m11 = O3h), IDCT_SHIFT));
> + psubd m8, m14 ;m8 - m3, E3l minus O3l
> + psrad m8, IDCT_SHIFT ;m8 = _mm_srai_epi32(m8, IDCT_SHIFT)
> + psubd m7, m15 ;m7 - m11, E3h minus O3h
> + psrad m7, IDCT_SHIFT ;m7 = _mm_srai_epi32(m7, IDCT_SHIFT)
> + packssdw m8, m7 ;m8 = m128iS4
> +
> +; // [07 06 05 04 03 02 01 00]
> +; // [17 16 15 14 13 12 11 10]
> +; // [27 26 25 24 23 22 21 20]
> +; // [37 36 35 34 33 32 31 30]
> +; // [47 46 45 44 43 42 41 40]
> +; // [57 56 55 54 53 52 51 50]
> +; // [67 66 65 64 63 62 61 60]
> +; // [77 76 75 74 73 72 71 70]
> +
> + mova m3, m13 ;m3 = m13, copy m128iS0 to m3
> + punpcklwd m3, m1 ;m3 = T00 = _mm_unpacklo_epi16(m3 = m128iS0, m1 = m128iS1); // [13 03 12 02 11 01 10 00]
> + punpckhwd m13, m1 ;m13 = T01 = _mm_unpackhi_epi16(m13 = m128iS0,m1 = m128iS1); // [17 07 16 06 15 05 14 04]
> + mova m4, m11 ;m4 = m15
> + punpcklwd m11, m0 ;m15 = T02 = _mm_unpacklo_epi16(m15 = m128iS2, m0 = m128iS3); // [33 23 32 22 31 21 30 20]
> + punpckhwd m4, m0 ;m4 = T03 = _mm_unpackhi_epi16(m4 = m128iS2, m0 = m128iS3); // [37 27 36 26 35 25 34 24]
> + mova m1, m8 ;m1 = m8, copy m128iS4 to m1
> + punpcklwd m8, m6 ;m8 = T04 = _mm_unpacklo_epi16(m8 = m128iS4, m6 = m128iS5); // [53 43 52 42 51 41 50 40]
> + punpckhwd m1, m6 ;m1 = T05 = _mm_unpackhi_epi16(m1 = m128iS4, m6 = m128iS5); // [57 47 56 46 55 45 54 44]
> + mova m7, m5 ;m7 = m5, copy m128iS6 to m7
> + punpcklwd m5, m2 ;m5 = T06 = _mm_unpacklo_epi16(m5 = m128iS6, m2 = m128iS7); // [73 63 72 62 71 61 70 60]
> + punpckhwd m7, m2 ;m7 = T07 = _mm_unpackhi_epi16(m7 = m128iS6, m2 = m128iS7); // [77 67 76 66 75 65 74 64]
> +
> + ;__m128i T10, T11;
> + mova m0, m3 ;m0 = m3, copy T00 to m0
> + punpckldq m0, m11 ;m0 = T10 = _mm_unpacklo_epi32(m0 = T00, m15 = T02); // [31 21 11 01 30 20 10 00]
> + punpckhdq m3, m11 ;m3 = T11 = _mm_unpackhi_epi32(m3 = T00, m15 = T02); // [33 23 13 03 32 22 12 02]
> +; lea r2, [r2 + r2] ;set r2 to index of 1
> +; lea r4, [r2 + r2] ;set r4 to index of 2
> +; lea r3, [r4 + r2] ;set r3 to index of 3
> +; lea r4, [r4 + r3] ;set r4 to index of 5
> + lea r0, [r4 + r2 * 2] ;set r0 to index of 7
> + movq [r1], m0 ;_mm_storel_epi64((__m128i*)&dst[0 * stride + 0], m0 = T10); // [30 20 10 00]
> + movq [r1 + r2 * 2], m3 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 0], m3 = T11); // [32 22 12 02]
> + movhps [r1 + r2 * 1], m0 ;_mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(m0 = T10)); // [31 21 11 01]
> + movhps [r1 + r3], m3 ;_mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(m3 = T11)); // [33 23 13 03]
> +
> + mova m2, m8 ;m2 = m8, copy T04 to m2
> + punpckldq m2, m5 ;m2 = T10 = _mm_unpacklo_epi32(m2 = T04, m5 = T06); // [71 61 51 41 70 60 50 40]
> + punpckhdq m8, m5 ;m8 = T11 = _mm_unpackhi_epi32(m8 = T04, m5 = T06); // [73 63 53 43 72 62 52 42]
> + movq [r1 + 8], m2 ;_mm_storel_epi64((__m128i*)&dst[0 * stride + 4], m2 = T10);
> + movq [r1 + r2 * 2 + 8], m8 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 4], m8 = T11);
> + movhps [r1 + r2 * 1 + 8], m2 ;_mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(m2 = T10));
> + movq [r1 + r2 * 2 + 8], m8 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 4], m8 = T11);
> +
> + movhps [r1 + r3 + 8], m8 ;_mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(m8 = T11));
> + mova m11, m0
> + punpcklqdq m0, m2
> + punpckhqdq m11, m2
> + mova [r1], m0
> + mova [r1 + r2 * 1], m11
> +
> + mova m5, m3
> + punpcklqdq m3, m8
> + punpckhqdq m5, m8
> + mova [r1 + r2 * 2], m3
> + mova [r1 + r3], m5
> +
> + mova m6, m13 ;m6 = m13, copy T01 to m6
> + punpckldq m6, m4 ;m6 = T10 = _mm_unpacklo_epi32(m6 = T01, m4 = T03); // [35 25 15 05 34 24 14 04]
> + punpckhdq m13, m4 ;m13 = T11 = _mm_unpackhi_epi32(m13 = T01, m4 = T03); // [37 27 17 07 36 26 16 06]
> + movq [r1 + r2 * 4], m6 ;_mm_storel_epi64((__m128i*)&dst[4 * stride + 0], m6 = T10);
> + movq [r1 + r3 * 2], m13 ;_mm_storel_epi64((__m128i*)&dst[6 * stride + 0], m13 = T11);
> + mova m9, m1 ;m9 = m1, copy T05 to m9
> + movhps [r1 + r4], m6 ;_mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(m6 = T10));
> + movhps [r1 + r0], m13 ;_mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(m13 = T11));
> +
> + punpckldq m1, m7 ;m1 = T10 = _mm_unpacklo_epi32(m1 = T05, m7 = T07); // [75 65 55 45 74 64 54 44]
> + punpckhdq m9, m7 ;m9 = T11 = _mm_unpackhi_epi32(m9 = T05, m7 = T07); // [77 67 57 47 76 56 46 36]
> + movq [r1 + r2 * 4 + 8], m1 ;_mm_storel_epi64((__m128i*)&dst[4 * stride + 4], m1 = T10);
> + movq [r1 + r3 * 2 + 8], m9 ;_mm_storel_epi64((__m128i*)&dst[6 * stride + 4], m9 = T11);
> + movhps [r1 + r4 + 8], m1 ;_mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(m1 = T10));
> + movhps [r1 + r0 + 8], m9 ;_mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(m9 = T11));
> +
> + RET
> +%undef IDCT_SHIFT
> +%undef IDCT_ADD
> +
> +;-------------------------------------------------------
> ; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM ssse3
> diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h Thu Nov 20 20:04:02 2014 +0530
> +++ b/source/common/x86/dct8.h Thu Nov 20 19:21:53 2014 -0800
> @@ -35,6 +35,7 @@
> void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> +void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
More information about the x265-devel
mailing list