[x265] [PATCH] asm: idct8 sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Fri Nov 21 04:24:08 CET 2014
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1416540113 28800
# Node ID f4a932dba6993d8bcff3ddea7dc8c83c55d52396
# Parent 1d17ec0cb9548194b90495c5d7c94552c71abbf5
asm: idct8 sse2
this version is based directly on the sse3 intrinsic
basically, it's the intrinsic version with almost no optimizations
but I thought it might be a better starting point for optimization
than gcc's optimized output
diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 20 19:21:53 2014 -0800
@@ -1376,6 +1376,7 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
LUMA_SS_FILTERS(_sse2);
}
@@ -1564,6 +1565,7 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/dct8.asm Thu Nov 20 19:21:53 2014 -0800
@@ -302,6 +302,19 @@
pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
+tab_idct8: times 4 dw 89, 75
+ times 4 dw 50, 18
+ times 4 dw 75, -18
+ times 4 dw -89, -50
+ times 4 dw 50, -89
+ times 4 dw 18, 75
+ times 4 dw 18, -50
+ times 4 dw 75, -89
+ times 4 dw 64, 64
+ times 4 dw 64, -64
+ times 4 dw 83, 36
+ times 4 dw 36, -83
+
SECTION .text
cextern pd_1
cextern pd_2
@@ -974,6 +987,437 @@
RET
;-------------------------------------------------------
+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_XMM sse2
+
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+ %define IDCT_ADD pd_512
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+ %define IDCT_ADD pd_2048
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+cglobal idct8, 3,7, 16
+ lea r2, [r2 + r2] ;set r2 to index of 1
+ lea r4, [r2 + r2] ;set r4 to index of 2
+ lea r3, [r4 + r2] ;set r3 to index of 3
+ lea r4, [r4 + r3] ;set r4 to index of 5
+ mov r5, rsp
+ and r5, ~(16-1)
+ movaps m6, [r0 + 1 * 16] ;m6 = m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
+ movaps m15, [r0 + 3 * 16] ;m15 = m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
+ mova m7, m6 ;m7 = m6, copy m128iS1 to m1
+ punpcklwd m6, m15 ;m6 = m128Tmp0 = _mm_unpacklo_epi16(m6 = m128iS1, m15 = m128iS3);
+ mova m0, [tab_idct8 + 0 * 16] ;m0 = tab_idct_8x8[0];
+ mova m1, m0 ;m1 = m0, copy tab_idct_8x8[0] to m1
+ pmaddwd m0, m6 ;m0 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m0 = tab_idct_8x8[0])));
+ punpckhwd m7, m15 ;m7 = m128Tmp1 = _mm_unpackhi_epi16(m7 = m128iS1 , m15 = m128iS3);
+ mova m12, [r0 + 5 * 16] ;m12 = m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
+ pmaddwd m1, m7 ;m1 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m1 = tab_idct_8x8[0])));
+ movu m15, [r0 + 7 * 16] ;m15 = m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
+ mova m13, m12 ;m13 = m12, copy m128iS5 to m13
+ punpcklwd m12, m15 ;m12 = m128Tmp2 = _mm_unpacklo_epi16(m12 = m128iS5, m15 = m128iS7);
+ movu m8, [tab_idct8 + 1 * 16] ;m8 = tab_idct_8x8[1];
+ movu m9, m8 ;m9 = m8, copy tab_idct_8x8[1] to m9
+ pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[1])));
+ punpckhwd m13, m15 ;m13 = m128Tmp3 = _mm_unpackhi_epi16(m13 = m128iS5, m15 = m128iS7);
+ pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[1])));
+ paddd m0, m8 ;m0 = O0l = _mm_add_epi32(m0 = E1l, m8 = E2l);
+ paddd m1, m9 ;m1 = O0h = _mm_add_epi32(m1 = E1h, m9 = E2h);
+ mova m2, [tab_idct8 + 2 * 16] ;m2 = tab_idct_8x8[2];
+ mova m3, m2 ;m3 = m2, copy tab_idct_8x8[2] to m3
+ pmaddwd m2, m6 ;m2 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m2 = tab_idct_8x8[2])));
+ pmaddwd m3, m7 ;m3 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m3 = tab_idct_8x8[2])));
+ mova m8, [tab_idct8 + 3 * 16] ;m8 = tab_idct_8x8[3];
+ mova m9, m8 ;m9 = m8, copy tab_idct_8x8[3] to m9
+ pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[3])));
+ pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[3])));
+ paddd m2, m8 ;m2 = O1l = _mm_add_epi32(m2 = E1l, m8 = E2l);
+ paddd m3, m9 ;m3 = O1h = _mm_add_epi32(m3 = E1h, m9 = E2h);
+ mova m4, [tab_idct8 + 4 * 16] ;m4 = tab_idct_8x8[4];
+ mova m5, m4 ;m5 = m4, copy tab_idct_8x8[4] to m5
+ pmaddwd m4, m6 ;m4 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(m4 = tab_idct_8x8[4])));
+ pmaddwd m5, m7 ;m5 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(m5 = tab_idct_8x8[4])));
+ mova m8, [tab_idct8 + 5 * 16] ;m8 = tab_idct_8x8[5];
+ mova m9, m8 ;m9 = m8, copy tab_idct_8x8[5] to m9
+ pmaddwd m8, m12 ;m8 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(m8 = tab_idct_8x8[5])));
+ pmaddwd m9, m13 ;m9 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[5])));
+ paddd m4, m8 ;m4 = O2l = _mm_add_epi32(m4 = E1l, m8 = E2l);
+ paddd m5, m9 ;m5 = O2h = _mm_add_epi32(m5 = E1h, m9 = E2h);
+ pmaddwd m6, [tab_idct8 + 6 * 16] ;m6 = E1l = _mm_madd_epi16(m6 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ pmaddwd m7, [tab_idct8 + 6 * 16] ;m7 = E1h = _mm_madd_epi16(m7 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ pmaddwd m12, [tab_idct8 + 7 * 16] ;m12 = E2l = _mm_madd_epi16(m12 = m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ pmaddwd m13, [tab_idct8 + 7 * 16] ;m13 = E2h = _mm_madd_epi16(m13 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ paddd m6, m12 ;m6 = O3l = _mm_add_epi32(m6 = E1l, m12 = E2l);
+ paddd m7, m13 ;m7 = O3h = _mm_add_epi32(m7 = E1h, m13 = E2h);
+
+ ;/* ------- */
+
+ mova m8, [r0 + 0 * 16] ;m8 = m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
+ mova m15, [r0 + 4 * 16] ;m15 = m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
+ mova m9, m8 ;m9 = m8, copy m128iS0 to m9
+ punpcklwd m8, m15 ;m8 = m128Tmp0 = _mm_unpacklo_epi16(m8 = m128iS0 , m15 = m128iS4);
+ mova m10, m8 ;10 = m8, copy m128Tmp0 to m10
+ pmaddwd m8, [tab_idct8 + 8 * 16] ;m8 = EE0l = _mm_madd_epi16(m8 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ punpckhwd m9, m15 ;m9 = m128Tmp1 = _mm_unpackhi_epi16(m9 = m128iS0, m15 = m128iS4);
+ mova m11, m9 ;m11 = m9, copy m128Tmp1 to m11
+ pmaddwd m9, [tab_idct8 + 8 * 16] ;m9 = EE0h = _mm_madd_epi16(m9 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ pmaddwd m10, [tab_idct8 + 9 * 16] ;m10 = EE1l = _mm_madd_epi16(m10 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+ pmaddwd m11, [tab_idct8 + 9 * 16] ;m11 = EE1h = _mm_madd_epi16(m11 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+
+ ;/* ------- */
+
+ mova m12, [r0 + 2 * 16] ;m12 = m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
+ mova m15, [r0 + 6 * 16] ;m15 = m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
+ mova m13, m12 ;m13 = m12, copy m128iS2 to m13
+ punpcklwd m12, m15 ;m12 = m128Tmp0 = _mm_unpacklo_epi16(m12 = m128iS2, m15 = m128iS6);
+ mova m14, m12 ;m14 = m12, copy m128Tmp0 to m14
+ pmaddwd m12, [tab_idct8 + 10 * 16] ;m12 = E00l = _mm_madd_epi16(m12 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ punpckhwd m13, m15 ;m13 = m128Tmp1 = _mm_unpackhi_epi16(m13 = m128iS2, m15 = m128iS6);
+ mova m15, m13 ;m15 = m13, copy m128Tmp1 to m15
+ pmaddwd m13, [tab_idct8 + 10 * 16] ;m13 = E00h = _mm_madd_epi16(m13 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ pmaddwd m14, [tab_idct8 + 11 * 16] ;m14 = E01l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ pmaddwd m15, [tab_idct8 + 11 * 16] ;m15 = E01h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ mova [r5 - 1 * 16], m12 ;s_1 = m12, copy E00l to stack[1]
+ paddd m12, m8 ;m12 = E0l = _mm_add_epi32(m8 = EE0l, m12 = E00l);
+ paddd m12, [pd_64] ;m12 = E0l = _mm_add_epi32(m12 = E0l, pd_64);
+ mova [r5 - 2 * 16], m13 ;s_2 = m13, copy E00h to stack[2]
+ paddd m13, m9 ;m13 = E0h = _mm_add_epi32(m9 = EE0h, m13 = E00h);
+ paddd m13, [pd_64] ;m13 = E0h = _mm_add_epi32(m13 = E0h, pd_64);
+ psubd m8, [r5 - 1 * 16] ;m8 = E3l = _mm_sub_epi32(m8 = EE0l, s_1 = E00l);
+ psubd m9, [r5 - 2 * 16] ;m9 = E3h = _mm_sub_epi32(m9 = EE0h, s_2 = E00h);
+ paddd m8, [pd_64] ;m8 = E3l = _mm_add_epi32(m8 = E3l, pd_64);
+ mova [r5 - 1 * 16], m14 ;s_1 = m14, copy E01l to stack[1]
+ paddd m9, [pd_64] ;m9 = E3h = _mm_add_epi32(m9 = E3h, pd_64);
+ paddd m14, m10 ;m14 = E1l = _mm_add_epi32(m10 = EE1l, m14 = E01l);
+ mova [r5 - 2 * 16], m15 ;s_2 = m15, copy E01h to stack[2]
+ paddd m14, [pd_64] ;m14 = E1l = _mm_add_epi32(m14 = E1l, pd_64);
+ paddd m15, m11 ;m15 = E1h = _mm_add_epi32(m11 = EE1h, m15 = E01h);
+ paddd m15, [pd_64] ;m15 = E1h = _mm_add_epi32(m15 = E1h, pd_64);
+ psubd m10, [r5 - 1 * 16] ;m10 = E2l = _mm_sub_epi32(m10 = EE1l, s_1 = E01l);
+ mova [r5 - 1 * 16], m0 ;s_1 = m0, copy O0l to stack[1]
+ paddd m10, [pd_64] ;m10 = E2l = _mm_add_epi32(m10 = E2l, pd_64);
+ psubd m11, [r5 - 2 * 16] ;m11 = E2h = _mm_sub_epi32(m11 = EE1h, s_2 = E01h);
+ paddd m11, [pd_64] ;m11 = E2h = _mm_add_epi32(m11 = E2h, pd_64);
+ mova [r5 - 2 * 16], m1 ;s_2 = m1, copy O0h to stack[2]
+ ;m0 = m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m12 = E0l, m0 = O0l), 7), _mm_srai_epi32(_mm_add_epi32(m13 = E0h, m1 = O0h), 7));
+ paddd m0, m12 ;m0 = _mm_add_epi32(m12 = E0l, m0 = O0l)
+ psrad m0, 7 ;m0 = _mm_srai_epi32(m0, 7)
+ paddd m1, m13 ;m1 = _mm_add_epi32(m13 = E0h, m1 = O0h)
+ psrad m1, 7 ;m1 = _mm_srai_epi32(m1, 7)
+ packssdw m0, m1 ;m0 = m128iS0 = _mm_packs_epi32(m0 , m1)
+ ;m12 = m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m12 = E0l, s_1 = O0l), 7), _mm_srai_epi32(_mm_sub_epi32(m13 = E0h, s_2 = O0h), 7));
+ psubd m12, [r5 - 1 * 16] ;m12 = _mm_sub_epi32(m12 = E0l, s_1 = O0l)
+ psrad m12, 7 ;m12 = _mm_srai_epi32(m12, 7)
+ psubd m13, [r5 - 2 * 16] ;m13 = _mm_sub_epi32(m13 = E0h, s_2 = O0h)
+ psrad m13, 7 ;m13 = _mm_srai_epi32(m13, 7)
+ packssdw m12, m13 ;m12 = m128iS7 = _mm_packs_epi32(m12, m13)
+ ;m2 = m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m14 = E1l, m2 = O1l), 7), _mm_srai_epi32(_mm_add_epi32(m15 = E1h, m3 = O1h), 7));
+ mova m1, m2 ;m1 = m2, copy O1l to m1
+ mova m13, m3 ;m13 = m3, copy O1h to m13
+ paddd m2, m14 ;m2 = _mm_add_epi32(m14 = E1l, m2 = O1l)
+ psrad m2, 7 ;m2 = _mm_srai_epi32(m2, 7)
+ paddd m3 ,m15, ;m3 = _mm_add_epi32(m15 = E1h, m3 = O1h)
+ psrad m3, 7 ;m3 = _mm_srai_epi32(m3, 7)
+ packssdw m2, m3 ;m2 = m128iS1 = _mm_packs_epi32(m2, m3)
+ ;m14 = m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m14 = E1l, m1 = O1l), 7), _mm_srai_epi32(_mm_sub_epi32(m15 = E1h, m13 = O1h), 7));
+ psubd m14, m1 ;m14 = _mm_sub_epi32(m14 = E1l, m1 = O1l)
+ psrad m14, 7 ;m14 = _mm_srai_epi32(m14, 7)
+ psubd m15, m13 ;m15 = _mm_sub_epi32(m15 = E1h, m13 = O1h)
+ psrad m15, 7 ;m15 = _mm_srai_epi32(m15, 7)
+ packssdw m14, m15 ;m14 = m128iS6 = _mm_packs_epi32(m14, m15)
+ ;m4 = m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m10 = E2l, m4 = O2l), 7), _mm_srai_epi32(_mm_add_epi32(m11 = E2h, m5 = O2h), 7));
+ mova m3, m4 ;m3 = m4, copy O2l to m3
+ mova m1, m5 ;m1 = m5, copy O2h to m1
+ paddd m4, m10 ;m4 = _mm_add_epi32(m10 = E2l, m4 = O2l)
+ psrad m4, 7 ;m4 = _mm_srai_epi32(m4, 7)
+ paddd m5, m11 ;m5 = _mm_add_epi32(m11 = E2h, m5 = O2h)
+ psrad m5, 7 ;m5 = _mm_srai_epi32(m5, 7)
+ packssdw m4, m5 ;m4 = m128iS2 = _mm_packs_epi32(m4, m5)
+ ;m10 = m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m10 = E2l, m3 = O2l), 7), _mm_srai_epi32(_mm_sub_epi32(m11 = E2h, m1 = O2h), 7));
+ psubd m10, m3 ;m10 = _mm_sub_epi32(m10 = E2l, m3 = O2l)
+ psrad m10, 7 ;m10 = _mm_srai_epi32(m10, 7)
+ psubd m11, m1 ;m11 = _mm_sub_epi32(m11 = E2h, m1 = O2h)
+ psrad m11, 7 ;m11 = _mm_srai_epi32(m11, 7)
+ packssdw m10, m11 ;m10 = m128iS5 = _mm_packs_epi32(m10, m11)
+ ;m6 = m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m8 = E3l, m6 = O3l), 7), _mm_srai_epi32(_mm_add_epi32(m9 = E3h, m7 = O3h), 7));
+ mova m13, m6 ;m13 = m6, copy O3l to m13
+ paddd m6, m8 ;m6 = _mm_add_epi32(m8 = E3l, m6 = O3l)
+ psrad m6, 7 ;m6 = _mm_srai_epi32(m6, 7)
+ mova m15, m7 ;m15 = m7, copy O3h to m15
+ paddd m7, m9 ;m7 = _mm_add_epi32(m9 = E3h, m7 = O3h)
+ psrad m7, 7 ;m7 = _mm_srai_epi32(m7, 7)
+ packssdw m6, m7 ;m6 = m128iS3 = _mm_packs_epi32(m6, m7)
+ ;m8 = m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m8 = E3l, m13 = O3l), 7), _mm_srai_epi32(_mm_sub_epi32(m9 = E3h, m15 = O3h), 7));
+ psubd m8, m13 ;m8 = _mm_sub_epi32(m8 = E3l, m13 = O3l)
+ psrad m8, 7 ;m8 = _mm_srai_epi32(m8, 7)
+ psubd m9, m15 ;m9 = _mm_sub_epi32(m9 = E3h, m15 = O3h)
+ psrad m9, 7 ;m9 = _mm_srai_epi32(m9, 7)
+ packssdw m8, m9 ;m8 = m128iS4 = _mm_packs_epi32(m8, m9)
+ ; /* Invers matrix */
+
+ mova m1, m0 ;m1 = m0, copy m128iS0 to m1
+ punpcklwd m0, m8 ;m0 = E0l = _mm_unpacklo_epi16(m0 = m128iS0, m8 = m128iS4);
+ mova m3, m2 ;m3 = m2, copy m128iS1 to m3
+ punpcklwd m2, m10 ;m2 = E1l = _mm_unpacklo_epi16(m2 = m128iS1, m10 = m128iS5);
+ mova m5, m4 ;m5 = m4, copy m128iS2 to m5
+ punpcklwd m4, m14 ;m4 = E2l = _mm_unpacklo_epi16(m4 = m128iS2, m14 = m128iS6);
+ mova m7, m6 ;m7 = m6, copy m128iS3 to m7
+ punpcklwd m6, m12 ;m6 = E3l = _mm_unpacklo_epi16(m6 = m128iS3, m12 = m128iS7);
+ punpckhwd m1, m8 ;m1 = O0l = _mm_unpackhi_epi16(m1 = m128iS0, m8 = m128iS4);
+ punpckhwd m3, m10 ;m3 = O1l = _mm_unpackhi_epi16(m3 = m128iS1, m10 = m128iS5);
+ punpckhwd m5, m14 ;m5 = O2l = _mm_unpackhi_epi16(m5 = m128iS2, m14 = m128iS6);
+ punpckhwd m7, m12 ;m7 = O3l = _mm_unpackhi_epi16(m7 = m128iS3, m12 = m128iS7);
+ mova m12, m0 ;m12 = m0, copy E0l to m12
+ punpcklwd m0, m4 ;m0 = m128Tmp0 = _mm_unpacklo_epi16(m0 = E0l, m4 = E2l);
+ mova m14, m0 ;m14 = m0, copy m128Tmp0 to m14
+ mova m13, m2 ;m13 = m2, copy E1l to m13
+ punpcklwd m2, m6 ;m2 = m128Tmp1 = _mm_unpacklo_epi16(m2 = E1l, m6 = E3l);
+ punpcklwd m0, m2 ;m0 = m128iS0 = _mm_unpacklo_epi16(m0 = m128Tmp0, m2 = m128Tmp1);
+ punpckhwd m14, m2 ;m14 = m128iS1 = _mm_unpackhi_epi16(m14 = m128Tmp0, m2 = m128Tmp1);
+ punpckhwd m12, m4 ;m12 = m128Tmp2 = _mm_unpackhi_epi16(m12 = E0l, m4 = E2l);
+ mova m2, m12 ;m2 = m12, copy m128Tmp2 to m2
+ punpckhwd m13, m6 ;m13 = m128Tmp3 = _mm_unpackhi_epi16(m13 = E1l, m6 = E3l);
+ punpcklwd m2, m13 ;m2 = m128iS2 = _mm_unpacklo_epi16(m2 = m128Tmp2, m13 = m128Tmp3);
+ punpckhwd m12, m13 ;m12 = m128iS3 = _mm_unpackhi_epi16(m12 = m128Tmp2, m13 = m128Tmp3);
+ mova m11, m1 ;m11 = m1, copy O0l to m11
+ punpcklwd m1, m5 ;m1 = m128Tmp0 = _mm_unpacklo_epi16(m1 = O0l, m5 = O2l);
+ mova m4, m1 ;m4 = m1, copy m128Tmp0 to m4
+ mova m10, m3 ;m10 = m3, copy O1l to m10
+ punpcklwd m3, m7 ;m3 = m128Tmp1 = _mm_unpacklo_epi16(m3 = O1l, m7 = O3l);
+ punpcklwd m1, m3 ;m1 = m128iS4 = _mm_unpacklo_epi16(m1 = m128Tmp0, m3 = m128Tmp1);
+ punpckhwd m4, m3 ;m4 = m128iS5 = _mm_unpackhi_epi16(m4 = m128Tmp0, m3 = m128Tmp1);
+ punpckhwd m11, m5 ;m11 = m128Tmp2 = _mm_unpackhi_epi16(m11 = O0l, m5 = O2l);
+ mova m5, m11 ;m5 = m11, copy m128Tmp2 to m5
+ punpckhwd m10, m7 ;m10 = m128Tmp3 = _mm_unpackhi_epi16(m10 = O1l, m7 = O3l);
+ punpcklwd m5, m10 ;m5 = m128iS6 = _mm_unpacklo_epi16(m5 = m128Tmp2, m10 = m128Tmp3);
+ punpckhwd m11, m10 ;m11 = m128iS7 = _mm_unpackhi_epi16(m11 = m128Tmp2, m10 = m128Tmp3);
+
+ ;m128iAdd = _mm_set1_epi32(2048);
+
+ mova m15, m14 ;m15 = m14, copy m128iS1 to m15
+ punpcklwd m14, m12 ;m14 = m128Tmp0 = _mm_unpacklo_epi16(m14 = m128iS1, m12 = m128iS3);
+ mova m13, [tab_idct8] ;m13 = tab_idct_8x8[0];
+ mova m6, m13 ;m6 = m13, copy tab_idct_8x8[0] to m6
+ pmaddwd m13, m14 ;m13 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+ punpckhwd m15, m12 ;m15 = m128Tmp1 = _mm_unpackhi_epi16(m15 = m128iS1, m12 = m128iS3);
+ pmaddwd m6, m15 ;m6 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m6 = tab_idct_8x8[0])));
+ mova m8, m4 ;m8 = m4, copy m128iS5 to m8
+ punpcklwd m4, m11 ;m4 = _m128Tmp2 = mm_unpacklo_epi16(m4 = m128iS5, m11 = m128iS7);
+ mova m7, [tab_idct8 + 1 * 16] ;m7 = tab_idct_8x8[1];
+ mova m9, m7 ;m9 = m7, copy tab_idct_8x8[1] to m7
+ pmaddwd m7, m4 ;m7 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+ punpckhwd m8, m11 ;m8 = m128Tmp3 = _mm_unpackhi_epi16(m8 = m128iS5, m11 = m128iS7);
+ pmaddwd m9, m8 ;m9 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+ paddd m13, m7 ;m13 = O0l = _mm_add_epi32(m13 = E1l, m7 = E2l);
+ paddd m6, m9 ;m6 = O0h = _mm_add_epi32(m6 = E1h, m9 = E2h);
+ mova m7, [tab_idct8 + 2 * 16] ;m7 = tab_idct_8x8[2];
+ mova m9, m7 ;m9 = m7, copy tab_idct_8x8[2] to m9
+ pmaddwd m7, m14 ;m7 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+ pmaddwd m9, m15 ;m9 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m9 = tab_idct_8x8[2])));
+ mova m10, [tab_idct8 + 3 * 16] ;m10 = tab_idct_8x8[3];
+ mova m12, m10 ;m12 = m10, copy tab_idct_8x8[3] to m12
+ pmaddwd m10, m4 ;m10 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+ pmaddwd m12, m8 ;m12 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(m12 = tab_idct_8x8[3])));
+ paddd m7, m10 ;m7 = O1l = _mm_add_epi32(m7 = E1l, m10 = E2l);
+ paddd m9, m12 ;m9 = O1h = _mm_add_epi32(m9 = E1h, m12);
+ mova m10, [tab_idct8 + 4 * 16] ;m10 = tab_idct_8x8[4];
+ mova m12, m10 ;m12 = m10, copy tab_idct_8x8[4] to m12
+ pmaddwd m10, m14 ;m10 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+ pmaddwd m12, m15 ;m12 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(m12 = tab_idct_8x8[4])));
+ mova m11, [tab_idct8 + 5 * 16] ;m11 = tab_idct_8x8[5];
+ mova m3, m11 ;m3 = m11, copy tab_idct_8x8[5] to m3
+ pmaddwd m11, m4 ;m11 = E2l = _mm_madd_epi16(m4 = _m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+ pmaddwd m3, m8 ;m3 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(m3 = tab_idct_8x8[5])));
+ paddd m10, m11 ;m10 = O2l = _mm_add_epi32(m10 = E1l, m11 = E2l);
+ paddd m12, m3 ;m12 = O2h = _mm_add_epi32(m12 = E1h, m3 = E2h);
+ pmaddwd m14, [tab_idct8 + 6 * 16] ;m14 = E1l = _mm_madd_epi16(m14 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ pmaddwd m15, [tab_idct8 + 6 * 16] ;m15 = E1h = _mm_madd_epi16(m15 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ pmaddwd m4, [tab_idct8 + 7 * 16] ;m4 = E2l = _mm_madd_epi16(m4 = m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ pmaddwd m8, [tab_idct8 + 7 * 16] ;m8 = E2h = _mm_madd_epi16(m8 = m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ paddd m15, m8 ;m11 = O3h = _mm_add_epi32(m11 = E1h, m15 = E2h);
+ paddd m14, m4 ;m3 = O3l = _mm_add_epi32(m3 = E1l, m14 = E2l);
+
+ mova m4, m0 ;m4 = m0, copy m128iS0 to m4
+ punpcklwd m0, m1 ;m0 = m128Tmp0 = _mm_unpacklo_epi16(m0 = m128iS0, m1 = m128iS4);
+ mova m11, m0 ;m11 = m0, copy m128Tmp0 to m15
+ pmaddwd m0, [tab_idct8 + 8 * 16] ;m0 = EE0l = _mm_madd_epi16(m0 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ punpckhwd m4, m1 ;m4 = m128Tmp1 = _mm_unpackhi_epi16(m4 = m128Tmp0, m1 = m128iS4);
+ mova m3, m4 ;m3 = m4, copy m128Tmp1 to m3
+ pmaddwd m4, [tab_idct8 + 8 * 16] ;m4 = EE0h = _mm_madd_epi16(m4 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ pmaddwd m11, [tab_idct8 + 9 * 16] ;m13 = EE1l = _mm_madd_epi16(m11 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+ pmaddwd m3, [tab_idct8 + 9 * 16] ;m3 = EE1h = _mm_madd_epi16(m3 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+
+ mova m8, m2 ;m8 = m2, copy m128iS2 to m8
+ punpcklwd m2, m5 ;m2 = m128Tmp0 = _mm_unpacklo_epi16(m2 = m128iS2, m5 = m128iS6);
+ mova m1, m2 ;m1 = m2, copy m128Tmp0 to m1
+ pmaddwd m2, [tab_idct8 + 10 * 16] ;m2 = E00l = _mm_madd_epi16(m2 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ punpckhwd m8, m5 ;m8 = m128Tmp1 = _mm_unpackhi_epi16(m8 = m128iS2, m5 = m128iS6);
+ mova m5, m8 ;m5 = m8, copy m128Tmp1 to m5
+ mova [r5 - 1 * 16], m2 ;s_1 = m2, copy E00l to stack[1]
+ pmaddwd m5, [tab_idct8 + 10 * 16] ;m5 = E00h = _mm_madd_epi16(m5 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ pmaddwd m1, [tab_idct8 + 11 * 16] ;m1 = E01l = _mm_madd_epi16(m1 = m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ pmaddwd m8, [tab_idct8 + 11 * 16] ;m8 = E01h = _mm_madd_epi16(m8 = m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ mova [r5 - 2 * 16], m5 ;s_2 = m5, copy E00h to stack[2]
+ paddd m2, m0 ;m2 = E0l = _mm_add_epi32(m0 = EE0l, m2 = E00l);
+ paddd m2, [IDCT_ADD] ;m2 = E0l = _mm_add_epi32(m2 = E0l, IDCT_ADD);
+ paddd m5, m4 ;m5 = E0h = _mm_add_epi32(m4 = EE0h, m5 = E00h);
+ paddd m5, [IDCT_ADD] ;m5 = E0h = _mm_add_epi32(m5 = E0h, IDCT_ADD);
+ psubd m0, [r5 - 1 * 16] ;m0 = E3l = _mm_sub_epi32(m0 = EE0l, s_1 = E00l);
+ mova [r5 - 1 * 16],m1 ;s_1 = m1, copy E01l to stack[1]
+ paddd m0, [IDCT_ADD] ;m0 = E3l = _mm_add_epi32(m0 = E3l, IDCT_ADD);
+ psubd m4, [r5 - 2 * 16] ;m4 = E3h = _mm_sub_epi32(m4 = EE0h, s_2 = E00h);
+ paddd m4, [IDCT_ADD] ;m4 = E3h = _mm_add_epi32(m4, IDCT_ADD);
+ paddd m1, m11 ;m1 = E1l = _mm_add_epi32(m15 = EE1l, m1 = E01l);
+ mova [r5 - 2 * 16], m8 ;s_2 = m8, copy = E01h to stack[2]
+ paddd m1, [IDCT_ADD] ;m1 = E1l = _mm_add_epi32(m1 = E1l, IDCT_ADD);
+ paddd m8, m3 ;m8 = E1h = _mm_add_epi32(m14 = EE1h, m8 = E01h);
+ paddd m8, [IDCT_ADD] ;m8 = E1h = _mm_add_epi32(m8 = E1h, IDCT_ADD);
+ psubd m11, [r5 - 1 * 16] ;m15 = E2l = _mm_sub_epi32(m15 = EE1l, s_1 = E01l);
+ paddd m11, [IDCT_ADD] ;m15 = E2l = _mm_add_epi32(m15 = E2l, IDCT_ADD);
+ psubd m3, [r5 - 2 * 16] ;m14 = E2h = _mm_sub_epi32(m14 = EE1h, s_2 = E01h);
+ paddd m3, [IDCT_ADD] ;m14 = E2h = _mm_add_epi32(m14 = E2h, IDCT_ADD);
+
+ ;m13 = m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m2 = E0l, m13 = O0l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m5 = E0h, m6 = O0h), IDCT_SHIFT));
+ mova [r5 - 1 * 16], m13 ;s_1 = m13, copy O0l to stack[1]
+ paddd m13, m2 ;m13 + m2, add E0l and O0l
+ psrad m13, IDCT_SHIFT ;m13 = _mm_srai_epi32(m13, IDCT_SHIFT)
+ mova [r5 - 2 * 16], m6 ;s_2 = m6, copy O0h to stack[2]
+ paddd m6, m5 ;m6 + m5, add O0h and E0h
+ psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
+ packssdw m13, m6 ;m13 = m128iS0
+ ;m2 = m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m2 = E0l, s_1 = O0l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m5 = E0h, s_2 = O0h), IDCT_SHIFT));
+ psubd m2, [r5 - 1 * 16] ;m2 - s_1, E0l minus O0l
+ psrad m2, IDCT_SHIFT ;m2 = _mm_srai_epi32(m2, IDCT_SHIFT)
+ psubd m5, [r5 - 2 * 16] ;m5 - s_2, E0h minus O0h
+ psrad m5, IDCT_SHIFT ;m5 = _mm_srai_epi32(m5, IDCT_SHIFT)
+ packssdw m2, m5 ;m2 = m128iS7
+ ;m1 = m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m1 = E1l, m7 = O1l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m8 = E1h, m9 = O1h), IDCT_SHIFT));
+ mova m5, m1 ;m5 = m1, copy E1l to m5
+ paddd m1, m7 ;m1 + m7, add E1l and O1l
+ psrad m1, IDCT_SHIFT ;m1 = _mm_srai_epi32(m1, IDCT_SHIFT)
+ mova m6, m8 ;m6 = m8, copy E1h to m6
+ paddd m8, m9 ;m8 + m9, add E1h and O1h
+ psrad m8, IDCT_SHIFT ;m8 = _mm_srai_epi32(m8, IDCT_SHIFT)
+ packssdw m1, m8 ;m1 = m128iS1
+ ;m5 = m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m5 = E1l, m7 = O1l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m6 = E1h, m9 = O1h), IDCT_SHIFT));
+ psubd m5, m7 ;m5 - m7, E1l minus O1l
+ psrad m5, IDCT_SHIFT ;m5 = _mm_srai_epi32(m5, IDCT_SHIFT)
+ psubd m6, m9 ;m6 - m9, E1h minus O1h
+ psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
+ packssdw m5, m6 ;m5 = m128iS6
+ ;m15 = m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m15 = E2l, m10 = O2l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m14 = E2h, m12 = O2h), IDCT_SHIFT));
+ mova m6, m11 ;m6 = m15, copy E2l to m6
+
+ paddd m11, m10 ;m15 + m10, add E2l and O2l
+ psrad m11, IDCT_SHIFT ;m11 = _mm_srai_epi32(m11, IDCT_SHIFT)
+ mova m7, m3 ;m7 = m14, copy E2h to m7
+ paddd m3, m12 ;m14 + m12, add E2h and O2h
+ psrad m3, IDCT_SHIFT ;m3 = _mm_srai_epi32(m3, IDCT_SHIFT)
+ packssdw m11, m3 ;m15 = m128iS2
+ ;m6 = m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m6 = E2l, m10 = O2l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m7 = E2h, m12 = O2h), IDCT_SHIFT));
+ psubd m6, m10 ;m6 - m10, E2l minus O2l
+ psrad m6, IDCT_SHIFT ;m6 = _mm_srai_epi32(m6, IDCT_SHIFT)
+ psubd m7, m12 ;m7 - m12, E2h minus O2h
+ psrad m7, IDCT_SHIFT ;m7 = _mm_srai_epi32(m7, IDCT_SHIFT)
+ packssdw m6, m7 ;m6 = m128iS5
+ ;m0 = m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(m0 = E3l, m3 = O3l), IDCT_SHIFT), _mm_srai_epi32(_mm_add_epi32(m4 = E3h, m11 = O3h), IDCT_SHIFT));
+ mova m8, m0 ;m8 = m0, copy E3l to m8
+ paddd m0, m14 ;m0 + m3, add E3l and O3l
+ psrad m0, IDCT_SHIFT ;m0 = _mm_srai_epi32(m0, IDCT_SHIFT)
+ mova m7, m4 ;m7 = m4, copy E3h to m7
+ paddd m4, m15 ;m4 + m11, add E3h and O3h
+ psrad m4, IDCT_SHIFT ;m4 = _mm_srai_epi32(m4, IDCT_SHIFT)
+ packssdw m0, m4 ;m0 = m128iS3
+ ;m8 = m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(m8 = E3l, m3 = O3l), IDCT_SHIFT), _mm_srai_epi32(_mm_sub_epi32(m7 = E3h, m11 = O3h), IDCT_SHIFT));
+ psubd m8, m14 ;m8 - m3, E3l minus O3l
+ psrad m8, IDCT_SHIFT ;m8 = _mm_srai_epi32(m8, IDCT_SHIFT)
+ psubd m7, m15 ;m7 - m11, E3h minus O3h
+ psrad m7, IDCT_SHIFT ;m7 = _mm_srai_epi32(m7, IDCT_SHIFT)
+ packssdw m8, m7 ;m8 = m128iS4
+
+; // [07 06 05 04 03 02 01 00]
+; // [17 16 15 14 13 12 11 10]
+; // [27 26 25 24 23 22 21 20]
+; // [37 36 35 34 33 32 31 30]
+; // [47 46 45 44 43 42 41 40]
+; // [57 56 55 54 53 52 51 50]
+; // [67 66 65 64 63 62 61 60]
+; // [77 76 75 74 73 72 71 70]
+
+ mova m3, m13 ;m3 = m13, copy m128iS0 to m3
+ punpcklwd m3, m1 ;m3 = T00 = _mm_unpacklo_epi16(m3 = m128iS0, m1 = m128iS1); // [13 03 12 02 11 01 10 00]
+ punpckhwd m13, m1 ;m13 = T01 = _mm_unpackhi_epi16(m13 = m128iS0,m1 = m128iS1); // [17 07 16 06 15 05 14 04]
+ mova m4, m11 ;m4 = m15
+ punpcklwd m11, m0 ;m15 = T02 = _mm_unpacklo_epi16(m15 = m128iS2, m0 = m128iS3); // [33 23 32 22 31 21 30 20]
+ punpckhwd m4, m0 ;m4 = T03 = _mm_unpackhi_epi16(m4 = m128iS2, m0 = m128iS3); // [37 27 36 26 35 25 34 24]
+ mova m1, m8 ;m1 = m8, copy m128iS4 to m1
+ punpcklwd m8, m6 ;m8 = T04 = _mm_unpacklo_epi16(m8 = m128iS4, m6 = m128iS5); // [53 43 52 42 51 41 50 40]
+ punpckhwd m1, m6 ;m1 = T05 = _mm_unpackhi_epi16(m1 = m128iS4, m6 = m128iS5); // [57 47 56 46 55 45 54 44]
+ mova m7, m5 ;m7 = m5, copy m128iS6 to m7
+ punpcklwd m5, m2 ;m5 = T06 = _mm_unpacklo_epi16(m5 = m128iS6, m2 = m128iS7); // [73 63 72 62 71 61 70 60]
+ punpckhwd m7, m2 ;m7 = T07 = _mm_unpackhi_epi16(m7 = m128iS6, m2 = m128iS7); // [77 67 76 66 75 65 74 64]
+
+ ;__m128i T10, T11;
+ mova m0, m3 ;m0 = m3, copy T00 to m0
+ punpckldq m0, m11 ;m0 = T10 = _mm_unpacklo_epi32(m0 = T00, m15 = T02); // [31 21 11 01 30 20 10 00]
+ punpckhdq m3, m11 ;m3 = T11 = _mm_unpackhi_epi32(m3 = T00, m15 = T02); // [33 23 13 03 32 22 12 02]
+; lea r2, [r2 + r2] ;set r2 to index of 1
+; lea r4, [r2 + r2] ;set r4 to index of 2
+; lea r3, [r4 + r2] ;set r3 to index of 3
+; lea r4, [r4 + r3] ;set r4 to index of 5
+ lea r0, [r4 + r2 * 2] ;set r0 to index of 7
+ movq [r1], m0 ;_mm_storel_epi64((__m128i*)&dst[0 * stride + 0], m0 = T10); // [30 20 10 00]
+ movq [r1 + r2 * 2], m3 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 0], m3 = T11); // [32 22 12 02]
+ movhps [r1 + r2 * 1], m0 ;_mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(m0 = T10)); // [31 21 11 01]
+ movhps [r1 + r3], m3 ;_mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(m3 = T11)); // [33 23 13 03]
+
+ mova m2, m8 ;m2 = m8, copy T04 to m2
+ punpckldq m2, m5 ;m2 = T10 = _mm_unpacklo_epi32(m2 = T04, m5 = T06); // [71 61 51 41 70 60 50 40]
+ punpckhdq m8, m5 ;m8 = T11 = _mm_unpackhi_epi32(m8 = T04, m5 = T06); // [73 63 53 43 72 62 52 42]
+ movq [r1 + 8], m2 ;_mm_storel_epi64((__m128i*)&dst[0 * stride + 4], m2 = T10);
+ movq [r1 + r2 * 2 + 8], m8 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 4], m8 = T11);
+ movhps [r1 + r2 * 1 + 8], m2 ;_mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(m2 = T10));
+ movq [r1 + r2 * 2 + 8], m8 ;_mm_storel_epi64((__m128i*)&dst[2 * stride + 4], m8 = T11);
+
+ movhps [r1 + r3 + 8], m8 ;_mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(m8 = T11));
+ mova m11, m0
+ punpcklqdq m0, m2
+ punpckhqdq m11, m2
+ mova [r1], m0
+ mova [r1 + r2 * 1], m11
+
+ mova m5, m3
+ punpcklqdq m3, m8
+ punpckhqdq m5, m8
+ mova [r1 + r2 * 2], m3
+ mova [r1 + r3], m5
+
+ mova m6, m13 ;m6 = m13, copy T01 to m6
+ punpckldq m6, m4 ;m6 = T10 = _mm_unpacklo_epi32(m6 = T01, m4 = T03); // [35 25 15 05 34 24 14 04]
+ punpckhdq m13, m4 ;m13 = T11 = _mm_unpackhi_epi32(m13 = T01, m4 = T03); // [37 27 17 07 36 26 16 06]
+ movq [r1 + r2 * 4], m6 ;_mm_storel_epi64((__m128i*)&dst[4 * stride + 0], m6 = T10);
+ movq [r1 + r3 * 2], m13 ;_mm_storel_epi64((__m128i*)&dst[6 * stride + 0], m13 = T11);
+ mova m9, m1 ;m9 = m1, copy T05 to m9
+ movhps [r1 + r4], m6 ;_mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(m6 = T10));
+ movhps [r1 + r0], m13 ;_mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(m13 = T11));
+
+ punpckldq m1, m7 ;m1 = T10 = _mm_unpacklo_epi32(m1 = T05, m7 = T07); // [75 65 55 45 74 64 54 44]
+ punpckhdq m9, m7 ;m9 = T11 = _mm_unpackhi_epi32(m9 = T05, m7 = T07); // [77 67 57 47 76 56 46 36]
+ movq [r1 + r2 * 4 + 8], m1 ;_mm_storel_epi64((__m128i*)&dst[4 * stride + 4], m1 = T10);
+ movq [r1 + r3 * 2 + 8], m9 ;_mm_storel_epi64((__m128i*)&dst[6 * stride + 4], m9 = T11);
+ movhps [r1 + r4 + 8], m1 ;_mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(m1 = T10));
+ movhps [r1 + r0 + 8], m9 ;_mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(m9 = T11));
+
+ RET
+%undef IDCT_SHIFT
+%undef IDCT_ADD
+
+;-------------------------------------------------------
; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
;-------------------------------------------------------
INIT_XMM ssse3
diff -r 1d17ec0cb954 -r f4a932dba699 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/dct8.h Thu Nov 20 19:21:53 2014 -0800
@@ -35,6 +35,7 @@
void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
More information about the x265-devel
mailing list