<div dir="ltr">Thanks, pushed. We appreciate all help in accelerating 16/32 DCT/IDCT primitives. <br></div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Jan 19, 2015 at 11:14 PM,  <span dir="ltr"><<a href="mailto:dtyx265@gmail.com" target="_blank">dtyx265@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User David T Yuen <<a href="mailto:dtyx265@gmail.com">dtyx265@gmail.com</a>><br>
# Date 1421689416 28800<br>
# Node ID fd4481542b452a01b790ab677e6a7209675b965b<br>
# Parent  4f8b7cc9d51e1102b2d2b27d5a19f97576ddde63<br>
asm: idct16 intrinsic 28900->25000 improvement over previous intrinsic<br>
<br>
diff -r 4f8b7cc9d51e -r fd4481542b45 source/common/vec/dct-sse3.cpp<br>
--- a/source/common/vec/dct-sse3.cpp    Mon Jan 19 18:21:50 2015 +0800<br>
+++ b/source/common/vec/dct-sse3.cpp    Mon Jan 19 09:43:36 2015 -0800<br>
@@ -291,6 +291,254 @@<br>
<br>
 void idct16(const int16_t *src, int16_t *dst, intptr_t stride)<br>
 {<br>
+#define READ_UNPACKHILO(offset)\<br>
+    const __m128i T_00_00A = _mm_unpacklo_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\<br>
+    const __m128i T_00_00B = _mm_unpackhi_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\<br>
+    const __m128i T_00_01A = _mm_unpacklo_epi16(*(__m128i*)&src[5 * 16 + offset], *(__m128i*)&src[7 * 16 + offset]);\<br>
+    const __m128i T_00_01B = _mm_unpackhi_epi16(*(__m128i*)&src[5 * 16 + offset], *(__m128i*)&src[7 * 16 + offset]);\<br>
+    const __m128i T_00_02A = _mm_unpacklo_epi16(*(__m128i*)&src[9 * 16 + offset], *(__m128i*)&src[11 * 16 + offset]);\<br>
+    const __m128i T_00_02B = _mm_unpackhi_epi16(*(__m128i*)&src[9 * 16 + offset], *(__m128i*)&src[11 * 16 + offset]);\<br>
+    const __m128i T_00_03A = _mm_unpacklo_epi16(*(__m128i*)&src[13 * 16 + offset], *(__m128i*)&src[15 * 16 + offset]);\<br>
+    const __m128i T_00_03B = _mm_unpackhi_epi16(*(__m128i*)&src[13 * 16 + offset], *(__m128i*)&src[15 * 16 + offset]);\<br>
+    const __m128i T_00_04A = _mm_unpacklo_epi16(*(__m128i*)&src[2 * 16 + offset], *(__m128i*)&src[6 * 16 + offset]);\<br>
+    const __m128i T_00_04B = _mm_unpackhi_epi16(*(__m128i*)&src[2 * 16 + offset], *(__m128i*)&src[6 * 16 + offset]);\<br>
+    const __m128i T_00_05A = _mm_unpacklo_epi16(*(__m128i*)&src[10 * 16 + offset], *(__m128i*)&src[14 * 16 + offset]);\<br>
+    const __m128i T_00_05B = _mm_unpackhi_epi16(*(__m128i*)&src[10 * 16 + offset], *(__m128i*)&src[14 * 16 + offset]);\<br>
+    const __m128i T_00_06A = _mm_unpacklo_epi16(*(__m128i*)&src[4 * 16 + offset], *(__m128i*)&src[12 * 16 + offset]);\<br>
+    const __m128i T_00_06B = _mm_unpackhi_epi16(*(__m128i*)&src[4 * 16 + offset], *(__m128i*)&src[12 * 16 + offset]);\<br>
+    const __m128i T_00_07A = _mm_unpacklo_epi16(*(__m128i*)&src[0 * 16 + offset], *(__m128i*)&src[8 * 16 + offset]);\<br>
+    const __m128i T_00_07B = _mm_unpackhi_epi16(*(__m128i*)&src[0 * 16 + offset], *(__m128i*)&src[8 * 16 + offset]);<br>
+<br>
+#define UNPACKHILO(part) \<br>
+    const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);\<br>
+    const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);\<br>
+    const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]);\<br>
+    const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]);\<br>
+    const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]);\<br>
+    const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]);\<br>
+    const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]);\<br>
+    const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]);\<br>
+    const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]);\<br>
+    const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]);\<br>
+    const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]);\<br>
+    const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]);\<br>
+    const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]);\<br>
+    const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]);\<br>
+    const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]);\<br>
+    const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]);<br>
+<br>
+#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \<br>
+    T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \<br>
+    T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \<br>
+    row = _mm_add_epi32(T00, T01);<br>
+<br>
+#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \<br>
+    tr0_0 = _mm_unpacklo_epi16(I0, I1); \<br>
+    tr0_1 = _mm_unpacklo_epi16(I2, I3); \<br>
+    tr0_2 = _mm_unpackhi_epi16(I0, I1); \<br>
+    tr0_3 = _mm_unpackhi_epi16(I2, I3); \<br>
+    tr0_4 = _mm_unpacklo_epi16(I4, I5); \<br>
+    tr0_5 = _mm_unpacklo_epi16(I6, I7); \<br>
+    tr0_6 = _mm_unpackhi_epi16(I4, I5); \<br>
+    tr0_7 = _mm_unpackhi_epi16(I6, I7); \<br>
+    tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \<br>
+    tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \<br>
+    tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \<br>
+    tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \<br>
+    tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \<br>
+    tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \<br>
+    tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \<br>
+    tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \<br>
+    O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \<br>
+    O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \<br>
+    O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \<br>
+    O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \<br>
+    O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \<br>
+    O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \<br>
+    O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \<br>
+    O7 = _mm_unpackhi_epi64(tr1_3, tr1_7);<br>
+<br>
+#define PROCESS(part, rnd, shift) \<br>
+    __m128i c32_rnd = _mm_set1_epi32(rnd);\<br>
+    int nShift = shift;\<br>
+\<br>
+    __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;\<br>
+    __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;\<br>
+    {\<br>
+        __m128i T00, T01;\<br>
+\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)\<br>
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)\<br>
+\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)\<br>
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)\<br>
+    }\<br>
+\<br>
+    __m128i EO0A, EO1A, EO2A, EO3A;\<br>
+    __m128i EO0B, EO1B, EO2B, EO3B;\<br>
+    EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50));\<br>
+    EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));\<br>
+    EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89));\<br>
+    EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));\<br>
+    EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18));\<br>
+    EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));\<br>
+    EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75));\<br>
+    EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));\<br>
+\<br>
+    __m128i EEO0A, EEO1A;\<br>
+    __m128i EEO0B, EEO1B;\<br>
+    EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);\<br>
+    EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);\<br>
+    EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);\<br>
+    EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);\<br>
+\<br>
+    __m128i EEE0A, EEE1A;\<br>
+    __m128i EEE0B, EEE1B;\<br>
+    EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);\<br>
+    EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);\<br>
+    EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);\<br>
+    EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);\<br>
+\<br>
+    const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A);\<br>
+    const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);\<br>
+    const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A);\<br>
+    const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);\<br>
+    const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A);\<br>
+    const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);\<br>
+    const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A);\<br>
+    const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);\<br>
+\<br>
+    const __m128i E0A = _mm_add_epi32(EE0A, EO0A);\<br>
+    const __m128i E0B = _mm_add_epi32(EE0B, EO0B);\<br>
+    const __m128i E1A = _mm_add_epi32(EE1A, EO1A);\<br>
+    const __m128i E1B = _mm_add_epi32(EE1B, EO1B);\<br>
+    const __m128i E2A = _mm_add_epi32(EE2A, EO2A);\<br>
+    const __m128i E2B = _mm_add_epi32(EE2B, EO2B);\<br>
+    const __m128i E3A = _mm_add_epi32(EE3A, EO3A);\<br>
+    const __m128i E3B = _mm_add_epi32(EE3B, EO3B);\<br>
+    const __m128i E7A = _mm_sub_epi32(EE0A, EO0A);\<br>
+    const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);\<br>
+    const __m128i E6A = _mm_sub_epi32(EE1A, EO1A);\<br>
+    const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);\<br>
+    const __m128i E5A = _mm_sub_epi32(EE2A, EO2A);\<br>
+    const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);\<br>
+    const __m128i E4A = _mm_sub_epi32(EE3A, EO3A);\<br>
+    const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);\<br>
+\<br>
+    const __m128i T10A = _mm_add_epi32(E0A, c32_rnd);\<br>
+    const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);\<br>
+    const __m128i T11A = _mm_add_epi32(E1A, c32_rnd);\<br>
+    const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);\<br>
+    const __m128i T12A = _mm_add_epi32(E2A, c32_rnd);\<br>
+    const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);\<br>
+    const __m128i T13A = _mm_add_epi32(E3A, c32_rnd);\<br>
+    const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);\<br>
+    const __m128i T14A = _mm_add_epi32(E4A, c32_rnd);\<br>
+    const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);\<br>
+    const __m128i T15A = _mm_add_epi32(E5A, c32_rnd);\<br>
+    const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);\<br>
+    const __m128i T16A = _mm_add_epi32(E6A, c32_rnd);\<br>
+    const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);\<br>
+    const __m128i T17A = _mm_add_epi32(E7A, c32_rnd);\<br>
+    const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);\<br>
+\<br>
+    const __m128i T20A = _mm_add_epi32(T10A, O0A);\<br>
+    const __m128i T20B = _mm_add_epi32(T10B, O0B);\<br>
+    const __m128i T21A = _mm_add_epi32(T11A, O1A);\<br>
+    const __m128i T21B = _mm_add_epi32(T11B, O1B);\<br>
+    const __m128i T22A = _mm_add_epi32(T12A, O2A);\<br>
+    const __m128i T22B = _mm_add_epi32(T12B, O2B);\<br>
+    const __m128i T23A = _mm_add_epi32(T13A, O3A);\<br>
+    const __m128i T23B = _mm_add_epi32(T13B, O3B);\<br>
+    const __m128i T24A = _mm_add_epi32(T14A, O4A);\<br>
+    const __m128i T24B = _mm_add_epi32(T14B, O4B);\<br>
+    const __m128i T25A = _mm_add_epi32(T15A, O5A);\<br>
+    const __m128i T25B = _mm_add_epi32(T15B, O5B);\<br>
+    const __m128i T26A = _mm_add_epi32(T16A, O6A);\<br>
+    const __m128i T26B = _mm_add_epi32(T16B, O6B);\<br>
+    const __m128i T27A = _mm_add_epi32(T17A, O7A);\<br>
+    const __m128i T27B = _mm_add_epi32(T17B, O7B);\<br>
+    const __m128i T2FA = _mm_sub_epi32(T10A, O0A);\<br>
+    const __m128i T2FB = _mm_sub_epi32(T10B, O0B);\<br>
+    const __m128i T2EA = _mm_sub_epi32(T11A, O1A);\<br>
+    const __m128i T2EB = _mm_sub_epi32(T11B, O1B);\<br>
+    const __m128i T2DA = _mm_sub_epi32(T12A, O2A);\<br>
+    const __m128i T2DB = _mm_sub_epi32(T12B, O2B);\<br>
+    const __m128i T2CA = _mm_sub_epi32(T13A, O3A);\<br>
+    const __m128i T2CB = _mm_sub_epi32(T13B, O3B);\<br>
+    const __m128i T2BA = _mm_sub_epi32(T14A, O4A);\<br>
+    const __m128i T2BB = _mm_sub_epi32(T14B, O4B);\<br>
+    const __m128i T2AA = _mm_sub_epi32(T15A, O5A);\<br>
+    const __m128i T2AB = _mm_sub_epi32(T15B, O5B);\<br>
+    const __m128i T29A = _mm_sub_epi32(T16A, O6A);\<br>
+    const __m128i T29B = _mm_sub_epi32(T16B, O6B);\<br>
+    const __m128i T28A = _mm_sub_epi32(T17A, O7A);\<br>
+    const __m128i T28B = _mm_sub_epi32(T17B, O7B);\<br>
+\<br>
+    const __m128i T30A = _mm_srai_epi32(T20A, nShift);\<br>
+    const __m128i T30B = _mm_srai_epi32(T20B, nShift);\<br>
+    const __m128i T31A = _mm_srai_epi32(T21A, nShift);\<br>
+    const __m128i T31B = _mm_srai_epi32(T21B, nShift);\<br>
+    const __m128i T32A = _mm_srai_epi32(T22A, nShift);\<br>
+    const __m128i T32B = _mm_srai_epi32(T22B, nShift);\<br>
+    const __m128i T33A = _mm_srai_epi32(T23A, nShift);\<br>
+    const __m128i T33B = _mm_srai_epi32(T23B, nShift);\<br>
+    const __m128i T34A = _mm_srai_epi32(T24A, nShift);\<br>
+    const __m128i T34B = _mm_srai_epi32(T24B, nShift);\<br>
+    const __m128i T35A = _mm_srai_epi32(T25A, nShift);\<br>
+    const __m128i T35B = _mm_srai_epi32(T25B, nShift);\<br>
+    const __m128i T36A = _mm_srai_epi32(T26A, nShift);\<br>
+    const __m128i T36B = _mm_srai_epi32(T26B, nShift);\<br>
+    const __m128i T37A = _mm_srai_epi32(T27A, nShift);\<br>
+    const __m128i T37B = _mm_srai_epi32(T27B, nShift);\<br>
+\<br>
+    const __m128i T38A = _mm_srai_epi32(T28A, nShift);\<br>
+    const __m128i T38B = _mm_srai_epi32(T28B, nShift);\<br>
+    const __m128i T39A = _mm_srai_epi32(T29A, nShift);\<br>
+    const __m128i T39B = _mm_srai_epi32(T29B, nShift);\<br>
+    const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);\<br>
+    const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);\<br>
+    const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);\<br>
+    const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);\<br>
+    const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);\<br>
+    const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);\<br>
+    const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);\<br>
+    const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);\<br>
+    const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);\<br>
+    const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);\<br>
+    const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);\<br>
+    const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);\<br>
+\<br>
+    res00[part]  = _mm_packs_epi32(T30A, T30B);\<br>
+    res01[part]  = _mm_packs_epi32(T31A, T31B);\<br>
+    res02[part]  = _mm_packs_epi32(T32A, T32B);\<br>
+    res03[part]  = _mm_packs_epi32(T33A, T33B);\<br>
+    res04[part]  = _mm_packs_epi32(T34A, T34B);\<br>
+    res05[part]  = _mm_packs_epi32(T35A, T35B);\<br>
+    res06[part]  = _mm_packs_epi32(T36A, T36B);\<br>
+    res07[part]  = _mm_packs_epi32(T37A, T37B);\<br>
+\<br>
+    res08[part]  = _mm_packs_epi32(T38A, T38B);\<br>
+    res09[part]  = _mm_packs_epi32(T39A, T39B);\<br>
+    res10[part]  = _mm_packs_epi32(T3AA, T3AB);\<br>
+    res11[part]  = _mm_packs_epi32(T3BA, T3BB);\<br>
+    res12[part]  = _mm_packs_epi32(T3CA, T3CB);\<br>
+    res13[part]  = _mm_packs_epi32(T3DA, T3DB);\<br>
+    res14[part]  = _mm_packs_epi32(T3EA, T3EB);\<br>
+    res15[part]  = _mm_packs_epi32(T3FA, T3FB);<br>
+<br>
     const __m128i c16_p87_p90   = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address<br>
     const __m128i c16_p70_p80   = _mm_set1_epi32(0x00460050);<br>
     const __m128i c16_p43_p57   = _mm_set1_epi32(0x002B0039);<br>
@@ -338,9 +586,6 @@<br>
<br>
     const __m128i c16_n64_p64   = _mm_set1_epi32(0xFFC00040);<br>
     const __m128i c16_p64_p64   = _mm_set1_epi32(0x00400040);<br>
-    __m128i c32_rnd             = _mm_set1_epi32(64);<br>
-<br>
-    int nShift = 7;<br>
<br>
     // DCT1<br>
     __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2];<br>
@@ -348,308 +593,79 @@<br>
     __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2];<br>
     __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2];<br>
<br>
-    for (int i = 0; i < 2; i++)<br>
     {<br>
-        const int offset = (i << 3);<br>
-        in00[i]  = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]<br>
-        in01[i]  = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]<br>
-        in02[i]  = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]<br>
-        in03[i]  = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]<br>
-        in04[i]  = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]<br>
-        in05[i]  = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]<br>
-        in06[i]  = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]<br>
-        in07[i]  = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]<br>
-        in08[i]  = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);<br>
-        in09[i]  = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);<br>
-        in10[i]  = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);<br>
-        in11[i]  = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);<br>
-        in12[i]  = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);<br>
-        in13[i]  = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);<br>
-        in14[i]  = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);<br>
-        in15[i]  = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);<br>
+        READ_UNPACKHILO(0)<br>
+        PROCESS(0, 64, 7)<br>
     }<br>
<br>
-    for (int pass = 0; pass < 2; pass++)<br>
     {<br>
-        if (pass == 1)<br>
-        {<br>
-            c32_rnd = _mm_set1_epi32(2048);<br>
-            nShift  = 12;<br>
-        }<br>
-<br>
-        for (int part = 0; part < 2; part++)<br>
-        {<br>
-            const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);       // [33 13 32 12 31 11 30 10]<br>
-            const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);       // [37 17 36 16 35 15 34 14]<br>
-            const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]);       // [ ]<br>
-            const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]);       // [ ]<br>
-            const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]);       // [ ]<br>
-            const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]);       // [ ]<br>
-            const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]);       // [ ]<br>
-            const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]);       // [ ]<br>
-            const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]);       // [ ]<br>
-            const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]);       // [ ]<br>
-            const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]);       // [ ]<br>
-            const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]);       // [ ]<br>
-            const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]);       // [ ]row<br>
-            const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]);       // [ ]<br>
-            const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]);       // [83 03 82 02 81 01 81 00] row08 row00<br>
-            const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]);       // [87 07 86 06 85 05 84 04]<br>
-<br>
-            __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;<br>
-            __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;<br>
-            {<br>
-                __m128i T00, T01;<br>
-#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \<br>
-    T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \<br>
-    T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \<br>
-    row = _mm_add_epi32(T00, T01);<br>
-<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)<br>
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)<br>
-<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)<br>
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)<br>
-#undef COMPUTE_ROW<br>
-            }<br>
-<br>
-            __m128i EO0A, EO1A, EO2A, EO3A;<br>
-            __m128i EO0B, EO1B, EO2B, EO3B;<br>
-            EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0<br>
-            EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));<br>
-            EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1<br>
-            EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));<br>
-            EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2<br>
-            EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));<br>
-            EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3<br>
-            EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));<br>
-<br>
-            __m128i EEO0A, EEO1A;<br>
-            __m128i EEO0B, EEO1B;<br>
-            EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);<br>
-            EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);<br>
-            EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);<br>
-            EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);<br>
-<br>
-            __m128i EEE0A, EEE1A;<br>
-            __m128i EEE0B, EEE1B;<br>
-            EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);<br>
-            EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);<br>
-            EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);<br>
-            EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);<br>
-<br>
-            const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A);          // EE0 = EEE0 + EEO0<br>
-            const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);<br>
-            const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A);          // EE1 = EEE1 + EEO1<br>
-            const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);<br>
-            const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A);          // EE2 = EEE0 - EEO0<br>
-            const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);<br>
-            const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A);          // EE3 = EEE1 - EEO1<br>
-            const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);<br>
-<br>
-            const __m128i E0A = _mm_add_epi32(EE0A, EO0A);          // E0 = EE0 + EO0<br>
-            const __m128i E0B = _mm_add_epi32(EE0B, EO0B);<br>
-            const __m128i E1A = _mm_add_epi32(EE1A, EO1A);          // E1 = EE1 + EO1<br>
-            const __m128i E1B = _mm_add_epi32(EE1B, EO1B);<br>
-            const __m128i E2A = _mm_add_epi32(EE2A, EO2A);          // E2 = EE2 + EO2<br>
-            const __m128i E2B = _mm_add_epi32(EE2B, EO2B);<br>
-            const __m128i E3A = _mm_add_epi32(EE3A, EO3A);          // E3 = EE3 + EO3<br>
-            const __m128i E3B = _mm_add_epi32(EE3B, EO3B);<br>
-            const __m128i E7A = _mm_sub_epi32(EE0A, EO0A);          // E0 = EE0 - EO0<br>
-            const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);<br>
-            const __m128i E6A = _mm_sub_epi32(EE1A, EO1A);          // E1 = EE1 - EO1<br>
-            const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);<br>
-            const __m128i E5A = _mm_sub_epi32(EE2A, EO2A);          // E2 = EE2 - EO2<br>
-            const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);<br>
-            const __m128i E4A = _mm_sub_epi32(EE3A, EO3A);          // E3 = EE3 - EO3<br>
-            const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);<br>
-<br>
-            const __m128i T10A = _mm_add_epi32(E0A, c32_rnd);         // E0 + rnd<br>
-            const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);<br>
-            const __m128i T11A = _mm_add_epi32(E1A, c32_rnd);         // E1 + rnd<br>
-            const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);<br>
-            const __m128i T12A = _mm_add_epi32(E2A, c32_rnd);         // E2 + rnd<br>
-            const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);<br>
-            const __m128i T13A = _mm_add_epi32(E3A, c32_rnd);         // E3 + rnd<br>
-            const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);<br>
-            const __m128i T14A = _mm_add_epi32(E4A, c32_rnd);         // E4 + rnd<br>
-            const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);<br>
-            const __m128i T15A = _mm_add_epi32(E5A, c32_rnd);         // E5 + rnd<br>
-            const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);<br>
-            const __m128i T16A = _mm_add_epi32(E6A, c32_rnd);         // E6 + rnd<br>
-            const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);<br>
-            const __m128i T17A = _mm_add_epi32(E7A, c32_rnd);         // E7 + rnd<br>
-            const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);<br>
-<br>
-            const __m128i T20A = _mm_add_epi32(T10A, O0A);          // E0 + O0 + rnd<br>
-            const __m128i T20B = _mm_add_epi32(T10B, O0B);<br>
-            const __m128i T21A = _mm_add_epi32(T11A, O1A);          // E1 + O1 + rnd<br>
-            const __m128i T21B = _mm_add_epi32(T11B, O1B);<br>
-            const __m128i T22A = _mm_add_epi32(T12A, O2A);          // E2 + O2 + rnd<br>
-            const __m128i T22B = _mm_add_epi32(T12B, O2B);<br>
-            const __m128i T23A = _mm_add_epi32(T13A, O3A);          // E3 + O3 + rnd<br>
-            const __m128i T23B = _mm_add_epi32(T13B, O3B);<br>
-            const __m128i T24A = _mm_add_epi32(T14A, O4A);          // E4<br>
-            const __m128i T24B = _mm_add_epi32(T14B, O4B);<br>
-            const __m128i T25A = _mm_add_epi32(T15A, O5A);          // E5<br>
-            const __m128i T25B = _mm_add_epi32(T15B, O5B);<br>
-            const __m128i T26A = _mm_add_epi32(T16A, O6A);          // E6<br>
-            const __m128i T26B = _mm_add_epi32(T16B, O6B);<br>
-            const __m128i T27A = _mm_add_epi32(T17A, O7A);          // E7<br>
-            const __m128i T27B = _mm_add_epi32(T17B, O7B);<br>
-            const __m128i T2FA = _mm_sub_epi32(T10A, O0A);          // E0 - O0 + rnd<br>
-            const __m128i T2FB = _mm_sub_epi32(T10B, O0B);<br>
-            const __m128i T2EA = _mm_sub_epi32(T11A, O1A);          // E1 - O1 + rnd<br>
-            const __m128i T2EB = _mm_sub_epi32(T11B, O1B);<br>
-            const __m128i T2DA = _mm_sub_epi32(T12A, O2A);          // E2 - O2 + rnd<br>
-            const __m128i T2DB = _mm_sub_epi32(T12B, O2B);<br>
-            const __m128i T2CA = _mm_sub_epi32(T13A, O3A);          // E3 - O3 + rnd<br>
-            const __m128i T2CB = _mm_sub_epi32(T13B, O3B);<br>
-            const __m128i T2BA = _mm_sub_epi32(T14A, O4A);          // E4<br>
-            const __m128i T2BB = _mm_sub_epi32(T14B, O4B);<br>
-            const __m128i T2AA = _mm_sub_epi32(T15A, O5A);          // E5<br>
-            const __m128i T2AB = _mm_sub_epi32(T15B, O5B);<br>
-            const __m128i T29A = _mm_sub_epi32(T16A, O6A);          // E6<br>
-            const __m128i T29B = _mm_sub_epi32(T16B, O6B);<br>
-            const __m128i T28A = _mm_sub_epi32(T17A, O7A);          // E7<br>
-            const __m128i T28B = _mm_sub_epi32(T17B, O7B);<br>
-<br>
-            const __m128i T30A = _mm_srai_epi32(T20A, nShift);             // [30 20 10 00]<br>
-            const __m128i T30B = _mm_srai_epi32(T20B, nShift);             // [70 60 50 40]<br>
-            const __m128i T31A = _mm_srai_epi32(T21A, nShift);             // [31 21 11 01]<br>
-            const __m128i T31B = _mm_srai_epi32(T21B, nShift);             // [71 61 51 41]<br>
-            const __m128i T32A = _mm_srai_epi32(T22A, nShift);             // [32 22 12 02]<br>
-            const __m128i T32B = _mm_srai_epi32(T22B, nShift);             // [72 62 52 42]<br>
-            const __m128i T33A = _mm_srai_epi32(T23A, nShift);             // [33 23 13 03]<br>
-            const __m128i T33B = _mm_srai_epi32(T23B, nShift);             // [73 63 53 43]<br>
-            const __m128i T34A = _mm_srai_epi32(T24A, nShift);             // [33 24 14 04]<br>
-            const __m128i T34B = _mm_srai_epi32(T24B, nShift);             // [74 64 54 44]<br>
-            const __m128i T35A = _mm_srai_epi32(T25A, nShift);             // [35 25 15 05]<br>
-            const __m128i T35B = _mm_srai_epi32(T25B, nShift);             // [75 65 55 45]<br>
-            const __m128i T36A = _mm_srai_epi32(T26A, nShift);             // [36 26 16 06]<br>
-            const __m128i T36B = _mm_srai_epi32(T26B, nShift);             // [76 66 56 46]<br>
-            const __m128i T37A = _mm_srai_epi32(T27A, nShift);             // [37 27 17 07]<br>
-            const __m128i T37B = _mm_srai_epi32(T27B, nShift);             // [77 67 57 47]<br>
-<br>
-            const __m128i T38A = _mm_srai_epi32(T28A, nShift);             // [30 20 10 00] x8<br>
-            const __m128i T38B = _mm_srai_epi32(T28B, nShift);             // [70 60 50 40]<br>
-            const __m128i T39A = _mm_srai_epi32(T29A, nShift);             // [31 21 11 01] x9<br>
-            const __m128i T39B = _mm_srai_epi32(T29B, nShift);             // [71 61 51 41]<br>
-            const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);             // [32 22 12 02] xA<br>
-            const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);             // [72 62 52 42]<br>
-            const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);             // [33 23 13 03] xB<br>
-            const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);             // [73 63 53 43]<br>
-            const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);             // [33 24 14 04] xC<br>
-            const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);             // [74 64 54 44]<br>
-            const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);             // [35 25 15 05] xD<br>
-            const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);             // [75 65 55 45]<br>
-            const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);             // [36 26 16 06] xE<br>
-            const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);             // [76 66 56 46]<br>
-            const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);             // [37 27 17 07] xF<br>
-            const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);             // [77 67 57 47]<br>
-<br>
-            res00[part]  = _mm_packs_epi32(T30A, T30B);        // [70 60 50 40 30 20 10 00]<br>
-            res01[part]  = _mm_packs_epi32(T31A, T31B);        // [71 61 51 41 31 21 11 01]<br>
-            res02[part]  = _mm_packs_epi32(T32A, T32B);        // [72 62 52 42 32 22 12 02]<br>
-            res03[part]  = _mm_packs_epi32(T33A, T33B);        // [73 63 53 43 33 23 13 03]<br>
-            res04[part]  = _mm_packs_epi32(T34A, T34B);        // [74 64 54 44 34 24 14 04]<br>
-            res05[part]  = _mm_packs_epi32(T35A, T35B);        // [75 65 55 45 35 25 15 05]<br>
-            res06[part]  = _mm_packs_epi32(T36A, T36B);        // [76 66 56 46 36 26 16 06]<br>
-            res07[part]  = _mm_packs_epi32(T37A, T37B);        // [77 67 57 47 37 27 17 07]<br>
-<br>
-            res08[part]  = _mm_packs_epi32(T38A, T38B);        // [A0 ... 80]<br>
-            res09[part]  = _mm_packs_epi32(T39A, T39B);        // [A1 ... 81]<br>
-            res10[part]  = _mm_packs_epi32(T3AA, T3AB);        // [A2 ... 82]<br>
-            res11[part]  = _mm_packs_epi32(T3BA, T3BB);        // [A3 ... 83]<br>
-            res12[part]  = _mm_packs_epi32(T3CA, T3CB);        // [A4 ... 84]<br>
-            res13[part]  = _mm_packs_epi32(T3DA, T3DB);        // [A5 ... 85]<br>
-            res14[part]  = _mm_packs_epi32(T3EA, T3EB);        // [A6 ... 86]<br>
-            res15[part]  = _mm_packs_epi32(T3FA, T3FB);        // [A7 ... 87]<br>
-        }<br>
-        //transpose matrix 8x8 16bit.<br>
-        {<br>
-            __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;<br>
-            __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;<br>
-#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \<br>
-    tr0_0 = _mm_unpacklo_epi16(I0, I1); \<br>
-    tr0_1 = _mm_unpacklo_epi16(I2, I3); \<br>
-    tr0_2 = _mm_unpackhi_epi16(I0, I1); \<br>
-    tr0_3 = _mm_unpackhi_epi16(I2, I3); \<br>
-    tr0_4 = _mm_unpacklo_epi16(I4, I5); \<br>
-    tr0_5 = _mm_unpacklo_epi16(I6, I7); \<br>
-    tr0_6 = _mm_unpackhi_epi16(I4, I5); \<br>
-    tr0_7 = _mm_unpackhi_epi16(I6, I7); \<br>
-    tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \<br>
-    tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \<br>
-    tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \<br>
-    tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \<br>
-    tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \<br>
-    tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \<br>
-    tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \<br>
-    tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \<br>
-    O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \<br>
-    O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \<br>
-    O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \<br>
-    O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \<br>
-    O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \<br>
-    O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \<br>
-    O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \<br>
-    O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \<br>
-<br>
-            TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])<br>
-            TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])<br>
-            TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])<br>
-            TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])<br>
-<br>
-#undef TRANSPOSE_8x8_16BIT<br>
-        }<br>
+        READ_UNPACKHILO(8)<br>
+        PROCESS(1, 64, 7)<br>
+    }<br>
+    {<br>
+        __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;<br>
+        __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;<br>
+        TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])<br>
+        TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])<br>
+        TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])<br>
+        TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])<br>
     }<br>
<br>
-    _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);<br>
-    _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);<br>
-    _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);<br>
-    _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);<br>
-    _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);<br>
-    _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);<br>
-    _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);<br>
-    _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);<br>
-    _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);<br>
-    _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);<br>
-    _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);<br>
-    _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);<br>
-    _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);<br>
-    _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);<br>
-    _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);<br>
-    _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);<br>
-    _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);<br>
-    _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);<br>
-    _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);<br>
-    _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);<br>
-    _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);<br>
-    _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);<br>
-    _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);<br>
-    _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);<br>
-    _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);<br>
-    _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);<br>
-    _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);<br>
-    _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);<br>
-    _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);<br>
-    _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);<br>
-    _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);<br>
-    _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);<br>
+    {<br>
+        UNPACKHILO(0)<br>
+        PROCESS(0, 2048, 12)<br>
+    }<br>
+    {<br>
+        UNPACKHILO(1)<br>
+        PROCESS(1, 2048, 12)<br>
+    }<br>
+<br>
+    {<br>
+        __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;<br>
+        __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;<br>
+        TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])<br>
+        _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);<br>
+        _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);<br>
+        _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);<br>
+        _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);<br>
+        _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);<br>
+        _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);<br>
+        _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);<br>
+        _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);<br>
+        TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])<br>
+        _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);<br>
+        _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);<br>
+        _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);<br>
+        _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);<br>
+        _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);<br>
+        _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);<br>
+        _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);<br>
+        _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);<br>
+        TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])<br>
+        _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);<br>
+        _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);<br>
+        _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);<br>
+        _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);<br>
+        _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);<br>
+        _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);<br>
+        _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);<br>
+        _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);<br>
+        TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])<br>
+        _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);<br>
+        _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);<br>
+        _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);<br>
+        _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);<br>
+        _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);<br>
+        _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);<br>
+        _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);<br>
+        _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);<br>
+    }<br>
 }<br>
+#undef PROCESS<br>
+#undef TRANSPOSE_8x8_16BIT<br>
+#undef COMPUTE_ROW<br>
+#undef UNPACKHILO<br>
+#undef READ_UNPACKHILO<br>
<br>
 void idct32(const int16_t *src, int16_t *dst, intptr_t stride)<br>
 {<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br>
</blockquote></div><br></div>