[x265] [PATCH] asm: idct16 intrinsic 28900->25000 improvement over previous intrinsic

dtyx265 at gmail.com dtyx265 at gmail.com
Mon Jan 19 18:44:55 CET 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1421689416 28800
# Node ID fd4481542b452a01b790ab677e6a7209675b965b
# Parent  4f8b7cc9d51e1102b2d2b27d5a19f97576ddde63
asm: idct16 intrinsic 28900->25000 improvement over previous intrinsic

diff -r 4f8b7cc9d51e -r fd4481542b45 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Mon Jan 19 18:21:50 2015 +0800
+++ b/source/common/vec/dct-sse3.cpp	Mon Jan 19 09:43:36 2015 -0800
@@ -291,6 +291,254 @@
 
 void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
 {
+#define READ_UNPACKHILO(offset)\
+    const __m128i T_00_00A = _mm_unpacklo_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\
+    const __m128i T_00_00B = _mm_unpackhi_epi16(*(__m128i*)&src[1 * 16 + offset], *(__m128i*)&src[3 * 16 + offset]);\
+    const __m128i T_00_01A = _mm_unpacklo_epi16(*(__m128i*)&src[5 * 16 + offset], *(__m128i*)&src[7 * 16 + offset]);\
+    const __m128i T_00_01B = _mm_unpackhi_epi16(*(__m128i*)&src[5 * 16 + offset], *(__m128i*)&src[7 * 16 + offset]);\
+    const __m128i T_00_02A = _mm_unpacklo_epi16(*(__m128i*)&src[9 * 16 + offset], *(__m128i*)&src[11 * 16 + offset]);\
+    const __m128i T_00_02B = _mm_unpackhi_epi16(*(__m128i*)&src[9 * 16 + offset], *(__m128i*)&src[11 * 16 + offset]);\
+    const __m128i T_00_03A = _mm_unpacklo_epi16(*(__m128i*)&src[13 * 16 + offset], *(__m128i*)&src[15 * 16 + offset]);\
+    const __m128i T_00_03B = _mm_unpackhi_epi16(*(__m128i*)&src[13 * 16 + offset], *(__m128i*)&src[15 * 16 + offset]);\
+    const __m128i T_00_04A = _mm_unpacklo_epi16(*(__m128i*)&src[2 * 16 + offset], *(__m128i*)&src[6 * 16 + offset]);\
+    const __m128i T_00_04B = _mm_unpackhi_epi16(*(__m128i*)&src[2 * 16 + offset], *(__m128i*)&src[6 * 16 + offset]);\
+    const __m128i T_00_05A = _mm_unpacklo_epi16(*(__m128i*)&src[10 * 16 + offset], *(__m128i*)&src[14 * 16 + offset]);\
+    const __m128i T_00_05B = _mm_unpackhi_epi16(*(__m128i*)&src[10 * 16 + offset], *(__m128i*)&src[14 * 16 + offset]);\
+    const __m128i T_00_06A = _mm_unpacklo_epi16(*(__m128i*)&src[4 * 16 + offset], *(__m128i*)&src[12 * 16 + offset]);\
+    const __m128i T_00_06B = _mm_unpackhi_epi16(*(__m128i*)&src[4 * 16 + offset], *(__m128i*)&src[12 * 16 + offset]);\
+    const __m128i T_00_07A = _mm_unpacklo_epi16(*(__m128i*)&src[0 * 16 + offset], *(__m128i*)&src[8 * 16 + offset]);\
+    const __m128i T_00_07B = _mm_unpackhi_epi16(*(__m128i*)&src[0 * 16 + offset], *(__m128i*)&src[8 * 16 + offset]);
+
+#define UNPACKHILO(part) \
+    const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);\
+    const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);\
+    const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]);\
+    const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]);\
+    const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]);\
+    const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]);\
+    const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]);\
+    const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]);\
+    const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]);\
+    const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]);\
+    const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]);\
+    const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]);\
+    const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]);\
+    const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]);\
+    const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]);\
+    const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]);
+
+#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
+    T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
+    T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
+    row = _mm_add_epi32(T00, T01);
+
+#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
+    tr0_0 = _mm_unpacklo_epi16(I0, I1); \
+    tr0_1 = _mm_unpacklo_epi16(I2, I3); \
+    tr0_2 = _mm_unpackhi_epi16(I0, I1); \
+    tr0_3 = _mm_unpackhi_epi16(I2, I3); \
+    tr0_4 = _mm_unpacklo_epi16(I4, I5); \
+    tr0_5 = _mm_unpacklo_epi16(I6, I7); \
+    tr0_6 = _mm_unpackhi_epi16(I4, I5); \
+    tr0_7 = _mm_unpackhi_epi16(I6, I7); \
+    tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+    tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+    tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+    tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+    O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+    O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+    O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+    O7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+
+#define PROCESS(part, rnd, shift) \
+    __m128i c32_rnd = _mm_set1_epi32(rnd);\
+    int nShift = shift;\
+\
+    __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;\
+    __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;\
+    {\
+        __m128i T00, T01;\
+\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)\
+        COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)\
+\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)\
+        COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)\
+    }\
+\
+    __m128i EO0A, EO1A, EO2A, EO3A;\
+    __m128i EO0B, EO1B, EO2B, EO3B;\
+    EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50));\
+    EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));\
+    EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89));\
+    EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));\
+    EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18));\
+    EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));\
+    EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75));\
+    EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));\
+\
+    __m128i EEO0A, EEO1A;\
+    __m128i EEO0B, EEO1B;\
+    EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);\
+    EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);\
+    EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);\
+    EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);\
+\
+    __m128i EEE0A, EEE1A;\
+    __m128i EEE0B, EEE1B;\
+    EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);\
+    EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);\
+    EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);\
+    EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);\
+\
+    const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A);\
+    const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);\
+    const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A);\
+    const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);\
+    const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A);\
+    const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);\
+    const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A);\
+    const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);\
+\
+    const __m128i E0A = _mm_add_epi32(EE0A, EO0A);\
+    const __m128i E0B = _mm_add_epi32(EE0B, EO0B);\
+    const __m128i E1A = _mm_add_epi32(EE1A, EO1A);\
+    const __m128i E1B = _mm_add_epi32(EE1B, EO1B);\
+    const __m128i E2A = _mm_add_epi32(EE2A, EO2A);\
+    const __m128i E2B = _mm_add_epi32(EE2B, EO2B);\
+    const __m128i E3A = _mm_add_epi32(EE3A, EO3A);\
+    const __m128i E3B = _mm_add_epi32(EE3B, EO3B);\
+    const __m128i E7A = _mm_sub_epi32(EE0A, EO0A);\
+    const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);\
+    const __m128i E6A = _mm_sub_epi32(EE1A, EO1A);\
+    const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);\
+    const __m128i E5A = _mm_sub_epi32(EE2A, EO2A);\
+    const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);\
+    const __m128i E4A = _mm_sub_epi32(EE3A, EO3A);\
+    const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);\
+\
+    const __m128i T10A = _mm_add_epi32(E0A, c32_rnd);\
+    const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);\
+    const __m128i T11A = _mm_add_epi32(E1A, c32_rnd);\
+    const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);\
+    const __m128i T12A = _mm_add_epi32(E2A, c32_rnd);\
+    const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);\
+    const __m128i T13A = _mm_add_epi32(E3A, c32_rnd);\
+    const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);\
+    const __m128i T14A = _mm_add_epi32(E4A, c32_rnd);\
+    const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);\
+    const __m128i T15A = _mm_add_epi32(E5A, c32_rnd);\
+    const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);\
+    const __m128i T16A = _mm_add_epi32(E6A, c32_rnd);\
+    const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);\
+    const __m128i T17A = _mm_add_epi32(E7A, c32_rnd);\
+    const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);\
+\
+    const __m128i T20A = _mm_add_epi32(T10A, O0A);\
+    const __m128i T20B = _mm_add_epi32(T10B, O0B);\
+    const __m128i T21A = _mm_add_epi32(T11A, O1A);\
+    const __m128i T21B = _mm_add_epi32(T11B, O1B);\
+    const __m128i T22A = _mm_add_epi32(T12A, O2A);\
+    const __m128i T22B = _mm_add_epi32(T12B, O2B);\
+    const __m128i T23A = _mm_add_epi32(T13A, O3A);\
+    const __m128i T23B = _mm_add_epi32(T13B, O3B);\
+    const __m128i T24A = _mm_add_epi32(T14A, O4A);\
+    const __m128i T24B = _mm_add_epi32(T14B, O4B);\
+    const __m128i T25A = _mm_add_epi32(T15A, O5A);\
+    const __m128i T25B = _mm_add_epi32(T15B, O5B);\
+    const __m128i T26A = _mm_add_epi32(T16A, O6A);\
+    const __m128i T26B = _mm_add_epi32(T16B, O6B);\
+    const __m128i T27A = _mm_add_epi32(T17A, O7A);\
+    const __m128i T27B = _mm_add_epi32(T17B, O7B);\
+    const __m128i T2FA = _mm_sub_epi32(T10A, O0A);\
+    const __m128i T2FB = _mm_sub_epi32(T10B, O0B);\
+    const __m128i T2EA = _mm_sub_epi32(T11A, O1A);\
+    const __m128i T2EB = _mm_sub_epi32(T11B, O1B);\
+    const __m128i T2DA = _mm_sub_epi32(T12A, O2A);\
+    const __m128i T2DB = _mm_sub_epi32(T12B, O2B);\
+    const __m128i T2CA = _mm_sub_epi32(T13A, O3A);\
+    const __m128i T2CB = _mm_sub_epi32(T13B, O3B);\
+    const __m128i T2BA = _mm_sub_epi32(T14A, O4A);\
+    const __m128i T2BB = _mm_sub_epi32(T14B, O4B);\
+    const __m128i T2AA = _mm_sub_epi32(T15A, O5A);\
+    const __m128i T2AB = _mm_sub_epi32(T15B, O5B);\
+    const __m128i T29A = _mm_sub_epi32(T16A, O6A);\
+    const __m128i T29B = _mm_sub_epi32(T16B, O6B);\
+    const __m128i T28A = _mm_sub_epi32(T17A, O7A);\
+    const __m128i T28B = _mm_sub_epi32(T17B, O7B);\
+\
+    const __m128i T30A = _mm_srai_epi32(T20A, nShift);\
+    const __m128i T30B = _mm_srai_epi32(T20B, nShift);\
+    const __m128i T31A = _mm_srai_epi32(T21A, nShift);\
+    const __m128i T31B = _mm_srai_epi32(T21B, nShift);\
+    const __m128i T32A = _mm_srai_epi32(T22A, nShift);\
+    const __m128i T32B = _mm_srai_epi32(T22B, nShift);\
+    const __m128i T33A = _mm_srai_epi32(T23A, nShift);\
+    const __m128i T33B = _mm_srai_epi32(T23B, nShift);\
+    const __m128i T34A = _mm_srai_epi32(T24A, nShift);\
+    const __m128i T34B = _mm_srai_epi32(T24B, nShift);\
+    const __m128i T35A = _mm_srai_epi32(T25A, nShift);\
+    const __m128i T35B = _mm_srai_epi32(T25B, nShift);\
+    const __m128i T36A = _mm_srai_epi32(T26A, nShift);\
+    const __m128i T36B = _mm_srai_epi32(T26B, nShift);\
+    const __m128i T37A = _mm_srai_epi32(T27A, nShift);\
+    const __m128i T37B = _mm_srai_epi32(T27B, nShift);\
+\
+    const __m128i T38A = _mm_srai_epi32(T28A, nShift);\
+    const __m128i T38B = _mm_srai_epi32(T28B, nShift);\
+    const __m128i T39A = _mm_srai_epi32(T29A, nShift);\
+    const __m128i T39B = _mm_srai_epi32(T29B, nShift);\
+    const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);\
+    const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);\
+    const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);\
+    const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);\
+    const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);\
+    const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);\
+    const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);\
+    const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);\
+    const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);\
+    const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);\
+    const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);\
+    const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);\
+\
+    res00[part]  = _mm_packs_epi32(T30A, T30B);\
+    res01[part]  = _mm_packs_epi32(T31A, T31B);\
+    res02[part]  = _mm_packs_epi32(T32A, T32B);\
+    res03[part]  = _mm_packs_epi32(T33A, T33B);\
+    res04[part]  = _mm_packs_epi32(T34A, T34B);\
+    res05[part]  = _mm_packs_epi32(T35A, T35B);\
+    res06[part]  = _mm_packs_epi32(T36A, T36B);\
+    res07[part]  = _mm_packs_epi32(T37A, T37B);\
+\
+    res08[part]  = _mm_packs_epi32(T38A, T38B);\
+    res09[part]  = _mm_packs_epi32(T39A, T39B);\
+    res10[part]  = _mm_packs_epi32(T3AA, T3AB);\
+    res11[part]  = _mm_packs_epi32(T3BA, T3BB);\
+    res12[part]  = _mm_packs_epi32(T3CA, T3CB);\
+    res13[part]  = _mm_packs_epi32(T3DA, T3DB);\
+    res14[part]  = _mm_packs_epi32(T3EA, T3EB);\
+    res15[part]  = _mm_packs_epi32(T3FA, T3FB);
+
     const __m128i c16_p87_p90   = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
     const __m128i c16_p70_p80   = _mm_set1_epi32(0x00460050);
     const __m128i c16_p43_p57   = _mm_set1_epi32(0x002B0039);
@@ -338,9 +586,6 @@
 
     const __m128i c16_n64_p64   = _mm_set1_epi32(0xFFC00040);
     const __m128i c16_p64_p64   = _mm_set1_epi32(0x00400040);
-    __m128i c32_rnd             = _mm_set1_epi32(64);
-
-    int nShift = 7;
 
     // DCT1
     __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2];
@@ -348,308 +593,79 @@
     __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2];
     __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2];
 
-    for (int i = 0; i < 2; i++)
     {
-        const int offset = (i << 3);
-        in00[i]  = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]
-        in01[i]  = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]
-        in02[i]  = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]
-        in03[i]  = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]
-        in04[i]  = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]
-        in05[i]  = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]
-        in06[i]  = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]
-        in07[i]  = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]
-        in08[i]  = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
-        in09[i]  = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
-        in10[i]  = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
-        in11[i]  = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
-        in12[i]  = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
-        in13[i]  = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
-        in14[i]  = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
-        in15[i]  = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
+        READ_UNPACKHILO(0)
+        PROCESS(0, 64, 7)
     }
 
-    for (int pass = 0; pass < 2; pass++)
     {
-        if (pass == 1)
-        {
-            c32_rnd = _mm_set1_epi32(2048);
-            nShift  = 12;
-        }
-
-        for (int part = 0; part < 2; part++)
-        {
-            const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);       // [33 13 32 12 31 11 30 10]
-            const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);       // [37 17 36 16 35 15 34 14]
-            const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]);       // [ ]
-            const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]);       // [ ]
-            const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]);       // [ ]
-            const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]);       // [ ]
-            const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]);       // [ ]
-            const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]);       // [ ]
-            const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]);       // [ ]
-            const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]);       // [ ]
-            const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]);       // [ ]
-            const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]);       // [ ]
-            const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]);       // [ ]row
-            const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]);       // [ ]
-            const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]);       // [83 03 82 02 81 01 81 00] row08 row00
-            const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]);       // [87 07 86 06 85 05 84 04]
-
-            __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;
-            __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;
-            {
-                __m128i T00, T01;
-#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \
-    T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \
-    T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \
-    row = _mm_add_epi32(T00, T01);
-
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)
-                COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)
-
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)
-                COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)
-#undef COMPUTE_ROW
-            }
-
-            __m128i EO0A, EO1A, EO2A, EO3A;
-            __m128i EO0B, EO1B, EO2B, EO3B;
-            EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89), _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0
-            EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89), _mm_madd_epi16(T_00_05B, c16_p18_p50));
-            EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75), _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1
-            EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75), _mm_madd_epi16(T_00_05B, c16_n50_n89));
-            EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50), _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2
-            EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50), _mm_madd_epi16(T_00_05B, c16_p75_p18));
-            EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18), _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3
-            EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18), _mm_madd_epi16(T_00_05B, c16_n89_p75));
-
-            __m128i EEO0A, EEO1A;
-            __m128i EEO0B, EEO1B;
-            EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);
-            EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);
-            EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);
-            EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);
-
-            __m128i EEE0A, EEE1A;
-            __m128i EEE0B, EEE1B;
-            EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);
-            EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);
-            EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);
-            EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);
-
-            const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A);          // EE0 = EEE0 + EEO0
-            const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
-            const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A);          // EE1 = EEE1 + EEO1
-            const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
-            const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A);          // EE2 = EEE0 - EEO0
-            const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);
-            const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A);          // EE3 = EEE1 - EEO1
-            const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);
-
-            const __m128i E0A = _mm_add_epi32(EE0A, EO0A);          // E0 = EE0 + EO0
-            const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
-            const __m128i E1A = _mm_add_epi32(EE1A, EO1A);          // E1 = EE1 + EO1
-            const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
-            const __m128i E2A = _mm_add_epi32(EE2A, EO2A);          // E2 = EE2 + EO2
-            const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
-            const __m128i E3A = _mm_add_epi32(EE3A, EO3A);          // E3 = EE3 + EO3
-            const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
-            const __m128i E7A = _mm_sub_epi32(EE0A, EO0A);          // E0 = EE0 - EO0
-            const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);
-            const __m128i E6A = _mm_sub_epi32(EE1A, EO1A);          // E1 = EE1 - EO1
-            const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);
-            const __m128i E5A = _mm_sub_epi32(EE2A, EO2A);          // E2 = EE2 - EO2
-            const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);
-            const __m128i E4A = _mm_sub_epi32(EE3A, EO3A);          // E3 = EE3 - EO3
-            const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);
-
-            const __m128i T10A = _mm_add_epi32(E0A, c32_rnd);         // E0 + rnd
-            const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
-            const __m128i T11A = _mm_add_epi32(E1A, c32_rnd);         // E1 + rnd
-            const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
-            const __m128i T12A = _mm_add_epi32(E2A, c32_rnd);         // E2 + rnd
-            const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
-            const __m128i T13A = _mm_add_epi32(E3A, c32_rnd);         // E3 + rnd
-            const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
-            const __m128i T14A = _mm_add_epi32(E4A, c32_rnd);         // E4 + rnd
-            const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
-            const __m128i T15A = _mm_add_epi32(E5A, c32_rnd);         // E5 + rnd
-            const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
-            const __m128i T16A = _mm_add_epi32(E6A, c32_rnd);         // E6 + rnd
-            const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
-            const __m128i T17A = _mm_add_epi32(E7A, c32_rnd);         // E7 + rnd
-            const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
-
-            const __m128i T20A = _mm_add_epi32(T10A, O0A);          // E0 + O0 + rnd
-            const __m128i T20B = _mm_add_epi32(T10B, O0B);
-            const __m128i T21A = _mm_add_epi32(T11A, O1A);          // E1 + O1 + rnd
-            const __m128i T21B = _mm_add_epi32(T11B, O1B);
-            const __m128i T22A = _mm_add_epi32(T12A, O2A);          // E2 + O2 + rnd
-            const __m128i T22B = _mm_add_epi32(T12B, O2B);
-            const __m128i T23A = _mm_add_epi32(T13A, O3A);          // E3 + O3 + rnd
-            const __m128i T23B = _mm_add_epi32(T13B, O3B);
-            const __m128i T24A = _mm_add_epi32(T14A, O4A);          // E4
-            const __m128i T24B = _mm_add_epi32(T14B, O4B);
-            const __m128i T25A = _mm_add_epi32(T15A, O5A);          // E5
-            const __m128i T25B = _mm_add_epi32(T15B, O5B);
-            const __m128i T26A = _mm_add_epi32(T16A, O6A);          // E6
-            const __m128i T26B = _mm_add_epi32(T16B, O6B);
-            const __m128i T27A = _mm_add_epi32(T17A, O7A);          // E7
-            const __m128i T27B = _mm_add_epi32(T17B, O7B);
-            const __m128i T2FA = _mm_sub_epi32(T10A, O0A);          // E0 - O0 + rnd
-            const __m128i T2FB = _mm_sub_epi32(T10B, O0B);
-            const __m128i T2EA = _mm_sub_epi32(T11A, O1A);          // E1 - O1 + rnd
-            const __m128i T2EB = _mm_sub_epi32(T11B, O1B);
-            const __m128i T2DA = _mm_sub_epi32(T12A, O2A);          // E2 - O2 + rnd
-            const __m128i T2DB = _mm_sub_epi32(T12B, O2B);
-            const __m128i T2CA = _mm_sub_epi32(T13A, O3A);          // E3 - O3 + rnd
-            const __m128i T2CB = _mm_sub_epi32(T13B, O3B);
-            const __m128i T2BA = _mm_sub_epi32(T14A, O4A);          // E4
-            const __m128i T2BB = _mm_sub_epi32(T14B, O4B);
-            const __m128i T2AA = _mm_sub_epi32(T15A, O5A);          // E5
-            const __m128i T2AB = _mm_sub_epi32(T15B, O5B);
-            const __m128i T29A = _mm_sub_epi32(T16A, O6A);          // E6
-            const __m128i T29B = _mm_sub_epi32(T16B, O6B);
-            const __m128i T28A = _mm_sub_epi32(T17A, O7A);          // E7
-            const __m128i T28B = _mm_sub_epi32(T17B, O7B);
-
-            const __m128i T30A = _mm_srai_epi32(T20A, nShift);             // [30 20 10 00]
-            const __m128i T30B = _mm_srai_epi32(T20B, nShift);             // [70 60 50 40]
-            const __m128i T31A = _mm_srai_epi32(T21A, nShift);             // [31 21 11 01]
-            const __m128i T31B = _mm_srai_epi32(T21B, nShift);             // [71 61 51 41]
-            const __m128i T32A = _mm_srai_epi32(T22A, nShift);             // [32 22 12 02]
-            const __m128i T32B = _mm_srai_epi32(T22B, nShift);             // [72 62 52 42]
-            const __m128i T33A = _mm_srai_epi32(T23A, nShift);             // [33 23 13 03]
-            const __m128i T33B = _mm_srai_epi32(T23B, nShift);             // [73 63 53 43]
-            const __m128i T34A = _mm_srai_epi32(T24A, nShift);             // [33 24 14 04]
-            const __m128i T34B = _mm_srai_epi32(T24B, nShift);             // [74 64 54 44]
-            const __m128i T35A = _mm_srai_epi32(T25A, nShift);             // [35 25 15 05]
-            const __m128i T35B = _mm_srai_epi32(T25B, nShift);             // [75 65 55 45]
-            const __m128i T36A = _mm_srai_epi32(T26A, nShift);             // [36 26 16 06]
-            const __m128i T36B = _mm_srai_epi32(T26B, nShift);             // [76 66 56 46]
-            const __m128i T37A = _mm_srai_epi32(T27A, nShift);             // [37 27 17 07]
-            const __m128i T37B = _mm_srai_epi32(T27B, nShift);             // [77 67 57 47]
-
-            const __m128i T38A = _mm_srai_epi32(T28A, nShift);             // [30 20 10 00] x8
-            const __m128i T38B = _mm_srai_epi32(T28B, nShift);             // [70 60 50 40]
-            const __m128i T39A = _mm_srai_epi32(T29A, nShift);             // [31 21 11 01] x9
-            const __m128i T39B = _mm_srai_epi32(T29B, nShift);             // [71 61 51 41]
-            const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);             // [32 22 12 02] xA
-            const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);             // [72 62 52 42]
-            const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);             // [33 23 13 03] xB
-            const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);             // [73 63 53 43]
-            const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);             // [33 24 14 04] xC
-            const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);             // [74 64 54 44]
-            const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);             // [35 25 15 05] xD
-            const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);             // [75 65 55 45]
-            const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);             // [36 26 16 06] xE
-            const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);             // [76 66 56 46]
-            const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);             // [37 27 17 07] xF
-            const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);             // [77 67 57 47]
-
-            res00[part]  = _mm_packs_epi32(T30A, T30B);        // [70 60 50 40 30 20 10 00]
-            res01[part]  = _mm_packs_epi32(T31A, T31B);        // [71 61 51 41 31 21 11 01]
-            res02[part]  = _mm_packs_epi32(T32A, T32B);        // [72 62 52 42 32 22 12 02]
-            res03[part]  = _mm_packs_epi32(T33A, T33B);        // [73 63 53 43 33 23 13 03]
-            res04[part]  = _mm_packs_epi32(T34A, T34B);        // [74 64 54 44 34 24 14 04]
-            res05[part]  = _mm_packs_epi32(T35A, T35B);        // [75 65 55 45 35 25 15 05]
-            res06[part]  = _mm_packs_epi32(T36A, T36B);        // [76 66 56 46 36 26 16 06]
-            res07[part]  = _mm_packs_epi32(T37A, T37B);        // [77 67 57 47 37 27 17 07]
-
-            res08[part]  = _mm_packs_epi32(T38A, T38B);        // [A0 ... 80]
-            res09[part]  = _mm_packs_epi32(T39A, T39B);        // [A1 ... 81]
-            res10[part]  = _mm_packs_epi32(T3AA, T3AB);        // [A2 ... 82]
-            res11[part]  = _mm_packs_epi32(T3BA, T3BB);        // [A3 ... 83]
-            res12[part]  = _mm_packs_epi32(T3CA, T3CB);        // [A4 ... 84]
-            res13[part]  = _mm_packs_epi32(T3DA, T3DB);        // [A5 ... 85]
-            res14[part]  = _mm_packs_epi32(T3EA, T3EB);        // [A6 ... 86]
-            res15[part]  = _mm_packs_epi32(T3FA, T3FB);        // [A7 ... 87]
-        }
-        //transpose matrix 8x8 16bit.
-        {
-            __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
-            __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
-#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \
-    tr0_0 = _mm_unpacklo_epi16(I0, I1); \
-    tr0_1 = _mm_unpacklo_epi16(I2, I3); \
-    tr0_2 = _mm_unpackhi_epi16(I0, I1); \
-    tr0_3 = _mm_unpackhi_epi16(I2, I3); \
-    tr0_4 = _mm_unpacklo_epi16(I4, I5); \
-    tr0_5 = _mm_unpacklo_epi16(I6, I7); \
-    tr0_6 = _mm_unpackhi_epi16(I4, I5); \
-    tr0_7 = _mm_unpackhi_epi16(I6, I7); \
-    tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
-    tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
-    tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
-    tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
-    O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-    O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
-    O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
-    O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
-    O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
-
-            TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
-            TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
-            TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
-            TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
-
-#undef TRANSPOSE_8x8_16BIT
-        }
+        READ_UNPACKHILO(8)
+        PROCESS(1, 64, 7)
+    }
+    {
+        __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
+        __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
+        TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
+        TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
+        TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
+        TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
     }
 
-    _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
-    _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
-    _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
-    _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
-    _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
-    _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
-    _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
-    _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
-    _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
-    _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
-    _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
-    _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
-    _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
-    _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
-    _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
-    _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
-    _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
-    _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
-    _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
-    _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
-    _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
-    _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
-    _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
-    _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
-    _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
-    _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
-    _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
-    _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
-    _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
-    _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
-    _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
-    _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
+    {
+        UNPACKHILO(0)
+        PROCESS(0, 2048, 12)
+    }
+    {
+        UNPACKHILO(1)
+        PROCESS(1, 2048, 12)
+    }
+
+    {
+        __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
+        __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
+        TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0])
+        _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
+        _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
+        _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
+        _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
+        _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
+        _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
+        _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
+        _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
+        TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1])
+        _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
+        _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
+        _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
+        _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
+        _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
+        _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
+        _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
+        _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
+        TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0])
+        _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
+        _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
+        _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
+        _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
+        _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
+        _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
+        _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
+        _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
+        TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1])
+        _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
+        _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
+        _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
+        _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
+        _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
+        _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
+        _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
+        _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
+    }
 }
+#undef PROCESS
+#undef TRANSPOSE_8x8_16BIT
+#undef COMPUTE_ROW
+#undef UNPACKHILO
+#undef READ_UNPACKHILO
 
 void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
 {



More information about the x265-devel mailing list