[x265] [PATCH] asm: idct16 intrinsic 28900->25000 improvement over previous intrinsic
Deepthi Nandakumar
deepthi at multicorewareinc.com
Tue Jan 20 05:56:45 CET 2015
Thanks, pushed. We appreciate all help in accelerating 16/32 DCT/IDCT
primitives.
On Mon, Jan 19, 2015 at 11:14 PM, <dtyx265 at gmail.com> wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1421689416 28800
> # Node ID fd4481542b452a01b790ab677e6a7209675b965b
> # Parent 4f8b7cc9d51e1102b2d2b27d5a19f97576ddde63
> asm: idct16 intrinsic 28900->25000 improvement over previous intrinsic
>
> diff -r 4f8b7cc9d51e -r fd4481542b45 source/common/vec/dct-sse3.cpp
> --- a/source/common/vec/dct-sse3.cpp Mon Jan 19 18:21:50 2015 +0800
> +++ b/source/common/vec/dct-sse3.cpp Mon Jan 19 09:43:36 2015 -0800
> @@ -291,6 +291,254 @@
>
> void idct16(const int16_t *src, int16_t *dst, intptr_t stride)
> {
> +#define READ_UNPACKHILO(offset)\
> + const __m128i T_00_00A = _mm_unpacklo_epi16(*(__m128i*)&src[1 * 16 +
> offset], *(__m128i*)&src[3 * 16 + offset]);\
> + const __m128i T_00_00B = _mm_unpackhi_epi16(*(__m128i*)&src[1 * 16 +
> offset], *(__m128i*)&src[3 * 16 + offset]);\
> + const __m128i T_00_01A = _mm_unpacklo_epi16(*(__m128i*)&src[5 * 16 +
> offset], *(__m128i*)&src[7 * 16 + offset]);\
> + const __m128i T_00_01B = _mm_unpackhi_epi16(*(__m128i*)&src[5 * 16 +
> offset], *(__m128i*)&src[7 * 16 + offset]);\
> + const __m128i T_00_02A = _mm_unpacklo_epi16(*(__m128i*)&src[9 * 16 +
> offset], *(__m128i*)&src[11 * 16 + offset]);\
> + const __m128i T_00_02B = _mm_unpackhi_epi16(*(__m128i*)&src[9 * 16 +
> offset], *(__m128i*)&src[11 * 16 + offset]);\
> + const __m128i T_00_03A = _mm_unpacklo_epi16(*(__m128i*)&src[13 * 16 +
> offset], *(__m128i*)&src[15 * 16 + offset]);\
> + const __m128i T_00_03B = _mm_unpackhi_epi16(*(__m128i*)&src[13 * 16 +
> offset], *(__m128i*)&src[15 * 16 + offset]);\
> + const __m128i T_00_04A = _mm_unpacklo_epi16(*(__m128i*)&src[2 * 16 +
> offset], *(__m128i*)&src[6 * 16 + offset]);\
> + const __m128i T_00_04B = _mm_unpackhi_epi16(*(__m128i*)&src[2 * 16 +
> offset], *(__m128i*)&src[6 * 16 + offset]);\
> + const __m128i T_00_05A = _mm_unpacklo_epi16(*(__m128i*)&src[10 * 16 +
> offset], *(__m128i*)&src[14 * 16 + offset]);\
> + const __m128i T_00_05B = _mm_unpackhi_epi16(*(__m128i*)&src[10 * 16 +
> offset], *(__m128i*)&src[14 * 16 + offset]);\
> + const __m128i T_00_06A = _mm_unpacklo_epi16(*(__m128i*)&src[4 * 16 +
> offset], *(__m128i*)&src[12 * 16 + offset]);\
> + const __m128i T_00_06B = _mm_unpackhi_epi16(*(__m128i*)&src[4 * 16 +
> offset], *(__m128i*)&src[12 * 16 + offset]);\
> + const __m128i T_00_07A = _mm_unpacklo_epi16(*(__m128i*)&src[0 * 16 +
> offset], *(__m128i*)&src[8 * 16 + offset]);\
> + const __m128i T_00_07B = _mm_unpackhi_epi16(*(__m128i*)&src[0 * 16 +
> offset], *(__m128i*)&src[8 * 16 + offset]);
> +
> +#define UNPACKHILO(part) \
> + const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);\
> + const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);\
> + const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]);\
> + const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]);\
> + const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]);\
> + const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]);\
> + const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]);\
> + const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]);\
> + const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]);\
> + const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]);\
> + const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]);\
> + const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]);\
> + const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]);\
> + const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]);\
> + const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]);\
> + const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]);
> +
> +#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507,
> c0911, c1315, row) \
> + T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103),
> _mm_madd_epi16(row0507, c0507)); \
> + T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911),
> _mm_madd_epi16(row1315, c1315)); \
> + row = _mm_add_epi32(T00, T01);
> +
> +#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2,
> O3, O4, O5, O6, O7) \
> + tr0_0 = _mm_unpacklo_epi16(I0, I1); \
> + tr0_1 = _mm_unpacklo_epi16(I2, I3); \
> + tr0_2 = _mm_unpackhi_epi16(I0, I1); \
> + tr0_3 = _mm_unpackhi_epi16(I2, I3); \
> + tr0_4 = _mm_unpacklo_epi16(I4, I5); \
> + tr0_5 = _mm_unpacklo_epi16(I6, I7); \
> + tr0_6 = _mm_unpackhi_epi16(I4, I5); \
> + tr0_7 = _mm_unpackhi_epi16(I6, I7); \
> + tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
> + tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
> + tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
> + tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
> + tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
> + tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
> + tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
> + tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
> + O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
> + O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
> + O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
> + O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
> + O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
> + O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
> + O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
> + O7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
> +
> +#define PROCESS(part, rnd, shift) \
> + __m128i c32_rnd = _mm_set1_epi32(rnd);\
> + int nShift = shift;\
> +\
> + __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;\
> + __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;\
> + {\
> + __m128i T00, T01;\
> +\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p87_p90,
> c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p57_p87,
> c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p09_p80,
> c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p70,
> c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n80_p57,
> c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n90_p43,
> c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n70_p25,
> c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)\
> + COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n25_p09,
> c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)\
> +\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p87_p90,
> c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p57_p87,
> c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p09_p80,
> c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p70,
> c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n80_p57,
> c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n90_p43,
> c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n70_p25,
> c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)\
> + COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n25_p09,
> c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)\
> + }\
> +\
> + __m128i EO0A, EO1A, EO2A, EO3A;\
> + __m128i EO0B, EO1B, EO2B, EO3B;\
> + EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89),
> _mm_madd_epi16(T_00_05A, c16_p18_p50));\
> + EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89),
> _mm_madd_epi16(T_00_05B, c16_p18_p50));\
> + EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75),
> _mm_madd_epi16(T_00_05A, c16_n50_n89));\
> + EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75),
> _mm_madd_epi16(T_00_05B, c16_n50_n89));\
> + EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50),
> _mm_madd_epi16(T_00_05A, c16_p75_p18));\
> + EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50),
> _mm_madd_epi16(T_00_05B, c16_p75_p18));\
> + EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18),
> _mm_madd_epi16(T_00_05A, c16_n89_p75));\
> + EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18),
> _mm_madd_epi16(T_00_05B, c16_n89_p75));\
> +\
> + __m128i EEO0A, EEO1A;\
> + __m128i EEO0B, EEO1B;\
> + EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);\
> + EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);\
> + EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);\
> + EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);\
> +\
> + __m128i EEE0A, EEE1A;\
> + __m128i EEE0B, EEE1B;\
> + EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);\
> + EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);\
> + EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);\
> + EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);\
> +\
> + const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A);\
> + const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);\
> + const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A);\
> + const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);\
> + const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A);\
> + const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);\
> + const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A);\
> + const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);\
> +\
> + const __m128i E0A = _mm_add_epi32(EE0A, EO0A);\
> + const __m128i E0B = _mm_add_epi32(EE0B, EO0B);\
> + const __m128i E1A = _mm_add_epi32(EE1A, EO1A);\
> + const __m128i E1B = _mm_add_epi32(EE1B, EO1B);\
> + const __m128i E2A = _mm_add_epi32(EE2A, EO2A);\
> + const __m128i E2B = _mm_add_epi32(EE2B, EO2B);\
> + const __m128i E3A = _mm_add_epi32(EE3A, EO3A);\
> + const __m128i E3B = _mm_add_epi32(EE3B, EO3B);\
> + const __m128i E7A = _mm_sub_epi32(EE0A, EO0A);\
> + const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);\
> + const __m128i E6A = _mm_sub_epi32(EE1A, EO1A);\
> + const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);\
> + const __m128i E5A = _mm_sub_epi32(EE2A, EO2A);\
> + const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);\
> + const __m128i E4A = _mm_sub_epi32(EE3A, EO3A);\
> + const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);\
> +\
> + const __m128i T10A = _mm_add_epi32(E0A, c32_rnd);\
> + const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);\
> + const __m128i T11A = _mm_add_epi32(E1A, c32_rnd);\
> + const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);\
> + const __m128i T12A = _mm_add_epi32(E2A, c32_rnd);\
> + const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);\
> + const __m128i T13A = _mm_add_epi32(E3A, c32_rnd);\
> + const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);\
> + const __m128i T14A = _mm_add_epi32(E4A, c32_rnd);\
> + const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);\
> + const __m128i T15A = _mm_add_epi32(E5A, c32_rnd);\
> + const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);\
> + const __m128i T16A = _mm_add_epi32(E6A, c32_rnd);\
> + const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);\
> + const __m128i T17A = _mm_add_epi32(E7A, c32_rnd);\
> + const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);\
> +\
> + const __m128i T20A = _mm_add_epi32(T10A, O0A);\
> + const __m128i T20B = _mm_add_epi32(T10B, O0B);\
> + const __m128i T21A = _mm_add_epi32(T11A, O1A);\
> + const __m128i T21B = _mm_add_epi32(T11B, O1B);\
> + const __m128i T22A = _mm_add_epi32(T12A, O2A);\
> + const __m128i T22B = _mm_add_epi32(T12B, O2B);\
> + const __m128i T23A = _mm_add_epi32(T13A, O3A);\
> + const __m128i T23B = _mm_add_epi32(T13B, O3B);\
> + const __m128i T24A = _mm_add_epi32(T14A, O4A);\
> + const __m128i T24B = _mm_add_epi32(T14B, O4B);\
> + const __m128i T25A = _mm_add_epi32(T15A, O5A);\
> + const __m128i T25B = _mm_add_epi32(T15B, O5B);\
> + const __m128i T26A = _mm_add_epi32(T16A, O6A);\
> + const __m128i T26B = _mm_add_epi32(T16B, O6B);\
> + const __m128i T27A = _mm_add_epi32(T17A, O7A);\
> + const __m128i T27B = _mm_add_epi32(T17B, O7B);\
> + const __m128i T2FA = _mm_sub_epi32(T10A, O0A);\
> + const __m128i T2FB = _mm_sub_epi32(T10B, O0B);\
> + const __m128i T2EA = _mm_sub_epi32(T11A, O1A);\
> + const __m128i T2EB = _mm_sub_epi32(T11B, O1B);\
> + const __m128i T2DA = _mm_sub_epi32(T12A, O2A);\
> + const __m128i T2DB = _mm_sub_epi32(T12B, O2B);\
> + const __m128i T2CA = _mm_sub_epi32(T13A, O3A);\
> + const __m128i T2CB = _mm_sub_epi32(T13B, O3B);\
> + const __m128i T2BA = _mm_sub_epi32(T14A, O4A);\
> + const __m128i T2BB = _mm_sub_epi32(T14B, O4B);\
> + const __m128i T2AA = _mm_sub_epi32(T15A, O5A);\
> + const __m128i T2AB = _mm_sub_epi32(T15B, O5B);\
> + const __m128i T29A = _mm_sub_epi32(T16A, O6A);\
> + const __m128i T29B = _mm_sub_epi32(T16B, O6B);\
> + const __m128i T28A = _mm_sub_epi32(T17A, O7A);\
> + const __m128i T28B = _mm_sub_epi32(T17B, O7B);\
> +\
> + const __m128i T30A = _mm_srai_epi32(T20A, nShift);\
> + const __m128i T30B = _mm_srai_epi32(T20B, nShift);\
> + const __m128i T31A = _mm_srai_epi32(T21A, nShift);\
> + const __m128i T31B = _mm_srai_epi32(T21B, nShift);\
> + const __m128i T32A = _mm_srai_epi32(T22A, nShift);\
> + const __m128i T32B = _mm_srai_epi32(T22B, nShift);\
> + const __m128i T33A = _mm_srai_epi32(T23A, nShift);\
> + const __m128i T33B = _mm_srai_epi32(T23B, nShift);\
> + const __m128i T34A = _mm_srai_epi32(T24A, nShift);\
> + const __m128i T34B = _mm_srai_epi32(T24B, nShift);\
> + const __m128i T35A = _mm_srai_epi32(T25A, nShift);\
> + const __m128i T35B = _mm_srai_epi32(T25B, nShift);\
> + const __m128i T36A = _mm_srai_epi32(T26A, nShift);\
> + const __m128i T36B = _mm_srai_epi32(T26B, nShift);\
> + const __m128i T37A = _mm_srai_epi32(T27A, nShift);\
> + const __m128i T37B = _mm_srai_epi32(T27B, nShift);\
> +\
> + const __m128i T38A = _mm_srai_epi32(T28A, nShift);\
> + const __m128i T38B = _mm_srai_epi32(T28B, nShift);\
> + const __m128i T39A = _mm_srai_epi32(T29A, nShift);\
> + const __m128i T39B = _mm_srai_epi32(T29B, nShift);\
> + const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);\
> + const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);\
> + const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);\
> + const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);\
> + const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);\
> + const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);\
> + const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);\
> + const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);\
> + const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);\
> + const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);\
> + const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);\
> + const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);\
> +\
> + res00[part] = _mm_packs_epi32(T30A, T30B);\
> + res01[part] = _mm_packs_epi32(T31A, T31B);\
> + res02[part] = _mm_packs_epi32(T32A, T32B);\
> + res03[part] = _mm_packs_epi32(T33A, T33B);\
> + res04[part] = _mm_packs_epi32(T34A, T34B);\
> + res05[part] = _mm_packs_epi32(T35A, T35B);\
> + res06[part] = _mm_packs_epi32(T36A, T36B);\
> + res07[part] = _mm_packs_epi32(T37A, T37B);\
> +\
> + res08[part] = _mm_packs_epi32(T38A, T38B);\
> + res09[part] = _mm_packs_epi32(T39A, T39B);\
> + res10[part] = _mm_packs_epi32(T3AA, T3AB);\
> + res11[part] = _mm_packs_epi32(T3BA, T3BB);\
> + res12[part] = _mm_packs_epi32(T3CA, T3CB);\
> + res13[part] = _mm_packs_epi32(T3DA, T3DB);\
> + res14[part] = _mm_packs_epi32(T3EA, T3EB);\
> + res15[part] = _mm_packs_epi32(T3FA, T3FB);
> +
> const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0
> 87high - 90low address
> const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
> const __m128i c16_p43_p57 = _mm_set1_epi32(0x002B0039);
> @@ -338,9 +586,6 @@
>
> const __m128i c16_n64_p64 = _mm_set1_epi32(0xFFC00040);
> const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
> - __m128i c32_rnd = _mm_set1_epi32(64);
> -
> - int nShift = 7;
>
> // DCT1
> __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2],
> in06[2], in07[2];
> @@ -348,308 +593,79 @@
> __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2],
> res06[2], res07[2];
> __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2],
> res14[2], res15[2];
>
> - for (int i = 0; i < 2; i++)
> {
> - const int offset = (i << 3);
> - in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 +
> offset]); // [07 06 05 04 03 02 01 00]
> - in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 +
> offset]); // [17 16 15 14 13 12 11 10]
> - in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 +
> offset]); // [27 26 25 24 23 22 21 20]
> - in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 +
> offset]); // [37 36 35 34 33 32 31 30]
> - in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 16 +
> offset]); // [47 46 45 44 43 42 41 40]
> - in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 16 +
> offset]); // [57 56 55 54 53 52 51 50]
> - in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 16 +
> offset]); // [67 66 65 64 63 62 61 60]
> - in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 16 +
> offset]); // [77 76 75 74 73 72 71 70]
> - in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
> - in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
> - in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 16 +
> offset]);
> - in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 16 +
> offset]);
> - in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 16 +
> offset]);
> - in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 16 +
> offset]);
> - in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 16 +
> offset]);
> - in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 16 +
> offset]);
> + READ_UNPACKHILO(0)
> + PROCESS(0, 64, 7)
> }
>
> - for (int pass = 0; pass < 2; pass++)
> {
> - if (pass == 1)
> - {
> - c32_rnd = _mm_set1_epi32(2048);
> - nShift = 12;
> - }
> -
> - for (int part = 0; part < 2; part++)
> - {
> - const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part],
> in03[part]); // [33 13 32 12 31 11 30 10]
> - const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part],
> in03[part]); // [37 17 36 16 35 15 34 14]
> - const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part],
> in07[part]); // [ ]
> - const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part],
> in07[part]); // [ ]
> - const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part],
> in11[part]); // [ ]
> - const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part],
> in11[part]); // [ ]
> - const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part],
> in15[part]); // [ ]
> - const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part],
> in15[part]); // [ ]
> - const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part],
> in06[part]); // [ ]
> - const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part],
> in06[part]); // [ ]
> - const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part],
> in14[part]); // [ ]
> - const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part],
> in14[part]); // [ ]
> - const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part],
> in12[part]); // [ ]row
> - const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part],
> in12[part]); // [ ]
> - const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part],
> in08[part]); // [83 03 82 02 81 01 81 00] row08 row00
> - const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part],
> in08[part]); // [87 07 86 06 85 05 84 04]
> -
> - __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A;
> - __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B;
> - {
> - __m128i T00, T01;
> -#define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507,
> c0911, c1315, row) \
> - T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103),
> _mm_madd_epi16(row0507, c0507)); \
> - T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911),
> _mm_madd_epi16(row1315, c1315)); \
> - row = _mm_add_epi32(T00, T01);
> -
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6A)
> - COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A,
> c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7A)
> -
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_p87_p90, c16_p70_p80, c16_p43_p57, c16_p09_p25, O0B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_p57_p87, c16_n43_p09, c16_n90_n80, c16_n25_n70, O1B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_p09_p80, c16_n87_n70, c16_p57_n25, c16_p43_p90, O2B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_n43_p70, c16_p09_n87, c16_p25_p90, c16_n57_n80, O3B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_n80_p57, c16_p90_n25, c16_n87_n09, c16_p70_p43, O4B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_n90_p43, c16_p25_p57, c16_p70_n87, c16_n80_p09, O5B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_n70_p25, c16_n80_p90, c16_p09_p43, c16_p87_n57, O6B)
> - COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B,
> c16_n25_p09, c16_n57_p43, c16_n80_p70, c16_n90_p87, O7B)
> -#undef COMPUTE_ROW
> - }
> -
> - __m128i EO0A, EO1A, EO2A, EO3A;
> - __m128i EO0B, EO1B, EO2B, EO3B;
> - EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p75_p89),
> _mm_madd_epi16(T_00_05A, c16_p18_p50)); // EO0
> - EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p75_p89),
> _mm_madd_epi16(T_00_05B, c16_p18_p50));
> - EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n18_p75),
> _mm_madd_epi16(T_00_05A, c16_n50_n89)); // EO1
> - EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n18_p75),
> _mm_madd_epi16(T_00_05B, c16_n50_n89));
> - EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n89_p50),
> _mm_madd_epi16(T_00_05A, c16_p75_p18)); // EO2
> - EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n89_p50),
> _mm_madd_epi16(T_00_05B, c16_p75_p18));
> - EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n50_p18),
> _mm_madd_epi16(T_00_05A, c16_n89_p75)); // EO3
> - EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n50_p18),
> _mm_madd_epi16(T_00_05B, c16_n89_p75));
> -
> - __m128i EEO0A, EEO1A;
> - __m128i EEO0B, EEO1B;
> - EEO0A = _mm_madd_epi16(T_00_06A, c16_p36_p83);
> - EEO0B = _mm_madd_epi16(T_00_06B, c16_p36_p83);
> - EEO1A = _mm_madd_epi16(T_00_06A, c16_n83_p36);
> - EEO1B = _mm_madd_epi16(T_00_06B, c16_n83_p36);
> -
> - __m128i EEE0A, EEE1A;
> - __m128i EEE0B, EEE1B;
> - EEE0A = _mm_madd_epi16(T_00_07A, c16_p64_p64);
> - EEE0B = _mm_madd_epi16(T_00_07B, c16_p64_p64);
> - EEE1A = _mm_madd_epi16(T_00_07A, c16_n64_p64);
> - EEE1B = _mm_madd_epi16(T_00_07B, c16_n64_p64);
> -
> - const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); //
> EE0 = EEE0 + EEO0
> - const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B);
> - const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); //
> EE1 = EEE1 + EEO1
> - const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B);
> - const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); //
> EE2 = EEE0 - EEO0
> - const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B);
> - const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); //
> EE3 = EEE1 - EEO1
> - const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B);
> -
> - const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0
> = EE0 + EO0
> - const __m128i E0B = _mm_add_epi32(EE0B, EO0B);
> - const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1
> = EE1 + EO1
> - const __m128i E1B = _mm_add_epi32(EE1B, EO1B);
> - const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2
> = EE2 + EO2
> - const __m128i E2B = _mm_add_epi32(EE2B, EO2B);
> - const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3
> = EE3 + EO3
> - const __m128i E3B = _mm_add_epi32(EE3B, EO3B);
> - const __m128i E7A = _mm_sub_epi32(EE0A, EO0A); // E0
> = EE0 - EO0
> - const __m128i E7B = _mm_sub_epi32(EE0B, EO0B);
> - const __m128i E6A = _mm_sub_epi32(EE1A, EO1A); // E1
> = EE1 - EO1
> - const __m128i E6B = _mm_sub_epi32(EE1B, EO1B);
> - const __m128i E5A = _mm_sub_epi32(EE2A, EO2A); // E2
> = EE2 - EO2
> - const __m128i E5B = _mm_sub_epi32(EE2B, EO2B);
> - const __m128i E4A = _mm_sub_epi32(EE3A, EO3A); // E3
> = EE3 - EO3
> - const __m128i E4B = _mm_sub_epi32(EE3B, EO3B);
> -
> - const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); //
> E0 + rnd
> - const __m128i T10B = _mm_add_epi32(E0B, c32_rnd);
> - const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); //
> E1 + rnd
> - const __m128i T11B = _mm_add_epi32(E1B, c32_rnd);
> - const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); //
> E2 + rnd
> - const __m128i T12B = _mm_add_epi32(E2B, c32_rnd);
> - const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); //
> E3 + rnd
> - const __m128i T13B = _mm_add_epi32(E3B, c32_rnd);
> - const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); //
> E4 + rnd
> - const __m128i T14B = _mm_add_epi32(E4B, c32_rnd);
> - const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); //
> E5 + rnd
> - const __m128i T15B = _mm_add_epi32(E5B, c32_rnd);
> - const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); //
> E6 + rnd
> - const __m128i T16B = _mm_add_epi32(E6B, c32_rnd);
> - const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); //
> E7 + rnd
> - const __m128i T17B = _mm_add_epi32(E7B, c32_rnd);
> -
> - const __m128i T20A = _mm_add_epi32(T10A, O0A); // E0
> + O0 + rnd
> - const __m128i T20B = _mm_add_epi32(T10B, O0B);
> - const __m128i T21A = _mm_add_epi32(T11A, O1A); // E1
> + O1 + rnd
> - const __m128i T21B = _mm_add_epi32(T11B, O1B);
> - const __m128i T22A = _mm_add_epi32(T12A, O2A); // E2
> + O2 + rnd
> - const __m128i T22B = _mm_add_epi32(T12B, O2B);
> - const __m128i T23A = _mm_add_epi32(T13A, O3A); // E3
> + O3 + rnd
> - const __m128i T23B = _mm_add_epi32(T13B, O3B);
> - const __m128i T24A = _mm_add_epi32(T14A, O4A); // E4
> - const __m128i T24B = _mm_add_epi32(T14B, O4B);
> - const __m128i T25A = _mm_add_epi32(T15A, O5A); // E5
> - const __m128i T25B = _mm_add_epi32(T15B, O5B);
> - const __m128i T26A = _mm_add_epi32(T16A, O6A); // E6
> - const __m128i T26B = _mm_add_epi32(T16B, O6B);
> - const __m128i T27A = _mm_add_epi32(T17A, O7A); // E7
> - const __m128i T27B = _mm_add_epi32(T17B, O7B);
> - const __m128i T2FA = _mm_sub_epi32(T10A, O0A); // E0
> - O0 + rnd
> - const __m128i T2FB = _mm_sub_epi32(T10B, O0B);
> - const __m128i T2EA = _mm_sub_epi32(T11A, O1A); // E1
> - O1 + rnd
> - const __m128i T2EB = _mm_sub_epi32(T11B, O1B);
> - const __m128i T2DA = _mm_sub_epi32(T12A, O2A); // E2
> - O2 + rnd
> - const __m128i T2DB = _mm_sub_epi32(T12B, O2B);
> - const __m128i T2CA = _mm_sub_epi32(T13A, O3A); // E3
> - O3 + rnd
> - const __m128i T2CB = _mm_sub_epi32(T13B, O3B);
> - const __m128i T2BA = _mm_sub_epi32(T14A, O4A); // E4
> - const __m128i T2BB = _mm_sub_epi32(T14B, O4B);
> - const __m128i T2AA = _mm_sub_epi32(T15A, O5A); // E5
> - const __m128i T2AB = _mm_sub_epi32(T15B, O5B);
> - const __m128i T29A = _mm_sub_epi32(T16A, O6A); // E6
> - const __m128i T29B = _mm_sub_epi32(T16B, O6B);
> - const __m128i T28A = _mm_sub_epi32(T17A, O7A); // E7
> - const __m128i T28B = _mm_sub_epi32(T17B, O7B);
> -
> - const __m128i T30A = _mm_srai_epi32(T20A, nShift);
> // [30 20 10 00]
> - const __m128i T30B = _mm_srai_epi32(T20B, nShift);
> // [70 60 50 40]
> - const __m128i T31A = _mm_srai_epi32(T21A, nShift);
> // [31 21 11 01]
> - const __m128i T31B = _mm_srai_epi32(T21B, nShift);
> // [71 61 51 41]
> - const __m128i T32A = _mm_srai_epi32(T22A, nShift);
> // [32 22 12 02]
> - const __m128i T32B = _mm_srai_epi32(T22B, nShift);
> // [72 62 52 42]
> - const __m128i T33A = _mm_srai_epi32(T23A, nShift);
> // [33 23 13 03]
> - const __m128i T33B = _mm_srai_epi32(T23B, nShift);
> // [73 63 53 43]
> - const __m128i T34A = _mm_srai_epi32(T24A, nShift);
> // [33 24 14 04]
> - const __m128i T34B = _mm_srai_epi32(T24B, nShift);
> // [74 64 54 44]
> - const __m128i T35A = _mm_srai_epi32(T25A, nShift);
> // [35 25 15 05]
> - const __m128i T35B = _mm_srai_epi32(T25B, nShift);
> // [75 65 55 45]
> - const __m128i T36A = _mm_srai_epi32(T26A, nShift);
> // [36 26 16 06]
> - const __m128i T36B = _mm_srai_epi32(T26B, nShift);
> // [76 66 56 46]
> - const __m128i T37A = _mm_srai_epi32(T27A, nShift);
> // [37 27 17 07]
> - const __m128i T37B = _mm_srai_epi32(T27B, nShift);
> // [77 67 57 47]
> -
> - const __m128i T38A = _mm_srai_epi32(T28A, nShift);
> // [30 20 10 00] x8
> - const __m128i T38B = _mm_srai_epi32(T28B, nShift);
> // [70 60 50 40]
> - const __m128i T39A = _mm_srai_epi32(T29A, nShift);
> // [31 21 11 01] x9
> - const __m128i T39B = _mm_srai_epi32(T29B, nShift);
> // [71 61 51 41]
> - const __m128i T3AA = _mm_srai_epi32(T2AA, nShift);
> // [32 22 12 02] xA
> - const __m128i T3AB = _mm_srai_epi32(T2AB, nShift);
> // [72 62 52 42]
> - const __m128i T3BA = _mm_srai_epi32(T2BA, nShift);
> // [33 23 13 03] xB
> - const __m128i T3BB = _mm_srai_epi32(T2BB, nShift);
> // [73 63 53 43]
> - const __m128i T3CA = _mm_srai_epi32(T2CA, nShift);
> // [33 24 14 04] xC
> - const __m128i T3CB = _mm_srai_epi32(T2CB, nShift);
> // [74 64 54 44]
> - const __m128i T3DA = _mm_srai_epi32(T2DA, nShift);
> // [35 25 15 05] xD
> - const __m128i T3DB = _mm_srai_epi32(T2DB, nShift);
> // [75 65 55 45]
> - const __m128i T3EA = _mm_srai_epi32(T2EA, nShift);
> // [36 26 16 06] xE
> - const __m128i T3EB = _mm_srai_epi32(T2EB, nShift);
> // [76 66 56 46]
> - const __m128i T3FA = _mm_srai_epi32(T2FA, nShift);
> // [37 27 17 07] xF
> - const __m128i T3FB = _mm_srai_epi32(T2FB, nShift);
> // [77 67 57 47]
> -
> - res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60
> 50 40 30 20 10 00]
> - res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61
> 51 41 31 21 11 01]
> - res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62
> 52 42 32 22 12 02]
> - res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63
> 53 43 33 23 13 03]
> - res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64
> 54 44 34 24 14 04]
> - res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65
> 55 45 35 25 15 05]
> - res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66
> 56 46 36 26 16 06]
> - res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67
> 57 47 37 27 17 07]
> -
> - res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ...
> 80]
> - res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ...
> 81]
> - res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ...
> 82]
> - res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ...
> 83]
> - res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ...
> 84]
> - res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ...
> 85]
> - res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ...
> 86]
> - res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ...
> 87]
> - }
> - //transpose matrix 8x8 16bit.
> - {
> - __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6,
> tr0_7;
> - __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6,
> tr1_7;
> -#define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2,
> O3, O4, O5, O6, O7) \
> - tr0_0 = _mm_unpacklo_epi16(I0, I1); \
> - tr0_1 = _mm_unpacklo_epi16(I2, I3); \
> - tr0_2 = _mm_unpackhi_epi16(I0, I1); \
> - tr0_3 = _mm_unpackhi_epi16(I2, I3); \
> - tr0_4 = _mm_unpacklo_epi16(I4, I5); \
> - tr0_5 = _mm_unpacklo_epi16(I6, I7); \
> - tr0_6 = _mm_unpackhi_epi16(I4, I5); \
> - tr0_7 = _mm_unpackhi_epi16(I6, I7); \
> - tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
> - tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
> - tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
> - tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
> - tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
> - tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
> - tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
> - tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
> - O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
> - O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
> - O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
> - O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
> - O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
> - O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
> - O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
> - O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
> -
> - TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0],
> res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0],
> in04[0], in05[0], in06[0], in07[0])
> - TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0],
> res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1],
> in04[1], in05[1], in06[1], in07[1])
> - TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1],
> res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0],
> in12[0], in13[0], in14[0], in15[0])
> - TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1],
> res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1],
> in12[1], in13[1], in14[1], in15[1])
> -
> -#undef TRANSPOSE_8x8_16BIT
> - }
> + READ_UNPACKHILO(8)
> + PROCESS(1, 64, 7)
> + }
> + {
> + __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
> + __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
> + TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0],
> res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0],
> in04[0], in05[0], in06[0], in07[0])
> + TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0],
> res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1],
> in04[1], in05[1], in06[1], in07[1])
> + TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1],
> res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0],
> in12[0], in13[0], in14[0], in15[0])
> + TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1],
> res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1],
> in12[1], in13[1], in14[1], in15[1])
> }
>
> - _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
> - _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
> - _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
> - _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
> - _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
> - _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
> - _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
> - _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
> - _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
> - _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
> - _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
> - _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
> - _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
> - _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
> - _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
> - _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
> - _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
> - _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
> - _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
> - _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
> - _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
> - _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
> - _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
> - _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
> - _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
> - _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
> - _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
> - _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
> - _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
> - _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
> - _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
> - _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
> + {
> + UNPACKHILO(0)
> + PROCESS(0, 2048, 12)
> + }
> + {
> + UNPACKHILO(1)
> + PROCESS(1, 2048, 12)
> + }
> +
> + {
> + __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7;
> + __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7;
> + TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0],
> res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0],
> in04[0], in05[0], in06[0], in07[0])
> + _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
> + _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
> + _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
> + _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
> + _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
> + _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
> + _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
> + _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
> + TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0],
> res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1],
> in04[1], in05[1], in06[1], in07[1])
> + _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
> + _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
> + _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
> + _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
> + _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
> + _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
> + _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
> + _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
> + TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1],
> res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0],
> in12[0], in13[0], in14[0], in15[0])
> + _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
> + _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
> + _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
> + _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
> + _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
> + _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
> + _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
> + _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
> + TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1],
> res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1],
> in12[1], in13[1], in14[1], in15[1])
> + _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
> + _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
> + _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
> + _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
> + _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
> + _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
> + _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
> + _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
> + }
> }
> +#undef PROCESS
> +#undef TRANSPOSE_8x8_16BIT
> +#undef COMPUTE_ROW
> +#undef UNPACKHILO
> +#undef READ_UNPACKHILO
>
> void idct32(const int16_t *src, int16_t *dst, intptr_t stride)
> {
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150120/68592497/attachment-0001.html>
More information about the x265-devel
mailing list