[x265-commits] [x265] Changed FrameEncoder::m_tld to a pointer and set it to on...
David T Yuen
dtyx265 at gmail.com
Thu Sep 25 04:52:27 CEST 2014
details: http://hg.videolan.org/x265/rev/3e1bfb2e4592
branches:
changeset: 8135:3e1bfb2e4592
user: David T Yuen <dtyx265 at gmail.com>
date: Wed Sep 24 13:17:23 2014 -0700
description:
Changed FrameEncoder::m_tld to a pointer and set it to one of Encoder's ThreadLocalData instances.
This uses less memory since m_tld isn't used in --wpp and Encoder's ThreadLocalData instances are not used in --no-wpp
Also there was a small performance increase on my system
Subject: [x265] Backed out changeset: eb011fa1d2d8
details: http://hg.videolan.org/x265/rev/e47e127da779
branches:
changeset: 8136:e47e127da779
user: Steve Borho <steve at borho.org>
date: Wed Sep 24 21:51:12 2014 -0500
description:
Backed out changeset: eb011fa1d2d8
diffstat:
source/common/vec/dct-sse3.cpp | 269 ++++++++++++++++++++++++++++++++++++++++
source/encoder/encoder.cpp | 13 +-
source/encoder/frameencoder.cpp | 9 +-
source/encoder/frameencoder.h | 2 +-
4 files changed, 282 insertions(+), 11 deletions(-)
diffs (truncated from 359 to 300 lines):
diff -r f6a0b0a97a5b -r e47e127da779 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp Wed Sep 24 15:30:16 2014 -0500
+++ b/source/common/vec/dct-sse3.cpp Wed Sep 24 21:51:12 2014 -0500
@@ -37,6 +37,274 @@ using namespace x265;
namespace {
#if !HIGH_BIT_DEPTH
+ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
+{
+ { 89, 75, 89, 75, 89, 75, 89, 75 },
+ { 50, 18, 50, 18, 50, 18, 50, 18 },
+ { 75, -18, 75, -18, 75, -18, 75, -18 },
+ { -89, -50, -89, -50, -89, -50, -89, -50 },
+ { 50, -89, 50, -89, 50, -89, 50, -89 },
+ { 18, 75, 18, 75, 18, 75, 18, 75 },
+ { 18, -50, 18, -50, 18, -50, 18, -50 },
+ { 75, -89, 75, -89, 75, -89, 75, -89 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, -64, 64, -64, 64, -64, 64, -64 },
+ { 83, 36, 83, 36, 83, 36, 83, 36 },
+ { 36, -83, 36, -83, 36, -83, 36, -83 }
+};
+void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+{
+ __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
+ __m128i T00, T01, T02, T03, T04, T05, T06, T07;
+
+ m128iAdd = _mm_set1_epi32(64);
+
+ T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
+ m128iS1 = _mm_packs_epi32(T00, T01);
+ T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
+ m128iS3 = _mm_packs_epi32(T00, T01);
+ m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+ m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+
+ T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
+ m128iS5 = _mm_packs_epi32(T00, T01);
+ T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
+ m128iS7 = _mm_packs_epi32(T00, T01);
+ m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+ m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+ O0l = _mm_add_epi32(E1l, E2l);
+ O0h = _mm_add_epi32(E1h, E2h);
+
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+
+ O1l = _mm_add_epi32(E1l, E2l);
+ O1h = _mm_add_epi32(E1h, E2h);
+
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+ O2l = _mm_add_epi32(E1l, E2l);
+ O2h = _mm_add_epi32(E1h, E2h);
+
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ O3h = _mm_add_epi32(E1h, E2h);
+ O3l = _mm_add_epi32(E1l, E2l);
+
+ /* ------- */
+
+ T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
+ m128iS0 = _mm_packs_epi32(T00, T01);
+ T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
+ m128iS4 = _mm_packs_epi32(T00, T01);
+ m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
+ EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
+ EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+
+ EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+ EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+
+ /* ------- */
+
+ T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
+ m128iS2 = _mm_packs_epi32(T00, T01);
+ T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
+ m128iS6 = _mm_packs_epi32(T00, T01);
+ m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
+ E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
+ E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ E0l = _mm_add_epi32(EE0l, E00l);
+ E0l = _mm_add_epi32(E0l, m128iAdd);
+ E0h = _mm_add_epi32(EE0h, E00h);
+ E0h = _mm_add_epi32(E0h, m128iAdd);
+ E3l = _mm_sub_epi32(EE0l, E00l);
+ E3l = _mm_add_epi32(E3l, m128iAdd);
+ E3h = _mm_sub_epi32(EE0h, E00h);
+ E3h = _mm_add_epi32(E3h, m128iAdd);
+
+ E1l = _mm_add_epi32(EE1l, E01l);
+ E1l = _mm_add_epi32(E1l, m128iAdd);
+ E1h = _mm_add_epi32(EE1h, E01h);
+ E1h = _mm_add_epi32(E1h, m128iAdd);
+ E2l = _mm_sub_epi32(EE1l, E01l);
+ E2l = _mm_add_epi32(E2l, m128iAdd);
+ E2h = _mm_sub_epi32(EE1h, E01h);
+ E2h = _mm_add_epi32(E2h, m128iAdd);
+ m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 7));
+ m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 7));
+ m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 7));
+ m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 7));
+ m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 7));
+ m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 7));
+ m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 7));
+ m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 7));
+ /* Invers matrix */
+
+ E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
+ E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
+ E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
+ E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
+ O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
+ O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
+ O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
+ O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
+ m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
+ m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
+ m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
+ m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
+ m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
+ m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
+ m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
+ m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
+ m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
+ m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
+ m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
+ m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
+ m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
+ m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
+ m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
+ m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
+
+ m128iAdd = _mm_set1_epi32(2048);
+
+ m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+ m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+ m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+ m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+ O0l = _mm_add_epi32(E1l, E2l);
+ O0h = _mm_add_epi32(E1h, E2h);
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+ O1l = _mm_add_epi32(E1l, E2l);
+ O1h = _mm_add_epi32(E1h, E2h);
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+ O2l = _mm_add_epi32(E1l, E2l);
+ O2h = _mm_add_epi32(E1h, E2h);
+ E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+ E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+ O3h = _mm_add_epi32(E1h, E2h);
+ O3l = _mm_add_epi32(E1l, E2l);
+
+ m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
+ EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
+ EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+ EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+ EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+
+ m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
+ E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
+ E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+ E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+ E0l = _mm_add_epi32(EE0l, E00l);
+ E0l = _mm_add_epi32(E0l, m128iAdd);
+ E0h = _mm_add_epi32(EE0h, E00h);
+ E0h = _mm_add_epi32(E0h, m128iAdd);
+ E3l = _mm_sub_epi32(EE0l, E00l);
+ E3l = _mm_add_epi32(E3l, m128iAdd);
+ E3h = _mm_sub_epi32(EE0h, E00h);
+ E3h = _mm_add_epi32(E3h, m128iAdd);
+ E1l = _mm_add_epi32(EE1l, E01l);
+ E1l = _mm_add_epi32(E1l, m128iAdd);
+ E1h = _mm_add_epi32(EE1h, E01h);
+ E1h = _mm_add_epi32(E1h, m128iAdd);
+ E2l = _mm_sub_epi32(EE1l, E01l);
+ E2l = _mm_add_epi32(E2l, m128iAdd);
+ E2h = _mm_sub_epi32(EE1h, E01h);
+ E2h = _mm_add_epi32(E2h, m128iAdd);
+
+ m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 12));
+ m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 12));
+ m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 12));
+ m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 12));
+ m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 12));
+ m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 12));
+ m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 12));
+ m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 12));
+
+ // [07 06 05 04 03 02 01 00]
+ // [17 16 15 14 13 12 11 10]
+ // [27 26 25 24 23 22 21 20]
+ // [37 36 35 34 33 32 31 30]
+ // [47 46 45 44 43 42 41 40]
+ // [57 56 55 54 53 52 51 50]
+ // [67 66 65 64 63 62 61 60]
+ // [77 76 75 74 73 72 71 70]
+
+ T00 = _mm_unpacklo_epi16(m128iS0, m128iS1); // [13 03 12 02 11 01 10 00]
+ T01 = _mm_unpackhi_epi16(m128iS0, m128iS1); // [17 07 16 06 15 05 14 04]
+ T02 = _mm_unpacklo_epi16(m128iS2, m128iS3); // [33 23 32 22 31 21 30 20]
+ T03 = _mm_unpackhi_epi16(m128iS2, m128iS3); // [37 27 36 26 35 25 34 24]
+ T04 = _mm_unpacklo_epi16(m128iS4, m128iS5); // [53 43 52 42 51 41 50 40]
+ T05 = _mm_unpackhi_epi16(m128iS4, m128iS5); // [57 47 56 46 55 45 54 44]
+ T06 = _mm_unpacklo_epi16(m128iS6, m128iS7); // [73 63 72 62 71 61 70 60]
+ T07 = _mm_unpackhi_epi16(m128iS6, m128iS7); // [77 67 76 66 75 65 74 64]
+
+ __m128i T10, T11;
+ T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00]
+ T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02]
+ _mm_storel_epi64((__m128i*)&dst[0 * stride + 0], T10); // [30 20 10 00]
+ _mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(T10)); // [31 21 11 01]
+ _mm_storel_epi64((__m128i*)&dst[2 * stride + 0], T11); // [32 22 12 02]
+ _mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(T11)); // [33 23 13 03]
+
+ T10 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40]
+ T11 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42]
+ _mm_storel_epi64((__m128i*)&dst[0 * stride + 4], T10);
+ _mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(T10));
+ _mm_storel_epi64((__m128i*)&dst[2 * stride + 4], T11);
+ _mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(T11));
+
+ T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04]
+ T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06]
+ _mm_storel_epi64((__m128i*)&dst[4 * stride + 0], T10);
+ _mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(T10));
+ _mm_storel_epi64((__m128i*)&dst[6 * stride + 0], T11);
+ _mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(T11));
+
+ T10 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44]
+ T11 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36]
+ _mm_storel_epi64((__m128i*)&dst[4 * stride + 4], T10);
+ _mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(T10));
+ _mm_storel_epi64((__m128i*)&dst[6 * stride + 4], T11);
+ _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
+}
+
void idct16(int32_t *src, int16_t *dst, intptr_t stride)
{
const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
@@ -1296,6 +1564,7 @@ void Setup_Vec_DCTPrimitives_sse3(Encode
* still somewhat rare on end-user PCs we still compile and link these SSE3
* intrinsic SIMD functions */
#if !HIGH_BIT_DEPTH
+ p.idct[IDCT_8x8] = idct8;
p.idct[IDCT_16x16] = idct16;
p.idct[IDCT_32x32] = idct32;
#endif
diff -r f6a0b0a97a5b -r e47e127da779 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Wed Sep 24 15:30:16 2014 -0500
+++ b/source/encoder/encoder.cpp Wed Sep 24 21:51:12 2014 -0500
@@ -102,15 +102,20 @@ void Encoder::create()
m_scalingList.setupQuantMatrices();
/* Allocate thread local data shared by all frame encoders */
- ThreadPool *pool = ThreadPool::getThreadPool();
- const int poolThreadCount = pool ? pool->getThreadCount() : 1;
- m_threadLocalData = new ThreadLocalData[poolThreadCount];
- for (int i = 0; i < poolThreadCount; i++)
+ const int poolThreadCount = ThreadPool::getThreadPool()->getThreadCount();
+ int numLocalData = m_param->frameNumThreads;
+ if (m_param->bEnableWavefront)
More information about the x265-commits
mailing list