[x265-commits] [x265] Changed FrameEncoder::m_tld to a pointer and set it to on...

David T Yuen dtyx265 at gmail.com
Thu Sep 25 04:52:27 CEST 2014


details:   http://hg.videolan.org/x265/rev/3e1bfb2e4592
branches:  
changeset: 8135:3e1bfb2e4592
user:      David T Yuen <dtyx265 at gmail.com>
date:      Wed Sep 24 13:17:23 2014 -0700
description:
Changed FrameEncoder::m_tld to a pointer and set it to one of Encoder's ThreadLocalData instances.

This uses less memory since m_tld isn't used in --wpp and Encoder's ThreadLocalData instances are not used in --no-wpp
Also there was a small performance increase on my system
Subject: [x265] Backed out changeset: eb011fa1d2d8

details:   http://hg.videolan.org/x265/rev/e47e127da779
branches:  
changeset: 8136:e47e127da779
user:      Steve Borho <steve at borho.org>
date:      Wed Sep 24 21:51:12 2014 -0500
description:
Backed out changeset: eb011fa1d2d8

diffstat:

 source/common/vec/dct-sse3.cpp  |  269 ++++++++++++++++++++++++++++++++++++++++
 source/encoder/encoder.cpp      |   13 +-
 source/encoder/frameencoder.cpp |    9 +-
 source/encoder/frameencoder.h   |    2 +-
 4 files changed, 282 insertions(+), 11 deletions(-)

diffs (truncated from 359 to 300 lines):

diff -r f6a0b0a97a5b -r e47e127da779 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Wed Sep 24 15:30:16 2014 -0500
+++ b/source/common/vec/dct-sse3.cpp	Wed Sep 24 21:51:12 2014 -0500
@@ -37,6 +37,274 @@ using namespace x265;
 
 namespace {
 #if !HIGH_BIT_DEPTH
+ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
+{
+    {  89,  75,  89,  75, 89,  75, 89,  75 },
+    {  50,  18,  50,  18, 50,  18, 50,  18 },
+    {  75, -18,  75, -18, 75, -18, 75, -18 },
+    { -89, -50, -89, -50, -89, -50, -89, -50 },
+    {  50, -89,  50, -89, 50, -89, 50, -89 },
+    {  18,  75,  18,  75, 18,  75, 18,  75 },
+    {  18, -50,  18, -50, 18, -50, 18, -50 },
+    {  75, -89,  75, -89, 75, -89, 75, -89 },
+    {  64,  64,  64,  64, 64,  64, 64,  64 },
+    {  64, -64,  64, -64, 64, -64, 64, -64 },
+    {  83,  36,  83,  36, 83,  36, 83,  36 },
+    {  36, -83,  36, -83, 36, -83, 36, -83 }
+};
+void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+{
+    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
+    __m128i T00, T01, T02, T03, T04, T05, T06, T07;
+
+    m128iAdd = _mm_set1_epi32(64);
+
+    T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
+    m128iS1 = _mm_packs_epi32(T00, T01);
+    T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
+    m128iS3 = _mm_packs_epi32(T00, T01);
+    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+
+    T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
+    m128iS5 = _mm_packs_epi32(T00, T01);
+    T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
+    m128iS7 = _mm_packs_epi32(T00, T01);
+    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+    O0l = _mm_add_epi32(E1l, E2l);
+    O0h = _mm_add_epi32(E1h, E2h);
+
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+
+    O1l = _mm_add_epi32(E1l, E2l);
+    O1h = _mm_add_epi32(E1h, E2h);
+
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+    O2l = _mm_add_epi32(E1l, E2l);
+    O2h = _mm_add_epi32(E1h, E2h);
+
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+    O3h = _mm_add_epi32(E1h, E2h);
+    O3l = _mm_add_epi32(E1l, E2l);
+
+    /*    -------     */
+
+    T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
+    m128iS0 = _mm_packs_epi32(T00, T01);
+    T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
+    m128iS4 = _mm_packs_epi32(T00, T01);
+    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
+    EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
+    EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+
+    EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+    EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+
+    /*    -------     */
+
+    T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
+    m128iS2 = _mm_packs_epi32(T00, T01);
+    T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
+    T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
+    m128iS6 = _mm_packs_epi32(T00, T01);
+    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
+    E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
+    E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+    E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+    E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+    E0l = _mm_add_epi32(EE0l, E00l);
+    E0l = _mm_add_epi32(E0l, m128iAdd);
+    E0h = _mm_add_epi32(EE0h, E00h);
+    E0h = _mm_add_epi32(E0h, m128iAdd);
+    E3l = _mm_sub_epi32(EE0l, E00l);
+    E3l = _mm_add_epi32(E3l, m128iAdd);
+    E3h = _mm_sub_epi32(EE0h, E00h);
+    E3h = _mm_add_epi32(E3h, m128iAdd);
+
+    E1l = _mm_add_epi32(EE1l, E01l);
+    E1l = _mm_add_epi32(E1l, m128iAdd);
+    E1h = _mm_add_epi32(EE1h, E01h);
+    E1h = _mm_add_epi32(E1h, m128iAdd);
+    E2l = _mm_sub_epi32(EE1l, E01l);
+    E2l = _mm_add_epi32(E2l, m128iAdd);
+    E2h = _mm_sub_epi32(EE1h, E01h);
+    E2h = _mm_add_epi32(E2h, m128iAdd);
+    m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 7));
+    m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 7));
+    m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 7));
+    m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 7));
+    m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 7), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 7));
+    m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 7), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 7));
+    m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 7), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 7));
+    m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 7), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 7));
+    /*  Invers matrix   */
+
+    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
+    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
+    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
+    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
+    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
+    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
+    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
+    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
+    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
+    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
+    m128iS0  = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
+    m128iS1  = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
+    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
+    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
+    m128iS2  = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
+    m128iS3  = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
+    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
+    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
+    m128iS4  = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
+    m128iS5  = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
+    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
+    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
+    m128iS6  = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
+    m128iS7  = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
+
+    m128iAdd = _mm_set1_epi32(2048);
+
+    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
+    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
+    O0l = _mm_add_epi32(E1l, E2l);
+    O0h = _mm_add_epi32(E1h, E2h);
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2])));
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3])));
+    O1l = _mm_add_epi32(E1l, E2l);
+    O1h = _mm_add_epi32(E1h, E2h);
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4])));
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5])));
+    O2l = _mm_add_epi32(E1l, E2l);
+    O2h = _mm_add_epi32(E1h, E2h);
+    E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+    E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6])));
+    E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+    E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7])));
+    O3h = _mm_add_epi32(E1h, E2h);
+    O3l = _mm_add_epi32(E1l, E2l);
+
+    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
+    EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
+    EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
+    EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+    EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9])));
+
+    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
+    E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
+    E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
+    E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+    E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11])));
+    E0l = _mm_add_epi32(EE0l, E00l);
+    E0l = _mm_add_epi32(E0l, m128iAdd);
+    E0h = _mm_add_epi32(EE0h, E00h);
+    E0h = _mm_add_epi32(E0h, m128iAdd);
+    E3l = _mm_sub_epi32(EE0l, E00l);
+    E3l = _mm_add_epi32(E3l, m128iAdd);
+    E3h = _mm_sub_epi32(EE0h, E00h);
+    E3h = _mm_add_epi32(E3h, m128iAdd);
+    E1l = _mm_add_epi32(EE1l, E01l);
+    E1l = _mm_add_epi32(E1l, m128iAdd);
+    E1h = _mm_add_epi32(EE1h, E01h);
+    E1h = _mm_add_epi32(E1h, m128iAdd);
+    E2l = _mm_sub_epi32(EE1l, E01l);
+    E2l = _mm_add_epi32(E2l, m128iAdd);
+    E2h = _mm_sub_epi32(EE1h, E01h);
+    E2h = _mm_add_epi32(E2h, m128iAdd);
+
+    m128iS0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 12));
+    m128iS1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 12));
+    m128iS2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 12));
+    m128iS3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 12));
+    m128iS4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 12), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 12));
+    m128iS5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 12), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 12));
+    m128iS6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 12), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 12));
+    m128iS7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 12), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 12));
+
+    // [07 06 05 04 03 02 01 00]
+    // [17 16 15 14 13 12 11 10]
+    // [27 26 25 24 23 22 21 20]
+    // [37 36 35 34 33 32 31 30]
+    // [47 46 45 44 43 42 41 40]
+    // [57 56 55 54 53 52 51 50]
+    // [67 66 65 64 63 62 61 60]
+    // [77 76 75 74 73 72 71 70]
+
+    T00 = _mm_unpacklo_epi16(m128iS0, m128iS1);     // [13 03 12 02 11 01 10 00]
+    T01 = _mm_unpackhi_epi16(m128iS0, m128iS1);     // [17 07 16 06 15 05 14 04]
+    T02 = _mm_unpacklo_epi16(m128iS2, m128iS3);     // [33 23 32 22 31 21 30 20]
+    T03 = _mm_unpackhi_epi16(m128iS2, m128iS3);     // [37 27 36 26 35 25 34 24]
+    T04 = _mm_unpacklo_epi16(m128iS4, m128iS5);     // [53 43 52 42 51 41 50 40]
+    T05 = _mm_unpackhi_epi16(m128iS4, m128iS5);     // [57 47 56 46 55 45 54 44]
+    T06 = _mm_unpacklo_epi16(m128iS6, m128iS7);     // [73 63 72 62 71 61 70 60]
+    T07 = _mm_unpackhi_epi16(m128iS6, m128iS7);     // [77 67 76 66 75 65 74 64]
+
+    __m128i T10, T11;
+    T10 = _mm_unpacklo_epi32(T00, T02);                                     // [31 21 11 01 30 20 10 00]
+    T11 = _mm_unpackhi_epi32(T00, T02);                                     // [33 23 13 03 32 22 12 02]
+    _mm_storel_epi64((__m128i*)&dst[0 * stride +  0], T10);                   // [30 20 10 00]
+    _mm_storeh_pi((__m64*)&dst[1 * stride +  0], _mm_castsi128_ps(T10));  // [31 21 11 01]
+    _mm_storel_epi64((__m128i*)&dst[2 * stride +  0], T11);                   // [32 22 12 02]
+    _mm_storeh_pi((__m64*)&dst[3 * stride +  0], _mm_castsi128_ps(T11));  // [33 23 13 03]
+
+    T10 = _mm_unpacklo_epi32(T04, T06);                                     // [71 61 51 41 70 60 50 40]
+    T11 = _mm_unpackhi_epi32(T04, T06);                                     // [73 63 53 43 72 62 52 42]
+    _mm_storel_epi64((__m128i*)&dst[0 * stride +  4], T10);
+    _mm_storeh_pi((__m64*)&dst[1 * stride +  4], _mm_castsi128_ps(T10));
+    _mm_storel_epi64((__m128i*)&dst[2 * stride +  4], T11);
+    _mm_storeh_pi((__m64*)&dst[3 * stride +  4], _mm_castsi128_ps(T11));
+
+    T10 = _mm_unpacklo_epi32(T01, T03);                                     // [35 25 15 05 34 24 14 04]
+    T11 = _mm_unpackhi_epi32(T01, T03);                                     // [37 27 17 07 36 26 16 06]
+    _mm_storel_epi64((__m128i*)&dst[4 * stride +  0], T10);
+    _mm_storeh_pi((__m64*)&dst[5 * stride +  0], _mm_castsi128_ps(T10));
+    _mm_storel_epi64((__m128i*)&dst[6 * stride +  0], T11);
+    _mm_storeh_pi((__m64*)&dst[7 * stride +  0], _mm_castsi128_ps(T11));
+
+    T10 = _mm_unpacklo_epi32(T05, T07);                                     // [75 65 55 45 74 64 54 44]
+    T11 = _mm_unpackhi_epi32(T05, T07);                                     // [77 67 57 47 76 56 46 36]
+    _mm_storel_epi64((__m128i*)&dst[4 * stride +  4], T10);
+    _mm_storeh_pi((__m64*)&dst[5 * stride +  4], _mm_castsi128_ps(T10));
+    _mm_storel_epi64((__m128i*)&dst[6 * stride +  4], T11);
+    _mm_storeh_pi((__m64*)&dst[7 * stride +  4], _mm_castsi128_ps(T11));
+}
+
 void idct16(int32_t *src, int16_t *dst, intptr_t stride)
 {
     const __m128i c16_p87_p90   = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
@@ -1296,6 +1564,7 @@ void Setup_Vec_DCTPrimitives_sse3(Encode
      * still somewhat rare on end-user PCs we still compile and link these SSE3
      * intrinsic SIMD functions */
 #if !HIGH_BIT_DEPTH
+    p.idct[IDCT_8x8] = idct8;
     p.idct[IDCT_16x16] = idct16;
     p.idct[IDCT_32x32] = idct32;
 #endif
diff -r f6a0b0a97a5b -r e47e127da779 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Wed Sep 24 15:30:16 2014 -0500
+++ b/source/encoder/encoder.cpp	Wed Sep 24 21:51:12 2014 -0500
@@ -102,15 +102,20 @@ void Encoder::create()
     m_scalingList.setupQuantMatrices();
 
     /* Allocate thread local data shared by all frame encoders */
-    ThreadPool *pool = ThreadPool::getThreadPool();
-    const int poolThreadCount = pool ? pool->getThreadCount() : 1;
-    m_threadLocalData = new ThreadLocalData[poolThreadCount];
-    for (int i = 0; i < poolThreadCount; i++)
+    const int poolThreadCount = ThreadPool::getThreadPool()->getThreadCount();
+    int numLocalData = m_param->frameNumThreads;
+    if (m_param->bEnableWavefront)


More information about the x265-commits mailing list