[x265-commits] [x265] remove unnecessary copyToPicLuma() call

Satoshi Nakagawa nakagawa424 at oki.com
Thu Feb 6 19:10:19 CET 2014


details:   http://hg.videolan.org/x265/rev/db0c1dfc3a11
branches:  
changeset: 6031:db0c1dfc3a11
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Thu Feb 06 11:02:31 2014 +0900
description:
remove unnecessary copyToPicLuma() call
Subject: [x265] asm: assembly code for IntraAng32x32 all modes

details:   http://hg.videolan.org/x265/rev/76fa0811c4e7
branches:  
changeset: 6032:76fa0811c4e7
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Thu Feb 06 11:11:37 2014 +0530
description:
asm: assembly code for IntraAng32x32 all modes
Subject: [x265] testbench: stress test support for all pixelharness functions

details:   http://hg.videolan.org/x265/rev/b86a25eb7968
branches:  
changeset: 6033:b86a25eb7968
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Thu Feb 06 12:15:43 2014 +0530
description:
testbench: stress test support for all pixelharness functions
Subject: [x265] asm: modified satd and sad asm functions in 16bpp to avoid overflow

details:   http://hg.videolan.org/x265/rev/ffe13a5eccb9
branches:  
changeset: 6034:ffe13a5eccb9
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Thu Feb 06 12:28:32 2014 +0530
description:
asm: modified satd and sad asm functions in 16bpp to avoid overflow
Subject: [x265] slicetype: bug fix for cuTree, use int32_t for listamount and propagate_amount to calculate valid propagate_cost

details:   http://hg.videolan.org/x265/rev/6d5207b8b2ef
branches:  
changeset: 6035:6d5207b8b2ef
user:      Gopu Govindaswamy
date:      Thu Feb 06 16:53:17 2014 -0800
description:
slicetype: bug fix for cuTree, use int32_t for listamount and propagate_amount to calculate valid propagate_cost
Subject: [x265] asm: fix Intrapred_ang[32x32] mode 10 and 26 failure on Mac

details:   http://hg.videolan.org/x265/rev/a079afc4e6c7
branches:  
changeset: 6036:a079afc4e6c7
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Thu Feb 06 16:42:42 2014 +0530
description:
asm: fix Intrapred_ang[32x32] mode 10 and 26 failure on Mac
Subject: [x265] vec: remove intraPredAng32x32, full asm coverage

details:   http://hg.videolan.org/x265/rev/40bec5582eca
branches:  
changeset: 6037:40bec5582eca
user:      Steve Borho <steve at borho.org>
date:      Thu Feb 06 12:09:28 2014 -0600
description:
vec: remove intraPredAng32x32, full asm coverage

diffstat:

 source/Lib/TLibEncoder/TEncCu.cpp    |     4 -
 source/common/vec/intra-ssse3.cpp    |  1245 -------------
 source/common/x86/asm-primitives.cpp |    61 +-
 source/common/x86/intrapred.h        |    14 +
 source/common/x86/intrapred8.asm     |  3130 ++++++++++++++++++++++++++++++++-
 source/common/x86/pixel-a.asm        |    70 +-
 source/common/x86/sad16-a.asm        |    14 +-
 source/encoder/compress.cpp          |     2 -
 source/encoder/slicetype.cpp         |    42 +-
 source/test/pixelharness.cpp         |   252 +-
 source/test/pixelharness.h           |     6 +-
 11 files changed, 3353 insertions(+), 1487 deletions(-)

diffs (truncated from 5311 to 300 lines):

diff -r 634bc0b1c246 -r 40bec5582eca source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Wed Feb 05 23:10:22 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Thu Feb 06 12:09:28 2014 -0600
@@ -1395,8 +1395,6 @@ void TEncCu::xCheckRDCostIntra(TComDataC
 
     m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC, true);
 
-    m_tmpRecoYuv[depth]->copyToPicLuma(outTempCU->getPic()->getPicYuvRec(), outTempCU->getAddr(), outTempCU->getZorderIdxInCU());
-
     m_search->estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC);
 
     m_entropyCoder->resetBits();
@@ -1444,8 +1442,6 @@ void TEncCu::xCheckRDCostIntraInInter(TC
     m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth],
                              preCalcDistC, bSeparateLumaChroma);
 
-    m_tmpRecoYuv[depth]->copyToPicLuma(outTempCU->getPic()->getPicYuvRec(), outTempCU->getAddr(), outTempCU->getZorderIdxInCU());
-
     m_search->estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC);
 
     m_entropyCoder->resetBits();
diff -r 634bc0b1c246 -r 40bec5582eca source/common/vec/intra-ssse3.cpp
--- a/source/common/vec/intra-ssse3.cpp	Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/vec/intra-ssse3.cpp	Thu Feb 06 12:09:28 2014 -0600
@@ -557,1249 +557,6 @@ void intraPredAng16x16(pixel* dst, intpt
 #undef MB4
 #undef CALC_BLND_8ROWS
 
-//32x32
-#define PREDANG_CALCROW_VER(X) \
-    v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-    v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
-    row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
-    row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row11L); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11L = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row11H); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-    it2 = _mm_add_epi16(it2, it3); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11H = _mm_srai_epi16(it2, 5); \
-\
-    itmp = _mm_packus_epi16(row11L, row11H); \
-    _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride)), itmp); \
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
-    row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
-    row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row11L); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11L = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row11H); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-    it2 = _mm_add_epi16(it2, it3); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11H = _mm_srai_epi16(it2, 5); \
-\
-    itmp = _mm_packus_epi16(row11L, row11H); \
-    _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride) + 16), itmp);
-
-#define PREDANG_CALCROW_VER_MODE2(X) \
-    v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-    v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row11); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row21); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    res1 = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row12); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row22); \
-    it2 = _mm_add_epi16(it2, it3); \
-    it2 = _mm_add_epi16(it2, i16); \
-    res2 = _mm_srai_epi16(it2, 5); \
-\
-    itmp = _mm_packus_epi16(res1, res2); \
-    _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride)), itmp); \
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row13); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row23); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    res1 = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row14); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row24); \
-    it2 = _mm_add_epi16(it2, it3); \
-    it2 = _mm_add_epi16(it2, i16); \
-    res2 = _mm_srai_epi16(it2, 5); \
-\
-    itmp = _mm_packus_epi16(res1, res2); \
-    _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride) + 16), itmp);
-
-#define PREDANG_CALCROW_HOR(X, rowx) \
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
-    row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
-    row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
-    v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-    v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row11L); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11L = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row11H); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-    it2 = _mm_add_epi16(it2, it3); \
-    it2 = _mm_add_epi16(it2, i16); \
-    row11H = _mm_srai_epi16(it2, 5); \
-    rowx = _mm_packus_epi16(row11L, row11H);
-
-#define PREDANG_CALCROW_HOR_MODE2(rowx) \
-    v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-    v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-    it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-    it2 = _mm_mullo_epi16(it1, row11L); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-    it2 = _mm_add_epi16(it2, it3); \
-    i16 = _mm_set1_epi16(16); \
-    it2 = _mm_add_epi16(it2, i16); \
-    res1 = _mm_srai_epi16(it2, 5); \
-    it2 = _mm_mullo_epi16(it1, row11H); \
-    it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-    it2 = _mm_add_epi16(it2, it3); \
-    it2 = _mm_add_epi16(it2, i16); \
-    res2 = _mm_srai_epi16(it2, 5); \
-    rowx = _mm_packus_epi16(res1, res2);
-
-#define LOADROW(ROWL, ROWH, X) \
-    itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
-    ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-    ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128());
-
-#define BLND2_2(R1, R2) \
-    itmp1 = _mm_unpacklo_epi64(R1, R2); \
-    itmp2 = _mm_unpackhi_epi64(R1, R2); \
-    _mm_storeu_si128((__m128i*)dst, itmp1); \
-    dst += dstStride; \
-    _mm_storeu_si128((__m128i*)dst, itmp2); \
-    dst += dstStride;
-
-#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
-    itmp1 = _mm_unpacklo_epi8(R1, R2); \
-    itmp2 = _mm_unpackhi_epi8(R1, R2); \
-    R1 = itmp1; \
-    R2 = itmp2; \
-    itmp1 = _mm_unpacklo_epi8(R3, R4); \
-    itmp2 = _mm_unpackhi_epi8(R3, R4); \
-    R3 = itmp1; \
-    R4 = itmp2; \
-    itmp1 = _mm_unpacklo_epi16(R1, R3); \
-    itmp2 = _mm_unpackhi_epi16(R1, R3); \
-    R1 = itmp1; \
-    R3 = itmp2; \
-    itmp1 = _mm_unpacklo_epi16(R2, R4); \
-    itmp2 = _mm_unpackhi_epi16(R2, R4); \
-    R2 = itmp1; \
-    R4 = itmp2; \
-    itmp1 = _mm_unpacklo_epi8(R5, R6); \
-    itmp2 = _mm_unpackhi_epi8(R5, R6); \
-    R5 = itmp1; \
-    R6 = itmp2; \
-    itmp1 = _mm_unpacklo_epi8(R7, R8); \
-    itmp2 = _mm_unpackhi_epi8(R7, R8); \
-    R7 = itmp1; \
-    R8 = itmp2; \
-    itmp1 = _mm_unpacklo_epi16(R5, R7); \
-    itmp2 = _mm_unpackhi_epi16(R5, R7); \
-    R5 = itmp1; \
-    R7 = itmp2; \
-    itmp1 = _mm_unpacklo_epi16(R6, R8); \
-    itmp2 = _mm_unpackhi_epi16(R6, R8); \
-    R6 = itmp1; \
-    R8 = itmp2; \
-    itmp1 = _mm_unpacklo_epi32(R1, R5); \
-    itmp2 = _mm_unpackhi_epi32(R1, R5); \
-    R1 = itmp1; \
-    R5 = itmp2; \
-\
-    itmp1 = _mm_unpacklo_epi32(R2, R6); \
-    itmp2 = _mm_unpackhi_epi32(R2, R6); \
-    R2 = itmp1; \
-    R6 = itmp2; \
-\
-    itmp1 = _mm_unpacklo_epi32(R3, R7); \
-    itmp2 = _mm_unpackhi_epi32(R3, R7); \
-    R3 = itmp1; \
-    R7 = itmp2; \
-\
-    itmp1 = _mm_unpacklo_epi32(R4, R8); \
-    itmp2 = _mm_unpackhi_epi32(R4, R8); \
-    R4 = itmp1; \
-    R8 = itmp2;
-
-#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) \
-    PREDANG_CALCROW_HOR(0 + X, R1) \
-    PREDANG_CALCROW_HOR(1 + X, R2) \
-    PREDANG_CALCROW_HOR(2 + X, R3) \
-    PREDANG_CALCROW_HOR(3 + X, R4) \
-    PREDANG_CALCROW_HOR(4 + X, R5) \
-    PREDANG_CALCROW_HOR(5 + X, R6) \
-    PREDANG_CALCROW_HOR(6 + X, R7)
-
-#define CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) \
-    PREDANG_CALCROW_HOR_MODE2(R1) \
-    PREDANG_CALCROW_HOR_MODE2(R2) \
-    PREDANG_CALCROW_HOR_MODE2(R3) \
-    PREDANG_CALCROW_HOR_MODE2(R4) \
-    PREDANG_CALCROW_HOR_MODE2(R5) \
-    PREDANG_CALCROW_HOR_MODE2(R6) \
-    PREDANG_CALCROW_HOR_MODE2(R7) \
-
-void intraPredAng32x32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int)
-{
-    int k;
-    int blkSize = 32;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int lookIdx = intraPredAngle;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-
-    pixel* refMain;
-    pixel* refSide;
-
-    // Initialize the Main and Left reference array.
-    if (intraPredAngle < 0)
-    {
-        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
-        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
-
-        // Extend the Main reference to the left.
-        int invAngleSum    = 128;     // rounding for (shift by 8)
-        if (intraPredAngle != -32)
-            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
-            {
-                invAngleSum += invAngle;
-                refMain[k] = refSide[invAngleSum >> 8];
-            }
-    }
-    else
-    {
-        refMain = modeVer ? refAbove : refLeft;
-        refSide = modeVer ? refLeft  : refAbove;
-    }
-
-    // bfilter will always be true for blocksize 8
-    if (intraPredAngle == 0)  // Exactly horizontal/vertical angles
-    {
-        if (modeHor)
-        {
-            __m128i temp, temp1;
-
-#define BROADCAST_STORE(X) \
-    temp1 = _mm_shuffle_epi8(temp, _mm_set1_epi8(X)); \
-    _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride)), temp1); \
-    _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride) + 16), temp1); \
-
-            temp = _mm_loadu_si128((__m128i const*)(refMain + 1));
-


More information about the x265-commits mailing list