[x265-commits] [x265] remove unnecessary copyToPicLuma() call
Satoshi Nakagawa
nakagawa424 at oki.com
Thu Feb 6 19:10:19 CET 2014
details: http://hg.videolan.org/x265/rev/db0c1dfc3a11
branches:
changeset: 6031:db0c1dfc3a11
user: Satoshi Nakagawa <nakagawa424 at oki.com>
date: Thu Feb 06 11:02:31 2014 +0900
description:
remove unnecessary copyToPicLuma() call
Subject: [x265] asm: assembly code for IntraAng32x32 all modes
details: http://hg.videolan.org/x265/rev/76fa0811c4e7
branches:
changeset: 6032:76fa0811c4e7
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Thu Feb 06 11:11:37 2014 +0530
description:
asm: assembly code for IntraAng32x32 all modes
Subject: [x265] testbench: stress test support for all pixelharness functions
details: http://hg.videolan.org/x265/rev/b86a25eb7968
branches:
changeset: 6033:b86a25eb7968
user: Murugan Vairavel <murugan at multicorewareinc.com>
date: Thu Feb 06 12:15:43 2014 +0530
description:
testbench: stress test support for all pixelharness functions
Subject: [x265] asm: modified satd and sad asm functions in 16bpp to avoid overflow
details: http://hg.videolan.org/x265/rev/ffe13a5eccb9
branches:
changeset: 6034:ffe13a5eccb9
user: Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date: Thu Feb 06 12:28:32 2014 +0530
description:
asm: modified satd and sad asm functions in 16bpp to avoid overflow
Subject: [x265] slicetype: bug fix for cuTree, use int32_t for listamount and propagate_amount to calculate valid propagate_cost
details: http://hg.videolan.org/x265/rev/6d5207b8b2ef
branches:
changeset: 6035:6d5207b8b2ef
user: Gopu Govindaswamy
date: Thu Feb 06 16:53:17 2014 -0800
description:
slicetype: bug fix for cuTree, use int32_t for listamount and propagate_amount to calculate valid propagate_cost
Subject: [x265] asm: fix Intrapred_ang[32x32] mode 10 and 26 failure on Mac
details: http://hg.videolan.org/x265/rev/a079afc4e6c7
branches:
changeset: 6036:a079afc4e6c7
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Thu Feb 06 16:42:42 2014 +0530
description:
asm: fix Intrapred_ang[32x32] mode 10 and 26 failure on Mac
Subject: [x265] vec: remove intraPredAng32x32, full asm coverage
details: http://hg.videolan.org/x265/rev/40bec5582eca
branches:
changeset: 6037:40bec5582eca
user: Steve Borho <steve at borho.org>
date: Thu Feb 06 12:09:28 2014 -0600
description:
vec: remove intraPredAng32x32, full asm coverage
diffstat:
source/Lib/TLibEncoder/TEncCu.cpp | 4 -
source/common/vec/intra-ssse3.cpp | 1245 -------------
source/common/x86/asm-primitives.cpp | 61 +-
source/common/x86/intrapred.h | 14 +
source/common/x86/intrapred8.asm | 3130 ++++++++++++++++++++++++++++++++-
source/common/x86/pixel-a.asm | 70 +-
source/common/x86/sad16-a.asm | 14 +-
source/encoder/compress.cpp | 2 -
source/encoder/slicetype.cpp | 42 +-
source/test/pixelharness.cpp | 252 +-
source/test/pixelharness.h | 6 +-
11 files changed, 3353 insertions(+), 1487 deletions(-)
diffs (truncated from 5311 to 300 lines):
diff -r 634bc0b1c246 -r 40bec5582eca source/Lib/TLibEncoder/TEncCu.cpp
--- a/source/Lib/TLibEncoder/TEncCu.cpp Wed Feb 05 23:10:22 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncCu.cpp Thu Feb 06 12:09:28 2014 -0600
@@ -1395,8 +1395,6 @@ void TEncCu::xCheckRDCostIntra(TComDataC
m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC, true);
- m_tmpRecoYuv[depth]->copyToPicLuma(outTempCU->getPic()->getPicYuvRec(), outTempCU->getAddr(), outTempCU->getZorderIdxInCU());
-
m_search->estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC);
m_entropyCoder->resetBits();
@@ -1444,8 +1442,6 @@ void TEncCu::xCheckRDCostIntraInInter(TC
m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth],
preCalcDistC, bSeparateLumaChroma);
- m_tmpRecoYuv[depth]->copyToPicLuma(outTempCU->getPic()->getPicYuvRec(), outTempCU->getAddr(), outTempCU->getZorderIdxInCU());
-
m_search->estIntraPredChromaQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth], preCalcDistC);
m_entropyCoder->resetBits();
diff -r 634bc0b1c246 -r 40bec5582eca source/common/vec/intra-ssse3.cpp
--- a/source/common/vec/intra-ssse3.cpp Wed Feb 05 23:10:22 2014 -0600
+++ b/source/common/vec/intra-ssse3.cpp Thu Feb 06 12:09:28 2014 -0600
@@ -557,1249 +557,6 @@ void intraPredAng16x16(pixel* dst, intpt
#undef MB4
#undef CALC_BLND_8ROWS
-//32x32
-#define PREDANG_CALCROW_VER(X) \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
- row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
- row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- row11L = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- row11H = _mm_srai_epi16(it2, 5); \
-\
- itmp = _mm_packus_epi16(row11L, row11H); \
- _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride)), itmp); \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
- row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
- row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- row11L = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- row11H = _mm_srai_epi16(it2, 5); \
-\
- itmp = _mm_packus_epi16(row11L, row11H); \
- _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride) + 16), itmp);
-
-#define PREDANG_CALCROW_VER_MODE2(X) \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11); \
- it3 = _mm_mullo_epi16(v_deltaFract, row21); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- res1 = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row12); \
- it3 = _mm_mullo_epi16(v_deltaFract, row22); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- res2 = _mm_srai_epi16(it2, 5); \
-\
- itmp = _mm_packus_epi16(res1, res2); \
- _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride)), itmp); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row13); \
- it3 = _mm_mullo_epi16(v_deltaFract, row23); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- res1 = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row14); \
- it3 = _mm_mullo_epi16(v_deltaFract, row24); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- res2 = _mm_srai_epi16(it2, 5); \
-\
- itmp = _mm_packus_epi16(res1, res2); \
- _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride) + 16), itmp);
-
-#define PREDANG_CALCROW_HOR(X, rowx) \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
- row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
- row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-\
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- row11L = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- row11H = _mm_srai_epi16(it2, 5); \
- rowx = _mm_packus_epi16(row11L, row11H);
-
-#define PREDANG_CALCROW_HOR_MODE2(rowx) \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- res1 = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- res2 = _mm_srai_epi16(it2, 5); \
- rowx = _mm_packus_epi16(res1, res2);
-
-#define LOADROW(ROWL, ROWH, X) \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
- ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128());
-
-#define BLND2_2(R1, R2) \
- itmp1 = _mm_unpacklo_epi64(R1, R2); \
- itmp2 = _mm_unpackhi_epi64(R1, R2); \
- _mm_storeu_si128((__m128i*)dst, itmp1); \
- dst += dstStride; \
- _mm_storeu_si128((__m128i*)dst, itmp2); \
- dst += dstStride;
-
-#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) \
- itmp1 = _mm_unpacklo_epi8(R1, R2); \
- itmp2 = _mm_unpackhi_epi8(R1, R2); \
- R1 = itmp1; \
- R2 = itmp2; \
- itmp1 = _mm_unpacklo_epi8(R3, R4); \
- itmp2 = _mm_unpackhi_epi8(R3, R4); \
- R3 = itmp1; \
- R4 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R1, R3); \
- itmp2 = _mm_unpackhi_epi16(R1, R3); \
- R1 = itmp1; \
- R3 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R2, R4); \
- itmp2 = _mm_unpackhi_epi16(R2, R4); \
- R2 = itmp1; \
- R4 = itmp2; \
- itmp1 = _mm_unpacklo_epi8(R5, R6); \
- itmp2 = _mm_unpackhi_epi8(R5, R6); \
- R5 = itmp1; \
- R6 = itmp2; \
- itmp1 = _mm_unpacklo_epi8(R7, R8); \
- itmp2 = _mm_unpackhi_epi8(R7, R8); \
- R7 = itmp1; \
- R8 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R5, R7); \
- itmp2 = _mm_unpackhi_epi16(R5, R7); \
- R5 = itmp1; \
- R7 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R6, R8); \
- itmp2 = _mm_unpackhi_epi16(R6, R8); \
- R6 = itmp1; \
- R8 = itmp2; \
- itmp1 = _mm_unpacklo_epi32(R1, R5); \
- itmp2 = _mm_unpackhi_epi32(R1, R5); \
- R1 = itmp1; \
- R5 = itmp2; \
-\
- itmp1 = _mm_unpacklo_epi32(R2, R6); \
- itmp2 = _mm_unpackhi_epi32(R2, R6); \
- R2 = itmp1; \
- R6 = itmp2; \
-\
- itmp1 = _mm_unpacklo_epi32(R3, R7); \
- itmp2 = _mm_unpackhi_epi32(R3, R7); \
- R3 = itmp1; \
- R7 = itmp2; \
-\
- itmp1 = _mm_unpacklo_epi32(R4, R8); \
- itmp2 = _mm_unpackhi_epi32(R4, R8); \
- R4 = itmp1; \
- R8 = itmp2;
-
-#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) \
- PREDANG_CALCROW_HOR(0 + X, R1) \
- PREDANG_CALCROW_HOR(1 + X, R2) \
- PREDANG_CALCROW_HOR(2 + X, R3) \
- PREDANG_CALCROW_HOR(3 + X, R4) \
- PREDANG_CALCROW_HOR(4 + X, R5) \
- PREDANG_CALCROW_HOR(5 + X, R6) \
- PREDANG_CALCROW_HOR(6 + X, R7)
-
-#define CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) \
- PREDANG_CALCROW_HOR_MODE2(R1) \
- PREDANG_CALCROW_HOR_MODE2(R2) \
- PREDANG_CALCROW_HOR_MODE2(R3) \
- PREDANG_CALCROW_HOR_MODE2(R4) \
- PREDANG_CALCROW_HOR_MODE2(R5) \
- PREDANG_CALCROW_HOR_MODE2(R6) \
- PREDANG_CALCROW_HOR_MODE2(R7) \
-
-void intraPredAng32x32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int)
-{
- int k;
- int blkSize = 32;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int lookIdx = intraPredAngle;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
-
- pixel* refMain;
- pixel* refSide;
-
- // Initialize the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- if (intraPredAngle != -32)
- for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- // bfilter will always be true for blocksize 8
- if (intraPredAngle == 0) // Exactly horizontal/vertical angles
- {
- if (modeHor)
- {
- __m128i temp, temp1;
-
-#define BROADCAST_STORE(X) \
- temp1 = _mm_shuffle_epi8(temp, _mm_set1_epi8(X)); \
- _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride)), temp1); \
- _mm_storeu_si128((__m128i*)(dst + ((X)*dstStride) + 16), temp1); \
-
- temp = _mm_loadu_si128((__m128i const*)(refMain + 1));
-
More information about the x265-commits
mailing list