[x265] [PATCH 6/6] intrapred: framework for generate 33 Angle modes once
Min Chen
chenm003 at 163.com
Tue Jun 18 18:43:38 CEST 2013
---
source/Lib/TLibCommon/TComPrediction.cpp | 5 +-
source/Lib/TLibCommon/TComPrediction.h | 1 +
source/Lib/TLibEncoder/TEncSearch.cpp | 65 +-
source/common/IntraPred.cpp | 569 +-
source/common/primitives.h | 2 +
source/common/vec/intrapred.inc | 9409 +++++++++++++++---------------
source/test/intrapredharness.cpp | 72 +
source/test/intrapredharness.h | 4 +
8 files changed, 5179 insertions(+), 4948 deletions(-)
diff --git a/source/Lib/TLibCommon/TComPrediction.cpp b/source/Lib/TLibCommon/TComPrediction.cpp
index 6e6baf3..6a58140 100644
--- a/source/Lib/TLibCommon/TComPrediction.cpp
+++ b/source/Lib/TLibCommon/TComPrediction.cpp
@@ -60,13 +60,15 @@ const UChar m_aucIntraFilter[5] =
TComPrediction::TComPrediction()
: m_pLumaRecBuffer(0)
, m_iLumaRecStride(0)
+ , m_piPredBuf(NULL)
+ , m_piPredAngBufs(NULL)
{
- m_piPredBuf = NULL;
}
TComPrediction::~TComPrediction()
{
delete[] m_piPredBuf;
+ xFree(m_piPredAngBufs);
xFree(refAbove);
xFree(refAboveFlt);
@@ -114,6 +116,7 @@ Void TComPrediction::initTempBuff()
m_iPredBufHeight = ((MAX_CU_SIZE + 2) << 4);
m_iPredBufStride = ((MAX_CU_SIZE + 8) << 4);
m_piPredBuf = new Pel[m_iPredBufStride * m_iPredBufHeight];
+ m_piPredAngBufs = (Pel*)xMalloc(Pel, 33 * MAX_CU_SIZE * MAX_CU_SIZE);
refAbove = (Pel*)xMalloc(Pel, 3 * MAX_CU_SIZE);
refAboveFlt = (Pel*)xMalloc(Pel, 3 * MAX_CU_SIZE);
diff --git a/source/Lib/TLibCommon/TComPrediction.h b/source/Lib/TLibCommon/TComPrediction.h
index 33d3882..8e18517 100644
--- a/source/Lib/TLibCommon/TComPrediction.h
+++ b/source/Lib/TLibCommon/TComPrediction.h
@@ -60,6 +60,7 @@ class TComPrediction : public TComWeightPrediction
protected:
Pel* m_piPredBuf;
+ Pel* m_piPredAngBufs;
Int m_iPredBufStride;
Int m_iPredBufHeight;
diff --git a/source/Lib/TLibEncoder/TEncSearch.cpp b/source/Lib/TLibEncoder/TEncSearch.cpp
index 63e2d66..7e4c50e 100644
--- a/source/Lib/TLibEncoder/TEncSearch.cpp
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp
@@ -2279,16 +2279,67 @@ Void TEncSearch::estIntraPredQT(TComDataCU* pcCU,
primitives.getIPredPlanar((pixel*)ptrSrc + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, (pixel*)piPred, uiStride, uiWidth);
uiSads[PLANAR_IDX] = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
- // 33 Angle modes
- for (UInt uiMode = 2; uiMode < numModesAvailable; uiMode++)
+ // 33 Angle modes once
+ if (uiWidth <= 4)
{
- predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
+ ALIGN_VAR_32(Pel, buf1[MAX_CU_SIZE * MAX_CU_SIZE]);
+ ALIGN_VAR_32(Pel, tmp[33 * MAX_CU_SIZE * MAX_CU_SIZE]);
- // use hadamard transform here
- UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
- uiSads[uiMode] = uiSad;
+ // Transpose NxN
+ // TODO: Optimize by SSE2
+ for (int k = 0; k < uiWidth; k++)
+ {
+ for (int l = 0; l < uiWidth; l++)
+ {
+ buf1[k * uiWidth + l] = piOrg[l * uiStride + k];
+ }
+ }
+
+ Pel *pAbove0 = refAbove + uiWidth - 1;
+ Pel *pAbove1 = refAboveFlt + uiWidth - 1;
+ Pel *pLeft0 = refLeft + uiWidth - 1;
+ Pel *pLeft1 = refLeftFlt + uiWidth - 1;
+
+ x265::primitives.getIPredAngs4(tmp, pAbove0, pLeft0, pAbove1, pLeft1, (uiWidth<16));
+
+ // TODO: We need SATD_x4 here
+ for (UInt uiMode = 2; uiMode < 18; uiMode++)
+ {
+ //predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
+ //for (int k = 0; k < uiWidth; k++)
+ //{
+ // for (int l = 0; l < uiWidth; l++)
+ // {
+ // if (tmp[(uiMode - 2) * (uiWidth * uiWidth) + k * uiWidth + l] != piPred[l * uiStride + k])
+ // printf("X");
+ // }
+ //}
+ //UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
+
+ // use hadamard transform here
+ UInt uiSad = sa8d((pixel*)buf1, uiWidth, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
+ uiSads[uiMode] = uiSad;
+ }
+ for (UInt uiMode = 18; uiMode < numModesAvailable; uiMode++)
+ {
+ // use hadamard transform here
+ UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
+ uiSads[uiMode] = uiSad;
+ }
+ x265_emms();
+ }
+ else
+ {
+ for (UInt uiMode = 2; uiMode < numModesAvailable; uiMode++)
+ {
+ predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
+
+ // use hadamard transform here
+ UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
+ uiSads[uiMode] = uiSad;
+ }
+ x265_emms();
}
- x265_emms();
for (UInt uiMode = 0; uiMode < numModesAvailable; uiMode++)
{
diff --git a/source/common/IntraPred.cpp b/source/common/IntraPred.cpp
index 75c7812..dc1cd0d 100644
--- a/source/common/IntraPred.cpp
+++ b/source/common/IntraPred.cpp
@@ -1,262 +1,307 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Min Chen <chenm003 at 163.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com.
- *****************************************************************************/
-
-#include "primitives.h"
-#include "Lib/TLibCommon/TComPrediction.h"
-#include <cstring>
-#include <assert.h>
-
-//#define MAX_CU_SIZE 64
-extern char g_aucConvertToBit[];
-
-namespace {
-pixel CDECL predIntraGetPredValDC(pixel* pSrc, intptr_t iSrcStride, intptr_t iWidth)
-{
- int iInd, iSum = 0;
- pixel pDcVal;
-
- for (iInd = 0; iInd < iWidth; iInd++)
- {
- iSum += pSrc[iInd - iSrcStride];
- }
- for (iInd = 0; iInd < iWidth; iInd++)
- {
- iSum += pSrc[iInd * iSrcStride - 1];
- }
-
- pDcVal = (pixel)((iSum + iWidth) / (iWidth + iWidth));
-
- return pDcVal;
-}
-
-void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* pDst, intptr_t iDstStride, int iWidth, int iHeight)
-{
- intptr_t x, y, iDstStride2, iSrcStride2;
-
- // boundary pixels processing
- pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pDst[0] + 2) >> 2);
-
- for (x = 1; x < iWidth; x++)
- {
- pDst[x] = (pixel)((pSrc[x - iSrcStride] + 3 * pDst[x] + 2) >> 2);
- }
-
- for (y = 1, iDstStride2 = iDstStride, iSrcStride2 = iSrcStride - 1; y < iHeight; y++, iDstStride2 += iDstStride, iSrcStride2 += iSrcStride)
- {
- pDst[iDstStride2] = (pixel)((pSrc[iSrcStride2] + 3 * pDst[iDstStride2] + 2) >> 2);
- }
-}
-
-void xPredIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
-{
- int k, l;
- int blkSize = width;
-
- // Do the DC prediction
- pixel dcval = (pixel)predIntraGetPredValDC(pSrc, srcStride, width);
-
- for (k = 0; k < blkSize; k++)
- {
- for (l = 0; l < blkSize; l++)
- {
- pDst[k * dstStride + l] = dcval;
- }
- }
-
- if (bFilter)
- {
- xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
- }
-}
-
-void xPredIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width)
-{
- //assert(width == height);
-
- int k, l;
- pixel bottomLeft, topRight;
- int horPred;
- // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is 17 bits or [-65280, 65280], so we have to use 32 bits here
- int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
- // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
- int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
- int blkSize = width;
- int offset2D = width;
- int shift1D = g_aucConvertToBit[width] + 2;
- int shift2D = shift1D + 1;
-
- // Get left and above reference column and row
- for (k = 0; k < blkSize + 1; k++)
- {
- topRow[k] = pSrc[k - srcStride];
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = (pixel)leftColumn[blkSize];
- topRight = (pixel)topRow[blkSize];
- for (k = 0; k < blkSize; k++)
- {
- bottomRow[k] = (int16_t)(bottomLeft - topRow[k]);
- rightColumn[k] = (int16_t)(topRight - leftColumn[k]);
- topRow[k] <<= shift1D;
- leftColumn[k] <<= shift1D;
- }
-
- // Generate prediction signal
- for (k = 0; k < blkSize; k++)
- {
- horPred = leftColumn[k] + offset2D;
- for (l = 0; l < blkSize; l++)
- {
- horPred += rightColumn[k];
- topRow[l] += bottomRow[l];
- pDst[k * dstStride + l] = (pixel)((horPred + topRow[l]) >> shift2D);
- }
- }
-}
-
-void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
-{
- int k, l;
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
- {
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- if (intraPredAngle == 0)
- {
- for (k = 0; k < blkSize; k++)
- {
- for (l = 0; l < blkSize; l++)
- {
- pDst[k * dstStride + l] = refMain[l + 1];
- }
- }
-
- if (bFilter)
- {
- for (k = 0; k < blkSize; k++)
- {
- pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
- }
- }
- }
- else
- {
- int deltaPos = 0;
- int deltaInt;
- int deltaFract;
- int refMainIndex;
-
- for (k = 0; k < blkSize; k++)
- {
- deltaPos += intraPredAngle;
- deltaInt = deltaPos >> 5;
- deltaFract = deltaPos & (32 - 1);
-
- if (deltaFract)
- {
- // Do linear filtering
- for (l = 0; l < blkSize; l++)
- {
- refMainIndex = l + deltaInt + 1;
- pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
- }
- }
- else
- {
- // Just copy the integer samples
- for (l = 0; l < blkSize; l++)
- {
- pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
- }
- }
- }
- }
-
- // Flip the block if this is the horizontal mode
- if (modeHor)
- {
- pixel tmp;
- for (k = 0; k < blkSize - 1; k++)
- {
- for (l = k + 1; l < blkSize; l++)
- {
- tmp = pDst[k * dstStride + l];
- pDst[k * dstStride + l] = pDst[l * dstStride + k];
- pDst[l * dstStride + k] = tmp;
- }
- }
- }
- }
-}
-}
-
-namespace x265 {
-// x265 private namespace
-
-void Setup_C_IPredPrimitives(EncoderPrimitives& p)
-{
- p.getIPredDC = xPredIntraDC;
- p.getIPredPlanar = xPredIntraPlanar;
- p.getIPredAng = xPredIntraAngBufRef;
-}
-}
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <chenm003 at 163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#include "primitives.h"
+#include "Lib/TLibCommon/TComPrediction.h"
+#include <cstring>
+#include <assert.h>
+
+//#define MAX_CU_SIZE 64
+extern char g_aucConvertToBit[];
+
+namespace {
+pixel CDECL predIntraGetPredValDC(pixel* pSrc, intptr_t iSrcStride, intptr_t iWidth)
+{
+ int iInd, iSum = 0;
+ pixel pDcVal;
+
+ for (iInd = 0; iInd < iWidth; iInd++)
+ {
+ iSum += pSrc[iInd - iSrcStride];
+ }
+ for (iInd = 0; iInd < iWidth; iInd++)
+ {
+ iSum += pSrc[iInd * iSrcStride - 1];
+ }
+
+ pDcVal = (pixel)((iSum + iWidth) / (iWidth + iWidth));
+
+ return pDcVal;
+}
+
+void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* pDst, intptr_t iDstStride, int iWidth, int iHeight)
+{
+ intptr_t x, y, iDstStride2, iSrcStride2;
+
+ // boundary pixels processing
+ pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pDst[0] + 2) >> 2);
+
+ for (x = 1; x < iWidth; x++)
+ {
+ pDst[x] = (pixel)((pSrc[x - iSrcStride] + 3 * pDst[x] + 2) >> 2);
+ }
+
+ for (y = 1, iDstStride2 = iDstStride, iSrcStride2 = iSrcStride - 1; y < iHeight; y++, iDstStride2 += iDstStride, iSrcStride2 += iSrcStride)
+ {
+ pDst[iDstStride2] = (pixel)((pSrc[iSrcStride2] + 3 * pDst[iDstStride2] + 2) >> 2);
+ }
+}
+
+void xPredIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
+{
+ int k, l;
+ int blkSize = width;
+
+ // Do the DC prediction
+ pixel dcval = (pixel)predIntraGetPredValDC(pSrc, srcStride, width);
+
+ for (k = 0; k < blkSize; k++)
+ {
+ for (l = 0; l < blkSize; l++)
+ {
+ pDst[k * dstStride + l] = dcval;
+ }
+ }
+
+ if (bFilter)
+ {
+ xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
+ }
+}
+
+void xPredIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width)
+{
+ //assert(width == height);
+
+ int k, l;
+ pixel bottomLeft, topRight;
+ int horPred;
+ // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is 17 bits or [-65280, 65280], so we have to use 32 bits here
+ int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
+ // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
+ int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
+ int blkSize = width;
+ int offset2D = width;
+ int shift1D = g_aucConvertToBit[width] + 2;
+ int shift2D = shift1D + 1;
+
+ // Get left and above reference column and row
+ for (k = 0; k < blkSize + 1; k++)
+ {
+ topRow[k] = pSrc[k - srcStride];
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = (pixel)leftColumn[blkSize];
+ topRight = (pixel)topRow[blkSize];
+ for (k = 0; k < blkSize; k++)
+ {
+ bottomRow[k] = (int16_t)(bottomLeft - topRow[k]);
+ rightColumn[k] = (int16_t)(topRight - leftColumn[k]);
+ topRow[k] <<= shift1D;
+ leftColumn[k] <<= shift1D;
+ }
+
+ // Generate prediction signal
+ for (k = 0; k < blkSize; k++)
+ {
+ horPred = leftColumn[k] + offset2D;
+ for (l = 0; l < blkSize; l++)
+ {
+ horPred += rightColumn[k];
+ topRow[l] += bottomRow[l];
+ pDst[k * dstStride + l] = (pixel)((horPred + topRow[l]) >> shift2D);
+ }
+ }
+}
+
+void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
+{
+ int k, l;
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+ {
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ if (intraPredAngle == 0)
+ {
+ for (k = 0; k < blkSize; k++)
+ {
+ for (l = 0; l < blkSize; l++)
+ {
+ pDst[k * dstStride + l] = refMain[l + 1];
+ }
+ }
+
+ if (bFilter)
+ {
+ for (k = 0; k < blkSize; k++)
+ {
+ pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
+ }
+ }
+ }
+ else
+ {
+ int deltaPos = 0;
+ int deltaInt;
+ int deltaFract;
+ int refMainIndex;
+
+ for (k = 0; k < blkSize; k++)
+ {
+ deltaPos += intraPredAngle;
+ deltaInt = deltaPos >> 5;
+ deltaFract = deltaPos & (32 - 1);
+
+ if (deltaFract)
+ {
+ // Do linear filtering
+ for (l = 0; l < blkSize; l++)
+ {
+ refMainIndex = l + deltaInt + 1;
+ pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
+ }
+ }
+ else
+ {
+ // Just copy the integer samples
+ for (l = 0; l < blkSize; l++)
+ {
+ pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
+ }
+ }
+ }
+ }
+
+ // Flip the block if this is the horizontal mode
+ if (modeHor)
+ {
+ pixel tmp;
+ for (k = 0; k < blkSize - 1; k++)
+ {
+ for (l = k + 1; l < blkSize; l++)
+ {
+ tmp = pDst[k * dstStride + l];
+ pDst[k * dstStride + l] = pDst[l * dstStride + k];
+ pDst[l * dstStride + k] = tmp;
+ }
+ }
+ }
+ }
+}
+
+unsigned char g_aucIntraFilterType[][35] = {
+ // Index: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
+ /* 8x8 */ { 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+ /* 16x16 */ { 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+ /* 32x32 */ { 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+
+void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+ int iMode;
+
+ // avoid warning
+ (pLeft1);
+ (pAbove1);
+
+ for( iMode = 2; iMode <= 34; iMode++ )
+ {
+ pixel *pLeft = pLeft0;
+ pixel *pAbove = pAbove0;
+ pixel *pDst = pDst0 + (iMode-2) * (4 * 4);
+
+ xPredIntraAngBufRef(8, pDst, 4, 4, iMode, bLuma, pLeft, pAbove);
+
+ // Optimize code don't flip buffer
+ bool modeHor = (iMode < 18);
+ // Flip the block if this is the horizontal mode
+ if (modeHor)
+ {
+ pixel tmp;
+ const int width = 4;
+ for (int k = 0; k < width - 1; k++)
+ {
+ for (int l = k + 1; l < width; l++)
+ {
+ tmp = pDst[k * width + l];
+ pDst[k * width + l] = pDst[l * width + k];
+ pDst[l * width + k] = tmp;
+ }
+ }
+ }
+ }
+}
+
+}
+
+namespace x265 {
+// x265 private namespace
+
+void Setup_C_IPredPrimitives(EncoderPrimitives& p)
+{
+ p.getIPredDC = xPredIntraDC;
+ p.getIPredPlanar = xPredIntraPlanar;
+ p.getIPredAng = xPredIntraAngBufRef;
+ p.getIPredAngs4 = xPredIntraAngs4;
+}
+}
diff --git a/source/common/primitives.h b/source/common/primitives.h
index f43f8a2..451927f 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -195,6 +195,7 @@ typedef void (CDECL * blockcpy_s_c)(int bx, int by, short *dst, intptr_t dstride
typedef void (CDECL * getIPredDC_t)(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter);
typedef void (CDECL * getIPredPlanar_t)(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride, int width);
typedef void (CDECL * getIPredAng_p)(int bitDepth, pixel* rpDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
+typedef void (CDECL * getIPredAngs_t)(pixel *pDst, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma);
typedef void (CDECL * quant)(int bitDepth, const int* pSrc, int* pDes, int iWidth, int iHeight, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int uiLog2TrSize, int *piDequantCoef);
typedef void (CDECL * cvt16to32_t)(short *psOrg, int *piDst, int);
typedef void (CDECL * cvt16to32_shl_t)(int *piDst, short *psOrg, intptr_t, int, int);
@@ -238,6 +239,7 @@ struct EncoderPrimitives
getIPredDC_t getIPredDC;
getIPredPlanar_t getIPredPlanar;
getIPredAng_p getIPredAng;
+ getIPredAngs_t getIPredAngs4;
quant deQuant;
dct_t dct[NUM_DCTS];
idct_t idct[NUM_IDCTS];
diff --git a/source/common/vec/intrapred.inc b/source/common/vec/intrapred.inc
index 43e3f1d..3a49935 100644
--- a/source/common/vec/intrapred.inc
+++ b/source/common/vec/intrapred.inc
@@ -1,4678 +1,4731 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Min Chen <chenm003 at 163.com>
- * Deepthi Devaki <deepthidevaki at multicorewareinc.com>
- * Steve Borho <steve at borho.org>
- * ShinYee Chung <shinyee at multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com.
- *****************************************************************************/
-
-#include "primitives.h"
-#include "TLibCommon/TComRom.h"
-#include <assert.h>
-#include <smmintrin.h>
-
-extern char g_aucConvertToBit[];
-
-using namespace x265;
-
-namespace {
-const int angAP[17][64] =
-{
- {
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
- },
- {
- 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52
- },
- {
- 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 32, 32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42
- },
- {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 34
- },
- {
- 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 26
- },
- {
- 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18
- },
- {
- 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10
- },
- {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
- },
- { // 0th virtual index; never used; just to help indexing
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
- },
- {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
- },
- {
- -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -10, -10
- },
- {
- -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -17, -18, -18, -18, -18
- },
- {
- -1, -1, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -13, -14, -14, -15, -15, -16, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -22, -22, -22, -23, -23, -24, -24, -24, -25, -25, -26, -26, -26
- },
- {
- -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, -33, -33, -34, -34
- },
- {
- -1, -2, -2, -3, -4, -4, -5, -6, -6, -7, -8, -8, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -16, -16, -17, -18, -18, -19, -20, -20, -21, -21, -22, -23, -23, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -31, -31, -32, -33, -33, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -41, -41, -42, -42
- },
- {
- -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52
- },
- {
- -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64
- }
-};
-
-#define GETAP(X, Y) angAP[8 - (X)][(Y)]
-
-__m128i v_multiL, v_multiH, v_multiH2, v_multiH3, v_multiH4, v_multiH5, v_multiH6, v_multiH7;
-__m128i v_multi_2Row;
-
-/* When compiled with /arch:AVX, this code is not safe to run on non-AVX CPUs and
- * thus we cannot use static initialization. This routine is only called if the
- * detected CPU can support this SIMD architecture. */
-static void initFileStaticVars()
-{
- v_multiL = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- v_multiH = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
- v_multiH2 = _mm_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24);
- v_multiH3 = _mm_setr_epi16(25, 26, 27, 28, 29, 30, 31, 32);
- v_multiH4 = _mm_setr_epi16(33, 34, 35, 36, 37, 38, 39, 40);
- v_multiH5 = _mm_setr_epi16(41, 42, 43, 44, 45, 46, 47, 48);
- v_multiH6 = _mm_setr_epi16(49, 50, 51, 52, 53, 54, 55, 56);
- v_multiH7 = _mm_setr_epi16(57, 58, 59, 60, 61, 62, 63, 64);
- v_multi_2Row = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
-}
-
-static inline
-void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* rpDst, intptr_t iDstStride, int iWidth, int /*iHeight*/)
-{
- pixel* pDst = rpDst;
- int y;
- pixel pixDC = *pDst;
- int pixDCx3 = pixDC * 3 + 2;
-
- // boundary pixels processing
- pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pixDC + 2) >> 2);
-
- Vec8us im1(pixDCx3);
- Vec8us im2, im3;
-#if HIGH_BIT_DEPTH
- switch (iWidth)
- {
- case 4:
- im2 = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
- im2 = (im1 + im2) >> const_int(2);
- store_partial(const_int(8), &pDst[1], im2);
- break;
-
- case 8:
- im2.load(&pSrc[1 - iSrcStride]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1]);
- break;
-
- case 16:
- im2.load(&pSrc[1 - iSrcStride]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1]);
-
- im2.load(&pSrc[1 - iSrcStride + 8]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 8]);
- break;
-
- case 32:
- im2.load(&pSrc[1 - iSrcStride]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1]);
-
- im2.load(&pSrc[1 - iSrcStride + 8]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 8]);
-
- im2.load(&pSrc[1 - iSrcStride + 16]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 16]);
-
- im2.load(&pSrc[1 - iSrcStride + 24]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 24]);
- break;
-
- //case 64:
- default:
- im2.load(&pSrc[1 - iSrcStride]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1]);
-
- im2.load(&pSrc[1 - iSrcStride + 8]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 8]);
-
- im2.load(&pSrc[1 - iSrcStride + 16]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 16]);
-
- im2.load(&pSrc[1 - iSrcStride + 24]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 24]);
-
- im2.load(&pSrc[1 - iSrcStride + 32]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 32]);
-
- im2.load(&pSrc[1 - iSrcStride + 40]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 40]);
-
- im2.load(&pSrc[1 - iSrcStride + 48]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 48]);
-
- im2.load(&pSrc[1 - iSrcStride + 56]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&pDst[1 + 56]);
- break;
- }
-
-#else /* if HIGH_BIT_DEPTH */
- Vec16uc pix;
- switch (iWidth)
- {
- case 4:
- pix = load_partial(const_int(4), &pSrc[1 - iSrcStride]);
- im2 = extend_low(pix);
- im2 = (im1 + im2) >> const_int(2);
- pix = compress(im2, im2);
- store_partial(const_int(4), &pDst[1], pix);
- break;
-
- case 8:
- pix = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
- im2 = extend_low(pix);
- im2 = (im1 + im2) >> const_int(2);
- pix = compress(im2, im2);
- store_partial(const_int(8), &pDst[1], pix);
- break;
-
- case 16:
- pix.load(&pSrc[1 - iSrcStride]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1]);
- break;
-
- case 32:
- pix.load(&pSrc[1 - iSrcStride]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1]);
-
- pix.load(&pSrc[1 - iSrcStride + 16]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1 + 16]);
- break;
-
- //case 64:
- default:
- pix.load(&pSrc[1 - iSrcStride]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1]);
-
- pix.load(&pSrc[1 - iSrcStride + 16]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1 + 16]);
-
- pix.load(&pSrc[1 - iSrcStride + 32]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1 + 32]);
-
- pix.load(&pSrc[1 - iSrcStride + 48]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&pDst[1 + 48]);
- break;
- }
-
-#endif /* if HIGH_BIT_DEPTH */
-
- for (y = 1; y < iWidth; y++)
- {
- pDst[iDstStride] = (pixel)((pSrc[iSrcStride - 1] + pixDCx3) >> 2);
- pSrc += iSrcStride;
- pDst += iDstStride;
- }
-}
-
-void predIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
-{
- //assert(iWidth == iHeight); // all of Intra is NxN
- //assert(blkAboveAvailable || blkLeftAvailable); // I think left and above always true since HEVC have a pixel fill process
- int iSum = 0;
- int logSize = g_aucConvertToBit[width] + 2;
- pixel *pSrcAbove = &pSrc[-srcStride];
- pixel *pSrcLeft = &pSrc[-1];
-
- for (int iInd = 0; iInd < width; iInd++)
- {
- iSum += *pSrcLeft;
- pSrcLeft += srcStride;
- }
-
-#if HIGH_BIT_DEPTH
- Vec8s sumAbove(0);
- Vec8s m0;
-
- switch (width)
- {
- case 4:
- sumAbove = load_partial(const_int(8), pSrcAbove);
- break;
- case 8:
- m0.load(pSrcAbove);
- sumAbove = m0;
- break;
- case 16:
- m0.load(pSrcAbove);
- sumAbove = m0;
- m0.load(pSrcAbove + 8);
- sumAbove += m0;
- break;
- case 32:
- m0.load(pSrcAbove);
- sumAbove = m0;
- m0.load(pSrcAbove + 8);
- sumAbove += m0;
- m0.load(pSrcAbove + 16);
- sumAbove += m0;
- m0.load(pSrcAbove + 24);
- sumAbove += m0;
- break;
- //case 64:
- default:
- // CHECK_ME: the max support bit_depth is 13-bits
- m0.load(pSrcAbove);
- sumAbove = m0;
- m0.load(pSrcAbove + 8);
- sumAbove += m0;
- m0.load(pSrcAbove + 16);
- sumAbove += m0;
- m0.load(pSrcAbove + 24);
- sumAbove += m0;
- m0.load(pSrcAbove + 32);
- sumAbove += m0;
- m0.load(pSrcAbove + 40);
- sumAbove += m0;
- m0.load(pSrcAbove + 48);
- sumAbove += m0;
- m0.load(pSrcAbove + 56);
- sumAbove += m0;
- break;
- }
-
- iSum += horizontal_add_x(sumAbove);
-
- logSize += 1;
- pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
- Vec8us dcValN(dcVal);
- int k;
-
- pixel *pDst1 = pDst;
- switch (width)
- {
- case 4:
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- break;
-
- case 8:
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- pDst1 += dstStride;
- break;
-
- case 16:
- for (k = 0; k < 16; k += 2)
- {
- dcValN.store(pDst1);
- dcValN.store(pDst1 + 8);
- pDst1 += dstStride;
- dcValN.store(pDst1);
- dcValN.store(pDst1 + 8);
- pDst1 += dstStride;
- }
-
- break;
-
- case 32:
- for (k = 0; k < 32; k++)
- {
- dcValN.store(pDst1);
- dcValN.store(pDst1 + 8);
- dcValN.store(pDst1 + 16);
- dcValN.store(pDst1 + 24);
- pDst1 += dstStride;
- }
-
- break;
-
- //case 64:
- default:
- for (k = 0; k < 64; k++)
- {
- dcValN.store(pDst1);
- dcValN.store(pDst1 + 8);
- dcValN.store(pDst1 + 16);
- dcValN.store(pDst1 + 24);
- dcValN.store(pDst1 + 32);
- dcValN.store(pDst1 + 40);
- dcValN.store(pDst1 + 48);
- dcValN.store(pDst1 + 56);
- pDst1 += dstStride;
- }
-
- break;
- }
-
- if (bFilter)
- {
- xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
- }
-#else // if !HIGH_BIT_DEPTH
-
- {
- Vec16uc pix;
- Vec8us im;
- Vec4ui im1, im2;
-
- switch (width)
- {
- case 4:
- pix.fromUint32(*(uint32_t*)pSrcAbove);
- iSum += horizontal_add(extend_low(pix));
- break;
- case 8:
-#if X86_64
- pix.fromUint64(*(uint64_t*)pSrcAbove);
-#else
- pix.load_partial(8, pSrcAbove);
-#endif
- iSum += horizontal_add(extend_low(pix));
- break;
- case 16:
- pix.load(pSrcAbove);
- iSum += horizontal_add_x(pix);
- break;
- case 32:
- pix.load(pSrcAbove);
- im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
- pix.load(pSrcAbove + 16);
- im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
- im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
- iSum += toInt32(im1);
- break;
- //case 64:
- default:
- pix.load(pSrcAbove);
- im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
- pix.load(pSrcAbove + 16);
- im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
- pix.load(pSrcAbove + 32);
- im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
- pix.load(pSrcAbove + 48);
- im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
- im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
- //im1 += extract_hi64(im1);
- iSum += toInt32(im1);
- break;
- }
- }
-
- logSize += 1;
- pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
- Vec16uc dcValN(dcVal);
- int k;
-
- pixel *pDst1 = pDst;
- switch (width)
- {
- case 4:
- store_partial(const_int(4), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(4), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(4), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(4), pDst1, dcValN);
- break;
-
- case 8:
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(8), pDst1, dcValN);
- break;
-
- case 16:
- for (k = 0; k < 16; k += 4)
- {
- store_partial(const_int(16), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(16), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(16), pDst1, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(16), pDst1, dcValN);
- pDst1 += dstStride;
- }
-
- break;
-
- case 32:
- for (k = 0; k < 32; k += 2)
- {
- store_partial(const_int(16), pDst1, dcValN);
- store_partial(const_int(16), pDst1 + 16, dcValN);
- pDst1 += dstStride;
- store_partial(const_int(16), pDst1, dcValN);
- store_partial(const_int(16), pDst1 + 16, dcValN);
- pDst1 += dstStride;
- }
-
- break;
-
- case 64:
- for (k = 0; k < 64; k++)
- {
- store_partial(const_int(16), pDst1, dcValN);
- store_partial(const_int(16), pDst1 + 16, dcValN);
- store_partial(const_int(16), pDst1 + 32, dcValN);
- store_partial(const_int(16), pDst1 + 48, dcValN);
- pDst1 += dstStride;
- }
-
- break;
- }
-
- if (bFilter)
- {
- xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
- }
-#endif // if HIGH_BIT_DEPTH
-}
-
-#if HIGH_BIT_DEPTH
-// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp
-void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- int k, bottomLeft, topRight;
- // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits
- int16_t leftColumn[4];
-
- // Get left and above reference column and row
- Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
-
- for (k = 0; k < 4; k++)
- {
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn); // leftColumn
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[4 * srcStride - 1];
- topRight = pSrc[4 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow = v_bottomLeft - v_topRow;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- v_topRow = v_topRow << const_int(2);
- v_leftColumn = v_leftColumn << const_int(2);
-
- // Generate prediction signal
- Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
- Vec8s v_horPred, v_rightColumnN;
- Vec8s v_im4;
- Vec16uc v_im5;
-
- // line0
- v_horPred = broadcast(const_int(0), v_horPred4);
- v_rightColumnN = broadcast(const_int(0), v_rightColumn) * v_multi;
- v_horPred = v_horPred + v_rightColumnN;
- v_topRow = v_topRow + v_bottomRow;
- // CHECK_ME: the HM don't clip the pixel, so I assume there is biggest 12+3=15(bits)
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
- store_partial(const_int(8), &rpDst[0 * dstStride], v_im4);
-
- // line1
- v_horPred = broadcast(const_int(1), v_horPred4);
- v_rightColumnN = broadcast(const_int(1), v_rightColumn) * v_multi;
- v_horPred = v_horPred + v_rightColumnN;
- v_topRow = v_topRow + v_bottomRow;
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
- store_partial(const_int(8), &rpDst[1 * dstStride], v_im4);
-
- // line2
- v_horPred = broadcast(const_int(2), v_horPred4);
- v_rightColumnN = broadcast(const_int(2), v_rightColumn) * v_multi;
- v_horPred = v_horPred + v_rightColumnN;
- v_topRow = v_topRow + v_bottomRow;
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
- store_partial(const_int(8), &rpDst[2 * dstStride], v_im4);
-
- // line3
- v_horPred = broadcast(const_int(3), v_horPred4);
- v_rightColumnN = broadcast(const_int(3), v_rightColumn) * v_multi;
- v_horPred = v_horPred + v_rightColumnN;
- v_topRow = v_topRow + v_bottomRow;
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
- store_partial(const_int(8), &rpDst[3 * dstStride], v_im4);
-}
-
-#else /* if HIGH_BIT_DEPTH */
-void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- int k;
- pixel bottomLeft, topRight;
-
- // Get left and above reference column and row
- Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &pSrc[-srcStride]); // topRow
- Vec8s v_topRow = extend_low(im0);
-
- int16_t leftColumn[4];
-
- for (k = 0; k < 4; k++)
- {
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn); // leftColumn
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[4 * srcStride - 1];
- topRight = pSrc[4 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow = v_bottomLeft - v_topRow;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- v_topRow = v_topRow << const_int(2);
- v_leftColumn = v_leftColumn << const_int(2);
-
- Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
- Vec8s v_horPred, v_rightColumnN;
- Vec8s v_im4;
- Vec16uc v_im5;
-
-#define COMP_PRED_PLANAR4_ROW(X) { \
- v_horPred = broadcast(const_int((X)), v_horPred4); \
- v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \
- v_horPred = v_horPred + v_rightColumnN; \
- v_topRow = v_topRow + v_bottomRow; \
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \
- v_im5 = compress_unsafe(v_im4, v_im4); \
- store_partial(const_int(4), &rpDst[(X)*dstStride], v_im5); \
-}
-
- COMP_PRED_PLANAR4_ROW(0)
- COMP_PRED_PLANAR4_ROW(1)
- COMP_PRED_PLANAR4_ROW(2)
- COMP_PRED_PLANAR4_ROW(3)
-
-#undef COMP_PRED_PLANAR4_ROW
-}
-
-#if INSTRSET >= 5
-void predIntraPlanar4_sse4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- pixel bottomLeft, topRight;
-
- // Get left and above reference column and row
- __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&pSrc[-srcStride]); // topRow
- __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-
- v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[4 * srcStride - 1];
- topRight = pSrc[4 - srcStride];
-
- __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
- __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
-
- v_topRow = _mm_slli_epi16(v_topRow, 2);
-
- __m128i v_horPred, v_rightColumnN;
- __m128i v_im4;
- __m128i v_im5;
- __m128i _tmp0, _tmp1;
-
- __m128i v_bottomRowL = _mm_unpacklo_epi64(v_bottomRow, _mm_setzero_si128());
- v_topRow = _mm_sub_epi16(v_topRow, v_bottomRowL);
- v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);
-
-#define COMP_PRED_PLANAR_2ROW(Y) { \
- _tmp0 = _mm_cvtsi32_si128((pSrc[((Y)) * srcStride - 1] << 2) + 4); \
- _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
- _tmp1 = _mm_cvtsi32_si128((pSrc[((Y)+1) * srcStride - 1] << 2) + 4); \
- _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
- v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \
- _tmp0 = _mm_cvtsi32_si128(topRight - pSrc[((Y)) * srcStride - 1]); \
- _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
- _tmp1 = _mm_cvtsi32_si128(topRight - pSrc[((Y)+1) * srcStride - 1]); \
- _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
- v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \
- v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \
- v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
- v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
- v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \
- v_im5 = _mm_packus_epi16(v_im4, v_im4); \
- *(uint32_t*)&rpDst[(Y)*dstStride] = _mm_cvtsi128_si32(v_im5); \
- *(uint32_t*)&rpDst[((Y)+1) * dstStride] = _mm_cvtsi128_si32(_mm_shuffle_epi32(v_im5, 0x55));; \
-}
-
- COMP_PRED_PLANAR_2ROW(0)
- COMP_PRED_PLANAR_2ROW(2)
-
-#undef COMP_PRED_PLANAR4_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#endif /* if HIGH_BIT_DEPTH */
-
-#if HIGH_BIT_DEPTH
-
-#define COMP_PRED_PLANAR_ROW(X) { \
- v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
- v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
- v_horPred = v_horPred + v_rightColumnN; \
- v_topRow = v_topRow + v_bottomRow; \
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
- store_partial(const_int(16), &rpDst[X * dstStride], v_im4); \
-}
-
-void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- int k, bottomLeft, topRight;
-
- int16_t leftColumn[8];
-
- // Get left and above reference column and row
- Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
-
- for (k = 0; k < 8; k++)
- {
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn); // leftColumn
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[8 * srcStride - 1];
- topRight = pSrc[8 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow = v_bottomLeft - v_topRow;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- int shift = g_aucConvertToBit[8]; // Using value corresponding to width = 8
- v_topRow = v_topRow << (2 + shift);
- v_leftColumn = v_leftColumn << (2 + shift);
-
- // Generate prediction signal
- Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
- Vec8s v_horPred, v_rightColumnN;
- Vec8s v_im4;
- Vec16uc v_im5;
-
- COMP_PRED_PLANAR_ROW(0); // row 0
- COMP_PRED_PLANAR_ROW(1);
- COMP_PRED_PLANAR_ROW(2);
- COMP_PRED_PLANAR_ROW(3);
- COMP_PRED_PLANAR_ROW(4);
- COMP_PRED_PLANAR_ROW(5);
- COMP_PRED_PLANAR_ROW(6);
- COMP_PRED_PLANAR_ROW(7); // row 7
-}
-
-#undef COMP_PRED_PLANAR_ROW
-#else /* if HIGH_BIT_DEPTH */
-
-#define COMP_PRED_PLANAR_ROW(X) { \
- v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
- v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
- v_horPred = v_horPred + v_rightColumnN; \
- v_topRow = v_topRow + v_bottomRow; \
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
- v_im5 = compress(v_im4, v_im4); \
- store_partial(const_int(8), &rpDst[X * dstStride], v_im5); \
-}
-
-void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- int k;
- pixel bottomLeft, topRight;
- int16_t leftColumn[8];
-
- // Get left and above reference column and row
- Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
- Vec8s v_topRow = extend_low(im0);
-
- for (k = 0; k < 8; k++)
- {
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn;
- v_leftColumn.load(leftColumn); // leftColumn
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[8 * srcStride - 1];
- topRight = pSrc[8 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow = v_bottomLeft - v_topRow;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- int shift = g_aucConvertToBit[8]; // Using value corresponding to width = 8
- v_topRow = v_topRow << (2 + shift);
- v_leftColumn = v_leftColumn << (2 + shift);
-
- Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
- Vec8s v_horPred, v_rightColumnN;
- Vec8s v_im4;
- Vec16uc v_im5;
-
- COMP_PRED_PLANAR_ROW(0); // row 0
- COMP_PRED_PLANAR_ROW(1);
- COMP_PRED_PLANAR_ROW(2);
- COMP_PRED_PLANAR_ROW(3);
- COMP_PRED_PLANAR_ROW(4);
- COMP_PRED_PLANAR_ROW(5);
- COMP_PRED_PLANAR_ROW(6);
- COMP_PRED_PLANAR_ROW(7); // row 7
-}
-
-#undef COMP_PRED_PLANAR_ROW
-
-#if INSTRSET >= 5
-void predIntraPlanar8_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
- pixel bottomLeft, topRight;
-
- // Get left and above reference column and row
- __m128i im0 = _mm_loadl_epi64((__m128i*)&pSrc[0 - srcStride]); // topRow
- __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-
- __m128i v_leftColumn = _mm_setzero_si128();
-
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[0 * srcStride - 1], 0);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[1 * srcStride - 1], 1);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[2 * srcStride - 1], 2);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[3 * srcStride - 1], 3);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[4 * srcStride - 1], 4);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[5 * srcStride - 1], 5);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[6 * srcStride - 1], 6);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[7 * srcStride - 1], 7);
- v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[8 * srcStride - 1];
- topRight = pSrc[8 - srcStride];
-
- __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
- __m128i v_topRight = _mm_set1_epi16(topRight);
-
- __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
- __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
-
- v_topRow = _mm_slli_epi16(v_topRow, 3);
- v_leftColumn = _mm_slli_epi16(v_leftColumn, 3);
-
- __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));
- __m128i v_horPred, v_rightColumnN;
- __m128i v_im4;
- __m128i v_im5;
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
- if ((Y) < 4) { \
- v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \
- v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \
- v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
- v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \
- } \
- else { \
- v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \
- v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \
- v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
- v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \
- } \
- v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \
- v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
- v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
- v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 4); \
- v_im5 = _mm_packus_epi16(v_im4, v_im4); \
- _mm_storel_epi64((__m128i*)&pDst[(Y)*dstStride], v_im5); \
-}
-
- COMP_PRED_PLANAR_ROW(0)
- COMP_PRED_PLANAR_ROW(1)
- COMP_PRED_PLANAR_ROW(2)
- COMP_PRED_PLANAR_ROW(3)
- COMP_PRED_PLANAR_ROW(4)
- COMP_PRED_PLANAR_ROW(5)
- COMP_PRED_PLANAR_ROW(6)
- COMP_PRED_PLANAR_ROW(7)
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#endif /* if HIGH_BIT_DEPTH */
-
-#if HIGH_BIT_DEPTH
-#define COMP_PRED_PLANAR_ROW(X) { \
- v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
- v_horPred_hi = v_horPred_lo; \
- v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
- v_rightColumnN_hi = v_rightColumnN_lo; \
- v_rightColumnN_lo *= v_multi_lo; \
- v_rightColumnN_hi *= v_multi_hi; \
- v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
- v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
- v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
- v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
- v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
- v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
- v_im4_lo.store(&rpDst[X * dstStride]); \
- v_im4_hi.store(&rpDst[X * dstStride + 8]); \
-}
-
-void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- int k;
- pixel bottomLeft, topRight;
- int16_t leftColumn[16];
-
- // Get left and above reference column and row
- Vec8s v_topRow_lo, v_topRow_hi;
-
- v_topRow_lo.load(&pSrc[-srcStride]);
- v_topRow_hi.load(&pSrc[-srcStride + 8]);
-
- for (k = 0; k < 16; k++)
- {
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn;
- v_leftColumn.load(leftColumn); // leftColumn
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[16 * srcStride - 1];
- topRight = pSrc[16 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
- Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- int shift = g_aucConvertToBit[16]; // Using value corresponding to width = 8
- v_topRow_lo = v_topRow_lo << (2 + shift);
- v_topRow_hi = v_topRow_hi << (2 + shift);
- v_leftColumn = v_leftColumn << (2 + shift);
-
- Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
- const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
- const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
- Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
- Vec8s v_im4_lo, v_im4_hi;
- Vec16uc v_im5;
-
- COMP_PRED_PLANAR_ROW(0); // row 0
- COMP_PRED_PLANAR_ROW(1);
- COMP_PRED_PLANAR_ROW(2);
- COMP_PRED_PLANAR_ROW(3);
- COMP_PRED_PLANAR_ROW(4);
- COMP_PRED_PLANAR_ROW(5);
- COMP_PRED_PLANAR_ROW(6);
- COMP_PRED_PLANAR_ROW(7); // row 7
-
- v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows
- v_rightColumn = v_topRight - v_leftColumn;
- v_leftColumn = v_leftColumn << (2 + shift);
- v_horPred4 = v_leftColumn + Vec8s(16);
-
- COMP_PRED_PLANAR_ROW(8); // row 0
- COMP_PRED_PLANAR_ROW(9);
- COMP_PRED_PLANAR_ROW(10);
- COMP_PRED_PLANAR_ROW(11);
- COMP_PRED_PLANAR_ROW(12);
- COMP_PRED_PLANAR_ROW(13);
- COMP_PRED_PLANAR_ROW(14);
- COMP_PRED_PLANAR_ROW(15);
-}
-
-#undef COMP_PRED_PLANAR_ROW
-
-#else /* if HIGH_BIT_DEPTH */
-#define COMP_PRED_PLANAR_ROW(X) { \
- v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
- v_horPred_hi = v_horPred_lo; \
- v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
- v_rightColumnN_hi = v_rightColumnN_lo; \
- v_rightColumnN_lo *= v_multi_lo; \
- v_rightColumnN_hi *= v_multi_hi; \
- v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
- v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
- v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
- v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
- v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
- v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
- v_im5 = compress(v_im4_lo, v_im4_hi); \
- store_partial(const_int(16), &rpDst[X * dstStride], v_im5); \
-}
-
-void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
- int k;
- pixel bottomLeft, topRight;
- int16_t leftColumn[16];
-
- // Get left and above reference column and row
- Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
- Vec8s v_topRow_lo = extend_low(im0);
- Vec8s v_topRow_hi = extend_high(im0);
-
- for (k = 0; k < 16; k++)
- {
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn;
- v_leftColumn.load(leftColumn); // leftColumn
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[16 * srcStride - 1];
- topRight = pSrc[16 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
- Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- int shift = g_aucConvertToBit[16]; // Using value corresponding to width = 8
- v_topRow_lo = v_topRow_lo << (2 + shift);
- v_topRow_hi = v_topRow_hi << (2 + shift);
- v_leftColumn = v_leftColumn << (2 + shift);
-
- Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
- const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
- const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
- Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
- Vec8s v_im4_lo, v_im4_hi;
- Vec16uc v_im5;
-
- COMP_PRED_PLANAR_ROW(0); // row 0
- COMP_PRED_PLANAR_ROW(1);
- COMP_PRED_PLANAR_ROW(2);
- COMP_PRED_PLANAR_ROW(3);
- COMP_PRED_PLANAR_ROW(4);
- COMP_PRED_PLANAR_ROW(5);
- COMP_PRED_PLANAR_ROW(6);
- COMP_PRED_PLANAR_ROW(7); // row 7
-
- v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows
- v_rightColumn = v_topRight - v_leftColumn;
- v_leftColumn = v_leftColumn << (2 + shift);
- v_horPred4 = v_leftColumn + Vec8s(16);
-
- COMP_PRED_PLANAR_ROW(8); // row 0
- COMP_PRED_PLANAR_ROW(9);
- COMP_PRED_PLANAR_ROW(10);
- COMP_PRED_PLANAR_ROW(11);
- COMP_PRED_PLANAR_ROW(12);
- COMP_PRED_PLANAR_ROW(13);
- COMP_PRED_PLANAR_ROW(14);
- COMP_PRED_PLANAR_ROW(15);
-}
-
-#undef COMP_PRED_PLANAR_ROW
-
-#if INSTRSET >= 5
-void predIntraPlanar16_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
- pixel bottomLeft, topRight;
- __m128i v_topRow[2];
- __m128i v_bottomRow[2];
-
- // Get left and above reference column and row
- __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
-
- v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
- v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[16 * srcStride - 1];
- topRight = pSrc[16 - srcStride];
-
- __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-
- v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
- v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
-
- v_topRow[0] = _mm_slli_epi16(v_topRow[0], 4);
- v_topRow[1] = _mm_slli_epi16(v_topRow[1], 4);
-
- __m128i v_horPred, v_horPredN[2], v_rightColumnN[2];
- __m128i v_im4L, v_im4H;
- __m128i v_im5;
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
- v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 4) + 16); \
- v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
- v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
- __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
- _tmp = _mm_shufflelo_epi16(_tmp, 0); \
- _tmp = _mm_shuffle_epi32(_tmp, 0); \
- v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
- v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
- v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
- v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
- v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
- v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
- v_im4L = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 5); \
- v_im4H = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 5); \
- v_im5 = _mm_packus_epi16(v_im4L, v_im4H); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5); \
-}
-
- COMP_PRED_PLANAR_ROW(0)
- COMP_PRED_PLANAR_ROW(1)
- COMP_PRED_PLANAR_ROW(2)
- COMP_PRED_PLANAR_ROW(3)
- COMP_PRED_PLANAR_ROW(4)
- COMP_PRED_PLANAR_ROW(5)
- COMP_PRED_PLANAR_ROW(6)
- COMP_PRED_PLANAR_ROW(7)
- COMP_PRED_PLANAR_ROW(8)
- COMP_PRED_PLANAR_ROW(9)
- COMP_PRED_PLANAR_ROW(10)
- COMP_PRED_PLANAR_ROW(11)
- COMP_PRED_PLANAR_ROW(12)
- COMP_PRED_PLANAR_ROW(13)
- COMP_PRED_PLANAR_ROW(14)
- COMP_PRED_PLANAR_ROW(15)
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#if INSTRSET >= 5
-void predIntraPlanar32_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
- pixel bottomLeft, topRight;
- __m128i v_topRow[4];
- __m128i v_bottomRow[4];
-
- // Get left and above reference column and row
- __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
- __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
-
- v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
- v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
- v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
- v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[32 * srcStride - 1];
- topRight = pSrc[32 - srcStride];
-
- __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-
- v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
- v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
- v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
- v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
-
- v_topRow[0] = _mm_slli_epi16(v_topRow[0], 5);
- v_topRow[1] = _mm_slli_epi16(v_topRow[1], 5);
- v_topRow[2] = _mm_slli_epi16(v_topRow[2], 5);
- v_topRow[3] = _mm_slli_epi16(v_topRow[3], 5);
-
- __m128i v_horPred, v_horPredN[4], v_rightColumnN[4];
- __m128i v_im4[4];
- __m128i v_im5[2];
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
- v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 5) + 32); \
- v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
- v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
- __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
- _tmp = _mm_shufflelo_epi16(_tmp, 0); \
- _tmp = _mm_shuffle_epi32(_tmp, 0); \
- v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
- v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
- v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
- v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
- v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
- v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
- v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
- v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
- v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
- v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
- v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
- v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
- v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 6); \
- v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 6); \
- v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 6); \
- v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 6); \
- v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
- v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
-}
-
- int i;
- for (i = 0; i < 32; i += 2)
- {
- COMP_PRED_PLANAR_ROW(i + 0);
- COMP_PRED_PLANAR_ROW(i + 1);
- }
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#if INSTRSET >= 5
-void predIntraPlanar64_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
- pixel bottomLeft, topRight;
- __m128i v_topRow[8];
- __m128i v_bottomRow[8];
-
- // Get left and above reference column and row
- __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
- __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
- __m128i im2 = _mm_loadu_si128((__m128i*)&pSrc[32 - srcStride]); // topRow
- __m128i im3 = _mm_loadu_si128((__m128i*)&pSrc[48 - srcStride]); // topRow
-
- v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
- v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
- v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
- v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
- v_topRow[4] = _mm_unpacklo_epi8(im2, _mm_setzero_si128());
- v_topRow[5] = _mm_unpackhi_epi8(im2, _mm_setzero_si128());
- v_topRow[6] = _mm_unpacklo_epi8(im3, _mm_setzero_si128());
- v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = pSrc[64 * srcStride - 1];
- topRight = pSrc[64 - srcStride];
-
- __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-
- v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
- v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
- v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
- v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
- v_bottomRow[4] = _mm_sub_epi16(v_bottomLeft, v_topRow[4]);
- v_bottomRow[5] = _mm_sub_epi16(v_bottomLeft, v_topRow[5]);
- v_bottomRow[6] = _mm_sub_epi16(v_bottomLeft, v_topRow[6]);
- v_bottomRow[7] = _mm_sub_epi16(v_bottomLeft, v_topRow[7]);
-
- v_topRow[0] = _mm_slli_epi16(v_topRow[0], 6);
- v_topRow[1] = _mm_slli_epi16(v_topRow[1], 6);
- v_topRow[2] = _mm_slli_epi16(v_topRow[2], 6);
- v_topRow[3] = _mm_slli_epi16(v_topRow[3], 6);
- v_topRow[4] = _mm_slli_epi16(v_topRow[4], 6);
- v_topRow[5] = _mm_slli_epi16(v_topRow[5], 6);
- v_topRow[6] = _mm_slli_epi16(v_topRow[6], 6);
- v_topRow[7] = _mm_slli_epi16(v_topRow[7], 6);
-
- __m128i v_horPred, v_horPredN[8], v_rightColumnN[8];
- __m128i v_im4[8];
- __m128i v_im5[4];
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
- v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 6) + 64); \
- v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
- v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
- __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
- _tmp = _mm_shufflelo_epi16(_tmp, 0); \
- _tmp = _mm_shuffle_epi32(_tmp, 0); \
- v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
- v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
- v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
- v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
- v_rightColumnN[4] = _mm_mullo_epi16(_tmp, v_multiH4); \
- v_rightColumnN[5] = _mm_mullo_epi16(_tmp, v_multiH5); \
- v_rightColumnN[6] = _mm_mullo_epi16(_tmp, v_multiH6); \
- v_rightColumnN[7] = _mm_mullo_epi16(_tmp, v_multiH7); \
- v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
- v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
- v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
- v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
- v_horPredN[4] = _mm_add_epi16(v_horPred, v_rightColumnN[4]); \
- v_horPredN[5] = _mm_add_epi16(v_horPred, v_rightColumnN[5]); \
- v_horPredN[6] = _mm_add_epi16(v_horPred, v_rightColumnN[6]); \
- v_horPredN[7] = _mm_add_epi16(v_horPred, v_rightColumnN[7]); \
- v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
- v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
- v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
- v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
- v_topRow[4] = _mm_add_epi16(v_topRow[4], v_bottomRow[4]); \
- v_topRow[5] = _mm_add_epi16(v_topRow[5], v_bottomRow[5]); \
- v_topRow[6] = _mm_add_epi16(v_topRow[6], v_bottomRow[6]); \
- v_topRow[7] = _mm_add_epi16(v_topRow[7], v_bottomRow[7]); \
- v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 7); \
- v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 7); \
- v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 7); \
- v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 7); \
- v_im4[4] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[4], v_topRow[4]), 7); \
- v_im4[5] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[5], v_topRow[5]), 7); \
- v_im4[6] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[6], v_topRow[6]), 7); \
- v_im4[7] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[7], v_topRow[7]), 7); \
- v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
- v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
- v_im5[2] = _mm_packus_epi16(v_im4[4], v_im4[5]); \
- v_im5[3] = _mm_packus_epi16(v_im4[6], v_im4[7]); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 32], v_im5[2]); \
- _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 48], v_im5[3]); \
-}
-
- int i;
- for (i = 0; i < 64; i++)
- {
- COMP_PRED_PLANAR_ROW(i + 0);
- //COMP_PRED_PLANAR_ROW(i+1);
- }
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#endif /* if HIGH_BIT_DEPTH */
-
-typedef void predIntraPlanar_t (pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride);
-predIntraPlanar_t *intraPlanarN[] =
-{
-#if !HIGH_BIT_DEPTH && INSTRSET >= 5
- predIntraPlanar4_sse4,
- predIntraPlanar8_sse4,
- predIntraPlanar16_sse4,
- predIntraPlanar32_sse4,
- predIntraPlanar64_sse4,
-#else
- predIntraPlanar4,
- predIntraPlanar8,
- predIntraPlanar16,
-#endif
-};
-
-void predIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride, int width)
-{
- //assert(width == height);
-
- int nLog2Size = g_aucConvertToBit[width] + 2;
-
-#if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)
- intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
- return;
-#else
- int k, l, bottomLeft, topRight;
- int horPred;
- // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here
- int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
- // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
- int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
- int blkSize = width;
- int offset2D = width;
- int shift1D = nLog2Size;
- int shift2D = shift1D + 1;
-
- if (width < 32)
- {
- intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
- return;
- }
-
- // Get left and above reference column and row
- for (k = 0; k < blkSize + 1; k++)
- {
- topRow[k] = pSrc[k - srcStride];
- leftColumn[k] = pSrc[k * srcStride - 1];
- }
-
- // Prepare intermediate variables used in interpolation
- bottomLeft = leftColumn[blkSize];
- topRight = topRow[blkSize];
- for (k = 0; k < blkSize; k++)
- {
- bottomRow[k] = bottomLeft - topRow[k];
- rightColumn[k] = topRight - leftColumn[k];
- topRow[k] <<= shift1D;
- leftColumn[k] <<= shift1D;
- }
-
- // Generate prediction signal
- for (k = 0; k < blkSize; k++)
- {
- horPred = leftColumn[k] + offset2D;
- for (l = 0; l < blkSize; l++)
- {
- horPred += rightColumn[k];
- topRow[l] += bottomRow[l];
- rpDst[k * dstStride + l] = ((horPred + topRow[l]) >> shift2D);
- }
- }
-
-#endif /* if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5) */
-}
-
-#if HIGH_BIT_DEPTH
-void xPredIntraAng4x4(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
-{
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int lookIdx = intraPredAngle;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
-
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- // bfilter will always be true for blocksize 4
- if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
- {
- if (modeHor)
- {
- Vec8s v_temp;
- Vec8s v_side_0; // refSide[0] value in a vector
- v_temp.load((void*)refSide);
- v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
-
- Vec8s v_side;
- v_side.load(refSide + 1);
-
- Vec8s v_main;
- v_main = load_partial(const_int(8), (void*)(refMain + 1));
-
- Vec8s tmp1, tmp2;
- tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(v_main, v_main);
- tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp1);
- tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
-
- Vec8s row0;
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row0 = tmp2 + v_side;
- row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
- store_partial(const_int(8), pDst, row0); //row0
- store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
-
- tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp2, tmp2);
- tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
-
- store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
- store_partial(const_int(8), pDst + (dstStride), tmp2); //row1
- }
- else
- {
- Vec16uc v_main;
- v_main = load_partial(const_int(8), refMain + 1);
- store_partial(const_int(8), pDst, v_main);
- store_partial(const_int(8), pDst + dstStride, v_main);
- store_partial(const_int(8), pDst + (2 * dstStride), v_main);
- store_partial(const_int(8), pDst + (3 * dstStride), v_main);
-
- for (int k = 0; k < 4; k++)
- {
- pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
- }
- }
- }
- else if (intraPredAngle == -32)
- {
- Vec8s tmp;
- tmp = load_partial(const_int(8), refMain); //-1,0,1,2
- store_partial(const_int(8), pDst, tmp);
- tmp = load_partial(const_int(8), refMain - 1); //-2,-1,0,1
- store_partial(const_int(8), pDst + dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 2);
- store_partial(const_int(8), pDst + 2 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 3);
- store_partial(const_int(8), pDst + 3 * dstStride, tmp);
- return;
- }
- else if (intraPredAngle == 32)
- {
- Vec8s tmp;
- tmp = load_partial(const_int(8), refMain + 2); //-1,0,1,2
- store_partial(const_int(8), pDst, tmp);
- tmp = load_partial(const_int(8), refMain + 3); //-2,-1,0,1
- store_partial(const_int(8), pDst + dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 4);
- store_partial(const_int(8), pDst + 2 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 5);
- store_partial(const_int(8), pDst + 3 * dstStride, tmp);
- return;
- }
- else
- {
- Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
-
- row11 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0));
- row12 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0) + 1);
-
- row21 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1));
- row22 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1) + 1);
-
- row31 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2));
- row32 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2) + 1);
-
- row41 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3));
- row42 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3) + 1);
-
- v_deltaPos = v_ipAngle = intraPredAngle;
-
- //row1
- v_deltaFract = v_deltaPos & thirty1;
- row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
-
- //row2
- v_deltaPos += v_ipAngle;
- v_deltaFract = v_deltaPos & thirty1;
- row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
-
- //row3
- v_deltaPos += v_ipAngle;
- v_deltaFract = v_deltaPos & thirty1;
- row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
-
- //row4
- v_deltaPos += v_ipAngle;
- v_deltaFract = v_deltaPos & thirty1;
- row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
-
- // Flip the block
-
- if (modeHor)
- {
- Vec8s tmp1, tmp2, tmp3, tmp4;
-
- tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
- tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
-
- tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
- tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
-
- //tmp16_1 = compress(tmp3, tmp3);
- store_partial(const_int(8), pDst, tmp3);
-
- store_partial(const_int(8), pDst + (2 * dstStride), tmp4); //row2
-
- tmp3 = blend2q<1, 3>((Vec2q)tmp3, (Vec2q)tmp3);
- tmp4 = blend2q<1, 3>((Vec2q)tmp4, (Vec2q)tmp4);
-
- store_partial(const_int(8), pDst + (3 * dstStride), tmp4); //row3
- store_partial(const_int(8), pDst + (dstStride), tmp3); //row1
- }
- else
- {
- store_partial(const_int(8), pDst, row11);
- store_partial(const_int(8), pDst + (dstStride), row21);
- store_partial(const_int(8), pDst + (2 * dstStride), row31);
- store_partial(const_int(8), pDst + (3 * dstStride), row41);
- }
- }
-}
-
-#else /* if HIGH_BIT_DEPTH */
-void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
-{
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
-
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- // bfilter will always be true for exactly vertical/horizontal modes
- if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
- {
- if (modeHor)
- {
- Vec16uc v_main;
- v_main = load_partial(const_int(4), (void*)(refMain + 1));
-
- Vec16uc tmp16;
- tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v_main, v_main);
- tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(tmp16, tmp16);
- Vec2uq tmp;
-
- if (bFilter)
- {
- Vec16uc v_temp;
- Vec8s v_side_0; // refSide[0] value in a vector
- v_temp = load_partial(const_int(8), (void*)refSide);
- v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
- v_side_0 = v_side_0 & 0x00FF;
-
- //shift v_side by 1 element (1 byte)
- tmp = reinterpret_i(v_temp);
- tmp = tmp >> 8;
- v_temp = reinterpret_i(tmp);
- Vec8s v_side = extend_low(v_temp);
-
- Vec8s row0 = extend_low(tmp16);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row0 += v_side;
- row0 = min(max(0, row0), 255);
- Vec16uc v_res(compress_unsafe(row0, 0));
- store_partial(const_int(4), pDst, v_res);
- }
- else
- {
- store_partial(const_int(4), pDst, tmp16);
- }
-
- tmp = (Vec2uq)tmp16;
- tmp >>= 32;
- store_partial(const_int(4), pDst + dstStride, tmp);
-
- tmp = blend2q<1, 3>(reinterpret_i(tmp16), reinterpret_i(tmp16));
- store_partial(const_int(4), pDst + (2 * dstStride), tmp);
-
- tmp >>= 32;
- store_partial(const_int(4), pDst + (3 * dstStride), tmp);
- }
- else
- {
- Vec16uc v_main;
- v_main = load_partial(const_int(4), refMain + 1);
- store_partial(const_int(4), pDst, v_main);
- store_partial(const_int(4), pDst + dstStride, v_main);
- store_partial(const_int(4), pDst + (2 * dstStride), v_main);
- store_partial(const_int(4), pDst + (3 * dstStride), v_main);
- if (bFilter)
- {
- for (int k = 0; k < 4; k++)
- {
- pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << 8) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
- }
- }
- }
- }
- else
- {
- Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
- Vec16uc tmp16_1, tmp16_2;
- Vec2uq tmp2uq;
- Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31), v_ipAngle(0);
- switch (intraPredAngle)
- {
- case -32:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain); //-1,0,1,2
- store_partial(const_int(4), pDst, tmp16_1);
- tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 1); //-2,-1,0,1
- store_partial(const_int(4), pDst + dstStride, tmp16_2);
- tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 2);
- store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
- tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 3);
- store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
- return;
-
- case -26:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 3);
- row41 = extend_low(tmp16_1); //offsets(-4,-3,-2,-1)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(-3,-2,-1,0)
-
- row31 = row42; //offsets(-3,-2,-1,0)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row32 = extend_low(tmp16_2); //offsets(-2,-1,0,1)
-
- row21 = row32; //offsets(-2,-1,0,1)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 24;
- tmp16_2 = reinterpret_i(tmp2uq);
- row22 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row11 = row22; //offsets(-1,0,1,2)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 32;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(0,1,2,3)
-
- v_deltaPos = v_ipAngle = -26;
- break;
-
- case -21:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
- row41 = extend_low(tmp16_1); //offsets(-3,-2,-1,0)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(-2,-1,0,1)
-
- row31 = row42; //offsets(-2,-1,0,1)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row32 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row21 = row31; //offsets(-2,-1,0,1)
- row22 = row32;
-
- row11 = row32;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 24;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- v_deltaPos = v_ipAngle = -21;
- break;
-
- case -17:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
- row41 = extend_low(tmp16_1); //offsets(-3,-2,-1,0)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(-2,-1,0,1)
-
- row31 = row42; //offsets(-2,-1,0,1)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row32 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row21 = row31; //offsets(-2,-1,0,1)
- row22 = row32;
-
- row11 = row32;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 24;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- v_deltaPos = v_ipAngle = -17;
- break;
-
- case -13:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
- row41 = extend_low(tmp16_1); //offsets(-2,-1,0,1)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row11 = row42;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row21 = row42; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row41;
- row32 = row42;
-
- v_deltaPos = v_ipAngle = -13;
- break;
-
- case -9:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
- row41 = extend_low(tmp16_1); //offsets(-2,-1,0,1)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row11 = row42;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
-
- row21 = row42; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row42;
- row32 = row12;
-
- v_deltaPos = v_ipAngle = -9;
- break;
-
- case -5:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
- row21 = row11; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row11;
- row32 = row12;
- row41 = row11;
- row42 = row12;
-
- v_deltaPos = v_ipAngle = -5;
- break;
-
- case -2:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
- row21 = row11; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row11;
- row32 = row12;
- row41 = row11;
- row42 = row12;
-
- v_deltaPos = v_ipAngle = -2;
- break;
-
- case 2:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
- row21 = row11; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row11;
- row32 = row12;
- row41 = row11;
- row42 = row12;
-
- v_deltaPos = v_ipAngle = 2;
- break;
-
- case 5:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
- row21 = row11; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row11;
- row32 = row12;
- row41 = row11;
- row42 = row12;
-
- v_deltaPos = v_ipAngle = 5;
- break;
-
- case 9:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
- row21 = row11; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row11;
- row32 = row12;
- row41 = row12;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2);
-
- v_deltaPos = v_ipAngle = 9;
- break;
-
- case 13:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
-
- row21 = row11; //offsets(0,1,2,3)
- row22 = row12;
- row31 = row12; //offsets(1,2,3,4)
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row32 = extend_low(tmp16_2); //offsets(2,3,4,5)
-
- row41 = row31; //offsets(1,2,3,4)
- row42 = row32;
-
- v_deltaPos = v_ipAngle = 13;
- break;
-
- case 17:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
-
- row21 = row12;
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row22 = extend_low(tmp16_2); //offsets(2,3,4,5)
-
- row31 = row21;
- row32 = row22;
-
- row41 = row22;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 24;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(3,4,5,6)
-
- v_deltaPos = v_ipAngle = 17;
- break;
-
- case 21:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
-
- row21 = row12;
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row22 = extend_low(tmp16_2); //offsets(2,3,4,5)
-
- row31 = row21;
- row32 = row22;
-
- row41 = row22;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 24;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(3,4,5,6)
-
- v_deltaPos = v_ipAngle = 21;
- break;
-
- case 26:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
- row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 8;
- tmp16_2 = reinterpret_i(tmp2uq);
- row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
-
- row21 = row12;
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 16;
- tmp16_2 = reinterpret_i(tmp2uq);
- row22 = extend_low(tmp16_2); //offsets(2,3,4,5)
-
- row31 = row22;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 24;
- tmp16_2 = reinterpret_i(tmp2uq);
- row32 = extend_low(tmp16_2); //offsets(3,4,5,6)
-
- row41 = row32;
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq = tmp2uq >> 32;
- tmp16_2 = reinterpret_i(tmp2uq);
- row42 = extend_low(tmp16_2); //offsets(4,5,6,7)
-
- v_deltaPos = v_ipAngle = 26;
- break;
-
- case 32:
- tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 2);
- store_partial(const_int(4), pDst, tmp16_1);
- tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 3);
- store_partial(const_int(4), pDst + dstStride, tmp16_2);
- tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 4);
- store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
- tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 5);
- store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
- return;
- }
-
- //row1
- v_deltaFract = v_deltaPos & thirty1;
- row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
-
- //row2
- v_deltaPos += v_ipAngle;
- v_deltaFract = v_deltaPos & thirty1;
- row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
-
- //row3
- v_deltaPos += v_ipAngle;
- v_deltaFract = v_deltaPos & thirty1;
- row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
-
- //row4
- v_deltaPos += v_ipAngle;
- v_deltaFract = v_deltaPos & thirty1;
- row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
-
- // Flip the block
-
- if (modeHor)
- {
- Vec8s tmp1, tmp2, tmp3, tmp4;
-
- tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
- tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
-
- tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
- tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
-
- tmp16_1 = compress_unsafe(tmp3, tmp3);
- store_partial(const_int(4), pDst, tmp16_1);
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq >>= 32;
- store_partial(const_int(4), pDst + dstStride, tmp2uq);
-
- tmp16_1 = compress_unsafe(tmp4, tmp4);
- store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
-
- tmp2uq = reinterpret_i(tmp16_1);
- tmp2uq >>= 32;
- store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
- }
- else
- {
- store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
- store_partial(const_int(4), pDst + (dstStride), compress_unsafe(row21, row21));
- store_partial(const_int(4), pDst + (2 * dstStride), compress_unsafe(row31, row31));
- store_partial(const_int(4), pDst + (3 * dstStride), compress_unsafe(row41, row41));
- }
- }
-}
-
-#endif /* if HIGH_BIT_DEPTH */
-
-#if HIGH_BIT_DEPTH
-#else
-#define PREDANG_CALCROW_VER(X) { \
- LOADROW(row11, GETAP(lookIdx, X)); \
- LOADROW(row12, GETAP(lookIdx, X) + 1); \
- CALCROW(row11, row11, row12); \
- store_partial(const_int(8), pDst + (X * dstStride), compress(row11, row11)); \
-}
-
-#define PREDANG_CALCROW_HOR(X, rowx) { \
- LOADROW(row11, GETAP(lookIdx, X)); \
- LOADROW(row12, GETAP(lookIdx, X) + 1); \
- CALCROW(rowx, row11, row12); \
-}
-
-// ROW is a Vec8s variable, X is the index in of data to be loaded
-#define LOADROW(ROW, X) { \
- tmp = load_partial(const_int(8), refMain + 1 + X); \
- ROW = extend_low(tmp); \
-}
-
-#define CALCROW(RES, ROW1, ROW2) { \
- v_deltaPos += v_ipAngle; \
- v_deltaFract = v_deltaPos & thirty1; \
- RES = ((thirty2 - v_deltaFract) * ROW1 + (v_deltaFract * ROW2) + 16) >> 5; \
-}
-
-void xPredIntraAng8x8(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
-{
- int k;
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int lookIdx = intraPredAngle;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
-
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- // bfilter will always be true for blocksize 8
- if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
- {
- if (modeHor)
- {
- Vec16uc v_temp;
- Vec16uc tmp1;
-
- v_temp.load(refMain + 1);
- Vec8s v_main;
- v_main = extend_low(v_temp);
-
- if (bFilter)
- {
- Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
- Vec16uc v_temp16;
- v_temp16.load(refSide + 1);
- Vec8s v_side;
- v_side = extend_low(v_temp16);
-
- Vec8s row0;
- row0 = permute8s<0, 0, 0, 0, 0, 0, 0, 0>(v_main);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row0 = row0 + v_side;
- row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
- tmp1 = compress(row0, row0);
- store_partial(const_int(8), pDst, tmp1); //row0
- }
- else
- {
- tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst, tmp1); //row0
- }
- tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (1 * dstStride), tmp1); //row1
-
- tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
-
- tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
-
- tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (4 * dstStride), tmp1); //row4
-
- tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (5 * dstStride), tmp1); //row5
-
- tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (6 * dstStride), tmp1); //row6
-
- tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
- store_partial(const_int(8), pDst + (7 * dstStride), tmp1); //row7
- }
- else
- {
- Vec16uc v_main;
- v_main = load_partial(const_int(8), refMain + 1);
- store_partial(const_int(8), pDst, v_main);
- store_partial(const_int(8), pDst + dstStride, v_main);
- store_partial(const_int(8), pDst + (2 * dstStride), v_main);
- store_partial(const_int(8), pDst + (3 * dstStride), v_main);
- store_partial(const_int(8), pDst + (4 * dstStride), v_main);
- store_partial(const_int(8), pDst + (5 * dstStride), v_main);
- store_partial(const_int(8), pDst + (6 * dstStride), v_main);
- store_partial(const_int(8), pDst + (7 * dstStride), v_main);
-
- if (bFilter)
- {
- Vec16uc v_temp;
- Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
-
- v_temp.load(refSide + 1);
- Vec8s v_side;
- v_side = extend_low(v_temp);
-
- v_temp.load(refMain + 1);
- Vec8s row0;
- row0 = permute16uc<0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1>(v_temp);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row0 = row0 + v_side;
- row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
- pDst[0 * dstStride] = row0[0];
- pDst[1 * dstStride] = row0[1];
- pDst[2 * dstStride] = row0[2];
- pDst[3 * dstStride] = row0[3];
- pDst[4 * dstStride] = row0[4];
- pDst[5 * dstStride] = row0[5];
- pDst[6 * dstStride] = row0[6];
- pDst[7 * dstStride] = row0[7];
- }
- }
- }
- else if (intraPredAngle == -32)
- {
- Vec16uc tmp;
- tmp = load_partial(const_int(8), refMain); //-1,0,1,2
- store_partial(const_int(8), pDst, tmp);
- tmp = load_partial(const_int(8), refMain - 1); //-2,-1,0,1
- store_partial(const_int(8), pDst + dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 2);
- store_partial(const_int(8), pDst + 2 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 3);
- store_partial(const_int(8), pDst + 3 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 4);
- store_partial(const_int(8), pDst + 4 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 5);
- store_partial(const_int(8), pDst + 5 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 6);
- store_partial(const_int(8), pDst + 6 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain - 7);
- store_partial(const_int(8), pDst + 7 * dstStride, tmp);
- return;
- }
- else if (intraPredAngle == 32)
- {
- Vec8s tmp;
- tmp = load_partial(const_int(8), refMain + 2); //-1,0,1,2
- store_partial(const_int(8), pDst, tmp);
- tmp = load_partial(const_int(8), refMain + 3); //-2,-1,0,1
- store_partial(const_int(8), pDst + dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 4);
- store_partial(const_int(8), pDst + 2 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 5);
- store_partial(const_int(8), pDst + 3 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 6);
- store_partial(const_int(8), pDst + 4 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 7);
- store_partial(const_int(8), pDst + 5 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 8);
- store_partial(const_int(8), pDst + 6 * dstStride, tmp);
- tmp = load_partial(const_int(8), refMain + 9);
- store_partial(const_int(8), pDst + 7 * dstStride, tmp);
- return;
- }
- else
- {
- if (modeHor) // Near horizontal modes
- {
- Vec16uc tmp;
- Vec8s row11, row12;
- Vec16uc row1, row2, row3, row4, tmp16_1, tmp16_2;
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
- Vec8s tmp1, tmp2;
- v_deltaPos = 0;
- v_ipAngle = intraPredAngle;
- switch (intraPredAngle)
- {
- case -5:
- LOADROW(row11, -1);
- LOADROW(row12, 0);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row1 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row2 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row3 = compress(tmp1, tmp2);
- row12 = row11;
- LOADROW(row11, -2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row4 = compress(tmp1, tmp2);
- break;
-
- case -2:
- LOADROW(row11, -1);
- LOADROW(row12, 0);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row1 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row2 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row3 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row4 = compress(tmp1, tmp2);
- break;
-
- case 2:
- LOADROW(row11, 0);
- LOADROW(row12, 1);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row1 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row2 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row3 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row4 = compress(tmp1, tmp2);
- break;
-
- case 5:
- LOADROW(row11, 0);
- LOADROW(row12, 1);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row1 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row2 = compress(tmp1, tmp2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row3 = compress(tmp1, tmp2);
- row11 = row12;
- LOADROW(row12, 2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- row4 = compress(tmp1, tmp2);
- break;
-
- default: // these cases uses the lookup table to identify access patterns
-
- PREDANG_CALCROW_HOR(0, tmp1);
- PREDANG_CALCROW_HOR(1, tmp2);
- row1 = compress(tmp1, tmp2);
- PREDANG_CALCROW_HOR(2, tmp1);
- PREDANG_CALCROW_HOR(3, tmp2);
- row2 = compress(tmp1, tmp2);
- PREDANG_CALCROW_HOR(4, tmp1);
- PREDANG_CALCROW_HOR(5, tmp2);
- row3 = compress(tmp1, tmp2);
- PREDANG_CALCROW_HOR(6, tmp1);
- PREDANG_CALCROW_HOR(7, tmp2);
- row4 = compress(tmp1, tmp2);
- }
-
- // Flip the block
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
- row1 = tmp16_1;
- row2 = tmp16_2;
-
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
- row3 = tmp16_1;
- row4 = tmp16_2;
-
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
- row1 = tmp16_1;
- row2 = tmp16_2;
-
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
- row3 = tmp16_1;
- row4 = tmp16_2;
-
- tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row1, (Vec4i)row3);
- tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row1, (Vec4i)row3);
- row1 = tmp16_1;
- row3 = tmp16_2;
-
- tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row2, (Vec4i)row4);
- tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row2, (Vec4i)row4);
- row2 = tmp16_1;
- row4 = tmp16_2;
-
- store_partial(const_int(8), pDst, row1); //row1
- store_partial(const_int(8), pDst + (2 * dstStride), row3); //row3
- store_partial(const_int(8), pDst + (4 * dstStride), row2); //row5
- store_partial(const_int(8), pDst + (6 * dstStride), row4); //row7
-
- row1 = blend2q<1, 3>((Vec2q)row1, (Vec2q)row1);
- store_partial(const_int(8), pDst + (1 * dstStride), row1); //row2
-
- row1 = blend2q<1, 3>((Vec2q)row3, (Vec2q)row3);
- store_partial(const_int(8), pDst + (3 * dstStride), row1); //row4
-
- row1 = blend2q<1, 3>((Vec2q)row2, (Vec2q)row2);
- store_partial(const_int(8), pDst + (5 * dstStride), row1); //row6
-
- row1 = blend2q<1, 3>((Vec2q)row4, (Vec2q)row4);
- store_partial(const_int(8), pDst + (7 * dstStride), row1); //row8
- }
- else // Vertical modes
- {
- Vec8s row11, row12;
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
- Vec16uc tmp;
- Vec8s tmp1, tmp2;
- v_deltaPos = 0;
- v_ipAngle = intraPredAngle;
- switch (intraPredAngle)
- {
- case -5:
- LOADROW(row11, -1);
- LOADROW(row12, 0);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst, compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
- row12 = row11;
- LOADROW(row11, -2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
- break;
-
- case -2:
- LOADROW(row11, -1);
- LOADROW(row12, 0);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst, compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
- break;
-
- case 2:
- LOADROW(row11, 0);
- LOADROW(row12, 1);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst, compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
- break;
-
- case 5:
- LOADROW(row11, 0);
- LOADROW(row12, 1);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst, compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
- row11 = row12;
- LOADROW(row12, 2);
- CALCROW(tmp1, row11, row12);
- CALCROW(tmp2, row11, row12);
- store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
- store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
- break;
-
- default: // these cases uses the lookup table to identify access patterns
- PREDANG_CALCROW_VER(0);
- PREDANG_CALCROW_VER(1);
- PREDANG_CALCROW_VER(2);
- PREDANG_CALCROW_VER(3);
- PREDANG_CALCROW_VER(4);
- PREDANG_CALCROW_VER(5);
- PREDANG_CALCROW_VER(6);
- PREDANG_CALCROW_VER(7);
- }
- }
- }
-}
-
-#undef PREDANG_CALCROW_VER
-#undef PREDANG_CALCROW_HOR
-#undef LOADROW
-#undef CALCROW
-#endif /* if HIGH_BIT_DEPTH */
-
-//16x16
-#if HIGH_BIT_DEPTH
-#else
-#define PREDANG_CALCROW_VER(X) { \
- LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
- LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
- CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
- /*compress(row11L, row11H).store(pDst + ((X)*dstStride));*/ \
- itmp = _mm_packus_epi16(row11L, row11H); \
- _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
-}
-
-#define PREDANG_CALCROW_HOR(X, rowx) { \
- LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
- LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
- CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
- /*rowx = compress(row11L, row11H);*/ \
- rowx = _mm_packus_epi16(row11L, row11H); \
-}
-
-// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
-#define LOADROW(ROWL, ROWH, X) { \
- /*tmp.load(refMain + 1 + (X)); */ \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
- /* ROWL = extend_low(tmp);*/ \
- ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- /*ROWH = extend_high(tmp);*/ \
- ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-}
-
-#define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
- /*v_deltaPos += v_ipAngle; \
- v_deltaFract = v_deltaPos & thirty1;*/ \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- /*RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
- RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5;*/ \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, ROW1L); \
- it3 = _mm_mullo_epi16(v_deltaFract, ROW2L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- RESL = _mm_srai_epi16(it2, 5); \
- \
- it2 = _mm_mullo_epi16(it1, ROW1H); \
- it3 = _mm_mullo_epi16(v_deltaFract, ROW2H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- RESH = _mm_srai_epi16(it2, 5); \
-}
-
-#define BLND2_16(R1, R2) { \
- /*tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); */ \
- itmp1 = _mm_unpacklo_epi8(R1, R2); \
- /*tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2);*/ \
- itmp2 = _mm_unpackhi_epi8(R1, R2); \
- R1 = itmp1; \
- R2 = itmp2; \
-}
-
-#define MB4(R1, R2, R3, R4) { \
- BLND2_16(R1, R2) \
- BLND2_16(R3, R4) \
- /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3);*/ \
- itmp1 = _mm_unpacklo_epi16(R1, R3); \
- /* tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3);*/ \
- itmp2 = _mm_unpackhi_epi16(R1, R3); \
- R1 = itmp1; \
- R3 = itmp2; \
- /*R1 = tmp1; \
- R3 = tmp2;*/ \
- /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
- tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4);*/ \
- itmp1 = _mm_unpacklo_epi16(R2, R4); \
- itmp2 = _mm_unpackhi_epi16(R2, R4); \
- R2 = itmp1; \
- R4 = itmp2; \
- /*R2 = tmp1; \
- R4 = tmp2;*/ \
-}
-
-#define BLND2_4(R1, R2) { \
- /* tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
- tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); */ \
- itmp1 = _mm_unpacklo_epi32(R1, R2); \
- itmp2 = _mm_unpackhi_epi32(R1, R2); \
- R1 = itmp1; \
- R2 = itmp2; \
- /*R1 = tmp1; \
- R2 = tmp2; */\
-}
-
-#define BLND2_2(R1, R2) { \
- /*tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
- tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2);*/ \
- itmp1 = _mm_unpacklo_epi64(R1, R2); \
- itmp2 = _mm_unpackhi_epi64(R1, R2); \
- /*tmp1.store(pDst); */ \
- _mm_storeu_si128((__m128i*)pDst, itmp1); \
- pDst += dstStride; \
- /*tmp2.store(pDst);*/ \
- _mm_storeu_si128((__m128i*)pDst, itmp2); \
- pDst += dstStride; \
-}
-
-#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
- PREDANG_CALCROW_HOR(0 + X, R1) \
- PREDANG_CALCROW_HOR(1 + X, R2) \
- PREDANG_CALCROW_HOR(2 + X, R3) \
- PREDANG_CALCROW_HOR(3 + X, R4) \
- PREDANG_CALCROW_HOR(4 + X, R5) \
- PREDANG_CALCROW_HOR(5 + X, R6) \
- PREDANG_CALCROW_HOR(6 + X, R7) \
- PREDANG_CALCROW_HOR(7 + X, R8) \
- MB4(R1, R2, R3, R4) \
- MB4(R5, R6, R7, R8) \
- BLND2_4(R1, R5); \
- BLND2_4(R2, R6); \
- BLND2_4(R3, R7); \
- BLND2_4(R4, R8); \
-}
-
-void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
-{
- int k;
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int lookIdx = intraPredAngle;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
-
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- if (intraPredAngle != -32)
- for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- // bfilter will always be true for blocksize 8
- if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
- {
- if (modeHor)
- {
- Vec16uc v_temp;
- Vec16uc tmp1;
- v_temp.load(refMain + 1);
-
- if (bFilter)
- {
- Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
- Vec16uc v_temp16;
- v_temp16.load(refSide + 1);
- Vec8s v_side;
- v_side = extend_low(v_temp16);
-
- Vec8s row01, row02, ref(refMain[1]);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row01 = ref + v_side;
- row01 = min(max(0, row01), (1 << bitDepth) - 1);
-
- v_side = extend_high(v_temp16);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row02 = ref + v_side;
- row02 = min(max(0, row02), (1 << bitDepth) - 1);
-
- tmp1 = compress_unsafe(row01, row02);
- tmp1.store(pDst); //row0
- }
- else
- {
- tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
- tmp1.store(pDst); //row0
- }
-
- tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
- tmp1.store(pDst + (1 * dstStride)); //row1
-
- tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
- tmp1.store(pDst + (2 * dstStride)); //row2
-
- tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
- tmp1.store(pDst + (3 * dstStride)); //row3
-
- tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
- tmp1.store(pDst + (4 * dstStride)); //row4
-
- tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
- tmp1.store(pDst + (5 * dstStride)); //row5
-
- tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
- tmp1.store(pDst + (6 * dstStride)); //row6
-
- tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
- tmp1.store(pDst + (7 * dstStride)); //row7
-
- tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
- tmp1.store(pDst + (8 * dstStride)); //row8
-
- tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
- tmp1.store(pDst + (9 * dstStride)); //row9
-
- tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
- tmp1.store(pDst + (10 * dstStride)); //row10
-
- tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
- tmp1.store(pDst + (11 * dstStride)); //row11
-
- tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
- tmp1.store(pDst + (12 * dstStride)); //row12
-
- tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
- tmp1.store(pDst + (13 * dstStride)); //row13
-
- tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
- tmp1.store(pDst + (14 * dstStride)); //row14
-
- tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
- tmp1.store(pDst + (15 * dstStride)); //row15
- }
- else
- {
- Vec16uc v_main;
-// v_main.load(refMain + 1);
- v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
-
- _mm_storeu_si128((__m128i*)pDst, v_main);
- _mm_storeu_si128((__m128i*)(pDst + dstStride), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (2 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (3 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (4 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (5 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (6 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (7 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (8 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (9 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (10 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (11 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (12 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (13 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (14 * dstStride)), v_main);
- _mm_storeu_si128((__m128i*)(pDst + (15 * dstStride)), v_main);
-
- if (bFilter)
- {
- Vec16uc v_temp;
- Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
-
- v_temp.load(refSide + 1);
- Vec8s v_side;
- v_side = extend_low(v_temp);
-
- Vec8s row0, ref(refMain[1]);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row0 = ref + v_side;
- row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
- pDst[0 * dstStride] = row0[0];
- pDst[1 * dstStride] = row0[1];
- pDst[2 * dstStride] = row0[2];
- pDst[3 * dstStride] = row0[3];
- pDst[4 * dstStride] = row0[4];
- pDst[5 * dstStride] = row0[5];
- pDst[6 * dstStride] = row0[6];
- pDst[7 * dstStride] = row0[7];
-
- v_side = extend_high(v_temp);
- v_side -= v_side_0;
- v_side = v_side >> 1;
- row0 = ref + v_side;
- row0 = min(max(0, row0), (1 << bitDepth) - 1);
- pDst[8 * dstStride] = row0[0];
- pDst[9 * dstStride] = row0[1];
- pDst[10 * dstStride] = row0[2];
- pDst[11 * dstStride] = row0[3];
- pDst[12 * dstStride] = row0[4];
- pDst[13 * dstStride] = row0[5];
- pDst[14 * dstStride] = row0[6];
- pDst[15 * dstStride] = row0[7];
- }
- }
- }
- else if (intraPredAngle == -32)
- {
- Vec16uc v_refSide;
- v_refSide.load(refSide);
- v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
- pixel refMain0 = refMain[0];
-
- v_refSide.store(refMain - 15);
- refMain[0] = refMain0;
-
- Vec16uc tmp;
- __m128i itmp;
-// tmp.load(refMain); //-1,0,1,2
-// tmp.store(pDst);
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
-
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)--refMain);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
-
-/*
- tmp.load(--refMain);
- pDst += dstStride;
- tmp.store(pDst);
- ... 14 times more
-*/
- return;
- }
- else if (intraPredAngle == 32)
- {
- Vec8s tmp;
- __m128i itmp;
- refMain += 2;
-
-// tmp.load(refMain++);
-// tmp.store(pDst);
-
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- _mm_storeu_si128((__m128i*)pDst, itmp);
-
-/*
- tmp.load(refMain++);
- pDst += dstStride;
- tmp.store(pDst);
- ... 14 times more
-*/
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)pDst, itmp);
-
- return;
- }
- else
- {
- if (modeHor)
- {
- Vec8s row11L, row12L, row11H, row12H;
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
- Vec16uc tmp;
- Vec16uc R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
- Vec16uc tmp1, tmp2;
- v_deltaPos = 0;
- v_ipAngle = intraPredAngle;
- __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
-// MB16;
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- }
- else
- {
- Vec8s row11L, row12L, row11H, row12H;
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
- Vec16uc tmp;
- Vec8s tmp1, tmp2;
- v_deltaPos = 0;
- v_ipAngle = intraPredAngle;
- __m128i itmp, it1, it2, it3, i16;
-
- PREDANG_CALCROW_VER(0);
- PREDANG_CALCROW_VER(1);
- PREDANG_CALCROW_VER(2);
- PREDANG_CALCROW_VER(3);
- PREDANG_CALCROW_VER(4);
- PREDANG_CALCROW_VER(5);
- PREDANG_CALCROW_VER(6);
- PREDANG_CALCROW_VER(7);
- PREDANG_CALCROW_VER(8);
- PREDANG_CALCROW_VER(9);
- PREDANG_CALCROW_VER(10);
- PREDANG_CALCROW_VER(11);
- PREDANG_CALCROW_VER(12);
- PREDANG_CALCROW_VER(13);
- PREDANG_CALCROW_VER(14);
- PREDANG_CALCROW_VER(15);
- }
- }
-}
-
-#undef PREDANG_CALCROW_VER
-#undef PREDANG_CALCROW_HOR
-#undef LOADROW
-#undef CALCROW
-#undef BLND2_16
-#undef BLND2_2
-#undef BLND2_4
-#undef MB4
-#undef CALC_BLND_8ROWS
-#endif /* if HIGH_BIT_DEPTH */
-
-//32x32
-#if HIGH_BIT_DEPTH
-#else
-#define PREDANG_CALCROW_VER(X) { \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
- row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
- \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
- row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
- \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- row11L = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- row11H = _mm_srai_epi16(it2, 5); \
- \
- itmp = _mm_packus_epi16(row11L, row11H); \
- _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
- row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
- \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
- row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
- \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- row11L = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- row11H = _mm_srai_epi16(it2, 5); \
- \
- itmp = _mm_packus_epi16(row11L, row11H); \
- _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
-}
-
-#define PREDANG_CALCROW_VER_MODE2(X) { \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11); \
- it3 = _mm_mullo_epi16(v_deltaFract, row21); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- res1 = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row12); \
- it3 = _mm_mullo_epi16(v_deltaFract, row22); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- res2 = _mm_srai_epi16(it2, 5); \
- \
- itmp = _mm_packus_epi16(res1, res2); \
- _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row13); \
- it3 = _mm_mullo_epi16(v_deltaFract, row23); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- res1 = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row14); \
- it3 = _mm_mullo_epi16(v_deltaFract, row24); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- res2 = _mm_srai_epi16(it2, 5); \
- \
- itmp = _mm_packus_epi16(res1, res2); \
- _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
-}
-
-#define PREDANG_CALCROW_HOR(X, rowx) { \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
- row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
- \
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
- row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
- \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- row11L = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- row11H = _mm_srai_epi16(it2, 5); \
- \
- rowx = _mm_packus_epi16(row11L, row11H); \
-}
-
-#define PREDANG_CALCROW_HOR_MODE2(rowx) { \
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
- it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
- it2 = _mm_mullo_epi16(it1, row11L); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
- it2 = _mm_add_epi16(it2, it3); \
- i16 = _mm_set1_epi16(16); \
- it2 = _mm_add_epi16(it2, i16); \
- res1 = _mm_srai_epi16(it2, 5); \
- it2 = _mm_mullo_epi16(it1, row11H); \
- it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
- it2 = _mm_add_epi16(it2, it3); \
- it2 = _mm_add_epi16(it2, i16); \
- res2 = _mm_srai_epi16(it2, 5); \
- \
- rowx = _mm_packus_epi16(res1, res2); \
-}
-
-// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
-#define LOADROW(ROWL, ROWH, X) { \
-/* tmp.load(refMain + 1 + (X)); \
- ROWL = extend_low(tmp); \
- ROWH = extend_high(tmp); */\
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
- ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
- ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-}
-
-#define BLND2_2(R1, R2) { \
-/* tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
- tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
- tmp1.store(pDst); pDst += dstStride; \
- tmp2.store(pDst); pDst += dstStride; */\
- itmp1 = _mm_unpacklo_epi64(R1, R2); \
- itmp2 = _mm_unpackhi_epi64(R1, R2); \
- _mm_storeu_si128((__m128i*)pDst, itmp1); \
- pDst += dstStride; \
- _mm_storeu_si128((__m128i*)pDst, itmp2); \
- pDst += dstStride; \
-}
-
-#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
- itmp1 = _mm_unpacklo_epi8(R1, R2); \
- itmp2 = _mm_unpackhi_epi8(R1, R2); \
- R1 = itmp1; \
- R2 = itmp2; \
- itmp1 = _mm_unpacklo_epi8(R3, R4); \
- itmp2 = _mm_unpackhi_epi8(R3, R4); \
- R3 = itmp1; \
- R4 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R1, R3); \
- itmp2 = _mm_unpackhi_epi16(R1, R3); \
- R1 = itmp1; \
- R3 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R2, R4); \
- itmp2 = _mm_unpackhi_epi16(R2, R4); \
- R2 = itmp1; \
- R4 = itmp2; \
- itmp1 = _mm_unpacklo_epi8(R5, R6); \
- itmp2 = _mm_unpackhi_epi8(R5, R6); \
- R5 = itmp1; \
- R6 = itmp2; \
- itmp1 = _mm_unpacklo_epi8(R7, R8); \
- itmp2 = _mm_unpackhi_epi8(R7, R8); \
- R7 = itmp1; \
- R8 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R5, R7); \
- itmp2 = _mm_unpackhi_epi16(R5, R7); \
- R5 = itmp1; \
- R7 = itmp2; \
- itmp1 = _mm_unpacklo_epi16(R6, R8); \
- itmp2 = _mm_unpackhi_epi16(R6, R8); \
- R6 = itmp1; \
- R8 = itmp2; \
- itmp1 = _mm_unpacklo_epi32(R1, R5); \
- itmp2 = _mm_unpackhi_epi32(R1, R5); \
- R1 = itmp1; \
- R5 = itmp2; \
- \
- itmp1 = _mm_unpacklo_epi32(R2, R6); \
- itmp2 = _mm_unpackhi_epi32(R2, R6); \
- R2 = itmp1; \
- R6 = itmp2; \
- \
- itmp1 = _mm_unpacklo_epi32(R3, R7); \
- itmp2 = _mm_unpackhi_epi32(R3, R7); \
- R3 = itmp1; \
- R7 = itmp2; \
- \
- itmp1 = _mm_unpacklo_epi32(R4, R8); \
- itmp2 = _mm_unpackhi_epi32(R4, R8); \
- R4 = itmp1; \
- R8 = itmp2; \
-}
-
-#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
- PREDANG_CALCROW_HOR(0 + X, R1) \
- PREDANG_CALCROW_HOR(1 + X, R2) \
- PREDANG_CALCROW_HOR(2 + X, R3) \
- PREDANG_CALCROW_HOR(3 + X, R4) \
- PREDANG_CALCROW_HOR(4 + X, R5) \
- PREDANG_CALCROW_HOR(5 + X, R6) \
- PREDANG_CALCROW_HOR(6 + X, R7) \
-}
-
-#define CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) { \
- PREDANG_CALCROW_HOR_MODE2(R1) \
- PREDANG_CALCROW_HOR_MODE2(R2) \
- PREDANG_CALCROW_HOR_MODE2(R3) \
- PREDANG_CALCROW_HOR_MODE2(R4) \
- PREDANG_CALCROW_HOR_MODE2(R5) \
- PREDANG_CALCROW_HOR_MODE2(R6) \
- PREDANG_CALCROW_HOR_MODE2(R7) \
-}
-
-void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
-{
- int k;
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int lookIdx = intraPredAngle;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
-
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- if (intraPredAngle != -32)
- for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- // bfilter will always be true for blocksize 8
- if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
- {
- if (modeHor)
- {
- Vec16uc v_temp, tmp1;
-
- v_temp.load(refMain + 1);
- /*BROADSTORE16ROWS;*/
- tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
- tmp1.store(pDst + (0 * dstStride));
- tmp1.store(pDst + (0 * dstStride) + 16);
- tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
- tmp1.store(pDst + (1 * dstStride));
- tmp1.store(pDst + (1 * dstStride) + 16);
- tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
- tmp1.store(pDst + (2 * dstStride));
- tmp1.store(pDst + (2 * dstStride) + 16);
- tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
- tmp1.store(pDst + (3 * dstStride));
- tmp1.store(pDst + (3 * dstStride) + 16);
- tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
- tmp1.store(pDst + (4 * dstStride));
- tmp1.store(pDst + (4 * dstStride) + 16);
- tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
- tmp1.store(pDst + (5 * dstStride));
- tmp1.store(pDst + (5 * dstStride) + 16);
- tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
- tmp1.store(pDst + (6 * dstStride));
- tmp1.store(pDst + (6 * dstStride) + 16);
- tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
- tmp1.store(pDst + (7 * dstStride));
- tmp1.store(pDst + (7 * dstStride) + 16);
- tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
- tmp1.store(pDst + (8 * dstStride));
- tmp1.store(pDst + (8 * dstStride) + 16);
- tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
- tmp1.store(pDst + (9 * dstStride));
- tmp1.store(pDst + (9 * dstStride) + 16);
- tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
- tmp1.store(pDst + (10 * dstStride));
- tmp1.store(pDst + (10 * dstStride) + 16);
- tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
- tmp1.store(pDst + (11 * dstStride));
- tmp1.store(pDst + (11 * dstStride) + 16);
- tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
- tmp1.store(pDst + (12 * dstStride));
- tmp1.store(pDst + (12 * dstStride) + 16);
- tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
- tmp1.store(pDst + (13 * dstStride));
- tmp1.store(pDst + (13 * dstStride) + 16);
- tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
- tmp1.store(pDst + (14 * dstStride));
- tmp1.store(pDst + (14 * dstStride) + 16);
- tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
- tmp1.store(pDst + (15 * dstStride));
- tmp1.store(pDst + (15 * dstStride) + 16);
-
- pDst += 16 * dstStride;
- v_temp.load(refMain + 1 + 16);
- /*BROADSTORE16ROWS;*/
- tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
- tmp1.store(pDst + (0 * dstStride));
- tmp1.store(pDst + (0 * dstStride) + 16);
- tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
- tmp1.store(pDst + (1 * dstStride));
- tmp1.store(pDst + (1 * dstStride) + 16);
- tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
- tmp1.store(pDst + (2 * dstStride));
- tmp1.store(pDst + (2 * dstStride) + 16);
- tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
- tmp1.store(pDst + (3 * dstStride));
- tmp1.store(pDst + (3 * dstStride) + 16);
- tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
- tmp1.store(pDst + (4 * dstStride));
- tmp1.store(pDst + (4 * dstStride) + 16);
- tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
- tmp1.store(pDst + (5 * dstStride));
- tmp1.store(pDst + (5 * dstStride) + 16);
- tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
- tmp1.store(pDst + (6 * dstStride));
- tmp1.store(pDst + (6 * dstStride) + 16);
- tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
- tmp1.store(pDst + (7 * dstStride));
- tmp1.store(pDst + (7 * dstStride) + 16);
- tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
- tmp1.store(pDst + (8 * dstStride));
- tmp1.store(pDst + (8 * dstStride) + 16);
- tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
- tmp1.store(pDst + (9 * dstStride));
- tmp1.store(pDst + (9 * dstStride) + 16);
- tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
- tmp1.store(pDst + (10 * dstStride));
- tmp1.store(pDst + (10 * dstStride) + 16);
- tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
- tmp1.store(pDst + (11 * dstStride));
- tmp1.store(pDst + (11 * dstStride) + 16);
- tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
- tmp1.store(pDst + (12 * dstStride));
- tmp1.store(pDst + (12 * dstStride) + 16);
- tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
- tmp1.store(pDst + (13 * dstStride));
- tmp1.store(pDst + (13 * dstStride) + 16);
- tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
- tmp1.store(pDst + (14 * dstStride));
- tmp1.store(pDst + (14 * dstStride) + 16);
- tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
- tmp1.store(pDst + (15 * dstStride));
- tmp1.store(pDst + (15 * dstStride) + 16);
- }
- else
- {
- __m128i v_main;
- Pel *dstOriginal = pDst;
-// v_main.load(refMain + 1);
- v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
-// v_main.store(pDst);
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
-
- pDst = dstOriginal + 16;
- v_main = _mm_loadu_si128((__m128i const*)(refMain + 17));
-// v_main.store(pDst);
-
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- pDst += dstStride;
- _mm_storeu_si128((__m128i*)(pDst), v_main);
- }
- }
- else if (intraPredAngle == -32)
- {
- Vec16uc v_refSide;
- pixel refMain0 = refMain[0];
-
- v_refSide.load(refSide);
- v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
- v_refSide.store(refMain - 15);
-
- v_refSide.load(refSide + 16);
- v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
- v_refSide.store(refMain - 31);
-
- refMain[0] = refMain0;
-
- __m128i itmp;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
-
- return;
- }
- else if (intraPredAngle == 32)
- {
- __m128i itmp;
- refMain += 2;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- return;
- }
- else
- {
- if (modeHor)
- {
- __m128i row11L, row12L, row11H, row12H, res1, res2;
- __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
- __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
-
- Pel * original_pDst = pDst;
- v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- thirty2 = _mm_set1_epi16(32);
- thirty1 = _mm_set1_epi16(31);
- __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
-
- switch (intraPredAngle)
- {
- case -2:
- LOADROW(row11L, row11H, -1)
- LOADROW(row12L, row12H, 0)
- R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row12L = row11L;
- row12H = row11H;
- LOADROW(row11L, row11H, -2)
- R16 = _mm_packus_epi16(row11L, row11H);
- pDst = original_pDst + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + (16 * dstStride);
- refMain += 16;
-
- v_deltaPos = _mm_setzero_si128();
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- LOADROW(row11L, row11H, -1)
- LOADROW(row12L, row12H, 0)
- R16 = _mm_packus_epi16(row11L, row11H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row12L = row11L;
- row12H = row11H;
- LOADROW(row11L, row11H, -2)
- R16 = _mm_packus_epi16(row11L, row11H);
- pDst = original_pDst + (16 * dstStride) + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- return;
-
- case 2:
- LOADROW(row11L, row11H, 0)
- LOADROW(row12L, row12H, 1)
- R16 = _mm_packus_epi16(row12L, row12H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row11L = row12L;
- row11H = row12H;
- LOADROW(row12L, row12H, 2)
- R16 = _mm_packus_epi16(row12L, row12H);
- pDst = original_pDst + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + (16 * dstStride);
- refMain += 16;
- v_deltaPos = _mm_setzero_si128();
-
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- LOADROW(row11L, row11H, 0)
- LOADROW(row12L, row12H, 1)
- R16 = _mm_packus_epi16(row12L, row12H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row11L = row12L;
- row11H = row12H;
- LOADROW(row12L, row12H, 2)
- R16 = _mm_packus_epi16(row12L, row12H);
- pDst = original_pDst + (16 * dstStride) + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- return;
- }
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
- PREDANG_CALCROW_HOR(7 + 0, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
- PREDANG_CALCROW_HOR(7 + 8, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + 16;
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
- PREDANG_CALCROW_HOR(7 + 16, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
- R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + (16 * dstStride);
- refMain += 16;
- v_deltaPos = _mm_setzero_si128();
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
- PREDANG_CALCROW_HOR(7 + 0, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
- PREDANG_CALCROW_HOR(7 + 8, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- pDst = original_pDst + (16 * dstStride) + 16;
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
- PREDANG_CALCROW_HOR(7 + 16, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
- R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- }
- else
- {
- __m128i row11L, row12L, row11H, row12H;
- __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
- __m128i row11, row12, row13, row14, row21, row22, row23, row24;
- __m128i res1, res2;
-
- v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- thirty2 = _mm_set1_epi16(32);
- thirty1 = _mm_set1_epi16(31);
- __m128i itmp, it1, it2, it3, i16;
-
- switch (intraPredAngle)
- {
- case -2:
- LOADROW(row11, row12, -1)
- LOADROW(row21, row22, 0)
- LOADROW(row13, row14, 15)
- LOADROW(row23, row24, 16)
- for (int i = 0; i <= 14; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- //deltaFract == 0 for 16th row
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- itmp = _mm_packus_epi16(row11, row12);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row13, row14);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
- row21 = row11;
- row22 = row12;
- row23 = row13;
- row24 = row14;
-
- LOADROW(row11, row12, -2)
- LOADROW(row13, row14, 14)
- for (int i = 16; i <= 30; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- itmp = _mm_packus_epi16(row11, row12);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row13, row14);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
- return;
-
- case 2:
-
- LOADROW(row11, row12, 0)
- LOADROW(row21, row22, 1)
- LOADROW(row13, row14, 16)
- LOADROW(row23, row24, 17)
- for (int i = 0; i <= 14; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- //deltaFract == 0 for 16th row
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- itmp = _mm_packus_epi16(row21, row22);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row23, row24);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
- row11 = row21;
- row12 = row22;
- row13 = row23;
- row14 = row24;
-
- LOADROW(row21, row22, 2)
- LOADROW(row23, row24, 18)
- for (int i = 16; i <= 30; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- itmp = _mm_packus_epi16(row21, row22);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row23, row24);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
- return;
- }
-
- for (int i = 0; i <= 30; i++)
- {
- PREDANG_CALCROW_VER(i);
- }
-
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
- }
- }
-}
-
-#endif /* if HIGH_BIT_DEPTH */
-
-void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
-{
-#if HIGH_BIT_DEPTH
-#else
- switch (width)
- {
- case 4:
- xPredIntraAng4x4(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
- return;
- case 8:
- xPredIntraAng8x8(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
- return;
- case 16:
- xPredIntraAng16x16(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
- return;
- case 32:
- xPredIntraAng32x32(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
- return;
- }
-
-#endif /* if HIGH_BIT_DEPTH */
-
- int k, l;
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
- {
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- if (intraPredAngle == 0)
- {
- for (k = 0; k < blkSize; k++)
- {
- for (l = 0; l < blkSize; l++)
- {
- pDst[k * dstStride + l] = refMain[l + 1];
- }
- }
-
- if (bFilter)
- {
- for (k = 0; k < blkSize; k++)
- {
- pDst[k * dstStride] = (pixel)Clip3(0, (1 << bitDepth) - 1, static_cast<short>(pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1));
- }
- }
- }
- else
- {
- int deltaPos = 0;
- int deltaInt;
- int deltaFract;
- int refMainIndex;
-
- for (k = 0; k < blkSize; k++)
- {
- deltaPos += intraPredAngle;
- deltaInt = deltaPos >> 5;
- deltaFract = deltaPos & (32 - 1);
-
- if (deltaFract)
- {
- // Do linear filtering
- for (l = 0; l < blkSize; l++)
- {
- refMainIndex = l + deltaInt + 1;
- pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
- }
- }
- else
- {
- // Just copy the integer samples
- for (l = 0; l < blkSize; l++)
- {
- pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
- }
- }
- }
- }
-
- // Flip the block if this is the horizontal mode
- if (modeHor)
- {
- pixel tmp;
- for (k = 0; k < blkSize - 1; k++)
- {
- for (l = k + 1; l < blkSize; l++)
- {
- tmp = pDst[k * dstStride + l];
- pDst[k * dstStride + l] = pDst[l * dstStride + k];
- pDst[l * dstStride + k] = tmp;
- }
- }
- }
- }
-}
-}
-
-#include "utils.h"
-
-namespace x265 {
-void NAME(Setup_Vec_IPredPrimitives)(EncoderPrimitives& p)
-{
- initFileStaticVars();
- p.getIPredDC = predIntraDC;
- p.getIPredPlanar = predIntraPlanar;
- p.getIPredAng = xPredIntraAngBufRef;
-}
-}
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <chenm003 at 163.com>
+ * Deepthi Devaki <deepthidevaki at multicorewareinc.com>
+ * Steve Borho <steve at borho.org>
+ * ShinYee Chung <shinyee at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#include "primitives.h"
+#include "TLibCommon/TComRom.h"
+#include <assert.h>
+#include <smmintrin.h>
+
+extern char g_aucConvertToBit[];
+extern unsigned char g_aucIntraFilterType[][35];
+
+using namespace x265;
+
+namespace {
+const int angAP[17][64] =
+{
+ {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+ },
+ {
+ 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52
+ },
+ {
+ 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 32, 32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42
+ },
+ {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 34
+ },
+ {
+ 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 26
+ },
+ {
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+ },
+ { // 0th virtual index; never used; just to help indexing
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
+ },
+ {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
+ },
+ {
+ -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -10, -10
+ },
+ {
+ -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -17, -18, -18, -18, -18
+ },
+ {
+ -1, -1, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -13, -14, -14, -15, -15, -16, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -22, -22, -22, -23, -23, -24, -24, -24, -25, -25, -26, -26, -26
+ },
+ {
+ -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, -33, -33, -34, -34
+ },
+ {
+ -1, -2, -2, -3, -4, -4, -5, -6, -6, -7, -8, -8, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -16, -16, -17, -18, -18, -19, -20, -20, -21, -21, -22, -23, -23, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -31, -31, -32, -33, -33, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -41, -41, -42, -42
+ },
+ {
+ -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52
+ },
+ {
+ -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64
+ }
+};
+
+#define GETAP(X, Y) angAP[8 - (X)][(Y)]
+
+__m128i v_multiL, v_multiH, v_multiH2, v_multiH3, v_multiH4, v_multiH5, v_multiH6, v_multiH7;
+__m128i v_multi_2Row;
+
+/* When compiled with /arch:AVX, this code is not safe to run on non-AVX CPUs and
+ * thus we cannot use static initialization. This routine is only called if the
+ * detected CPU can support this SIMD architecture. */
+static void initFileStaticVars()
+{
+ v_multiL = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ v_multiH = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+ v_multiH2 = _mm_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+ v_multiH3 = _mm_setr_epi16(25, 26, 27, 28, 29, 30, 31, 32);
+ v_multiH4 = _mm_setr_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+ v_multiH5 = _mm_setr_epi16(41, 42, 43, 44, 45, 46, 47, 48);
+ v_multiH6 = _mm_setr_epi16(49, 50, 51, 52, 53, 54, 55, 56);
+ v_multiH7 = _mm_setr_epi16(57, 58, 59, 60, 61, 62, 63, 64);
+ v_multi_2Row = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
+}
+
+static inline
+void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* rpDst, intptr_t iDstStride, int iWidth, int /*iHeight*/)
+{
+ pixel* pDst = rpDst;
+ int y;
+ pixel pixDC = *pDst;
+ int pixDCx3 = pixDC * 3 + 2;
+
+ // boundary pixels processing
+ pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pixDC + 2) >> 2);
+
+ Vec8us im1(pixDCx3);
+ Vec8us im2, im3;
+#if HIGH_BIT_DEPTH
+ switch (iWidth)
+ {
+ case 4:
+ im2 = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
+ im2 = (im1 + im2) >> const_int(2);
+ store_partial(const_int(8), &pDst[1], im2);
+ break;
+
+ case 8:
+ im2.load(&pSrc[1 - iSrcStride]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1]);
+ break;
+
+ case 16:
+ im2.load(&pSrc[1 - iSrcStride]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1]);
+
+ im2.load(&pSrc[1 - iSrcStride + 8]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 8]);
+ break;
+
+ case 32:
+ im2.load(&pSrc[1 - iSrcStride]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1]);
+
+ im2.load(&pSrc[1 - iSrcStride + 8]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 8]);
+
+ im2.load(&pSrc[1 - iSrcStride + 16]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 16]);
+
+ im2.load(&pSrc[1 - iSrcStride + 24]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 24]);
+ break;
+
+ //case 64:
+ default:
+ im2.load(&pSrc[1 - iSrcStride]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1]);
+
+ im2.load(&pSrc[1 - iSrcStride + 8]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 8]);
+
+ im2.load(&pSrc[1 - iSrcStride + 16]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 16]);
+
+ im2.load(&pSrc[1 - iSrcStride + 24]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 24]);
+
+ im2.load(&pSrc[1 - iSrcStride + 32]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 32]);
+
+ im2.load(&pSrc[1 - iSrcStride + 40]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 40]);
+
+ im2.load(&pSrc[1 - iSrcStride + 48]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 48]);
+
+ im2.load(&pSrc[1 - iSrcStride + 56]);
+ im2 = (im1 + im2) >> const_int(2);
+ im2.store(&pDst[1 + 56]);
+ break;
+ }
+
+#else /* if HIGH_BIT_DEPTH */
+ Vec16uc pix;
+ switch (iWidth)
+ {
+ case 4:
+ pix = load_partial(const_int(4), &pSrc[1 - iSrcStride]);
+ im2 = extend_low(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ pix = compress(im2, im2);
+ store_partial(const_int(4), &pDst[1], pix);
+ break;
+
+ case 8:
+ pix = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
+ im2 = extend_low(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ pix = compress(im2, im2);
+ store_partial(const_int(8), &pDst[1], pix);
+ break;
+
+ case 16:
+ pix.load(&pSrc[1 - iSrcStride]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1]);
+ break;
+
+ case 32:
+ pix.load(&pSrc[1 - iSrcStride]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1]);
+
+ pix.load(&pSrc[1 - iSrcStride + 16]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1 + 16]);
+ break;
+
+ //case 64:
+ default:
+ pix.load(&pSrc[1 - iSrcStride]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1]);
+
+ pix.load(&pSrc[1 - iSrcStride + 16]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1 + 16]);
+
+ pix.load(&pSrc[1 - iSrcStride + 32]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1 + 32]);
+
+ pix.load(&pSrc[1 - iSrcStride + 48]);
+ im2 = extend_low(pix);
+ im3 = extend_high(pix);
+ im2 = (im1 + im2) >> const_int(2);
+ im3 = (im1 + im3) >> const_int(2);
+ pix = compress(im2, im3);
+ pix.store(&pDst[1 + 48]);
+ break;
+ }
+
+#endif /* if HIGH_BIT_DEPTH */
+
+ for (y = 1; y < iWidth; y++)
+ {
+ pDst[iDstStride] = (pixel)((pSrc[iSrcStride - 1] + pixDCx3) >> 2);
+ pSrc += iSrcStride;
+ pDst += iDstStride;
+ }
+}
+
+void predIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
+{
+ //assert(iWidth == iHeight); // all of Intra is NxN
+ //assert(blkAboveAvailable || blkLeftAvailable); // I think left and above always true since HEVC have a pixel fill process
+ int iSum = 0;
+ int logSize = g_aucConvertToBit[width] + 2;
+ pixel *pSrcAbove = &pSrc[-srcStride];
+ pixel *pSrcLeft = &pSrc[-1];
+
+ for (int iInd = 0; iInd < width; iInd++)
+ {
+ iSum += *pSrcLeft;
+ pSrcLeft += srcStride;
+ }
+
+#if HIGH_BIT_DEPTH
+ Vec8s sumAbove(0);
+ Vec8s m0;
+
+ switch (width)
+ {
+ case 4:
+ sumAbove = load_partial(const_int(8), pSrcAbove);
+ break;
+ case 8:
+ m0.load(pSrcAbove);
+ sumAbove = m0;
+ break;
+ case 16:
+ m0.load(pSrcAbove);
+ sumAbove = m0;
+ m0.load(pSrcAbove + 8);
+ sumAbove += m0;
+ break;
+ case 32:
+ m0.load(pSrcAbove);
+ sumAbove = m0;
+ m0.load(pSrcAbove + 8);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 16);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 24);
+ sumAbove += m0;
+ break;
+ //case 64:
+ default:
+ // CHECK_ME: the max support bit_depth is 13-bits
+ m0.load(pSrcAbove);
+ sumAbove = m0;
+ m0.load(pSrcAbove + 8);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 16);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 24);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 32);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 40);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 48);
+ sumAbove += m0;
+ m0.load(pSrcAbove + 56);
+ sumAbove += m0;
+ break;
+ }
+
+ iSum += horizontal_add_x(sumAbove);
+
+ logSize += 1;
+ pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
+ Vec8us dcValN(dcVal);
+ int k;
+
+ pixel *pDst1 = pDst;
+ switch (width)
+ {
+ case 4:
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ break;
+
+ case 8:
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ pDst1 += dstStride;
+ break;
+
+ case 16:
+ for (k = 0; k < 16; k += 2)
+ {
+ dcValN.store(pDst1);
+ dcValN.store(pDst1 + 8);
+ pDst1 += dstStride;
+ dcValN.store(pDst1);
+ dcValN.store(pDst1 + 8);
+ pDst1 += dstStride;
+ }
+
+ break;
+
+ case 32:
+ for (k = 0; k < 32; k++)
+ {
+ dcValN.store(pDst1);
+ dcValN.store(pDst1 + 8);
+ dcValN.store(pDst1 + 16);
+ dcValN.store(pDst1 + 24);
+ pDst1 += dstStride;
+ }
+
+ break;
+
+ //case 64:
+ default:
+ for (k = 0; k < 64; k++)
+ {
+ dcValN.store(pDst1);
+ dcValN.store(pDst1 + 8);
+ dcValN.store(pDst1 + 16);
+ dcValN.store(pDst1 + 24);
+ dcValN.store(pDst1 + 32);
+ dcValN.store(pDst1 + 40);
+ dcValN.store(pDst1 + 48);
+ dcValN.store(pDst1 + 56);
+ pDst1 += dstStride;
+ }
+
+ break;
+ }
+
+ if (bFilter)
+ {
+ xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
+ }
+#else // if !HIGH_BIT_DEPTH
+
+ {
+ Vec16uc pix;
+ Vec8us im;
+ Vec4ui im1, im2;
+
+ switch (width)
+ {
+ case 4:
+ pix.fromUint32(*(uint32_t*)pSrcAbove);
+ iSum += horizontal_add(extend_low(pix));
+ break;
+ case 8:
+#if X86_64
+ pix.fromUint64(*(uint64_t*)pSrcAbove);
+#else
+ pix.load_partial(8, pSrcAbove);
+#endif
+ iSum += horizontal_add(extend_low(pix));
+ break;
+ case 16:
+ pix.load(pSrcAbove);
+ iSum += horizontal_add_x(pix);
+ break;
+ case 32:
+ pix.load(pSrcAbove);
+ im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
+ pix.load(pSrcAbove + 16);
+ im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+ im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
+ iSum += toInt32(im1);
+ break;
+ //case 64:
+ default:
+ pix.load(pSrcAbove);
+ im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
+ pix.load(pSrcAbove + 16);
+ im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+ pix.load(pSrcAbove + 32);
+ im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+ pix.load(pSrcAbove + 48);
+ im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+ im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
+ //im1 += extract_hi64(im1);
+ iSum += toInt32(im1);
+ break;
+ }
+ }
+
+ logSize += 1;
+ pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
+ Vec16uc dcValN(dcVal);
+ int k;
+
+ pixel *pDst1 = pDst;
+ switch (width)
+ {
+ case 4:
+ store_partial(const_int(4), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(4), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(4), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(4), pDst1, dcValN);
+ break;
+
+ case 8:
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(8), pDst1, dcValN);
+ break;
+
+ case 16:
+ for (k = 0; k < 16; k += 4)
+ {
+ store_partial(const_int(16), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(16), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(16), pDst1, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(16), pDst1, dcValN);
+ pDst1 += dstStride;
+ }
+
+ break;
+
+ case 32:
+ for (k = 0; k < 32; k += 2)
+ {
+ store_partial(const_int(16), pDst1, dcValN);
+ store_partial(const_int(16), pDst1 + 16, dcValN);
+ pDst1 += dstStride;
+ store_partial(const_int(16), pDst1, dcValN);
+ store_partial(const_int(16), pDst1 + 16, dcValN);
+ pDst1 += dstStride;
+ }
+
+ break;
+
+ case 64:
+ for (k = 0; k < 64; k++)
+ {
+ store_partial(const_int(16), pDst1, dcValN);
+ store_partial(const_int(16), pDst1 + 16, dcValN);
+ store_partial(const_int(16), pDst1 + 32, dcValN);
+ store_partial(const_int(16), pDst1 + 48, dcValN);
+ pDst1 += dstStride;
+ }
+
+ break;
+ }
+
+ if (bFilter)
+ {
+ xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
+ }
+#endif // if HIGH_BIT_DEPTH
+}
+
+#if HIGH_BIT_DEPTH
+// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp
+void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ int k, bottomLeft, topRight;
+ // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits
+ int16_t leftColumn[4];
+
+ // Get left and above reference column and row
+ Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
+
+ for (k = 0; k < 4; k++)
+ {
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn); // leftColumn
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[4 * srcStride - 1];
+ topRight = pSrc[4 - srcStride];
+
+ Vec8s v_bottomLeft(bottomLeft);
+ Vec8s v_topRight(topRight);
+
+ Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+ Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+ v_topRow = v_topRow << const_int(2);
+ v_leftColumn = v_leftColumn << const_int(2);
+
+ // Generate prediction signal
+ Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
+ const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+ Vec8s v_horPred, v_rightColumnN;
+ Vec8s v_im4;
+ Vec16uc v_im5;
+
+ // line0
+ v_horPred = broadcast(const_int(0), v_horPred4);
+ v_rightColumnN = broadcast(const_int(0), v_rightColumn) * v_multi;
+ v_horPred = v_horPred + v_rightColumnN;
+ v_topRow = v_topRow + v_bottomRow;
+ // CHECK_ME: the HM don't clip the pixel, so I assume there is biggest 12+3=15(bits)
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+ store_partial(const_int(8), &rpDst[0 * dstStride], v_im4);
+
+ // line1
+ v_horPred = broadcast(const_int(1), v_horPred4);
+ v_rightColumnN = broadcast(const_int(1), v_rightColumn) * v_multi;
+ v_horPred = v_horPred + v_rightColumnN;
+ v_topRow = v_topRow + v_bottomRow;
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+ store_partial(const_int(8), &rpDst[1 * dstStride], v_im4);
+
+ // line2
+ v_horPred = broadcast(const_int(2), v_horPred4);
+ v_rightColumnN = broadcast(const_int(2), v_rightColumn) * v_multi;
+ v_horPred = v_horPred + v_rightColumnN;
+ v_topRow = v_topRow + v_bottomRow;
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+ store_partial(const_int(8), &rpDst[2 * dstStride], v_im4);
+
+ // line3
+ v_horPred = broadcast(const_int(3), v_horPred4);
+ v_rightColumnN = broadcast(const_int(3), v_rightColumn) * v_multi;
+ v_horPred = v_horPred + v_rightColumnN;
+ v_topRow = v_topRow + v_bottomRow;
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+ store_partial(const_int(8), &rpDst[3 * dstStride], v_im4);
+}
+
+#else /* if HIGH_BIT_DEPTH */
+void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ int k;
+ pixel bottomLeft, topRight;
+
+ // Get left and above reference column and row
+ Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &pSrc[-srcStride]); // topRow
+ Vec8s v_topRow = extend_low(im0);
+
+ int16_t leftColumn[4];
+
+ for (k = 0; k < 4; k++)
+ {
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn); // leftColumn
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[4 * srcStride - 1];
+ topRight = pSrc[4 - srcStride];
+
+ Vec8s v_bottomLeft(bottomLeft);
+ Vec8s v_topRight(topRight);
+
+ Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+ Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+ v_topRow = v_topRow << const_int(2);
+ v_leftColumn = v_leftColumn << const_int(2);
+
+ Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
+ const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+ Vec8s v_horPred, v_rightColumnN;
+ Vec8s v_im4;
+ Vec16uc v_im5;
+
+#define COMP_PRED_PLANAR4_ROW(X) { \
+ v_horPred = broadcast(const_int((X)), v_horPred4); \
+ v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \
+ v_horPred = v_horPred + v_rightColumnN; \
+ v_topRow = v_topRow + v_bottomRow; \
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \
+ v_im5 = compress_unsafe(v_im4, v_im4); \
+ store_partial(const_int(4), &rpDst[(X)*dstStride], v_im5); \
+}
+
+ COMP_PRED_PLANAR4_ROW(0)
+ COMP_PRED_PLANAR4_ROW(1)
+ COMP_PRED_PLANAR4_ROW(2)
+ COMP_PRED_PLANAR4_ROW(3)
+
+#undef COMP_PRED_PLANAR4_ROW
+}
+
+#if INSTRSET >= 5
+void predIntraPlanar4_sse4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ pixel bottomLeft, topRight;
+
+ // Get left and above reference column and row
+ __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&pSrc[-srcStride]); // topRow
+ __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+
+ v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[4 * srcStride - 1];
+ topRight = pSrc[4 - srcStride];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+ __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
+
+ v_topRow = _mm_slli_epi16(v_topRow, 2);
+
+ __m128i v_horPred, v_rightColumnN;
+ __m128i v_im4;
+ __m128i v_im5;
+ __m128i _tmp0, _tmp1;
+
+ __m128i v_bottomRowL = _mm_unpacklo_epi64(v_bottomRow, _mm_setzero_si128());
+ v_topRow = _mm_sub_epi16(v_topRow, v_bottomRowL);
+ v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);
+
+#define COMP_PRED_PLANAR_2ROW(Y) { \
+ _tmp0 = _mm_cvtsi32_si128((pSrc[((Y)) * srcStride - 1] << 2) + 4); \
+ _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
+ _tmp1 = _mm_cvtsi32_si128((pSrc[((Y)+1) * srcStride - 1] << 2) + 4); \
+ _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
+ v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \
+ _tmp0 = _mm_cvtsi32_si128(topRight - pSrc[((Y)) * srcStride - 1]); \
+ _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
+ _tmp1 = _mm_cvtsi32_si128(topRight - pSrc[((Y)+1) * srcStride - 1]); \
+ _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
+ v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \
+ v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \
+ v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+ v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+ v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \
+ v_im5 = _mm_packus_epi16(v_im4, v_im4); \
+ *(uint32_t*)&rpDst[(Y)*dstStride] = _mm_cvtsi128_si32(v_im5); \
+ *(uint32_t*)&rpDst[((Y)+1) * dstStride] = _mm_cvtsi128_si32(_mm_shuffle_epi32(v_im5, 0x55));; \
+}
+
+ COMP_PRED_PLANAR_2ROW(0)
+ COMP_PRED_PLANAR_2ROW(2)
+
+#undef COMP_PRED_PLANAR4_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+
+#define COMP_PRED_PLANAR_ROW(X) { \
+ v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+ v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
+ v_horPred = v_horPred + v_rightColumnN; \
+ v_topRow = v_topRow + v_bottomRow; \
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
+ store_partial(const_int(16), &rpDst[X * dstStride], v_im4); \
+}
+
+void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ int k, bottomLeft, topRight;
+
+ int16_t leftColumn[8];
+
+ // Get left and above reference column and row
+ Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
+
+ for (k = 0; k < 8; k++)
+ {
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn); // leftColumn
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[8 * srcStride - 1];
+ topRight = pSrc[8 - srcStride];
+
+ Vec8s v_bottomLeft(bottomLeft);
+ Vec8s v_topRight(topRight);
+
+ Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+ Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+ int shift = g_aucConvertToBit[8]; // Using value corresponding to width = 8
+ v_topRow = v_topRow << (2 + shift);
+ v_leftColumn = v_leftColumn << (2 + shift);
+
+ // Generate prediction signal
+ Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
+ const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+ Vec8s v_horPred, v_rightColumnN;
+ Vec8s v_im4;
+ Vec16uc v_im5;
+
+ COMP_PRED_PLANAR_ROW(0); // row 0
+ COMP_PRED_PLANAR_ROW(1);
+ COMP_PRED_PLANAR_ROW(2);
+ COMP_PRED_PLANAR_ROW(3);
+ COMP_PRED_PLANAR_ROW(4);
+ COMP_PRED_PLANAR_ROW(5);
+ COMP_PRED_PLANAR_ROW(6);
+ COMP_PRED_PLANAR_ROW(7); // row 7
+}
+
+#undef COMP_PRED_PLANAR_ROW
+#else /* if HIGH_BIT_DEPTH */
+
+#define COMP_PRED_PLANAR_ROW(X) { \
+ v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+ v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
+ v_horPred = v_horPred + v_rightColumnN; \
+ v_topRow = v_topRow + v_bottomRow; \
+ v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
+ v_im5 = compress(v_im4, v_im4); \
+ store_partial(const_int(8), &rpDst[X * dstStride], v_im5); \
+}
+
+void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ int k;
+ pixel bottomLeft, topRight;
+ int16_t leftColumn[8];
+
+ // Get left and above reference column and row
+ Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
+ Vec8s v_topRow = extend_low(im0);
+
+ for (k = 0; k < 8; k++)
+ {
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ Vec8s v_leftColumn;
+ v_leftColumn.load(leftColumn); // leftColumn
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[8 * srcStride - 1];
+ topRight = pSrc[8 - srcStride];
+
+ Vec8s v_bottomLeft(bottomLeft);
+ Vec8s v_topRight(topRight);
+
+ Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+ Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+ int shift = g_aucConvertToBit[8]; // Using value corresponding to width = 8
+ v_topRow = v_topRow << (2 + shift);
+ v_leftColumn = v_leftColumn << (2 + shift);
+
+ Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
+ const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+ Vec8s v_horPred, v_rightColumnN;
+ Vec8s v_im4;
+ Vec16uc v_im5;
+
+ COMP_PRED_PLANAR_ROW(0); // row 0
+ COMP_PRED_PLANAR_ROW(1);
+ COMP_PRED_PLANAR_ROW(2);
+ COMP_PRED_PLANAR_ROW(3);
+ COMP_PRED_PLANAR_ROW(4);
+ COMP_PRED_PLANAR_ROW(5);
+ COMP_PRED_PLANAR_ROW(6);
+ COMP_PRED_PLANAR_ROW(7); // row 7
+}
+
+#undef COMP_PRED_PLANAR_ROW
+
+#if INSTRSET >= 5
+void predIntraPlanar8_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+ pixel bottomLeft, topRight;
+
+ // Get left and above reference column and row
+ __m128i im0 = _mm_loadl_epi64((__m128i*)&pSrc[0 - srcStride]); // topRow
+ __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+
+ __m128i v_leftColumn = _mm_setzero_si128();
+
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[0 * srcStride - 1], 0);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[1 * srcStride - 1], 1);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[2 * srcStride - 1], 2);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[3 * srcStride - 1], 3);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[4 * srcStride - 1], 4);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[5 * srcStride - 1], 5);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[6 * srcStride - 1], 6);
+ v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[7 * srcStride - 1], 7);
+ v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[8 * srcStride - 1];
+ topRight = pSrc[8 - srcStride];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+ __m128i v_topRight = _mm_set1_epi16(topRight);
+
+ __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
+ __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
+
+ v_topRow = _mm_slli_epi16(v_topRow, 3);
+ v_leftColumn = _mm_slli_epi16(v_leftColumn, 3);
+
+ __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));
+ __m128i v_horPred, v_rightColumnN;
+ __m128i v_im4;
+ __m128i v_im5;
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+ if ((Y) < 4) { \
+ v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \
+ v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \
+ v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
+ v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \
+ } \
+ else { \
+ v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \
+ v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \
+ v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
+ v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \
+ } \
+ v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \
+ v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+ v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+ v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 4); \
+ v_im5 = _mm_packus_epi16(v_im4, v_im4); \
+ _mm_storel_epi64((__m128i*)&pDst[(Y)*dstStride], v_im5); \
+}
+
+ COMP_PRED_PLANAR_ROW(0)
+ COMP_PRED_PLANAR_ROW(1)
+ COMP_PRED_PLANAR_ROW(2)
+ COMP_PRED_PLANAR_ROW(3)
+ COMP_PRED_PLANAR_ROW(4)
+ COMP_PRED_PLANAR_ROW(5)
+ COMP_PRED_PLANAR_ROW(6)
+ COMP_PRED_PLANAR_ROW(7)
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+#define COMP_PRED_PLANAR_ROW(X) { \
+ v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+ v_horPred_hi = v_horPred_lo; \
+ v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
+ v_rightColumnN_hi = v_rightColumnN_lo; \
+ v_rightColumnN_lo *= v_multi_lo; \
+ v_rightColumnN_hi *= v_multi_hi; \
+ v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
+ v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
+ v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
+ v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
+ v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
+ v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
+ v_im4_lo.store(&rpDst[X * dstStride]); \
+ v_im4_hi.store(&rpDst[X * dstStride + 8]); \
+}
+
+void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ int k;
+ pixel bottomLeft, topRight;
+ int16_t leftColumn[16];
+
+ // Get left and above reference column and row
+ Vec8s v_topRow_lo, v_topRow_hi;
+
+ v_topRow_lo.load(&pSrc[-srcStride]);
+ v_topRow_hi.load(&pSrc[-srcStride + 8]);
+
+ for (k = 0; k < 16; k++)
+ {
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ Vec8s v_leftColumn;
+ v_leftColumn.load(leftColumn); // leftColumn
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[16 * srcStride - 1];
+ topRight = pSrc[16 - srcStride];
+
+ Vec8s v_bottomLeft(bottomLeft);
+ Vec8s v_topRight(topRight);
+
+ Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
+ Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
+ Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+ int shift = g_aucConvertToBit[16]; // Using value corresponding to width = 8
+ v_topRow_lo = v_topRow_lo << (2 + shift);
+ v_topRow_hi = v_topRow_hi << (2 + shift);
+ v_leftColumn = v_leftColumn << (2 + shift);
+
+ Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
+ const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
+ const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
+ Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
+ Vec8s v_im4_lo, v_im4_hi;
+ Vec16uc v_im5;
+
+ COMP_PRED_PLANAR_ROW(0); // row 0
+ COMP_PRED_PLANAR_ROW(1);
+ COMP_PRED_PLANAR_ROW(2);
+ COMP_PRED_PLANAR_ROW(3);
+ COMP_PRED_PLANAR_ROW(4);
+ COMP_PRED_PLANAR_ROW(5);
+ COMP_PRED_PLANAR_ROW(6);
+ COMP_PRED_PLANAR_ROW(7); // row 7
+
+ v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows
+ v_rightColumn = v_topRight - v_leftColumn;
+ v_leftColumn = v_leftColumn << (2 + shift);
+ v_horPred4 = v_leftColumn + Vec8s(16);
+
+ COMP_PRED_PLANAR_ROW(8); // row 0
+ COMP_PRED_PLANAR_ROW(9);
+ COMP_PRED_PLANAR_ROW(10);
+ COMP_PRED_PLANAR_ROW(11);
+ COMP_PRED_PLANAR_ROW(12);
+ COMP_PRED_PLANAR_ROW(13);
+ COMP_PRED_PLANAR_ROW(14);
+ COMP_PRED_PLANAR_ROW(15);
+}
+
+#undef COMP_PRED_PLANAR_ROW
+
+#else /* if HIGH_BIT_DEPTH */
+#define COMP_PRED_PLANAR_ROW(X) { \
+ v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+ v_horPred_hi = v_horPred_lo; \
+ v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
+ v_rightColumnN_hi = v_rightColumnN_lo; \
+ v_rightColumnN_lo *= v_multi_lo; \
+ v_rightColumnN_hi *= v_multi_hi; \
+ v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
+ v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
+ v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
+ v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
+ v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
+ v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
+ v_im5 = compress(v_im4_lo, v_im4_hi); \
+ store_partial(const_int(16), &rpDst[X * dstStride], v_im5); \
+}
+
+void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+ int k;
+ pixel bottomLeft, topRight;
+ int16_t leftColumn[16];
+
+ // Get left and above reference column and row
+ Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
+ Vec8s v_topRow_lo = extend_low(im0);
+ Vec8s v_topRow_hi = extend_high(im0);
+
+ for (k = 0; k < 16; k++)
+ {
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ Vec8s v_leftColumn;
+ v_leftColumn.load(leftColumn); // leftColumn
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[16 * srcStride - 1];
+ topRight = pSrc[16 - srcStride];
+
+ Vec8s v_bottomLeft(bottomLeft);
+ Vec8s v_topRight(topRight);
+
+ Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
+ Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
+ Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+ int shift = g_aucConvertToBit[16]; // Using value corresponding to width = 8
+ v_topRow_lo = v_topRow_lo << (2 + shift);
+ v_topRow_hi = v_topRow_hi << (2 + shift);
+ v_leftColumn = v_leftColumn << (2 + shift);
+
+ Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
+ const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
+ const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
+ Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
+ Vec8s v_im4_lo, v_im4_hi;
+ Vec16uc v_im5;
+
+ COMP_PRED_PLANAR_ROW(0); // row 0
+ COMP_PRED_PLANAR_ROW(1);
+ COMP_PRED_PLANAR_ROW(2);
+ COMP_PRED_PLANAR_ROW(3);
+ COMP_PRED_PLANAR_ROW(4);
+ COMP_PRED_PLANAR_ROW(5);
+ COMP_PRED_PLANAR_ROW(6);
+ COMP_PRED_PLANAR_ROW(7); // row 7
+
+ v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows
+ v_rightColumn = v_topRight - v_leftColumn;
+ v_leftColumn = v_leftColumn << (2 + shift);
+ v_horPred4 = v_leftColumn + Vec8s(16);
+
+ COMP_PRED_PLANAR_ROW(8); // row 0
+ COMP_PRED_PLANAR_ROW(9);
+ COMP_PRED_PLANAR_ROW(10);
+ COMP_PRED_PLANAR_ROW(11);
+ COMP_PRED_PLANAR_ROW(12);
+ COMP_PRED_PLANAR_ROW(13);
+ COMP_PRED_PLANAR_ROW(14);
+ COMP_PRED_PLANAR_ROW(15);
+}
+
+#undef COMP_PRED_PLANAR_ROW
+
+#if INSTRSET >= 5
+void predIntraPlanar16_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+ pixel bottomLeft, topRight;
+ __m128i v_topRow[2];
+ __m128i v_bottomRow[2];
+
+ // Get left and above reference column and row
+ __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
+
+ v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+ v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[16 * srcStride - 1];
+ topRight = pSrc[16 - srcStride];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+
+ v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
+ v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
+
+ v_topRow[0] = _mm_slli_epi16(v_topRow[0], 4);
+ v_topRow[1] = _mm_slli_epi16(v_topRow[1], 4);
+
+ __m128i v_horPred, v_horPredN[2], v_rightColumnN[2];
+ __m128i v_im4L, v_im4H;
+ __m128i v_im5;
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+ v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 4) + 16); \
+ v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
+ v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
+ __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
+ _tmp = _mm_shufflelo_epi16(_tmp, 0); \
+ _tmp = _mm_shuffle_epi32(_tmp, 0); \
+ v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
+ v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
+ v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
+ v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
+ v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
+ v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
+ v_im4L = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 5); \
+ v_im4H = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 5); \
+ v_im5 = _mm_packus_epi16(v_im4L, v_im4H); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5); \
+}
+
+ COMP_PRED_PLANAR_ROW(0)
+ COMP_PRED_PLANAR_ROW(1)
+ COMP_PRED_PLANAR_ROW(2)
+ COMP_PRED_PLANAR_ROW(3)
+ COMP_PRED_PLANAR_ROW(4)
+ COMP_PRED_PLANAR_ROW(5)
+ COMP_PRED_PLANAR_ROW(6)
+ COMP_PRED_PLANAR_ROW(7)
+ COMP_PRED_PLANAR_ROW(8)
+ COMP_PRED_PLANAR_ROW(9)
+ COMP_PRED_PLANAR_ROW(10)
+ COMP_PRED_PLANAR_ROW(11)
+ COMP_PRED_PLANAR_ROW(12)
+ COMP_PRED_PLANAR_ROW(13)
+ COMP_PRED_PLANAR_ROW(14)
+ COMP_PRED_PLANAR_ROW(15)
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#if INSTRSET >= 5
+void predIntraPlanar32_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+ pixel bottomLeft, topRight;
+ __m128i v_topRow[4];
+ __m128i v_bottomRow[4];
+
+ // Get left and above reference column and row
+ __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
+ __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
+
+ v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+ v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
+ v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
+ v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[32 * srcStride - 1];
+ topRight = pSrc[32 - srcStride];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+
+ v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
+ v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
+ v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
+ v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
+
+ v_topRow[0] = _mm_slli_epi16(v_topRow[0], 5);
+ v_topRow[1] = _mm_slli_epi16(v_topRow[1], 5);
+ v_topRow[2] = _mm_slli_epi16(v_topRow[2], 5);
+ v_topRow[3] = _mm_slli_epi16(v_topRow[3], 5);
+
+ __m128i v_horPred, v_horPredN[4], v_rightColumnN[4];
+ __m128i v_im4[4];
+ __m128i v_im5[2];
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+ v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 5) + 32); \
+ v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
+ v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
+ __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
+ _tmp = _mm_shufflelo_epi16(_tmp, 0); \
+ _tmp = _mm_shuffle_epi32(_tmp, 0); \
+ v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
+ v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
+ v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
+ v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
+ v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
+ v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
+ v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
+ v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
+ v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
+ v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
+ v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
+ v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
+ v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 6); \
+ v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 6); \
+ v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 6); \
+ v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 6); \
+ v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
+ v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
+}
+
+ int i;
+ for (i = 0; i < 32; i += 2)
+ {
+ COMP_PRED_PLANAR_ROW(i + 0);
+ COMP_PRED_PLANAR_ROW(i + 1);
+ }
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#if INSTRSET >= 5
+void predIntraPlanar64_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+ pixel bottomLeft, topRight;
+ __m128i v_topRow[8];
+ __m128i v_bottomRow[8];
+
+ // Get left and above reference column and row
+ __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
+ __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
+ __m128i im2 = _mm_loadu_si128((__m128i*)&pSrc[32 - srcStride]); // topRow
+ __m128i im3 = _mm_loadu_si128((__m128i*)&pSrc[48 - srcStride]); // topRow
+
+ v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+ v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
+ v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
+ v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
+ v_topRow[4] = _mm_unpacklo_epi8(im2, _mm_setzero_si128());
+ v_topRow[5] = _mm_unpackhi_epi8(im2, _mm_setzero_si128());
+ v_topRow[6] = _mm_unpacklo_epi8(im3, _mm_setzero_si128());
+ v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = pSrc[64 * srcStride - 1];
+ topRight = pSrc[64 - srcStride];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+
+ v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
+ v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
+ v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
+ v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
+ v_bottomRow[4] = _mm_sub_epi16(v_bottomLeft, v_topRow[4]);
+ v_bottomRow[5] = _mm_sub_epi16(v_bottomLeft, v_topRow[5]);
+ v_bottomRow[6] = _mm_sub_epi16(v_bottomLeft, v_topRow[6]);
+ v_bottomRow[7] = _mm_sub_epi16(v_bottomLeft, v_topRow[7]);
+
+ v_topRow[0] = _mm_slli_epi16(v_topRow[0], 6);
+ v_topRow[1] = _mm_slli_epi16(v_topRow[1], 6);
+ v_topRow[2] = _mm_slli_epi16(v_topRow[2], 6);
+ v_topRow[3] = _mm_slli_epi16(v_topRow[3], 6);
+ v_topRow[4] = _mm_slli_epi16(v_topRow[4], 6);
+ v_topRow[5] = _mm_slli_epi16(v_topRow[5], 6);
+ v_topRow[6] = _mm_slli_epi16(v_topRow[6], 6);
+ v_topRow[7] = _mm_slli_epi16(v_topRow[7], 6);
+
+ __m128i v_horPred, v_horPredN[8], v_rightColumnN[8];
+ __m128i v_im4[8];
+ __m128i v_im5[4];
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+ v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 6) + 64); \
+ v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
+ v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
+ __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
+ _tmp = _mm_shufflelo_epi16(_tmp, 0); \
+ _tmp = _mm_shuffle_epi32(_tmp, 0); \
+ v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
+ v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
+ v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
+ v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
+ v_rightColumnN[4] = _mm_mullo_epi16(_tmp, v_multiH4); \
+ v_rightColumnN[5] = _mm_mullo_epi16(_tmp, v_multiH5); \
+ v_rightColumnN[6] = _mm_mullo_epi16(_tmp, v_multiH6); \
+ v_rightColumnN[7] = _mm_mullo_epi16(_tmp, v_multiH7); \
+ v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
+ v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
+ v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
+ v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
+ v_horPredN[4] = _mm_add_epi16(v_horPred, v_rightColumnN[4]); \
+ v_horPredN[5] = _mm_add_epi16(v_horPred, v_rightColumnN[5]); \
+ v_horPredN[6] = _mm_add_epi16(v_horPred, v_rightColumnN[6]); \
+ v_horPredN[7] = _mm_add_epi16(v_horPred, v_rightColumnN[7]); \
+ v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
+ v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
+ v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
+ v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
+ v_topRow[4] = _mm_add_epi16(v_topRow[4], v_bottomRow[4]); \
+ v_topRow[5] = _mm_add_epi16(v_topRow[5], v_bottomRow[5]); \
+ v_topRow[6] = _mm_add_epi16(v_topRow[6], v_bottomRow[6]); \
+ v_topRow[7] = _mm_add_epi16(v_topRow[7], v_bottomRow[7]); \
+ v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 7); \
+ v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 7); \
+ v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 7); \
+ v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 7); \
+ v_im4[4] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[4], v_topRow[4]), 7); \
+ v_im4[5] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[5], v_topRow[5]), 7); \
+ v_im4[6] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[6], v_topRow[6]), 7); \
+ v_im4[7] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[7], v_topRow[7]), 7); \
+ v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
+ v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
+ v_im5[2] = _mm_packus_epi16(v_im4[4], v_im4[5]); \
+ v_im5[3] = _mm_packus_epi16(v_im4[6], v_im4[7]); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 32], v_im5[2]); \
+ _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 48], v_im5[3]); \
+}
+
+ int i;
+ for (i = 0; i < 64; i++)
+ {
+ COMP_PRED_PLANAR_ROW(i + 0);
+ //COMP_PRED_PLANAR_ROW(i+1);
+ }
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#endif /* if HIGH_BIT_DEPTH */
+
+typedef void predIntraPlanar_t (pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride);
+predIntraPlanar_t *intraPlanarN[] =
+{
+#if !HIGH_BIT_DEPTH && INSTRSET >= 5
+ predIntraPlanar4_sse4,
+ predIntraPlanar8_sse4,
+ predIntraPlanar16_sse4,
+ predIntraPlanar32_sse4,
+ predIntraPlanar64_sse4,
+#else
+ predIntraPlanar4,
+ predIntraPlanar8,
+ predIntraPlanar16,
+#endif
+};
+
+void predIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride, int width)
+{
+ //assert(width == height);
+
+ int nLog2Size = g_aucConvertToBit[width] + 2;
+
+#if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)
+ intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
+ return;
+#else
+ int k, l, bottomLeft, topRight;
+ int horPred;
+ // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here
+ int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
+ // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
+ int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
+ int blkSize = width;
+ int offset2D = width;
+ int shift1D = nLog2Size;
+ int shift2D = shift1D + 1;
+
+ if (width < 32)
+ {
+ intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
+ return;
+ }
+
+ // Get left and above reference column and row
+ for (k = 0; k < blkSize + 1; k++)
+ {
+ topRow[k] = pSrc[k - srcStride];
+ leftColumn[k] = pSrc[k * srcStride - 1];
+ }
+
+ // Prepare intermediate variables used in interpolation
+ bottomLeft = leftColumn[blkSize];
+ topRight = topRow[blkSize];
+ for (k = 0; k < blkSize; k++)
+ {
+ bottomRow[k] = bottomLeft - topRow[k];
+ rightColumn[k] = topRight - leftColumn[k];
+ topRow[k] <<= shift1D;
+ leftColumn[k] <<= shift1D;
+ }
+
+ // Generate prediction signal
+ for (k = 0; k < blkSize; k++)
+ {
+ horPred = leftColumn[k] + offset2D;
+ for (l = 0; l < blkSize; l++)
+ {
+ horPred += rightColumn[k];
+ topRow[l] += bottomRow[l];
+ rpDst[k * dstStride + l] = ((horPred + topRow[l]) >> shift2D);
+ }
+ }
+
+#endif /* if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5) */
+}
+
+#if HIGH_BIT_DEPTH
+void xPredIntraAng4x4(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int lookIdx = intraPredAngle;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ // bfilter will always be true for blocksize 4
+ if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
+ {
+ if (modeHor)
+ {
+ Vec8s v_temp;
+ Vec8s v_side_0; // refSide[0] value in a vector
+ v_temp.load((void*)refSide);
+ v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
+
+ Vec8s v_side;
+ v_side.load(refSide + 1);
+
+ Vec8s v_main;
+ v_main = load_partial(const_int(8), (void*)(refMain + 1));
+
+ Vec8s tmp1, tmp2;
+ tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(v_main, v_main);
+ tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp1);
+ tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
+
+ Vec8s row0;
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row0 = tmp2 + v_side;
+ row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+ store_partial(const_int(8), pDst, row0); //row0
+ store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
+
+ tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp2, tmp2);
+ tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
+
+ store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
+ store_partial(const_int(8), pDst + (dstStride), tmp2); //row1
+ }
+ else
+ {
+ Vec16uc v_main;
+ v_main = load_partial(const_int(8), refMain + 1);
+ store_partial(const_int(8), pDst, v_main);
+ store_partial(const_int(8), pDst + dstStride, v_main);
+ store_partial(const_int(8), pDst + (2 * dstStride), v_main);
+ store_partial(const_int(8), pDst + (3 * dstStride), v_main);
+
+ for (int k = 0; k < 4; k++)
+ {
+ pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
+ }
+ }
+ }
+ else if (intraPredAngle == -32)
+ {
+ Vec8s tmp;
+ tmp = load_partial(const_int(8), refMain); //-1,0,1,2
+ store_partial(const_int(8), pDst, tmp);
+ tmp = load_partial(const_int(8), refMain - 1); //-2,-1,0,1
+ store_partial(const_int(8), pDst + dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 2);
+ store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 3);
+ store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+ return;
+ }
+ else if (intraPredAngle == 32)
+ {
+ Vec8s tmp;
+ tmp = load_partial(const_int(8), refMain + 2); //-1,0,1,2
+ store_partial(const_int(8), pDst, tmp);
+ tmp = load_partial(const_int(8), refMain + 3); //-2,-1,0,1
+ store_partial(const_int(8), pDst + dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 4);
+ store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 5);
+ store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+ return;
+ }
+ else
+ {
+ Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
+ Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+
+ row11 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0));
+ row12 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0) + 1);
+
+ row21 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1));
+ row22 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1) + 1);
+
+ row31 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2));
+ row32 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2) + 1);
+
+ row41 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3));
+ row42 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3) + 1);
+
+ v_deltaPos = v_ipAngle = intraPredAngle;
+
+ //row1
+ v_deltaFract = v_deltaPos & thirty1;
+ row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
+
+ //row2
+ v_deltaPos += v_ipAngle;
+ v_deltaFract = v_deltaPos & thirty1;
+ row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
+
+ //row3
+ v_deltaPos += v_ipAngle;
+ v_deltaFract = v_deltaPos & thirty1;
+ row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
+
+ //row4
+ v_deltaPos += v_ipAngle;
+ v_deltaFract = v_deltaPos & thirty1;
+ row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
+
+ // Flip the block
+
+ if (modeHor)
+ {
+ Vec8s tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
+ tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
+
+ tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
+ tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
+
+ //tmp16_1 = compress(tmp3, tmp3);
+ store_partial(const_int(8), pDst, tmp3);
+
+ store_partial(const_int(8), pDst + (2 * dstStride), tmp4); //row2
+
+ tmp3 = blend2q<1, 3>((Vec2q)tmp3, (Vec2q)tmp3);
+ tmp4 = blend2q<1, 3>((Vec2q)tmp4, (Vec2q)tmp4);
+
+ store_partial(const_int(8), pDst + (3 * dstStride), tmp4); //row3
+ store_partial(const_int(8), pDst + (dstStride), tmp3); //row1
+ }
+ else
+ {
+ store_partial(const_int(8), pDst, row11);
+ store_partial(const_int(8), pDst + (dstStride), row21);
+ store_partial(const_int(8), pDst + (2 * dstStride), row31);
+ store_partial(const_int(8), pDst + (3 * dstStride), row41);
+ }
+ }
+}
+
+#else /* if HIGH_BIT_DEPTH */
+void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
+{
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ // bfilter will always be true for exactly vertical/horizontal modes
+ if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
+ {
+ if (modeHor)
+ {
+ Vec16uc v_main;
+ v_main = load_partial(const_int(4), (void*)(refMain + 1));
+
+ Vec16uc tmp16;
+ tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v_main, v_main);
+ tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(tmp16, tmp16);
+ Vec2uq tmp;
+
+ if (bFilter)
+ {
+ Vec16uc v_temp;
+ Vec8s v_side_0; // refSide[0] value in a vector
+ v_temp = load_partial(const_int(8), (void*)refSide);
+ v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
+ v_side_0 = v_side_0 & 0x00FF;
+
+ //shift v_side by 1 element (1 byte)
+ tmp = reinterpret_i(v_temp);
+ tmp = tmp >> 8;
+ v_temp = reinterpret_i(tmp);
+ Vec8s v_side = extend_low(v_temp);
+
+ Vec8s row0 = extend_low(tmp16);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row0 += v_side;
+ row0 = min(max(0, row0), 255);
+ Vec16uc v_res(compress_unsafe(row0, 0));
+ store_partial(const_int(4), pDst, v_res);
+ }
+ else
+ {
+ store_partial(const_int(4), pDst, tmp16);
+ }
+
+ tmp = (Vec2uq)tmp16;
+ tmp >>= 32;
+ store_partial(const_int(4), pDst + dstStride, tmp);
+
+ tmp = blend2q<1, 3>(reinterpret_i(tmp16), reinterpret_i(tmp16));
+ store_partial(const_int(4), pDst + (2 * dstStride), tmp);
+
+ tmp >>= 32;
+ store_partial(const_int(4), pDst + (3 * dstStride), tmp);
+ }
+ else
+ {
+ Vec16uc v_main;
+ v_main = load_partial(const_int(4), refMain + 1);
+ store_partial(const_int(4), pDst, v_main);
+ store_partial(const_int(4), pDst + dstStride, v_main);
+ store_partial(const_int(4), pDst + (2 * dstStride), v_main);
+ store_partial(const_int(4), pDst + (3 * dstStride), v_main);
+ if (bFilter)
+ {
+ for (int k = 0; k < 4; k++)
+ {
+ pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << 8) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
+ }
+ }
+ }
+ }
+ else
+ {
+ Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
+ Vec16uc tmp16_1, tmp16_2;
+ Vec2uq tmp2uq;
+ Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31), v_ipAngle(0);
+ switch (intraPredAngle)
+ {
+ case -32:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain); //-1,0,1,2
+ store_partial(const_int(4), pDst, tmp16_1);
+ tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 1); //-2,-1,0,1
+ store_partial(const_int(4), pDst + dstStride, tmp16_2);
+ tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 2);
+ store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
+ tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 3);
+ store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
+ return;
+
+ case -26:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 3);
+ row41 = extend_low(tmp16_1); //offsets(-4,-3,-2,-1)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(-3,-2,-1,0)
+
+ row31 = row42; //offsets(-3,-2,-1,0)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row32 = extend_low(tmp16_2); //offsets(-2,-1,0,1)
+
+ row21 = row32; //offsets(-2,-1,0,1)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 24;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row22 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row11 = row22; //offsets(-1,0,1,2)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 32;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(0,1,2,3)
+
+ v_deltaPos = v_ipAngle = -26;
+ break;
+
+ case -21:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
+ row41 = extend_low(tmp16_1); //offsets(-3,-2,-1,0)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(-2,-1,0,1)
+
+ row31 = row42; //offsets(-2,-1,0,1)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row32 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row21 = row31; //offsets(-2,-1,0,1)
+ row22 = row32;
+
+ row11 = row32;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 24;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ v_deltaPos = v_ipAngle = -21;
+ break;
+
+ case -17:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
+ row41 = extend_low(tmp16_1); //offsets(-3,-2,-1,0)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(-2,-1,0,1)
+
+ row31 = row42; //offsets(-2,-1,0,1)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row32 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row21 = row31; //offsets(-2,-1,0,1)
+ row22 = row32;
+
+ row11 = row32;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 24;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ v_deltaPos = v_ipAngle = -17;
+ break;
+
+ case -13:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
+ row41 = extend_low(tmp16_1); //offsets(-2,-1,0,1)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row11 = row42;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row21 = row42; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row41;
+ row32 = row42;
+
+ v_deltaPos = v_ipAngle = -13;
+ break;
+
+ case -9:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
+ row41 = extend_low(tmp16_1); //offsets(-2,-1,0,1)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row11 = row42;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(-1,0,1,2)
+
+ row21 = row42; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row42;
+ row32 = row12;
+
+ v_deltaPos = v_ipAngle = -9;
+ break;
+
+ case -5:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+ row21 = row11; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row11;
+ row32 = row12;
+ row41 = row11;
+ row42 = row12;
+
+ v_deltaPos = v_ipAngle = -5;
+ break;
+
+ case -2:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+ row21 = row11; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row11;
+ row32 = row12;
+ row41 = row11;
+ row42 = row12;
+
+ v_deltaPos = v_ipAngle = -2;
+ break;
+
+ case 2:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+ row21 = row11; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row11;
+ row32 = row12;
+ row41 = row11;
+ row42 = row12;
+
+ v_deltaPos = v_ipAngle = 2;
+ break;
+
+ case 5:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+ row21 = row11; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row11;
+ row32 = row12;
+ row41 = row11;
+ row42 = row12;
+
+ v_deltaPos = v_ipAngle = 5;
+ break;
+
+ case 9:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+ row21 = row11; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row11;
+ row32 = row12;
+ row41 = row12;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2);
+
+ v_deltaPos = v_ipAngle = 9;
+ break;
+
+ case 13:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+
+ row21 = row11; //offsets(0,1,2,3)
+ row22 = row12;
+ row31 = row12; //offsets(1,2,3,4)
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row32 = extend_low(tmp16_2); //offsets(2,3,4,5)
+
+ row41 = row31; //offsets(1,2,3,4)
+ row42 = row32;
+
+ v_deltaPos = v_ipAngle = 13;
+ break;
+
+ case 17:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+
+ row21 = row12;
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row22 = extend_low(tmp16_2); //offsets(2,3,4,5)
+
+ row31 = row21;
+ row32 = row22;
+
+ row41 = row22;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 24;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(3,4,5,6)
+
+ v_deltaPos = v_ipAngle = 17;
+ break;
+
+ case 21:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+
+ row21 = row12;
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row22 = extend_low(tmp16_2); //offsets(2,3,4,5)
+
+ row31 = row21;
+ row32 = row22;
+
+ row41 = row22;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 24;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(3,4,5,6)
+
+ v_deltaPos = v_ipAngle = 21;
+ break;
+
+ case 26:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+ row11 = extend_low(tmp16_1); //offsets(0,1,2,3)
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 8;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row12 = extend_low(tmp16_2); //offsets(1,2,3,4)
+
+ row21 = row12;
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 16;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row22 = extend_low(tmp16_2); //offsets(2,3,4,5)
+
+ row31 = row22;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 24;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row32 = extend_low(tmp16_2); //offsets(3,4,5,6)
+
+ row41 = row32;
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq = tmp2uq >> 32;
+ tmp16_2 = reinterpret_i(tmp2uq);
+ row42 = extend_low(tmp16_2); //offsets(4,5,6,7)
+
+ v_deltaPos = v_ipAngle = 26;
+ break;
+
+ case 32:
+ tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 2);
+ store_partial(const_int(4), pDst, tmp16_1);
+ tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 3);
+ store_partial(const_int(4), pDst + dstStride, tmp16_2);
+ tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 4);
+ store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
+ tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 5);
+ store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
+ return;
+ }
+
+ //row1
+ v_deltaFract = v_deltaPos & thirty1;
+ row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
+
+ //row2
+ v_deltaPos += v_ipAngle;
+ v_deltaFract = v_deltaPos & thirty1;
+ row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
+
+ //row3
+ v_deltaPos += v_ipAngle;
+ v_deltaFract = v_deltaPos & thirty1;
+ row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
+
+ //row4
+ v_deltaPos += v_ipAngle;
+ v_deltaFract = v_deltaPos & thirty1;
+ row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
+
+ // Flip the block
+
+ if (modeHor)
+ {
+ Vec8s tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
+ tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
+
+ tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
+ tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
+
+ tmp16_1 = compress_unsafe(tmp3, tmp3);
+ store_partial(const_int(4), pDst, tmp16_1);
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq >>= 32;
+ store_partial(const_int(4), pDst + dstStride, tmp2uq);
+
+ tmp16_1 = compress_unsafe(tmp4, tmp4);
+ store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
+
+ tmp2uq = reinterpret_i(tmp16_1);
+ tmp2uq >>= 32;
+ store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
+ }
+ else
+ {
+ store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
+ store_partial(const_int(4), pDst + (dstStride), compress_unsafe(row21, row21));
+ store_partial(const_int(4), pDst + (2 * dstStride), compress_unsafe(row31, row31));
+ store_partial(const_int(4), pDst + (3 * dstStride), compress_unsafe(row41, row41));
+ }
+ }
+}
+
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+ LOADROW(row11, GETAP(lookIdx, X)); \
+ LOADROW(row12, GETAP(lookIdx, X) + 1); \
+ CALCROW(row11, row11, row12); \
+ store_partial(const_int(8), pDst + (X * dstStride), compress(row11, row11)); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+ LOADROW(row11, GETAP(lookIdx, X)); \
+ LOADROW(row12, GETAP(lookIdx, X) + 1); \
+ CALCROW(rowx, row11, row12); \
+}
+
+// ROW is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROW, X) { \
+ tmp = load_partial(const_int(8), refMain + 1 + X); \
+ ROW = extend_low(tmp); \
+}
+
+#define CALCROW(RES, ROW1, ROW2) { \
+ v_deltaPos += v_ipAngle; \
+ v_deltaFract = v_deltaPos & thirty1; \
+ RES = ((thirty2 - v_deltaFract) * ROW1 + (v_deltaFract * ROW2) + 16) >> 5; \
+}
+
+void xPredIntraAng8x8(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
+{
+ int k;
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int lookIdx = intraPredAngle;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ // bfilter will always be true for blocksize 8
+ if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
+ {
+ if (modeHor)
+ {
+ Vec16uc v_temp;
+ Vec16uc tmp1;
+
+ v_temp.load(refMain + 1);
+ Vec8s v_main;
+ v_main = extend_low(v_temp);
+
+ if (bFilter)
+ {
+ Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+ Vec16uc v_temp16;
+ v_temp16.load(refSide + 1);
+ Vec8s v_side;
+ v_side = extend_low(v_temp16);
+
+ Vec8s row0;
+ row0 = permute8s<0, 0, 0, 0, 0, 0, 0, 0>(v_main);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row0 = row0 + v_side;
+ row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+ tmp1 = compress(row0, row0);
+ store_partial(const_int(8), pDst, tmp1); //row0
+ }
+ else
+ {
+ tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst, tmp1); //row0
+ }
+ tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (1 * dstStride), tmp1); //row1
+
+ tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
+
+ tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
+
+ tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (4 * dstStride), tmp1); //row4
+
+ tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (5 * dstStride), tmp1); //row5
+
+ tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (6 * dstStride), tmp1); //row6
+
+ tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+ store_partial(const_int(8), pDst + (7 * dstStride), tmp1); //row7
+ }
+ else
+ {
+ Vec16uc v_main;
+ v_main = load_partial(const_int(8), refMain + 1);
+ store_partial(const_int(8), pDst, v_main);
+ store_partial(const_int(8), pDst + dstStride, v_main);
+ store_partial(const_int(8), pDst + (2 * dstStride), v_main);
+ store_partial(const_int(8), pDst + (3 * dstStride), v_main);
+ store_partial(const_int(8), pDst + (4 * dstStride), v_main);
+ store_partial(const_int(8), pDst + (5 * dstStride), v_main);
+ store_partial(const_int(8), pDst + (6 * dstStride), v_main);
+ store_partial(const_int(8), pDst + (7 * dstStride), v_main);
+
+ if (bFilter)
+ {
+ Vec16uc v_temp;
+ Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+
+ v_temp.load(refSide + 1);
+ Vec8s v_side;
+ v_side = extend_low(v_temp);
+
+ v_temp.load(refMain + 1);
+ Vec8s row0;
+ row0 = permute16uc<0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1>(v_temp);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row0 = row0 + v_side;
+ row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+ pDst[0 * dstStride] = row0[0];
+ pDst[1 * dstStride] = row0[1];
+ pDst[2 * dstStride] = row0[2];
+ pDst[3 * dstStride] = row0[3];
+ pDst[4 * dstStride] = row0[4];
+ pDst[5 * dstStride] = row0[5];
+ pDst[6 * dstStride] = row0[6];
+ pDst[7 * dstStride] = row0[7];
+ }
+ }
+ }
+ else if (intraPredAngle == -32)
+ {
+ Vec16uc tmp;
+ tmp = load_partial(const_int(8), refMain); //-1,0,1,2
+ store_partial(const_int(8), pDst, tmp);
+ tmp = load_partial(const_int(8), refMain - 1); //-2,-1,0,1
+ store_partial(const_int(8), pDst + dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 2);
+ store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 3);
+ store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 4);
+ store_partial(const_int(8), pDst + 4 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 5);
+ store_partial(const_int(8), pDst + 5 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 6);
+ store_partial(const_int(8), pDst + 6 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain - 7);
+ store_partial(const_int(8), pDst + 7 * dstStride, tmp);
+ return;
+ }
+ else if (intraPredAngle == 32)
+ {
+ Vec8s tmp;
+ tmp = load_partial(const_int(8), refMain + 2); //-1,0,1,2
+ store_partial(const_int(8), pDst, tmp);
+ tmp = load_partial(const_int(8), refMain + 3); //-2,-1,0,1
+ store_partial(const_int(8), pDst + dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 4);
+ store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 5);
+ store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 6);
+ store_partial(const_int(8), pDst + 4 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 7);
+ store_partial(const_int(8), pDst + 5 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 8);
+ store_partial(const_int(8), pDst + 6 * dstStride, tmp);
+ tmp = load_partial(const_int(8), refMain + 9);
+ store_partial(const_int(8), pDst + 7 * dstStride, tmp);
+ return;
+ }
+ else
+ {
+ if (modeHor) // Near horizontal modes
+ {
+ Vec16uc tmp;
+ Vec8s row11, row12;
+ Vec16uc row1, row2, row3, row4, tmp16_1, tmp16_2;
+ Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+ Vec8s tmp1, tmp2;
+ v_deltaPos = 0;
+ v_ipAngle = intraPredAngle;
+ switch (intraPredAngle)
+ {
+ case -5:
+ LOADROW(row11, -1);
+ LOADROW(row12, 0);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row1 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row2 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row3 = compress(tmp1, tmp2);
+ row12 = row11;
+ LOADROW(row11, -2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row4 = compress(tmp1, tmp2);
+ break;
+
+ case -2:
+ LOADROW(row11, -1);
+ LOADROW(row12, 0);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row1 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row2 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row3 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row4 = compress(tmp1, tmp2);
+ break;
+
+ case 2:
+ LOADROW(row11, 0);
+ LOADROW(row12, 1);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row1 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row2 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row3 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row4 = compress(tmp1, tmp2);
+ break;
+
+ case 5:
+ LOADROW(row11, 0);
+ LOADROW(row12, 1);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row1 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row2 = compress(tmp1, tmp2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row3 = compress(tmp1, tmp2);
+ row11 = row12;
+ LOADROW(row12, 2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ row4 = compress(tmp1, tmp2);
+ break;
+
+ default: // these cases uses the lookup table to identify access patterns
+
+ PREDANG_CALCROW_HOR(0, tmp1);
+ PREDANG_CALCROW_HOR(1, tmp2);
+ row1 = compress(tmp1, tmp2);
+ PREDANG_CALCROW_HOR(2, tmp1);
+ PREDANG_CALCROW_HOR(3, tmp2);
+ row2 = compress(tmp1, tmp2);
+ PREDANG_CALCROW_HOR(4, tmp1);
+ PREDANG_CALCROW_HOR(5, tmp2);
+ row3 = compress(tmp1, tmp2);
+ PREDANG_CALCROW_HOR(6, tmp1);
+ PREDANG_CALCROW_HOR(7, tmp2);
+ row4 = compress(tmp1, tmp2);
+ }
+
+ // Flip the block
+ tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
+ tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
+ row1 = tmp16_1;
+ row2 = tmp16_2;
+
+ tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
+ tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
+ row3 = tmp16_1;
+ row4 = tmp16_2;
+
+ tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
+ tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
+ row1 = tmp16_1;
+ row2 = tmp16_2;
+
+ tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
+ tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
+ row3 = tmp16_1;
+ row4 = tmp16_2;
+
+ tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row1, (Vec4i)row3);
+ tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row1, (Vec4i)row3);
+ row1 = tmp16_1;
+ row3 = tmp16_2;
+
+ tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row2, (Vec4i)row4);
+ tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row2, (Vec4i)row4);
+ row2 = tmp16_1;
+ row4 = tmp16_2;
+
+ store_partial(const_int(8), pDst, row1); //row1
+ store_partial(const_int(8), pDst + (2 * dstStride), row3); //row3
+ store_partial(const_int(8), pDst + (4 * dstStride), row2); //row5
+ store_partial(const_int(8), pDst + (6 * dstStride), row4); //row7
+
+ row1 = blend2q<1, 3>((Vec2q)row1, (Vec2q)row1);
+ store_partial(const_int(8), pDst + (1 * dstStride), row1); //row2
+
+ row1 = blend2q<1, 3>((Vec2q)row3, (Vec2q)row3);
+ store_partial(const_int(8), pDst + (3 * dstStride), row1); //row4
+
+ row1 = blend2q<1, 3>((Vec2q)row2, (Vec2q)row2);
+ store_partial(const_int(8), pDst + (5 * dstStride), row1); //row6
+
+ row1 = blend2q<1, 3>((Vec2q)row4, (Vec2q)row4);
+ store_partial(const_int(8), pDst + (7 * dstStride), row1); //row8
+ }
+ else // Vertical modes
+ {
+ Vec8s row11, row12;
+ Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+ Vec16uc tmp;
+ Vec8s tmp1, tmp2;
+ v_deltaPos = 0;
+ v_ipAngle = intraPredAngle;
+ switch (intraPredAngle)
+ {
+ case -5:
+ LOADROW(row11, -1);
+ LOADROW(row12, 0);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+ row12 = row11;
+ LOADROW(row11, -2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+ break;
+
+ case -2:
+ LOADROW(row11, -1);
+ LOADROW(row12, 0);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+ break;
+
+ case 2:
+ LOADROW(row11, 0);
+ LOADROW(row12, 1);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+ break;
+
+ case 5:
+ LOADROW(row11, 0);
+ LOADROW(row12, 1);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+ row11 = row12;
+ LOADROW(row12, 2);
+ CALCROW(tmp1, row11, row12);
+ CALCROW(tmp2, row11, row12);
+ store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+ store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+ break;
+
+ default: // these cases uses the lookup table to identify access patterns
+ PREDANG_CALCROW_VER(0);
+ PREDANG_CALCROW_VER(1);
+ PREDANG_CALCROW_VER(2);
+ PREDANG_CALCROW_VER(3);
+ PREDANG_CALCROW_VER(4);
+ PREDANG_CALCROW_VER(5);
+ PREDANG_CALCROW_VER(6);
+ PREDANG_CALCROW_VER(7);
+ }
+ }
+ }
+}
+
+#undef PREDANG_CALCROW_VER
+#undef PREDANG_CALCROW_HOR
+#undef LOADROW
+#undef CALCROW
+#endif /* if HIGH_BIT_DEPTH */
+
+//16x16
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+ LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
+ LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
+ CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
+ /*compress(row11L, row11H).store(pDst + ((X)*dstStride));*/ \
+ itmp = _mm_packus_epi16(row11L, row11H); \
+ _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+ LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
+ LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
+ CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
+ /*rowx = compress(row11L, row11H);*/ \
+ rowx = _mm_packus_epi16(row11L, row11H); \
+}
+
+// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROWL, ROWH, X) { \
+ /*tmp.load(refMain + 1 + (X)); */ \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
+ /* ROWL = extend_low(tmp);*/ \
+ ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ /*ROWH = extend_high(tmp);*/ \
+ ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+}
+
+#define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
+ /*v_deltaPos += v_ipAngle; \
+ v_deltaFract = v_deltaPos & thirty1;*/ \
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+ /*RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
+ RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5;*/ \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, ROW1L); \
+ it3 = _mm_mullo_epi16(v_deltaFract, ROW2L); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ RESL = _mm_srai_epi16(it2, 5); \
+ \
+ it2 = _mm_mullo_epi16(it1, ROW1H); \
+ it3 = _mm_mullo_epi16(v_deltaFract, ROW2H); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ RESH = _mm_srai_epi16(it2, 5); \
+}
+
+#define BLND2_16(R1, R2) { \
+ /*tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); */ \
+ itmp1 = _mm_unpacklo_epi8(R1, R2); \
+ /*tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2);*/ \
+ itmp2 = _mm_unpackhi_epi8(R1, R2); \
+ R1 = itmp1; \
+ R2 = itmp2; \
+}
+
+#define MB4(R1, R2, R3, R4) { \
+ BLND2_16(R1, R2) \
+ BLND2_16(R3, R4) \
+ /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3);*/ \
+ itmp1 = _mm_unpacklo_epi16(R1, R3); \
+ /* tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3);*/ \
+ itmp2 = _mm_unpackhi_epi16(R1, R3); \
+ R1 = itmp1; \
+ R3 = itmp2; \
+ /*R1 = tmp1; \
+ R3 = tmp2;*/ \
+ /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
+ tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4);*/ \
+ itmp1 = _mm_unpacklo_epi16(R2, R4); \
+ itmp2 = _mm_unpackhi_epi16(R2, R4); \
+ R2 = itmp1; \
+ R4 = itmp2; \
+ /*R2 = tmp1; \
+ R4 = tmp2;*/ \
+}
+
+#define BLND2_4(R1, R2) { \
+ /* tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
+ tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); */ \
+ itmp1 = _mm_unpacklo_epi32(R1, R2); \
+ itmp2 = _mm_unpackhi_epi32(R1, R2); \
+ R1 = itmp1; \
+ R2 = itmp2; \
+ /*R1 = tmp1; \
+ R2 = tmp2; */\
+}
+
+#define BLND2_2(R1, R2) { \
+ /*tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+ tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2);*/ \
+ itmp1 = _mm_unpacklo_epi64(R1, R2); \
+ itmp2 = _mm_unpackhi_epi64(R1, R2); \
+ /*tmp1.store(pDst); */ \
+ _mm_storeu_si128((__m128i*)pDst, itmp1); \
+ pDst += dstStride; \
+ /*tmp2.store(pDst);*/ \
+ _mm_storeu_si128((__m128i*)pDst, itmp2); \
+ pDst += dstStride; \
+}
+
+#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
+ PREDANG_CALCROW_HOR(0 + X, R1) \
+ PREDANG_CALCROW_HOR(1 + X, R2) \
+ PREDANG_CALCROW_HOR(2 + X, R3) \
+ PREDANG_CALCROW_HOR(3 + X, R4) \
+ PREDANG_CALCROW_HOR(4 + X, R5) \
+ PREDANG_CALCROW_HOR(5 + X, R6) \
+ PREDANG_CALCROW_HOR(6 + X, R7) \
+ PREDANG_CALCROW_HOR(7 + X, R8) \
+ MB4(R1, R2, R3, R4) \
+ MB4(R5, R6, R7, R8) \
+ BLND2_4(R1, R5); \
+ BLND2_4(R2, R6); \
+ BLND2_4(R3, R7); \
+ BLND2_4(R4, R8); \
+}
+
+void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
+{
+ int k;
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int lookIdx = intraPredAngle;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ if (intraPredAngle != -32)
+ for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ // bfilter will always be true for blocksize 8
+ if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
+ {
+ if (modeHor)
+ {
+ Vec16uc v_temp;
+ Vec16uc tmp1;
+ v_temp.load(refMain + 1);
+
+ if (bFilter)
+ {
+ Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+ Vec16uc v_temp16;
+ v_temp16.load(refSide + 1);
+ Vec8s v_side;
+ v_side = extend_low(v_temp16);
+
+ Vec8s row01, row02, ref(refMain[1]);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row01 = ref + v_side;
+ row01 = min(max(0, row01), (1 << bitDepth) - 1);
+
+ v_side = extend_high(v_temp16);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row02 = ref + v_side;
+ row02 = min(max(0, row02), (1 << bitDepth) - 1);
+
+ tmp1 = compress_unsafe(row01, row02);
+ tmp1.store(pDst); //row0
+ }
+ else
+ {
+ tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+ tmp1.store(pDst); //row0
+ }
+
+ tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+ tmp1.store(pDst + (1 * dstStride)); //row1
+
+ tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+ tmp1.store(pDst + (2 * dstStride)); //row2
+
+ tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+ tmp1.store(pDst + (3 * dstStride)); //row3
+
+ tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+ tmp1.store(pDst + (4 * dstStride)); //row4
+
+ tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+ tmp1.store(pDst + (5 * dstStride)); //row5
+
+ tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+ tmp1.store(pDst + (6 * dstStride)); //row6
+
+ tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+ tmp1.store(pDst + (7 * dstStride)); //row7
+
+ tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+ tmp1.store(pDst + (8 * dstStride)); //row8
+
+ tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+ tmp1.store(pDst + (9 * dstStride)); //row9
+
+ tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+ tmp1.store(pDst + (10 * dstStride)); //row10
+
+ tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+ tmp1.store(pDst + (11 * dstStride)); //row11
+
+ tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+ tmp1.store(pDst + (12 * dstStride)); //row12
+
+ tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+ tmp1.store(pDst + (13 * dstStride)); //row13
+
+ tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+ tmp1.store(pDst + (14 * dstStride)); //row14
+
+ tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+ tmp1.store(pDst + (15 * dstStride)); //row15
+ }
+ else
+ {
+ Vec16uc v_main;
+// v_main.load(refMain + 1);
+ v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
+
+ _mm_storeu_si128((__m128i*)pDst, v_main);
+ _mm_storeu_si128((__m128i*)(pDst + dstStride), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (2 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (3 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (4 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (5 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (6 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (7 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (8 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (9 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (10 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (11 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (12 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (13 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (14 * dstStride)), v_main);
+ _mm_storeu_si128((__m128i*)(pDst + (15 * dstStride)), v_main);
+
+ if (bFilter)
+ {
+ Vec16uc v_temp;
+ Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+
+ v_temp.load(refSide + 1);
+ Vec8s v_side;
+ v_side = extend_low(v_temp);
+
+ Vec8s row0, ref(refMain[1]);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row0 = ref + v_side;
+ row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+ pDst[0 * dstStride] = row0[0];
+ pDst[1 * dstStride] = row0[1];
+ pDst[2 * dstStride] = row0[2];
+ pDst[3 * dstStride] = row0[3];
+ pDst[4 * dstStride] = row0[4];
+ pDst[5 * dstStride] = row0[5];
+ pDst[6 * dstStride] = row0[6];
+ pDst[7 * dstStride] = row0[7];
+
+ v_side = extend_high(v_temp);
+ v_side -= v_side_0;
+ v_side = v_side >> 1;
+ row0 = ref + v_side;
+ row0 = min(max(0, row0), (1 << bitDepth) - 1);
+ pDst[8 * dstStride] = row0[0];
+ pDst[9 * dstStride] = row0[1];
+ pDst[10 * dstStride] = row0[2];
+ pDst[11 * dstStride] = row0[3];
+ pDst[12 * dstStride] = row0[4];
+ pDst[13 * dstStride] = row0[5];
+ pDst[14 * dstStride] = row0[6];
+ pDst[15 * dstStride] = row0[7];
+ }
+ }
+ }
+ else if (intraPredAngle == -32)
+ {
+ Vec16uc v_refSide;
+ v_refSide.load(refSide);
+ v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+ pixel refMain0 = refMain[0];
+
+ v_refSide.store(refMain - 15);
+ refMain[0] = refMain0;
+
+ Vec16uc tmp;
+ __m128i itmp;
+// tmp.load(refMain); //-1,0,1,2
+// tmp.store(pDst);
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)--refMain);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+
+/*
+ tmp.load(--refMain);
+ pDst += dstStride;
+ tmp.store(pDst);
+ ... 14 times more
+*/
+ return;
+ }
+ else if (intraPredAngle == 32)
+ {
+ Vec8s tmp;
+ __m128i itmp;
+ refMain += 2;
+
+// tmp.load(refMain++);
+// tmp.store(pDst);
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+
+/*
+ tmp.load(refMain++);
+ pDst += dstStride;
+ tmp.store(pDst);
+ ... 14 times more
+*/
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+
+ return;
+ }
+ else
+ {
+ if (modeHor)
+ {
+ Vec8s row11L, row12L, row11H, row12H;
+ Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+ Vec16uc tmp;
+ Vec16uc R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+ Vec16uc tmp1, tmp2;
+ v_deltaPos = 0;
+ v_ipAngle = intraPredAngle;
+ __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+// MB16;
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ }
+ else
+ {
+ Vec8s row11L, row12L, row11H, row12H;
+ Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+ Vec16uc tmp;
+ Vec8s tmp1, tmp2;
+ v_deltaPos = 0;
+ v_ipAngle = intraPredAngle;
+ __m128i itmp, it1, it2, it3, i16;
+
+ PREDANG_CALCROW_VER(0);
+ PREDANG_CALCROW_VER(1);
+ PREDANG_CALCROW_VER(2);
+ PREDANG_CALCROW_VER(3);
+ PREDANG_CALCROW_VER(4);
+ PREDANG_CALCROW_VER(5);
+ PREDANG_CALCROW_VER(6);
+ PREDANG_CALCROW_VER(7);
+ PREDANG_CALCROW_VER(8);
+ PREDANG_CALCROW_VER(9);
+ PREDANG_CALCROW_VER(10);
+ PREDANG_CALCROW_VER(11);
+ PREDANG_CALCROW_VER(12);
+ PREDANG_CALCROW_VER(13);
+ PREDANG_CALCROW_VER(14);
+ PREDANG_CALCROW_VER(15);
+ }
+ }
+}
+
+#undef PREDANG_CALCROW_VER
+#undef PREDANG_CALCROW_HOR
+#undef LOADROW
+#undef CALCROW
+#undef BLND2_16
+#undef BLND2_2
+#undef BLND2_4
+#undef MB4
+#undef CALC_BLND_8ROWS
+#endif /* if HIGH_BIT_DEPTH */
+
+//32x32
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
+ row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+ \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
+ row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+ \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, row11L); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ row11L = _mm_srai_epi16(it2, 5); \
+ it2 = _mm_mullo_epi16(it1, row11H); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ row11H = _mm_srai_epi16(it2, 5); \
+ \
+ itmp = _mm_packus_epi16(row11L, row11H); \
+ _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
+ row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+ \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
+ row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+ \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, row11L); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ row11L = _mm_srai_epi16(it2, 5); \
+ it2 = _mm_mullo_epi16(it1, row11H); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ row11H = _mm_srai_epi16(it2, 5); \
+ \
+ itmp = _mm_packus_epi16(row11L, row11H); \
+ _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
+}
+
+#define PREDANG_CALCROW_VER_MODE2(X) { \
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, row11); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row21); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ res1 = _mm_srai_epi16(it2, 5); \
+ it2 = _mm_mullo_epi16(it1, row12); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row22); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ res2 = _mm_srai_epi16(it2, 5); \
+ \
+ itmp = _mm_packus_epi16(res1, res2); \
+ _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, row13); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row23); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ res1 = _mm_srai_epi16(it2, 5); \
+ it2 = _mm_mullo_epi16(it1, row14); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row24); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ res2 = _mm_srai_epi16(it2, 5); \
+ \
+ itmp = _mm_packus_epi16(res1, res2); \
+ _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
+ row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+ \
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
+ row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+ \
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, row11L); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ row11L = _mm_srai_epi16(it2, 5); \
+ it2 = _mm_mullo_epi16(it1, row11H); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ row11H = _mm_srai_epi16(it2, 5); \
+ \
+ rowx = _mm_packus_epi16(row11L, row11H); \
+}
+
+#define PREDANG_CALCROW_HOR_MODE2(rowx) { \
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+ it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+ it2 = _mm_mullo_epi16(it1, row11L); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+ it2 = _mm_add_epi16(it2, it3); \
+ i16 = _mm_set1_epi16(16); \
+ it2 = _mm_add_epi16(it2, i16); \
+ res1 = _mm_srai_epi16(it2, 5); \
+ it2 = _mm_mullo_epi16(it1, row11H); \
+ it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+ it2 = _mm_add_epi16(it2, it3); \
+ it2 = _mm_add_epi16(it2, i16); \
+ res2 = _mm_srai_epi16(it2, 5); \
+ \
+ rowx = _mm_packus_epi16(res1, res2); \
+}
+
+// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROWL, ROWH, X) { \
+/* tmp.load(refMain + 1 + (X)); \
+ ROWL = extend_low(tmp); \
+ ROWH = extend_high(tmp); */\
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
+ ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+ ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+}
+
+#define BLND2_2(R1, R2) { \
+/* tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+ tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
+ tmp1.store(pDst); pDst += dstStride; \
+ tmp2.store(pDst); pDst += dstStride; */\
+ itmp1 = _mm_unpacklo_epi64(R1, R2); \
+ itmp2 = _mm_unpackhi_epi64(R1, R2); \
+ _mm_storeu_si128((__m128i*)pDst, itmp1); \
+ pDst += dstStride; \
+ _mm_storeu_si128((__m128i*)pDst, itmp2); \
+ pDst += dstStride; \
+}
+
+#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
+ itmp1 = _mm_unpacklo_epi8(R1, R2); \
+ itmp2 = _mm_unpackhi_epi8(R1, R2); \
+ R1 = itmp1; \
+ R2 = itmp2; \
+ itmp1 = _mm_unpacklo_epi8(R3, R4); \
+ itmp2 = _mm_unpackhi_epi8(R3, R4); \
+ R3 = itmp1; \
+ R4 = itmp2; \
+ itmp1 = _mm_unpacklo_epi16(R1, R3); \
+ itmp2 = _mm_unpackhi_epi16(R1, R3); \
+ R1 = itmp1; \
+ R3 = itmp2; \
+ itmp1 = _mm_unpacklo_epi16(R2, R4); \
+ itmp2 = _mm_unpackhi_epi16(R2, R4); \
+ R2 = itmp1; \
+ R4 = itmp2; \
+ itmp1 = _mm_unpacklo_epi8(R5, R6); \
+ itmp2 = _mm_unpackhi_epi8(R5, R6); \
+ R5 = itmp1; \
+ R6 = itmp2; \
+ itmp1 = _mm_unpacklo_epi8(R7, R8); \
+ itmp2 = _mm_unpackhi_epi8(R7, R8); \
+ R7 = itmp1; \
+ R8 = itmp2; \
+ itmp1 = _mm_unpacklo_epi16(R5, R7); \
+ itmp2 = _mm_unpackhi_epi16(R5, R7); \
+ R5 = itmp1; \
+ R7 = itmp2; \
+ itmp1 = _mm_unpacklo_epi16(R6, R8); \
+ itmp2 = _mm_unpackhi_epi16(R6, R8); \
+ R6 = itmp1; \
+ R8 = itmp2; \
+ itmp1 = _mm_unpacklo_epi32(R1, R5); \
+ itmp2 = _mm_unpackhi_epi32(R1, R5); \
+ R1 = itmp1; \
+ R5 = itmp2; \
+ \
+ itmp1 = _mm_unpacklo_epi32(R2, R6); \
+ itmp2 = _mm_unpackhi_epi32(R2, R6); \
+ R2 = itmp1; \
+ R6 = itmp2; \
+ \
+ itmp1 = _mm_unpacklo_epi32(R3, R7); \
+ itmp2 = _mm_unpackhi_epi32(R3, R7); \
+ R3 = itmp1; \
+ R7 = itmp2; \
+ \
+ itmp1 = _mm_unpacklo_epi32(R4, R8); \
+ itmp2 = _mm_unpackhi_epi32(R4, R8); \
+ R4 = itmp1; \
+ R8 = itmp2; \
+}
+
+#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
+ PREDANG_CALCROW_HOR(0 + X, R1) \
+ PREDANG_CALCROW_HOR(1 + X, R2) \
+ PREDANG_CALCROW_HOR(2 + X, R3) \
+ PREDANG_CALCROW_HOR(3 + X, R4) \
+ PREDANG_CALCROW_HOR(4 + X, R5) \
+ PREDANG_CALCROW_HOR(5 + X, R6) \
+ PREDANG_CALCROW_HOR(6 + X, R7) \
+}
+
+#define CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) { \
+ PREDANG_CALCROW_HOR_MODE2(R1) \
+ PREDANG_CALCROW_HOR_MODE2(R2) \
+ PREDANG_CALCROW_HOR_MODE2(R3) \
+ PREDANG_CALCROW_HOR_MODE2(R4) \
+ PREDANG_CALCROW_HOR_MODE2(R5) \
+ PREDANG_CALCROW_HOR_MODE2(R6) \
+ PREDANG_CALCROW_HOR_MODE2(R7) \
+}
+
+void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+ int k;
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int lookIdx = intraPredAngle;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ if (intraPredAngle != -32)
+ for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ // bfilter will always be true for blocksize 8
+ if (intraPredAngle == 0) // Exactly hotizontal/vertical angles
+ {
+ if (modeHor)
+ {
+ Vec16uc v_temp, tmp1;
+
+ v_temp.load(refMain + 1);
+ /*BROADSTORE16ROWS;*/
+ tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+ tmp1.store(pDst + (0 * dstStride));
+ tmp1.store(pDst + (0 * dstStride) + 16);
+ tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+ tmp1.store(pDst + (1 * dstStride));
+ tmp1.store(pDst + (1 * dstStride) + 16);
+ tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+ tmp1.store(pDst + (2 * dstStride));
+ tmp1.store(pDst + (2 * dstStride) + 16);
+ tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+ tmp1.store(pDst + (3 * dstStride));
+ tmp1.store(pDst + (3 * dstStride) + 16);
+ tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+ tmp1.store(pDst + (4 * dstStride));
+ tmp1.store(pDst + (4 * dstStride) + 16);
+ tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+ tmp1.store(pDst + (5 * dstStride));
+ tmp1.store(pDst + (5 * dstStride) + 16);
+ tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+ tmp1.store(pDst + (6 * dstStride));
+ tmp1.store(pDst + (6 * dstStride) + 16);
+ tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+ tmp1.store(pDst + (7 * dstStride));
+ tmp1.store(pDst + (7 * dstStride) + 16);
+ tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+ tmp1.store(pDst + (8 * dstStride));
+ tmp1.store(pDst + (8 * dstStride) + 16);
+ tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+ tmp1.store(pDst + (9 * dstStride));
+ tmp1.store(pDst + (9 * dstStride) + 16);
+ tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+ tmp1.store(pDst + (10 * dstStride));
+ tmp1.store(pDst + (10 * dstStride) + 16);
+ tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+ tmp1.store(pDst + (11 * dstStride));
+ tmp1.store(pDst + (11 * dstStride) + 16);
+ tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+ tmp1.store(pDst + (12 * dstStride));
+ tmp1.store(pDst + (12 * dstStride) + 16);
+ tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+ tmp1.store(pDst + (13 * dstStride));
+ tmp1.store(pDst + (13 * dstStride) + 16);
+ tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+ tmp1.store(pDst + (14 * dstStride));
+ tmp1.store(pDst + (14 * dstStride) + 16);
+ tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+ tmp1.store(pDst + (15 * dstStride));
+ tmp1.store(pDst + (15 * dstStride) + 16);
+
+ pDst += 16 * dstStride;
+ v_temp.load(refMain + 1 + 16);
+ /*BROADSTORE16ROWS;*/
+ tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+ tmp1.store(pDst + (0 * dstStride));
+ tmp1.store(pDst + (0 * dstStride) + 16);
+ tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+ tmp1.store(pDst + (1 * dstStride));
+ tmp1.store(pDst + (1 * dstStride) + 16);
+ tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+ tmp1.store(pDst + (2 * dstStride));
+ tmp1.store(pDst + (2 * dstStride) + 16);
+ tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+ tmp1.store(pDst + (3 * dstStride));
+ tmp1.store(pDst + (3 * dstStride) + 16);
+ tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+ tmp1.store(pDst + (4 * dstStride));
+ tmp1.store(pDst + (4 * dstStride) + 16);
+ tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+ tmp1.store(pDst + (5 * dstStride));
+ tmp1.store(pDst + (5 * dstStride) + 16);
+ tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+ tmp1.store(pDst + (6 * dstStride));
+ tmp1.store(pDst + (6 * dstStride) + 16);
+ tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+ tmp1.store(pDst + (7 * dstStride));
+ tmp1.store(pDst + (7 * dstStride) + 16);
+ tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+ tmp1.store(pDst + (8 * dstStride));
+ tmp1.store(pDst + (8 * dstStride) + 16);
+ tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+ tmp1.store(pDst + (9 * dstStride));
+ tmp1.store(pDst + (9 * dstStride) + 16);
+ tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+ tmp1.store(pDst + (10 * dstStride));
+ tmp1.store(pDst + (10 * dstStride) + 16);
+ tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+ tmp1.store(pDst + (11 * dstStride));
+ tmp1.store(pDst + (11 * dstStride) + 16);
+ tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+ tmp1.store(pDst + (12 * dstStride));
+ tmp1.store(pDst + (12 * dstStride) + 16);
+ tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+ tmp1.store(pDst + (13 * dstStride));
+ tmp1.store(pDst + (13 * dstStride) + 16);
+ tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+ tmp1.store(pDst + (14 * dstStride));
+ tmp1.store(pDst + (14 * dstStride) + 16);
+ tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+ tmp1.store(pDst + (15 * dstStride));
+ tmp1.store(pDst + (15 * dstStride) + 16);
+ }
+ else
+ {
+ __m128i v_main;
+ Pel *dstOriginal = pDst;
+// v_main.load(refMain + 1);
+ v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
+// v_main.store(pDst);
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+
+ pDst = dstOriginal + 16;
+ v_main = _mm_loadu_si128((__m128i const*)(refMain + 17));
+// v_main.store(pDst);
+
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ pDst += dstStride;
+ _mm_storeu_si128((__m128i*)(pDst), v_main);
+ }
+ }
+ else if (intraPredAngle == -32)
+ {
+ Vec16uc v_refSide;
+ pixel refMain0 = refMain[0];
+
+ v_refSide.load(refSide);
+ v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+ v_refSide.store(refMain - 15);
+
+ v_refSide.load(refSide + 16);
+ v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+ v_refSide.store(refMain - 31);
+
+ refMain[0] = refMain0;
+
+ __m128i itmp;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+
+ return;
+ }
+ else if (intraPredAngle == 32)
+ {
+ __m128i itmp;
+ refMain += 2;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ return;
+ }
+ else
+ {
+ if (modeHor)
+ {
+ __m128i row11L, row12L, row11H, row12H, res1, res2;
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+ Pel * original_pDst = pDst;
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+ switch (intraPredAngle)
+ {
+ case -2:
+ LOADROW(row11L, row11H, -1)
+ LOADROW(row12L, row12H, 0)
+ R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row12L = row11L;
+ row12H = row11H;
+ LOADROW(row11L, row11H, -2)
+ R16 = _mm_packus_epi16(row11L, row11H);
+ pDst = original_pDst + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + (16 * dstStride);
+ refMain += 16;
+
+ v_deltaPos = _mm_setzero_si128();
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ LOADROW(row11L, row11H, -1)
+ LOADROW(row12L, row12H, 0)
+ R16 = _mm_packus_epi16(row11L, row11H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row12L = row11L;
+ row12H = row11H;
+ LOADROW(row11L, row11H, -2)
+ R16 = _mm_packus_epi16(row11L, row11H);
+ pDst = original_pDst + (16 * dstStride) + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ return;
+
+ case 2:
+ LOADROW(row11L, row11H, 0)
+ LOADROW(row12L, row12H, 1)
+ R16 = _mm_packus_epi16(row12L, row12H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row11L = row12L;
+ row11H = row12H;
+ LOADROW(row12L, row12H, 2)
+ R16 = _mm_packus_epi16(row12L, row12H);
+ pDst = original_pDst + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + (16 * dstStride);
+ refMain += 16;
+ v_deltaPos = _mm_setzero_si128();
+
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ LOADROW(row11L, row11H, 0)
+ LOADROW(row12L, row12H, 1)
+ R16 = _mm_packus_epi16(row12L, row12H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row11L = row12L;
+ row11H = row12H;
+ LOADROW(row12L, row12H, 2)
+ R16 = _mm_packus_epi16(row12L, row12H);
+ pDst = original_pDst + (16 * dstStride) + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ return;
+ }
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+ PREDANG_CALCROW_HOR(7 + 0, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+ PREDANG_CALCROW_HOR(7 + 8, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + 16;
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+ PREDANG_CALCROW_HOR(7 + 16, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+ R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + (16 * dstStride);
+ refMain += 16;
+ v_deltaPos = _mm_setzero_si128();
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+ PREDANG_CALCROW_HOR(7 + 0, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+ PREDANG_CALCROW_HOR(7 + 8, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ pDst = original_pDst + (16 * dstStride) + 16;
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+ PREDANG_CALCROW_HOR(7 + 16, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+ R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ }
+ else
+ {
+ __m128i row11L, row12L, row11H, row12H;
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i row11, row12, row13, row14, row21, row22, row23, row24;
+ __m128i res1, res2;
+
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, it1, it2, it3, i16;
+
+ switch (intraPredAngle)
+ {
+ case -2:
+ LOADROW(row11, row12, -1)
+ LOADROW(row21, row22, 0)
+ LOADROW(row13, row14, 15)
+ LOADROW(row23, row24, 16)
+ for (int i = 0; i <= 14; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ //deltaFract == 0 for 16th row
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ itmp = _mm_packus_epi16(row11, row12);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row13, row14);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+ row21 = row11;
+ row22 = row12;
+ row23 = row13;
+ row24 = row14;
+
+ LOADROW(row11, row12, -2)
+ LOADROW(row13, row14, 14)
+ for (int i = 16; i <= 30; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ itmp = _mm_packus_epi16(row11, row12);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row13, row14);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+
+ return;
+
+ case 2:
+
+ LOADROW(row11, row12, 0)
+ LOADROW(row21, row22, 1)
+ LOADROW(row13, row14, 16)
+ LOADROW(row23, row24, 17)
+ for (int i = 0; i <= 14; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ //deltaFract == 0 for 16th row
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ itmp = _mm_packus_epi16(row21, row22);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row23, row24);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+ row11 = row21;
+ row12 = row22;
+ row13 = row23;
+ row14 = row24;
+
+ LOADROW(row21, row22, 2)
+ LOADROW(row23, row24, 18)
+ for (int i = 16; i <= 30; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ itmp = _mm_packus_epi16(row21, row22);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row23, row24);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+
+ return;
+ }
+
+ for (int i = 0; i <= 30; i++)
+ {
+ PREDANG_CALCROW_VER(i);
+ }
+
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+ }
+ }
+}
+
+#endif /* if HIGH_BIT_DEPTH */
+
+void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
+{
+#if HIGH_BIT_DEPTH
+#else
+ switch (width)
+ {
+ case 4:
+ xPredIntraAng4x4(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
+ return;
+ case 8:
+ xPredIntraAng8x8(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
+ return;
+ case 16:
+ xPredIntraAng16x16(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
+ return;
+ case 32:
+ xPredIntraAng32x32(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
+ return;
+ }
+
+#endif /* if HIGH_BIT_DEPTH */
+
+ int k, l;
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+ int invAngle = invAngTable[absAng];
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ // Do angular predictions
+ {
+ pixel* refMain;
+ pixel* refSide;
+
+ // Initialise the Main and Left reference array.
+ if (intraPredAngle < 0)
+ {
+ refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+ refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+ // Extend the Main reference to the left.
+ int invAngleSum = 128; // rounding for (shift by 8)
+ for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+ {
+ invAngleSum += invAngle;
+ refMain[k] = refSide[invAngleSum >> 8];
+ }
+ }
+ else
+ {
+ refMain = modeVer ? refAbove : refLeft;
+ refSide = modeVer ? refLeft : refAbove;
+ }
+
+ if (intraPredAngle == 0)
+ {
+ for (k = 0; k < blkSize; k++)
+ {
+ for (l = 0; l < blkSize; l++)
+ {
+ pDst[k * dstStride + l] = refMain[l + 1];
+ }
+ }
+
+ if (bFilter)
+ {
+ for (k = 0; k < blkSize; k++)
+ {
+ pDst[k * dstStride] = (pixel)Clip3(0, (1 << bitDepth) - 1, static_cast<short>(pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1));
+ }
+ }
+ }
+ else
+ {
+ int deltaPos = 0;
+ int deltaInt;
+ int deltaFract;
+ int refMainIndex;
+
+ for (k = 0; k < blkSize; k++)
+ {
+ deltaPos += intraPredAngle;
+ deltaInt = deltaPos >> 5;
+ deltaFract = deltaPos & (32 - 1);
+
+ if (deltaFract)
+ {
+ // Do linear filtering
+ for (l = 0; l < blkSize; l++)
+ {
+ refMainIndex = l + deltaInt + 1;
+ pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
+ }
+ }
+ else
+ {
+ // Just copy the integer samples
+ for (l = 0; l < blkSize; l++)
+ {
+ pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
+ }
+ }
+ }
+ }
+
+ // Flip the block if this is the horizontal mode
+ if (modeHor)
+ {
+ pixel tmp;
+ for (k = 0; k < blkSize - 1; k++)
+ {
+ for (l = k + 1; l < blkSize; l++)
+ {
+ tmp = pDst[k * dstStride + l];
+ pDst[k * dstStride + l] = pDst[l * dstStride + k];
+ pDst[l * dstStride + k] = tmp;
+ }
+ }
+ }
+ }
+}
+
+#if HIGH_BIT_DEPTH
+#else // HIGH_BIT_DEPTH
+
+#if INSTRSET < 40
+void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+ int iMode;
+
+ // avoid warning
+ (pLeft1);
+ (pAbove1);
+
+ for( iMode = 2; iMode <= 34; iMode++ )
+ {
+ pixel *pLeft = pLeft0;
+ pixel *pAbove = pAbove0;
+ pixel *pDst = pDst0 + (iMode-2) * (4 * 4);
+ xPredIntraAngBufRef(8, pDst, 4, 4, iMode, bLuma, pLeft, pAbove);
+
+ // Optimize code don't flip buffer
+ bool modeHor = (iMode < 18);
+ // Flip the block if this is the horizontal mode
+ if (modeHor)
+ {
+ pixel tmp;
+ const int width = 4;
+ for (int k = 0; k < width - 1; k++)
+ {
+ for (int l = k + 1; l < width; l++)
+ {
+ tmp = pDst[k * width + l];
+ pDst[k * width + l] = pDst[l * width + k];
+ pDst[l * width + k] = tmp;
+ }
+ }
+ }
+ }
+}
+
+#else // INSTRSET >= 4
+
+void xPredIntraAngs4(pixel *pDst, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+
+}
+#endif // INSTRSET < 4
+
+#endif // HIGH_BIT_DEPTH
+
+}
+
+#include "utils.h"
+
+namespace x265 {
+void NAME(Setup_Vec_IPredPrimitives)(EncoderPrimitives& p)
+{
+ initFileStaticVars();
+ p.getIPredDC = predIntraDC;
+ p.getIPredPlanar = predIntraPlanar;
+ p.getIPredAng = xPredIntraAngBufRef;
+ p.getIPredAngs4 = xPredIntraAngs4;
+}
+
+}
diff --git a/source/test/intrapredharness.cpp b/source/test/intrapredharness.cpp
index 1f6bcf6..42b6dbe 100644
--- a/source/test/intrapredharness.cpp
+++ b/source/test/intrapredharness.cpp
@@ -47,6 +47,8 @@ IntraPredHarness::IntraPredHarness()
pixel_out_C = (pixel*)malloc(out_size * sizeof(pixel));
pixel_out_Vec = (pixel*)malloc(out_size * sizeof(pixel));
+ pixel_out_33_C = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), out_size_33, 32);
+ pixel_out_33_Vec = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), out_size_33, 32);
if (!pixel_out_C || !pixel_out_Vec)
{
@@ -62,6 +64,8 @@ IntraPredHarness::~IntraPredHarness()
free(pixel_buff);
free(pixel_out_C);
free(pixel_out_Vec);
+ TestHarness::alignedFree(pixel_out_33_C);
+ TestHarness::alignedFree(pixel_out_33_Vec);
}
bool IntraPredHarness::check_getIPredDC_primitive(x265::getIPredDC_t ref, x265::getIPredDC_t opt)
@@ -167,6 +171,52 @@ bool IntraPredHarness::check_getIPredAng_primitive(x265::getIPredAng_p ref, x265
return true;
}
+bool IntraPredHarness::check_getIPredAngs4_primitive(x265::getIPredAngs_t ref, x265::getIPredAngs_t opt)
+{
+ int j = ADI_BUF_STRIDE;
+
+ Bool isLuma;
+
+ for (int width = 4; width <= 4; width <<= 1)
+ {
+ for (int i = 0; i <= 100; i++)
+ {
+ isLuma = (width <= 16) && (rand()%2);
+
+ pixel * refAbove0 = pixel_buff + j;
+ pixel * refLeft0 = refAbove0 + 3 * width;
+ refLeft0[0] = refAbove0[0];
+
+ pixel * refAbove1 = pixel_buff + j + FENC_STRIDE;
+ pixel * refLeft1 = refAbove1 + 3 * width + FENC_STRIDE;
+ refLeft1[0] = refAbove1[0];
+
+#if _DEBUG
+ memset(pixel_out_33_Vec, 0xCD, out_size);
+ memset(pixel_out_33_C, 0xCD, out_size);
+#endif
+
+ ref(pixel_out_33_C, refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
+ opt(pixel_out_33_Vec, refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
+ for (int p = 2-2; p <= 34-2; p++)
+ {
+ for (int k = 0; k < width; k++)
+ {
+ if (memcmp(pixel_out_33_C + p * (width *width) + k * width, pixel_out_33_Vec + p * (width *width) + k * width, width))
+ {
+ printf("\nFailed: [%2d]: width=%d, mode=%d, bfilter=%d\n", k, width, p+2, isLuma);
+ return false;
+ }
+ }
+ }
+
+ j += FENC_STRIDE;
+ }
+ }
+
+ return true;
+}
+
bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.getIPredDC)
@@ -193,6 +243,14 @@ bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const Encod
return false;
}
}
+ if (opt.getIPredAngs4)
+ {
+ if (!check_getIPredAngs4_primitive(ref.getIPredAngs4, opt.getIPredAngs4))
+ {
+ printf("intrapred_angular_4x4_33_modes failed\n");
+ return false;
+ }
+ }
return true;
}
@@ -239,4 +297,18 @@ void IntraPredHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderP
}
}
}
+ if (opt.getIPredAngs4)
+ {
+ for (int ii = 4; ii <= 4; ii <<= 1)
+ {
+ width = ii;
+ bool bFilter = (width <= 16);
+ pixel * refAbove = pixel_buff + srcStride;
+ pixel * refLeft = refAbove + 3 * width;
+ refLeft[0] = refAbove[0];
+ printf("IPred_getIPredAngs4\t\t");
+ REPORT_SPEEDUP(opt.getIPredAngs4, ref.getIPredAngs4,
+ pixel_out_33_Vec, refAbove, refLeft, refAbove, refLeft, bFilter);
+ }
+ }
}
diff --git a/source/test/intrapredharness.h b/source/test/intrapredharness.h
index 5f138d3..9c144f1 100644
--- a/source/test/intrapredharness.h
+++ b/source/test/intrapredharness.h
@@ -34,15 +34,19 @@ protected:
pixel *pixel_buff;
pixel *pixel_out_C;
pixel *pixel_out_Vec;
+ pixel *pixel_out_33_C;
+ pixel *pixel_out_33_Vec;
pixel *IP_vec_output_p, *IP_C_output_p;
static const int ip_t_size = 4 * 65 * 65 * 100;
static const int out_size = 64 * FENC_STRIDE;
+ static const int out_size_33 = 33 * 64 * FENC_STRIDE;
bool check_getIPredDC_primitive(x265::getIPredDC_t ref, x265::getIPredDC_t opt);
bool check_getIPredPlanar_primitive(x265::getIPredPlanar_t ref, x265::getIPredPlanar_t opt);
bool check_getIPredAng_primitive(x265::getIPredAng_p ref, x265::getIPredAng_p opt);
+ bool check_getIPredAngs4_primitive(x265::getIPredAngs_t ref, x265::getIPredAngs_t opt);
public:
--
1.8.3.msysgit.0
More information about the x265-devel
mailing list