[x265] [PATCH 6/6] intrapred: framework for generate 33 Angle modes once

Min Chen chenm003 at 163.com
Tue Jun 18 18:43:38 CEST 2013


---
 source/Lib/TLibCommon/TComPrediction.cpp |    5 +-
 source/Lib/TLibCommon/TComPrediction.h   |    1 +
 source/Lib/TLibEncoder/TEncSearch.cpp    |   65 +-
 source/common/IntraPred.cpp              |  569 +-
 source/common/primitives.h               |    2 +
 source/common/vec/intrapred.inc          | 9409 +++++++++++++++---------------
 source/test/intrapredharness.cpp         |   72 +
 source/test/intrapredharness.h           |    4 +
 8 files changed, 5179 insertions(+), 4948 deletions(-)

diff --git a/source/Lib/TLibCommon/TComPrediction.cpp b/source/Lib/TLibCommon/TComPrediction.cpp
index 6e6baf3..6a58140 100644
--- a/source/Lib/TLibCommon/TComPrediction.cpp
+++ b/source/Lib/TLibCommon/TComPrediction.cpp
@@ -60,13 +60,15 @@ const UChar m_aucIntraFilter[5] =
 TComPrediction::TComPrediction()
     : m_pLumaRecBuffer(0)
     , m_iLumaRecStride(0)
+    , m_piPredBuf(NULL)
+    , m_piPredAngBufs(NULL)
 {
-    m_piPredBuf = NULL;
 }
 
 TComPrediction::~TComPrediction()
 {
     delete[] m_piPredBuf;
+    xFree(m_piPredAngBufs);
 
     xFree(refAbove);
     xFree(refAboveFlt);
@@ -114,6 +116,7 @@ Void TComPrediction::initTempBuff()
         m_iPredBufHeight  = ((MAX_CU_SIZE + 2) << 4);
         m_iPredBufStride = ((MAX_CU_SIZE  + 8) << 4);
         m_piPredBuf = new Pel[m_iPredBufStride * m_iPredBufHeight];
+        m_piPredAngBufs = (Pel*)xMalloc(Pel, 33 * MAX_CU_SIZE * MAX_CU_SIZE);
 
         refAbove = (Pel*)xMalloc(Pel, 3 * MAX_CU_SIZE);
         refAboveFlt = (Pel*)xMalloc(Pel, 3 * MAX_CU_SIZE);
diff --git a/source/Lib/TLibCommon/TComPrediction.h b/source/Lib/TLibCommon/TComPrediction.h
index 33d3882..8e18517 100644
--- a/source/Lib/TLibCommon/TComPrediction.h
+++ b/source/Lib/TLibCommon/TComPrediction.h
@@ -60,6 +60,7 @@ class TComPrediction : public TComWeightPrediction
 protected:
 
     Pel*      m_piPredBuf;
+    Pel*      m_piPredAngBufs;
     Int       m_iPredBufStride;
     Int       m_iPredBufHeight;
 
diff --git a/source/Lib/TLibEncoder/TEncSearch.cpp b/source/Lib/TLibEncoder/TEncSearch.cpp
index 63e2d66..7e4c50e 100644
--- a/source/Lib/TLibEncoder/TEncSearch.cpp
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp
@@ -2279,16 +2279,67 @@ Void TEncSearch::estIntraPredQT(TComDataCU* pcCU,
             primitives.getIPredPlanar((pixel*)ptrSrc + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, (pixel*)piPred, uiStride, uiWidth);
             uiSads[PLANAR_IDX] = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
 
-            // 33 Angle modes
-            for (UInt uiMode = 2; uiMode < numModesAvailable; uiMode++)
+            // 33 Angle modes once
+            if (uiWidth <= 4)
             {
-                predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
+                ALIGN_VAR_32(Pel, buf1[MAX_CU_SIZE * MAX_CU_SIZE]);
+                ALIGN_VAR_32(Pel, tmp[33 * MAX_CU_SIZE * MAX_CU_SIZE]);
 
-                // use hadamard transform here
-                UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
-                uiSads[uiMode] = uiSad;
+                // Transpose NxN
+                // TODO: Optimize by SSE2
+                for (int k = 0; k < uiWidth; k++)
+                {
+                    for (int l = 0; l < uiWidth; l++)
+                    {
+                        buf1[k * uiWidth + l] = piOrg[l * uiStride + k];
+                    }
+                }
+
+                Pel *pAbove0 = refAbove    + uiWidth - 1;
+                Pel *pAbove1 = refAboveFlt + uiWidth - 1;
+                Pel *pLeft0  = refLeft     + uiWidth - 1;
+                Pel *pLeft1  = refLeftFlt  + uiWidth - 1;
+
+                x265::primitives.getIPredAngs4(tmp, pAbove0, pLeft0, pAbove1, pLeft1, (uiWidth<16));
+
+                // TODO: We need SATD_x4 here
+                for (UInt uiMode = 2; uiMode < 18; uiMode++)
+                {
+                    //predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
+                    //for (int k = 0; k < uiWidth; k++)
+                    //{
+                    //    for (int l = 0; l < uiWidth; l++)
+                    //    {
+                    //        if (tmp[(uiMode - 2) * (uiWidth * uiWidth) + k * uiWidth + l] != piPred[l * uiStride + k])
+                    //            printf("X");
+                    //    }
+                    //}
+                    //UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
+
+                    // use hadamard transform here
+                    UInt uiSad = sa8d((pixel*)buf1, uiWidth, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
+                    uiSads[uiMode] = uiSad;
+                }
+                for (UInt uiMode = 18; uiMode < numModesAvailable; uiMode++)
+                {
+                    // use hadamard transform here
+                    UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
+                    uiSads[uiMode] = uiSad;
+                }
+                x265_emms();
+            }
+            else
+            {
+                for (UInt uiMode = 2; uiMode < numModesAvailable; uiMode++)
+                {
+                    predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
+
+                    // use hadamard transform here
+                    UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
+                    uiSads[uiMode] = uiSad;
+                }
+                x265_emms();
             }
-            x265_emms();
 
             for (UInt uiMode = 0; uiMode < numModesAvailable; uiMode++)
             {
diff --git a/source/common/IntraPred.cpp b/source/common/IntraPred.cpp
index 75c7812..dc1cd0d 100644
--- a/source/common/IntraPred.cpp
+++ b/source/common/IntraPred.cpp
@@ -1,262 +1,307 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Min Chen <chenm003 at 163.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com.
- *****************************************************************************/
-
-#include "primitives.h"
-#include "Lib/TLibCommon/TComPrediction.h"
-#include <cstring>
-#include <assert.h>
-
-//#define MAX_CU_SIZE 64
-extern char g_aucConvertToBit[];
-
-namespace {
-pixel CDECL predIntraGetPredValDC(pixel* pSrc, intptr_t iSrcStride, intptr_t iWidth)
-{
-    int iInd, iSum = 0;
-    pixel pDcVal;
-
-    for (iInd = 0; iInd < iWidth; iInd++)
-    {
-        iSum += pSrc[iInd - iSrcStride];
-    }
-    for (iInd = 0; iInd < iWidth; iInd++)
-    {
-        iSum += pSrc[iInd * iSrcStride - 1];
-    }
-
-    pDcVal = (pixel)((iSum + iWidth) / (iWidth + iWidth));
-
-    return pDcVal;
-}
-
-void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* pDst, intptr_t iDstStride, int iWidth, int iHeight)
-{
-    intptr_t x, y, iDstStride2, iSrcStride2;
-
-    // boundary pixels processing
-    pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pDst[0] + 2) >> 2);
-
-    for (x = 1; x < iWidth; x++)
-    {
-        pDst[x] = (pixel)((pSrc[x - iSrcStride] +  3 * pDst[x] + 2) >> 2);
-    }
-
-    for (y = 1, iDstStride2 = iDstStride, iSrcStride2 = iSrcStride - 1; y < iHeight; y++, iDstStride2 += iDstStride, iSrcStride2 += iSrcStride)
-    {
-        pDst[iDstStride2] = (pixel)((pSrc[iSrcStride2] + 3 * pDst[iDstStride2] + 2) >> 2);
-    }
-}
-
-void xPredIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
-{
-    int k, l;
-    int blkSize = width;
-
-    // Do the DC prediction
-    pixel dcval = (pixel)predIntraGetPredValDC(pSrc, srcStride, width);
-
-    for (k = 0; k < blkSize; k++)
-    {
-        for (l = 0; l < blkSize; l++)
-        {
-            pDst[k * dstStride + l] = dcval;
-        }
-    }
-
-    if (bFilter)
-    {
-        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
-    }
-}
-
-void xPredIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width)
-{
-    //assert(width == height);
-
-    int k, l;
-    pixel bottomLeft, topRight;
-    int horPred;
-    // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is 17 bits or [-65280, 65280], so we have to use 32 bits here
-    int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
-    // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
-    int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
-    int blkSize = width;
-    int offset2D = width;
-    int shift1D = g_aucConvertToBit[width] + 2;
-    int shift2D = shift1D + 1;
-
-    // Get left and above reference column and row
-    for (k = 0; k < blkSize + 1; k++)
-    {
-        topRow[k] = pSrc[k - srcStride];
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = (pixel)leftColumn[blkSize];
-    topRight   = (pixel)topRow[blkSize];
-    for (k = 0; k < blkSize; k++)
-    {
-        bottomRow[k]   = (int16_t)(bottomLeft - topRow[k]);
-        rightColumn[k] = (int16_t)(topRight   - leftColumn[k]);
-        topRow[k]      <<= shift1D;
-        leftColumn[k]  <<= shift1D;
-    }
-
-    // Generate prediction signal
-    for (k = 0; k < blkSize; k++)
-    {
-        horPred = leftColumn[k] + offset2D;
-        for (l = 0; l < blkSize; l++)
-        {
-            horPred += rightColumn[k];
-            topRow[l] += bottomRow[l];
-            pDst[k * dstStride + l] = (pixel)((horPred + topRow[l]) >> shift2D);
-        }
-    }
-}
-
-void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
-{
-    int k, l;
-    int blkSize  = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-    {
-        pixel* refMain;
-        pixel* refSide;
-
-        // Initialise the Main and Left reference array.
-        if (intraPredAngle < 0)
-        {
-            refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
-            refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
-            // Extend the Main reference to the left.
-            int invAngleSum    = 128; // rounding for (shift by 8)
-            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
-            {
-                invAngleSum += invAngle;
-                refMain[k] = refSide[invAngleSum >> 8];
-            }
-        }
-        else
-        {
-            refMain = modeVer ? refAbove : refLeft;
-            refSide = modeVer ? refLeft  : refAbove;
-        }
-
-        if (intraPredAngle == 0)
-        {
-            for (k = 0; k < blkSize; k++)
-            {
-                for (l = 0; l < blkSize; l++)
-                {
-                    pDst[k * dstStride + l] = refMain[l + 1];
-                }
-            }
-
-            if (bFilter)
-            {
-                for (k = 0; k < blkSize; k++)
-                {
-                    pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
-                }
-            }
-        }
-        else
-        {
-            int deltaPos = 0;
-            int deltaInt;
-            int deltaFract;
-            int refMainIndex;
-
-            for (k = 0; k < blkSize; k++)
-            {
-                deltaPos += intraPredAngle;
-                deltaInt   = deltaPos >> 5;
-                deltaFract = deltaPos & (32 - 1);
-
-                if (deltaFract)
-                {
-                    // Do linear filtering
-                    for (l = 0; l < blkSize; l++)
-                    {
-                        refMainIndex        = l + deltaInt + 1;
-                        pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
-                    }
-                }
-                else
-                {
-                    // Just copy the integer samples
-                    for (l = 0; l < blkSize; l++)
-                    {
-                        pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
-                    }
-                }
-            }
-        }
-
-        // Flip the block if this is the horizontal mode
-        if (modeHor)
-        {
-            pixel  tmp;
-            for (k = 0; k < blkSize - 1; k++)
-            {
-                for (l = k + 1; l < blkSize; l++)
-                {
-                    tmp                 = pDst[k * dstStride + l];
-                    pDst[k * dstStride + l] = pDst[l * dstStride + k];
-                    pDst[l * dstStride + k] = tmp;
-                }
-            }
-        }
-    }
-}
-}
-
-namespace x265 {
-// x265 private namespace
-
-void Setup_C_IPredPrimitives(EncoderPrimitives& p)
-{
-    p.getIPredDC = xPredIntraDC;
-    p.getIPredPlanar = xPredIntraPlanar;
-    p.getIPredAng = xPredIntraAngBufRef;
-}
-}
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <chenm003 at 163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#include "primitives.h"
+#include "Lib/TLibCommon/TComPrediction.h"
+#include <cstring>
+#include <assert.h>
+
+//#define MAX_CU_SIZE 64
+extern char g_aucConvertToBit[];
+
+namespace {
+pixel CDECL predIntraGetPredValDC(pixel* pSrc, intptr_t iSrcStride, intptr_t iWidth)
+{
+    int iInd, iSum = 0;
+    pixel pDcVal;
+
+    for (iInd = 0; iInd < iWidth; iInd++)
+    {
+        iSum += pSrc[iInd - iSrcStride];
+    }
+    for (iInd = 0; iInd < iWidth; iInd++)
+    {
+        iSum += pSrc[iInd * iSrcStride - 1];
+    }
+
+    pDcVal = (pixel)((iSum + iWidth) / (iWidth + iWidth));
+
+    return pDcVal;
+}
+
+void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* pDst, intptr_t iDstStride, int iWidth, int iHeight)
+{
+    intptr_t x, y, iDstStride2, iSrcStride2;
+
+    // boundary pixels processing
+    pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pDst[0] + 2) >> 2);
+
+    for (x = 1; x < iWidth; x++)
+    {
+        pDst[x] = (pixel)((pSrc[x - iSrcStride] +  3 * pDst[x] + 2) >> 2);
+    }
+
+    for (y = 1, iDstStride2 = iDstStride, iSrcStride2 = iSrcStride - 1; y < iHeight; y++, iDstStride2 += iDstStride, iSrcStride2 += iSrcStride)
+    {
+        pDst[iDstStride2] = (pixel)((pSrc[iSrcStride2] + 3 * pDst[iDstStride2] + 2) >> 2);
+    }
+}
+
+void xPredIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
+{
+    int k, l;
+    int blkSize = width;
+
+    // Do the DC prediction
+    pixel dcval = (pixel)predIntraGetPredValDC(pSrc, srcStride, width);
+
+    for (k = 0; k < blkSize; k++)
+    {
+        for (l = 0; l < blkSize; l++)
+        {
+            pDst[k * dstStride + l] = dcval;
+        }
+    }
+
+    if (bFilter)
+    {
+        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
+    }
+}
+
+void xPredIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width)
+{
+    //assert(width == height);
+
+    int k, l;
+    pixel bottomLeft, topRight;
+    int horPred;
+    // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is 17 bits or [-65280, 65280], so we have to use 32 bits here
+    int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
+    // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
+    int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
+    int blkSize = width;
+    int offset2D = width;
+    int shift1D = g_aucConvertToBit[width] + 2;
+    int shift2D = shift1D + 1;
+
+    // Get left and above reference column and row
+    for (k = 0; k < blkSize + 1; k++)
+    {
+        topRow[k] = pSrc[k - srcStride];
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = (pixel)leftColumn[blkSize];
+    topRight   = (pixel)topRow[blkSize];
+    for (k = 0; k < blkSize; k++)
+    {
+        bottomRow[k]   = (int16_t)(bottomLeft - topRow[k]);
+        rightColumn[k] = (int16_t)(topRight   - leftColumn[k]);
+        topRow[k]      <<= shift1D;
+        leftColumn[k]  <<= shift1D;
+    }
+
+    // Generate prediction signal
+    for (k = 0; k < blkSize; k++)
+    {
+        horPred = leftColumn[k] + offset2D;
+        for (l = 0; l < blkSize; l++)
+        {
+            horPred += rightColumn[k];
+            topRow[l] += bottomRow[l];
+            pDst[k * dstStride + l] = (pixel)((horPred + topRow[l]) >> shift2D);
+        }
+    }
+}
+
+void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
+{
+    int k, l;
+    int blkSize  = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+    {
+        pixel* refMain;
+        pixel* refSide;
+
+        // Initialise the Main and Left reference array.
+        if (intraPredAngle < 0)
+        {
+            refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+            refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+            // Extend the Main reference to the left.
+            int invAngleSum    = 128; // rounding for (shift by 8)
+            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+            {
+                invAngleSum += invAngle;
+                refMain[k] = refSide[invAngleSum >> 8];
+            }
+        }
+        else
+        {
+            refMain = modeVer ? refAbove : refLeft;
+            refSide = modeVer ? refLeft  : refAbove;
+        }
+
+        if (intraPredAngle == 0)
+        {
+            for (k = 0; k < blkSize; k++)
+            {
+                for (l = 0; l < blkSize; l++)
+                {
+                    pDst[k * dstStride + l] = refMain[l + 1];
+                }
+            }
+
+            if (bFilter)
+            {
+                for (k = 0; k < blkSize; k++)
+                {
+                    pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
+                }
+            }
+        }
+        else
+        {
+            int deltaPos = 0;
+            int deltaInt;
+            int deltaFract;
+            int refMainIndex;
+
+            for (k = 0; k < blkSize; k++)
+            {
+                deltaPos += intraPredAngle;
+                deltaInt   = deltaPos >> 5;
+                deltaFract = deltaPos & (32 - 1);
+
+                if (deltaFract)
+                {
+                    // Do linear filtering
+                    for (l = 0; l < blkSize; l++)
+                    {
+                        refMainIndex        = l + deltaInt + 1;
+                        pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
+                    }
+                }
+                else
+                {
+                    // Just copy the integer samples
+                    for (l = 0; l < blkSize; l++)
+                    {
+                        pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
+                    }
+                }
+            }
+        }
+
+        // Flip the block if this is the horizontal mode
+        if (modeHor)
+        {
+            pixel  tmp;
+            for (k = 0; k < blkSize - 1; k++)
+            {
+                for (l = k + 1; l < blkSize; l++)
+                {
+                    tmp                 = pDst[k * dstStride + l];
+                    pDst[k * dstStride + l] = pDst[l * dstStride + k];
+                    pDst[l * dstStride + k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+unsigned char g_aucIntraFilterType[][35] = {
+    //  Index:    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
+    /*  8x8  */ { 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    /* 16x16 */ { 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+    /* 32x32 */ { 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+
+void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+    int iMode;
+
+    // avoid warning
+    (pLeft1);
+    (pAbove1);
+
+    for( iMode = 2; iMode <= 34; iMode++ )
+    {
+        pixel *pLeft = pLeft0;
+        pixel *pAbove = pAbove0;
+        pixel *pDst = pDst0 + (iMode-2) * (4 * 4);
+
+        xPredIntraAngBufRef(8, pDst, 4, 4, iMode, bLuma, pLeft, pAbove);
+
+        // Optimize code don't flip buffer
+        bool modeHor = (iMode < 18);
+        // Flip the block if this is the horizontal mode
+        if (modeHor)
+        {
+            pixel  tmp;
+            const int width = 4;
+            for (int k = 0; k < width - 1; k++)
+            {
+                for (int l = k + 1; l < width; l++)
+                {
+                    tmp                 = pDst[k * width + l];
+                    pDst[k * width + l] = pDst[l * width + k];
+                    pDst[l * width + k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+}
+
+namespace x265 {
+// x265 private namespace
+
+void Setup_C_IPredPrimitives(EncoderPrimitives& p)
+{
+    p.getIPredDC = xPredIntraDC;
+    p.getIPredPlanar = xPredIntraPlanar;
+    p.getIPredAng = xPredIntraAngBufRef;
+    p.getIPredAngs4 = xPredIntraAngs4;
+}
+}
diff --git a/source/common/primitives.h b/source/common/primitives.h
index f43f8a2..451927f 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -195,6 +195,7 @@ typedef void (CDECL * blockcpy_s_c)(int bx, int by, short *dst, intptr_t dstride
 typedef void (CDECL * getIPredDC_t)(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter);
 typedef void (CDECL * getIPredPlanar_t)(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride, int width);
 typedef void (CDECL * getIPredAng_p)(int bitDepth, pixel* rpDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
+typedef void (CDECL * getIPredAngs_t)(pixel *pDst, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma);
 typedef void (CDECL * quant)(int bitDepth, const int* pSrc, int* pDes, int iWidth, int iHeight, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int uiLog2TrSize, int *piDequantCoef);
 typedef void (CDECL * cvt16to32_t)(short *psOrg, int *piDst, int);
 typedef void (CDECL * cvt16to32_shl_t)(int *piDst, short *psOrg, intptr_t, int, int);
@@ -238,6 +239,7 @@ struct EncoderPrimitives
     getIPredDC_t getIPredDC;
     getIPredPlanar_t getIPredPlanar;
     getIPredAng_p getIPredAng;
+    getIPredAngs_t getIPredAngs4;
     quant deQuant;
     dct_t dct[NUM_DCTS];
     idct_t idct[NUM_IDCTS];
diff --git a/source/common/vec/intrapred.inc b/source/common/vec/intrapred.inc
index 43e3f1d..3a49935 100644
--- a/source/common/vec/intrapred.inc
+++ b/source/common/vec/intrapred.inc
@@ -1,4678 +1,4731 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Min Chen <chenm003 at 163.com>
- *          Deepthi Devaki <deepthidevaki at multicorewareinc.com>
- *          Steve Borho <steve at borho.org>
- *          ShinYee Chung <shinyee at multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at multicorewareinc.com.
- *****************************************************************************/
-
-#include "primitives.h"
-#include "TLibCommon/TComRom.h"
-#include <assert.h>
-#include <smmintrin.h>
-
-extern char g_aucConvertToBit[];
-
-using namespace x265;
-
-namespace {
-const int angAP[17][64] =
-{
-    {
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
-    },
-    {
-        0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52
-    },
-    {
-        0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 32, 32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42
-    },
-    {
-        0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 34
-    },
-    {
-        0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 26
-    },
-    {
-        0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18
-    },
-    {
-        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10
-    },
-    {
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
-    },
-    { // 0th virtual index; never used; just to help indexing
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
-    },
-    {
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
-    },
-    {
-        -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -10, -10
-    },
-    {
-        -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -17, -18, -18, -18, -18
-    },
-    {
-        -1, -1, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -13, -14, -14, -15, -15, -16, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -22, -22, -22, -23, -23, -24, -24, -24, -25, -25, -26, -26, -26
-    },
-    {
-        -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, -33, -33, -34, -34
-    },
-    {
-        -1, -2, -2, -3, -4, -4, -5, -6, -6, -7, -8, -8, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -16, -16, -17, -18, -18, -19, -20, -20, -21, -21, -22, -23, -23, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -31, -31, -32, -33, -33, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -41, -41, -42, -42
-    },
-    {
-        -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52
-    },
-    {
-        -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64
-    }
-};
-
-#define GETAP(X, Y) angAP[8 - (X)][(Y)]
-
-__m128i v_multiL, v_multiH, v_multiH2, v_multiH3, v_multiH4, v_multiH5, v_multiH6, v_multiH7;
-__m128i v_multi_2Row;
-
-/* When compiled with /arch:AVX, this code is not safe to run on non-AVX CPUs and
- * thus we cannot use static initialization.  This routine is only called if the
- * detected CPU can support this SIMD architecture. */
-static void initFileStaticVars()
-{
-    v_multiL = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-    v_multiH = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
-    v_multiH2 = _mm_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24);
-    v_multiH3 = _mm_setr_epi16(25, 26, 27, 28, 29, 30, 31, 32);
-    v_multiH4 = _mm_setr_epi16(33, 34, 35, 36, 37, 38, 39, 40);
-    v_multiH5 = _mm_setr_epi16(41, 42, 43, 44, 45, 46, 47, 48);
-    v_multiH6 = _mm_setr_epi16(49, 50, 51, 52, 53, 54, 55, 56);
-    v_multiH7 = _mm_setr_epi16(57, 58, 59, 60, 61, 62, 63, 64);
-    v_multi_2Row = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
-}
-
-static inline
-void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* rpDst, intptr_t iDstStride, int iWidth, int /*iHeight*/)
-{
-    pixel* pDst = rpDst;
-    int y;
-    pixel pixDC = *pDst;
-    int pixDCx3 = pixDC * 3 + 2;
-
-    // boundary pixels processing
-    pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pixDC + 2) >> 2);
-
-    Vec8us im1(pixDCx3);
-    Vec8us im2, im3;
-#if HIGH_BIT_DEPTH
-    switch (iWidth)
-    {
-    case 4:
-        im2 = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
-        im2 = (im1 + im2) >> const_int(2);
-        store_partial(const_int(8), &pDst[1], im2);
-        break;
-
-    case 8:
-        im2.load(&pSrc[1 - iSrcStride]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1]);
-        break;
-
-    case 16:
-        im2.load(&pSrc[1 - iSrcStride]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1]);
-
-        im2.load(&pSrc[1 - iSrcStride + 8]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 8]);
-        break;
-
-    case 32:
-        im2.load(&pSrc[1 - iSrcStride]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1]);
-
-        im2.load(&pSrc[1 - iSrcStride + 8]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 8]);
-
-        im2.load(&pSrc[1 - iSrcStride + 16]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 16]);
-
-        im2.load(&pSrc[1 - iSrcStride + 24]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 24]);
-        break;
-
-    //case 64:
-    default:
-        im2.load(&pSrc[1 - iSrcStride]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1]);
-
-        im2.load(&pSrc[1 - iSrcStride + 8]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 8]);
-
-        im2.load(&pSrc[1 - iSrcStride + 16]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 16]);
-
-        im2.load(&pSrc[1 - iSrcStride + 24]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 24]);
-
-        im2.load(&pSrc[1 - iSrcStride + 32]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 32]);
-
-        im2.load(&pSrc[1 - iSrcStride + 40]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 40]);
-
-        im2.load(&pSrc[1 - iSrcStride + 48]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 48]);
-
-        im2.load(&pSrc[1 - iSrcStride + 56]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&pDst[1 + 56]);
-        break;
-    }
-
-#else /* if HIGH_BIT_DEPTH */
-    Vec16uc pix;
-    switch (iWidth)
-    {
-    case 4:
-        pix = load_partial(const_int(4), &pSrc[1 - iSrcStride]);
-        im2 = extend_low(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        pix = compress(im2, im2);
-        store_partial(const_int(4), &pDst[1], pix);
-        break;
-
-    case 8:
-        pix = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
-        im2 = extend_low(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        pix = compress(im2, im2);
-        store_partial(const_int(8), &pDst[1], pix);
-        break;
-
-    case 16:
-        pix.load(&pSrc[1 - iSrcStride]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1]);
-        break;
-
-    case 32:
-        pix.load(&pSrc[1 - iSrcStride]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1]);
-
-        pix.load(&pSrc[1 - iSrcStride + 16]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1 + 16]);
-        break;
-
-    //case 64:
-    default:
-        pix.load(&pSrc[1 - iSrcStride]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1]);
-
-        pix.load(&pSrc[1 - iSrcStride + 16]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1 + 16]);
-
-        pix.load(&pSrc[1 - iSrcStride + 32]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1 + 32]);
-
-        pix.load(&pSrc[1 - iSrcStride + 48]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&pDst[1 + 48]);
-        break;
-    }
-
-#endif /* if HIGH_BIT_DEPTH */
-
-    for (y = 1; y < iWidth; y++)
-    {
-        pDst[iDstStride] = (pixel)((pSrc[iSrcStride - 1] + pixDCx3) >> 2);
-        pSrc += iSrcStride;
-        pDst += iDstStride;
-    }
-}
-
-void predIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
-{
-    //assert(iWidth == iHeight); // all of Intra is NxN
-    //assert(blkAboveAvailable || blkLeftAvailable); // I think left and above always true since HEVC have a pixel fill process
-    int iSum = 0;
-    int logSize = g_aucConvertToBit[width] + 2;
-    pixel *pSrcAbove = &pSrc[-srcStride];
-    pixel *pSrcLeft = &pSrc[-1];
-
-    for (int iInd = 0; iInd < width; iInd++)
-    {
-        iSum += *pSrcLeft;
-        pSrcLeft += srcStride;
-    }
-
-#if HIGH_BIT_DEPTH
-    Vec8s sumAbove(0);
-    Vec8s m0;
-
-    switch (width)
-    {
-    case 4:
-        sumAbove = load_partial(const_int(8), pSrcAbove);
-        break;
-    case 8:
-        m0.load(pSrcAbove);
-        sumAbove = m0;
-        break;
-    case 16:
-        m0.load(pSrcAbove);
-        sumAbove  = m0;
-        m0.load(pSrcAbove + 8);
-        sumAbove += m0;
-        break;
-    case 32:
-        m0.load(pSrcAbove);
-        sumAbove  = m0;
-        m0.load(pSrcAbove + 8);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 16);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 24);
-        sumAbove += m0;
-        break;
-        //case 64:
-    default:
-        // CHECK_ME: the max support bit_depth is 13-bits
-        m0.load(pSrcAbove);
-        sumAbove  = m0;
-        m0.load(pSrcAbove + 8);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 16);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 24);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 32);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 40);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 48);
-        sumAbove += m0;
-        m0.load(pSrcAbove + 56);
-        sumAbove += m0;
-        break;
-    }
-
-    iSum += horizontal_add_x(sumAbove);
-
-    logSize += 1;
-    pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
-    Vec8us dcValN(dcVal);
-    int k;
-
-    pixel *pDst1 = pDst;
-    switch (width)
-    {
-    case 4:
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        break;
-
-    case 8:
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        dcValN.store(pDst1);
-        pDst1 += dstStride;
-        break;
-
-    case 16:
-        for (k = 0; k < 16; k += 2)
-        {
-            dcValN.store(pDst1);
-            dcValN.store(pDst1 + 8);
-            pDst1 += dstStride;
-            dcValN.store(pDst1);
-            dcValN.store(pDst1 + 8);
-            pDst1 += dstStride;
-        }
-
-        break;
-
-    case 32:
-        for (k = 0; k < 32; k++)
-        {
-            dcValN.store(pDst1);
-            dcValN.store(pDst1 +  8);
-            dcValN.store(pDst1 + 16);
-            dcValN.store(pDst1 + 24);
-            pDst1 += dstStride;
-        }
-
-        break;
-
-    //case 64:
-    default:
-        for (k = 0; k < 64; k++)
-        {
-            dcValN.store(pDst1);
-            dcValN.store(pDst1 +  8);
-            dcValN.store(pDst1 + 16);
-            dcValN.store(pDst1 + 24);
-            dcValN.store(pDst1 + 32);
-            dcValN.store(pDst1 + 40);
-            dcValN.store(pDst1 + 48);
-            dcValN.store(pDst1 + 56);
-            pDst1 += dstStride;
-        }
-
-        break;
-    }
-
-    if (bFilter)
-    {
-        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
-    }
-#else // if !HIGH_BIT_DEPTH
-
-    {
-        Vec16uc pix;
-        Vec8us  im;
-        Vec4ui  im1, im2;
-
-        switch (width)
-        {
-        case 4:
-            pix.fromUint32(*(uint32_t*)pSrcAbove);
-            iSum += horizontal_add(extend_low(pix));
-            break;
-        case 8:
-#if X86_64
-            pix.fromUint64(*(uint64_t*)pSrcAbove);
-#else
-            pix.load_partial(8, pSrcAbove);
-#endif
-            iSum += horizontal_add(extend_low(pix));
-            break;
-        case 16:
-            pix.load(pSrcAbove);
-            iSum += horizontal_add_x(pix);
-            break;
-        case 32:
-            pix.load(pSrcAbove);
-            im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
-            pix.load(pSrcAbove + 16);
-            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
-            im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
-            iSum += toInt32(im1);
-            break;
-        //case 64:
-        default:
-            pix.load(pSrcAbove);
-            im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
-            pix.load(pSrcAbove + 16);
-            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
-            pix.load(pSrcAbove + 32);
-            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
-            pix.load(pSrcAbove + 48);
-            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
-            im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
-            //im1 += extract_hi64(im1);
-            iSum += toInt32(im1);
-            break;
-        }
-    }
-
-    logSize += 1;
-    pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
-    Vec16uc dcValN(dcVal);
-    int k;
-
-    pixel *pDst1 = pDst;
-    switch (width)
-    {
-    case 4:
-        store_partial(const_int(4), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(4), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(4), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(4), pDst1, dcValN);
-        break;
-
-    case 8:
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        pDst1 += dstStride;
-        store_partial(const_int(8), pDst1, dcValN);
-        break;
-
-    case 16:
-        for (k = 0; k < 16; k += 4)
-        {
-            store_partial(const_int(16), pDst1, dcValN);
-            pDst1 += dstStride;
-            store_partial(const_int(16), pDst1, dcValN);
-            pDst1 += dstStride;
-            store_partial(const_int(16), pDst1, dcValN);
-            pDst1 += dstStride;
-            store_partial(const_int(16), pDst1, dcValN);
-            pDst1 += dstStride;
-        }
-
-        break;
-
-    case 32:
-        for (k = 0; k < 32; k += 2)
-        {
-            store_partial(const_int(16), pDst1,    dcValN);
-            store_partial(const_int(16), pDst1 + 16, dcValN);
-            pDst1 += dstStride;
-            store_partial(const_int(16), pDst1,    dcValN);
-            store_partial(const_int(16), pDst1 + 16, dcValN);
-            pDst1 += dstStride;
-        }
-
-        break;
-
-    case 64:
-        for (k = 0; k < 64; k++)
-        {
-            store_partial(const_int(16), pDst1,    dcValN);
-            store_partial(const_int(16), pDst1 + 16, dcValN);
-            store_partial(const_int(16), pDst1 + 32, dcValN);
-            store_partial(const_int(16), pDst1 + 48, dcValN);
-            pDst1 += dstStride;
-        }
-
-        break;
-    }
-
-    if (bFilter)
-    {
-        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
-    }
-#endif // if HIGH_BIT_DEPTH
-}
-
-#if HIGH_BIT_DEPTH
-// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp
-void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    int k, bottomLeft, topRight;
-    // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits
-    int16_t leftColumn[4];
-
-    // Get left and above reference column and row
-    Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
-
-    for (k = 0; k < 4; k++)
-    {
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn);   // leftColumn
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[4 * srcStride - 1];
-    topRight   = pSrc[4 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    v_topRow = v_topRow << const_int(2);
-    v_leftColumn = v_leftColumn << const_int(2);
-
-    // Generate prediction signal
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
-    Vec8s v_horPred, v_rightColumnN;
-    Vec8s v_im4;
-    Vec16uc v_im5;
-
-    // line0
-    v_horPred = broadcast(const_int(0), v_horPred4);
-    v_rightColumnN = broadcast(const_int(0), v_rightColumn) * v_multi;
-    v_horPred = v_horPred + v_rightColumnN;
-    v_topRow = v_topRow + v_bottomRow;
-    // CHECK_ME: the HM don't clip the pixel, so I assume there is biggest 12+3=15(bits)
-    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
-    store_partial(const_int(8), &rpDst[0 * dstStride], v_im4);
-
-    // line1
-    v_horPred = broadcast(const_int(1), v_horPred4);
-    v_rightColumnN = broadcast(const_int(1), v_rightColumn) * v_multi;
-    v_horPred = v_horPred + v_rightColumnN;
-    v_topRow = v_topRow + v_bottomRow;
-    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
-    store_partial(const_int(8), &rpDst[1 * dstStride], v_im4);
-
-    // line2
-    v_horPred = broadcast(const_int(2), v_horPred4);
-    v_rightColumnN = broadcast(const_int(2), v_rightColumn) * v_multi;
-    v_horPred = v_horPred + v_rightColumnN;
-    v_topRow = v_topRow + v_bottomRow;
-    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
-    store_partial(const_int(8), &rpDst[2 * dstStride], v_im4);
-
-    // line3
-    v_horPred = broadcast(const_int(3), v_horPred4);
-    v_rightColumnN = broadcast(const_int(3), v_rightColumn) * v_multi;
-    v_horPred = v_horPred + v_rightColumnN;
-    v_topRow = v_topRow + v_bottomRow;
-    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
-    store_partial(const_int(8), &rpDst[3 * dstStride], v_im4);
-}
-
-#else /* if HIGH_BIT_DEPTH */
-void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    int k;
-    pixel bottomLeft, topRight;
-
-    // Get left and above reference column and row
-    Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &pSrc[-srcStride]); // topRow
-    Vec8s v_topRow = extend_low(im0);
-
-    int16_t leftColumn[4];
-
-    for (k = 0; k < 4; k++)
-    {
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn);   // leftColumn
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[4 * srcStride - 1];
-    topRight   = pSrc[4 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    v_topRow = v_topRow << const_int(2);
-    v_leftColumn = v_leftColumn << const_int(2);
-
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
-    Vec8s v_horPred, v_rightColumnN;
-    Vec8s v_im4;
-    Vec16uc v_im5;
-
-#define COMP_PRED_PLANAR4_ROW(X) { \
-        v_horPred = broadcast(const_int((X)), v_horPred4); \
-        v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \
-        v_horPred = v_horPred + v_rightColumnN; \
-        v_topRow = v_topRow + v_bottomRow; \
-        v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \
-        v_im5 = compress_unsafe(v_im4, v_im4); \
-        store_partial(const_int(4), &rpDst[(X)*dstStride], v_im5); \
-}
-
-    COMP_PRED_PLANAR4_ROW(0)
-    COMP_PRED_PLANAR4_ROW(1)
-    COMP_PRED_PLANAR4_ROW(2)
-    COMP_PRED_PLANAR4_ROW(3)
-
-#undef COMP_PRED_PLANAR4_ROW
-}
-
-#if INSTRSET >= 5
-void predIntraPlanar4_sse4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    pixel bottomLeft, topRight;
-
-    // Get left and above reference column and row
-    __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&pSrc[-srcStride]); // topRow
-    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-
-    v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[4 * srcStride - 1];
-    topRight   = pSrc[4 - srcStride];
-
-    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-    __m128i v_bottomRow   = _mm_sub_epi16(v_bottomLeft, v_topRow);
-
-    v_topRow = _mm_slli_epi16(v_topRow, 2);
-
-    __m128i v_horPred, v_rightColumnN;
-    __m128i v_im4;
-    __m128i v_im5;
-    __m128i _tmp0, _tmp1;
-
-    __m128i v_bottomRowL = _mm_unpacklo_epi64(v_bottomRow, _mm_setzero_si128());
-    v_topRow = _mm_sub_epi16(v_topRow, v_bottomRowL);
-    v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);
-
-#define COMP_PRED_PLANAR_2ROW(Y) { \
-        _tmp0 = _mm_cvtsi32_si128((pSrc[((Y)) * srcStride - 1] << 2) + 4); \
-        _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
-        _tmp1 = _mm_cvtsi32_si128((pSrc[((Y)+1) * srcStride - 1] << 2) + 4); \
-        _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
-        v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \
-        _tmp0 = _mm_cvtsi32_si128(topRight - pSrc[((Y)) * srcStride - 1]); \
-        _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
-        _tmp1 = _mm_cvtsi32_si128(topRight - pSrc[((Y)+1) * srcStride - 1]); \
-        _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
-        v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \
-        v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \
-        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
-        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
-        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \
-        v_im5 = _mm_packus_epi16(v_im4, v_im4); \
-        *(uint32_t*)&rpDst[(Y)*dstStride] = _mm_cvtsi128_si32(v_im5); \
-        *(uint32_t*)&rpDst[((Y)+1) * dstStride] = _mm_cvtsi128_si32(_mm_shuffle_epi32(v_im5, 0x55));; \
-}
-
-    COMP_PRED_PLANAR_2ROW(0)
-    COMP_PRED_PLANAR_2ROW(2)
-
-#undef COMP_PRED_PLANAR4_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#endif /* if HIGH_BIT_DEPTH */
-
-#if HIGH_BIT_DEPTH
-
-#define COMP_PRED_PLANAR_ROW(X) { \
-        v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
-        v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
-        v_horPred = v_horPred + v_rightColumnN; \
-        v_topRow = v_topRow + v_bottomRow; \
-        v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
-        store_partial(const_int(16), &rpDst[X * dstStride], v_im4); \
-}
-
-void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    int k, bottomLeft, topRight;
-
-    int16_t leftColumn[8];
-
-    // Get left and above reference column and row
-    Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
-
-    for (k = 0; k < 8; k++)
-    {
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn);   // leftColumn
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[8 * srcStride - 1];
-    topRight   = pSrc[8 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    int shift = g_aucConvertToBit[8];          // Using value corresponding to width = 8
-    v_topRow = v_topRow << (2 + shift);
-    v_leftColumn = v_leftColumn << (2 + shift);
-
-    // Generate prediction signal
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
-    Vec8s v_horPred, v_rightColumnN;
-    Vec8s v_im4;
-    Vec16uc v_im5;
-
-    COMP_PRED_PLANAR_ROW(0);     // row 0
-    COMP_PRED_PLANAR_ROW(1);
-    COMP_PRED_PLANAR_ROW(2);
-    COMP_PRED_PLANAR_ROW(3);
-    COMP_PRED_PLANAR_ROW(4);
-    COMP_PRED_PLANAR_ROW(5);
-    COMP_PRED_PLANAR_ROW(6);
-    COMP_PRED_PLANAR_ROW(7);     // row 7
-}
-
-#undef COMP_PRED_PLANAR_ROW
-#else /* if HIGH_BIT_DEPTH */
-
-#define COMP_PRED_PLANAR_ROW(X) { \
-        v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
-        v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
-        v_horPred = v_horPred + v_rightColumnN; \
-        v_topRow = v_topRow + v_bottomRow; \
-        v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
-        v_im5 = compress(v_im4, v_im4); \
-        store_partial(const_int(8), &rpDst[X * dstStride], v_im5); \
-}
-
-void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    int k;
-    pixel bottomLeft, topRight;
-    int16_t leftColumn[8];
-
-    // Get left and above reference column and row
-    Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
-    Vec8s v_topRow = extend_low(im0);
-
-    for (k = 0; k < 8; k++)
-    {
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn;
-    v_leftColumn.load(leftColumn);   // leftColumn
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[8 * srcStride - 1];
-    topRight   = pSrc[8 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    int shift = g_aucConvertToBit[8];         // Using value corresponding to width = 8
-    v_topRow = v_topRow << (2 + shift);
-    v_leftColumn = v_leftColumn << (2 + shift);
-
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
-    Vec8s v_horPred, v_rightColumnN;
-    Vec8s v_im4;
-    Vec16uc v_im5;
-
-    COMP_PRED_PLANAR_ROW(0);     // row 0
-    COMP_PRED_PLANAR_ROW(1);
-    COMP_PRED_PLANAR_ROW(2);
-    COMP_PRED_PLANAR_ROW(3);
-    COMP_PRED_PLANAR_ROW(4);
-    COMP_PRED_PLANAR_ROW(5);
-    COMP_PRED_PLANAR_ROW(6);
-    COMP_PRED_PLANAR_ROW(7);     // row 7
-}
-
-#undef COMP_PRED_PLANAR_ROW
-
-#if INSTRSET >= 5
-void predIntraPlanar8_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
-    pixel bottomLeft, topRight;
-
-    // Get left and above reference column and row
-    __m128i im0 = _mm_loadl_epi64((__m128i*)&pSrc[0 - srcStride]); // topRow
-    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-
-    __m128i v_leftColumn = _mm_setzero_si128();
-
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[0 * srcStride - 1], 0);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[1 * srcStride - 1], 1);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[2 * srcStride - 1], 2);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[3 * srcStride - 1], 3);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[4 * srcStride - 1], 4);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[5 * srcStride - 1], 5);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[6 * srcStride - 1], 6);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[7 * srcStride - 1], 7);
-    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[8 * srcStride - 1];
-    topRight   = pSrc[8 - srcStride];
-
-    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-    __m128i v_topRight   = _mm_set1_epi16(topRight);
-
-    __m128i v_bottomRow   = _mm_sub_epi16(v_bottomLeft, v_topRow);
-    __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
-
-    v_topRow = _mm_slli_epi16(v_topRow, 3);
-    v_leftColumn = _mm_slli_epi16(v_leftColumn, 3);
-
-    __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));
-    __m128i v_horPred, v_rightColumnN;
-    __m128i v_im4;
-    __m128i v_im5;
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
-        if ((Y) < 4) { \
-            v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \
-            v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \
-            v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
-            v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \
-        } \
-        else { \
-            v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \
-            v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \
-            v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
-            v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \
-        } \
-        v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \
-        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
-        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
-        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 4); \
-        v_im5 = _mm_packus_epi16(v_im4, v_im4); \
-        _mm_storel_epi64((__m128i*)&pDst[(Y)*dstStride], v_im5); \
-}
-
-    COMP_PRED_PLANAR_ROW(0)
-    COMP_PRED_PLANAR_ROW(1)
-    COMP_PRED_PLANAR_ROW(2)
-    COMP_PRED_PLANAR_ROW(3)
-    COMP_PRED_PLANAR_ROW(4)
-    COMP_PRED_PLANAR_ROW(5)
-    COMP_PRED_PLANAR_ROW(6)
-    COMP_PRED_PLANAR_ROW(7)
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#endif /* if HIGH_BIT_DEPTH */
-
-#if HIGH_BIT_DEPTH
-#define COMP_PRED_PLANAR_ROW(X) { \
-        v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
-        v_horPred_hi = v_horPred_lo; \
-        v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
-        v_rightColumnN_hi = v_rightColumnN_lo; \
-        v_rightColumnN_lo *= v_multi_lo; \
-        v_rightColumnN_hi *= v_multi_hi; \
-        v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
-        v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
-        v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
-        v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
-        v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
-        v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
-        v_im4_lo.store(&rpDst[X * dstStride]); \
-        v_im4_hi.store(&rpDst[X * dstStride + 8]); \
-}
-
-void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    int k;
-    pixel bottomLeft, topRight;
-    int16_t leftColumn[16];
-
-    // Get left and above reference column and row
-    Vec8s v_topRow_lo, v_topRow_hi;
-
-    v_topRow_lo.load(&pSrc[-srcStride]);
-    v_topRow_hi.load(&pSrc[-srcStride + 8]);
-
-    for (k = 0; k < 16; k++)
-    {
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn;
-    v_leftColumn.load(leftColumn);   // leftColumn
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[16 * srcStride - 1];
-    topRight   = pSrc[16 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
-    Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    int shift = g_aucConvertToBit[16];         // Using value corresponding to width = 8
-    v_topRow_lo = v_topRow_lo << (2 + shift);
-    v_topRow_hi = v_topRow_hi << (2 + shift);
-    v_leftColumn = v_leftColumn << (2 + shift);
-
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
-    const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
-    const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
-    Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
-    Vec8s v_im4_lo, v_im4_hi;
-    Vec16uc v_im5;
-
-    COMP_PRED_PLANAR_ROW(0);     // row 0
-    COMP_PRED_PLANAR_ROW(1);
-    COMP_PRED_PLANAR_ROW(2);
-    COMP_PRED_PLANAR_ROW(3);
-    COMP_PRED_PLANAR_ROW(4);
-    COMP_PRED_PLANAR_ROW(5);
-    COMP_PRED_PLANAR_ROW(6);
-    COMP_PRED_PLANAR_ROW(7);     // row 7
-
-    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows
-    v_rightColumn = v_topRight - v_leftColumn;
-    v_leftColumn = v_leftColumn << (2 + shift);
-    v_horPred4 = v_leftColumn + Vec8s(16);
-
-    COMP_PRED_PLANAR_ROW(8);     // row 0
-    COMP_PRED_PLANAR_ROW(9);
-    COMP_PRED_PLANAR_ROW(10);
-    COMP_PRED_PLANAR_ROW(11);
-    COMP_PRED_PLANAR_ROW(12);
-    COMP_PRED_PLANAR_ROW(13);
-    COMP_PRED_PLANAR_ROW(14);
-    COMP_PRED_PLANAR_ROW(15);
-}
-
-#undef COMP_PRED_PLANAR_ROW
-
-#else /* if HIGH_BIT_DEPTH */
-#define COMP_PRED_PLANAR_ROW(X) { \
-        v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
-        v_horPred_hi = v_horPred_lo; \
-        v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
-        v_rightColumnN_hi = v_rightColumnN_lo; \
-        v_rightColumnN_lo *= v_multi_lo; \
-        v_rightColumnN_hi *= v_multi_hi; \
-        v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
-        v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
-        v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
-        v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
-        v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
-        v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
-        v_im5 = compress(v_im4_lo, v_im4_hi); \
-        store_partial(const_int(16), &rpDst[X * dstStride], v_im5); \
-}
-
-void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
-{
-    int k;
-    pixel bottomLeft, topRight;
-    int16_t leftColumn[16];
-
-    // Get left and above reference column and row
-    Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
-    Vec8s v_topRow_lo = extend_low(im0);
-    Vec8s v_topRow_hi = extend_high(im0);
-
-    for (k = 0; k < 16; k++)
-    {
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn;
-    v_leftColumn.load(leftColumn);   // leftColumn
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[16 * srcStride - 1];
-    topRight   = pSrc[16 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
-    Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    int shift = g_aucConvertToBit[16];         // Using value corresponding to width = 8
-    v_topRow_lo = v_topRow_lo << (2 + shift);
-    v_topRow_hi = v_topRow_hi << (2 + shift);
-    v_leftColumn = v_leftColumn << (2 + shift);
-
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
-    const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
-    const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
-    Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
-    Vec8s v_im4_lo, v_im4_hi;
-    Vec16uc v_im5;
-
-    COMP_PRED_PLANAR_ROW(0);     // row 0
-    COMP_PRED_PLANAR_ROW(1);
-    COMP_PRED_PLANAR_ROW(2);
-    COMP_PRED_PLANAR_ROW(3);
-    COMP_PRED_PLANAR_ROW(4);
-    COMP_PRED_PLANAR_ROW(5);
-    COMP_PRED_PLANAR_ROW(6);
-    COMP_PRED_PLANAR_ROW(7);     // row 7
-
-    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows
-    v_rightColumn = v_topRight - v_leftColumn;
-    v_leftColumn = v_leftColumn << (2 + shift);
-    v_horPred4 = v_leftColumn + Vec8s(16);
-
-    COMP_PRED_PLANAR_ROW(8);     // row 0
-    COMP_PRED_PLANAR_ROW(9);
-    COMP_PRED_PLANAR_ROW(10);
-    COMP_PRED_PLANAR_ROW(11);
-    COMP_PRED_PLANAR_ROW(12);
-    COMP_PRED_PLANAR_ROW(13);
-    COMP_PRED_PLANAR_ROW(14);
-    COMP_PRED_PLANAR_ROW(15);
-}
-
-#undef COMP_PRED_PLANAR_ROW
-
-#if INSTRSET >= 5
-void predIntraPlanar16_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
-    pixel bottomLeft, topRight;
-    __m128i v_topRow[2];
-    __m128i v_bottomRow[2];
-
-    // Get left and above reference column and row
-    __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
-
-    v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-    v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[16 * srcStride - 1];
-    topRight   = pSrc[16 - srcStride];
-
-    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-
-    v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
-    v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
-
-    v_topRow[0] = _mm_slli_epi16(v_topRow[0], 4);
-    v_topRow[1] = _mm_slli_epi16(v_topRow[1], 4);
-
-    __m128i v_horPred, v_horPredN[2], v_rightColumnN[2];
-    __m128i v_im4L, v_im4H;
-    __m128i v_im5;
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
-        v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 4) + 16); \
-        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
-        v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
-        __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
-        _tmp = _mm_shufflelo_epi16(_tmp, 0); \
-        _tmp = _mm_shuffle_epi32(_tmp, 0); \
-        v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
-        v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
-        v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
-        v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
-        v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
-        v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
-        v_im4L = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 5); \
-        v_im4H = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 5); \
-        v_im5 = _mm_packus_epi16(v_im4L, v_im4H); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5); \
-}
-
-    COMP_PRED_PLANAR_ROW(0)
-    COMP_PRED_PLANAR_ROW(1)
-    COMP_PRED_PLANAR_ROW(2)
-    COMP_PRED_PLANAR_ROW(3)
-    COMP_PRED_PLANAR_ROW(4)
-    COMP_PRED_PLANAR_ROW(5)
-    COMP_PRED_PLANAR_ROW(6)
-    COMP_PRED_PLANAR_ROW(7)
-    COMP_PRED_PLANAR_ROW(8)
-    COMP_PRED_PLANAR_ROW(9)
-    COMP_PRED_PLANAR_ROW(10)
-    COMP_PRED_PLANAR_ROW(11)
-    COMP_PRED_PLANAR_ROW(12)
-    COMP_PRED_PLANAR_ROW(13)
-    COMP_PRED_PLANAR_ROW(14)
-    COMP_PRED_PLANAR_ROW(15)
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#if INSTRSET >= 5
-void predIntraPlanar32_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
-    pixel bottomLeft, topRight;
-    __m128i v_topRow[4];
-    __m128i v_bottomRow[4];
-
-    // Get left and above reference column and row
-    __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
-    __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
-
-    v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-    v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
-    v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
-    v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[32 * srcStride - 1];
-    topRight   = pSrc[32 - srcStride];
-
-    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-
-    v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
-    v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
-    v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
-    v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
-
-    v_topRow[0] = _mm_slli_epi16(v_topRow[0], 5);
-    v_topRow[1] = _mm_slli_epi16(v_topRow[1], 5);
-    v_topRow[2] = _mm_slli_epi16(v_topRow[2], 5);
-    v_topRow[3] = _mm_slli_epi16(v_topRow[3], 5);
-
-    __m128i v_horPred, v_horPredN[4], v_rightColumnN[4];
-    __m128i v_im4[4];
-    __m128i v_im5[2];
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
-        v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 5) + 32); \
-        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
-        v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
-        __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
-        _tmp = _mm_shufflelo_epi16(_tmp, 0); \
-        _tmp = _mm_shuffle_epi32(_tmp, 0); \
-        v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
-        v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
-        v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
-        v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
-        v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
-        v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
-        v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
-        v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
-        v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
-        v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
-        v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
-        v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
-        v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 6); \
-        v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 6); \
-        v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 6); \
-        v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 6); \
-        v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
-        v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
-}
-
-    int i;
-    for (i = 0; i < 32; i += 2)
-    {
-        COMP_PRED_PLANAR_ROW(i + 0);
-        COMP_PRED_PLANAR_ROW(i + 1);
-    }
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#if INSTRSET >= 5
-void predIntraPlanar64_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
-{
-    pixel bottomLeft, topRight;
-    __m128i v_topRow[8];
-    __m128i v_bottomRow[8];
-
-    // Get left and above reference column and row
-    __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
-    __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
-    __m128i im2 = _mm_loadu_si128((__m128i*)&pSrc[32 - srcStride]); // topRow
-    __m128i im3 = _mm_loadu_si128((__m128i*)&pSrc[48 - srcStride]); // topRow
-
-    v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-    v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
-    v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
-    v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
-    v_topRow[4] = _mm_unpacklo_epi8(im2, _mm_setzero_si128());
-    v_topRow[5] = _mm_unpackhi_epi8(im2, _mm_setzero_si128());
-    v_topRow[6] = _mm_unpacklo_epi8(im3, _mm_setzero_si128());
-    v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = pSrc[64 * srcStride - 1];
-    topRight   = pSrc[64 - srcStride];
-
-    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
-
-    v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
-    v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
-    v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
-    v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
-    v_bottomRow[4] = _mm_sub_epi16(v_bottomLeft, v_topRow[4]);
-    v_bottomRow[5] = _mm_sub_epi16(v_bottomLeft, v_topRow[5]);
-    v_bottomRow[6] = _mm_sub_epi16(v_bottomLeft, v_topRow[6]);
-    v_bottomRow[7] = _mm_sub_epi16(v_bottomLeft, v_topRow[7]);
-
-    v_topRow[0] = _mm_slli_epi16(v_topRow[0], 6);
-    v_topRow[1] = _mm_slli_epi16(v_topRow[1], 6);
-    v_topRow[2] = _mm_slli_epi16(v_topRow[2], 6);
-    v_topRow[3] = _mm_slli_epi16(v_topRow[3], 6);
-    v_topRow[4] = _mm_slli_epi16(v_topRow[4], 6);
-    v_topRow[5] = _mm_slli_epi16(v_topRow[5], 6);
-    v_topRow[6] = _mm_slli_epi16(v_topRow[6], 6);
-    v_topRow[7] = _mm_slli_epi16(v_topRow[7], 6);
-
-    __m128i v_horPred, v_horPredN[8], v_rightColumnN[8];
-    __m128i v_im4[8];
-    __m128i v_im5[4];
-
-#define COMP_PRED_PLANAR_ROW(Y) { \
-        v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 6) + 64); \
-        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
-        v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
-        __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
-        _tmp = _mm_shufflelo_epi16(_tmp, 0); \
-        _tmp = _mm_shuffle_epi32(_tmp, 0); \
-        v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
-        v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
-        v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
-        v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
-        v_rightColumnN[4] = _mm_mullo_epi16(_tmp, v_multiH4); \
-        v_rightColumnN[5] = _mm_mullo_epi16(_tmp, v_multiH5); \
-        v_rightColumnN[6] = _mm_mullo_epi16(_tmp, v_multiH6); \
-        v_rightColumnN[7] = _mm_mullo_epi16(_tmp, v_multiH7); \
-        v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
-        v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
-        v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
-        v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
-        v_horPredN[4] = _mm_add_epi16(v_horPred, v_rightColumnN[4]); \
-        v_horPredN[5] = _mm_add_epi16(v_horPred, v_rightColumnN[5]); \
-        v_horPredN[6] = _mm_add_epi16(v_horPred, v_rightColumnN[6]); \
-        v_horPredN[7] = _mm_add_epi16(v_horPred, v_rightColumnN[7]); \
-        v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
-        v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
-        v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
-        v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
-        v_topRow[4] = _mm_add_epi16(v_topRow[4], v_bottomRow[4]); \
-        v_topRow[5] = _mm_add_epi16(v_topRow[5], v_bottomRow[5]); \
-        v_topRow[6] = _mm_add_epi16(v_topRow[6], v_bottomRow[6]); \
-        v_topRow[7] = _mm_add_epi16(v_topRow[7], v_bottomRow[7]); \
-        v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 7); \
-        v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 7); \
-        v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 7); \
-        v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 7); \
-        v_im4[4] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[4], v_topRow[4]), 7); \
-        v_im4[5] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[5], v_topRow[5]), 7); \
-        v_im4[6] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[6], v_topRow[6]), 7); \
-        v_im4[7] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[7], v_topRow[7]), 7); \
-        v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
-        v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
-        v_im5[2] = _mm_packus_epi16(v_im4[4], v_im4[5]); \
-        v_im5[3] = _mm_packus_epi16(v_im4[6], v_im4[7]); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 32], v_im5[2]); \
-        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 48], v_im5[3]); \
-}
-
-    int i;
-    for (i = 0; i < 64; i++)
-    {
-        COMP_PRED_PLANAR_ROW(i + 0);
-        //COMP_PRED_PLANAR_ROW(i+1);
-    }
-
-#undef COMP_PRED_PLANAR_ROW
-}
-
-#endif // INSTRSET >= 5
-
-#endif /* if HIGH_BIT_DEPTH */
-
-typedef void predIntraPlanar_t (pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride);
-predIntraPlanar_t *intraPlanarN[] =
-{
-#if !HIGH_BIT_DEPTH && INSTRSET >= 5
-    predIntraPlanar4_sse4,
-    predIntraPlanar8_sse4,
-    predIntraPlanar16_sse4,
-    predIntraPlanar32_sse4,
-    predIntraPlanar64_sse4,
-#else
-    predIntraPlanar4,
-    predIntraPlanar8,
-    predIntraPlanar16,
-#endif
-};
-
-void predIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride, int width)
-{
-    //assert(width == height);
-
-    int nLog2Size = g_aucConvertToBit[width] + 2;
-
-#if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)
-    intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
-    return;
-#else
-    int k, l, bottomLeft, topRight;
-    int horPred;
-    // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here
-    int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
-    // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
-    int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
-    int blkSize = width;
-    int offset2D = width;
-    int shift1D = nLog2Size;
-    int shift2D = shift1D + 1;
-
-    if (width < 32)
-    {
-        intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
-        return;
-    }
-
-    // Get left and above reference column and row
-    for (k = 0; k < blkSize + 1; k++)
-    {
-        topRow[k] = pSrc[k - srcStride];
-        leftColumn[k] = pSrc[k * srcStride - 1];
-    }
-
-    // Prepare intermediate variables used in interpolation
-    bottomLeft = leftColumn[blkSize];
-    topRight   = topRow[blkSize];
-    for (k = 0; k < blkSize; k++)
-    {
-        bottomRow[k]   = bottomLeft - topRow[k];
-        rightColumn[k] = topRight   - leftColumn[k];
-        topRow[k]      <<= shift1D;
-        leftColumn[k]  <<= shift1D;
-    }
-
-    // Generate prediction signal
-    for (k = 0; k < blkSize; k++)
-    {
-        horPred = leftColumn[k] + offset2D;
-        for (l = 0; l < blkSize; l++)
-        {
-            horPred += rightColumn[k];
-            topRow[l] += bottomRow[l];
-            rpDst[k * dstStride + l] = ((horPred + topRow[l]) >> shift2D);
-        }
-    }
-
-#endif /* if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5) */
-}
-
-#if HIGH_BIT_DEPTH
-void xPredIntraAng4x4(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
-{
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int lookIdx = intraPredAngle;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-
-    pixel* refMain;
-    pixel* refSide;
-
-    // Initialise the Main and Left reference array.
-    if (intraPredAngle < 0)
-    {
-        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
-        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
-
-        // Extend the Main reference to the left.
-        int invAngleSum    = 128;     // rounding for (shift by 8)
-        for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
-        {
-            invAngleSum += invAngle;
-            refMain[k] = refSide[invAngleSum >> 8];
-        }
-    }
-    else
-    {
-        refMain = modeVer ? refAbove : refLeft;
-        refSide = modeVer ? refLeft  : refAbove;
-    }
-
-    // bfilter will always be true for blocksize 4
-    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
-    {
-        if (modeHor)
-        {
-            Vec8s v_temp;
-            Vec8s v_side_0; // refSide[0] value in a vector
-            v_temp.load((void*)refSide);
-            v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
-
-            Vec8s v_side;
-            v_side.load(refSide + 1);
-
-            Vec8s v_main;
-            v_main = load_partial(const_int(8), (void*)(refMain + 1));
-
-            Vec8s tmp1, tmp2;
-            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(v_main, v_main);
-            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp1);
-            tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
-
-            Vec8s row0;
-            v_side -= v_side_0;
-            v_side = v_side >> 1;
-            row0 = tmp2 + v_side;
-            row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
-            store_partial(const_int(8), pDst, row0);                //row0
-            store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
-
-            tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp2, tmp2);
-            tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
-
-            store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
-            store_partial(const_int(8), pDst + (dstStride), tmp2);    //row1
-        }
-        else
-        {
-            Vec16uc v_main;
-            v_main = load_partial(const_int(8), refMain + 1);
-            store_partial(const_int(8), pDst, v_main);
-            store_partial(const_int(8), pDst + dstStride, v_main);
-            store_partial(const_int(8), pDst + (2 * dstStride), v_main);
-            store_partial(const_int(8), pDst + (3 * dstStride), v_main);
-
-            for (int k = 0; k < 4; k++)
-            {
-                pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
-            }
-        }
-    }
-    else if (intraPredAngle == -32)
-    {
-        Vec8s tmp;
-        tmp = load_partial(const_int(8), refMain);        //-1,0,1,2
-        store_partial(const_int(8), pDst, tmp);
-        tmp = load_partial(const_int(8), refMain - 1);     //-2,-1,0,1
-        store_partial(const_int(8), pDst + dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 2);
-        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 3);
-        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
-        return;
-    }
-    else if (intraPredAngle == 32)
-    {
-        Vec8s tmp;
-        tmp = load_partial(const_int(8), refMain + 2);        //-1,0,1,2
-        store_partial(const_int(8), pDst, tmp);
-        tmp = load_partial(const_int(8), refMain + 3);     //-2,-1,0,1
-        store_partial(const_int(8), pDst + dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 4);
-        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 5);
-        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
-        return;
-    }
-    else
-    {
-        Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
-        Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
-
-        row11 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0));
-        row12 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0) + 1);
-
-        row21 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1));
-        row22 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1) + 1);
-
-        row31 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2));
-        row32 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2) + 1);
-
-        row41 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3));
-        row42 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3) + 1);
-
-        v_deltaPos = v_ipAngle = intraPredAngle;
-
-        //row1
-        v_deltaFract = v_deltaPos & thirty1;
-        row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
-
-        //row2
-        v_deltaPos += v_ipAngle;
-        v_deltaFract = v_deltaPos & thirty1;
-        row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
-
-        //row3
-        v_deltaPos += v_ipAngle;
-        v_deltaFract = v_deltaPos & thirty1;
-        row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
-
-        //row4
-        v_deltaPos += v_ipAngle;
-        v_deltaFract = v_deltaPos & thirty1;
-        row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
-
-        // Flip the block
-
-        if (modeHor)
-        {
-            Vec8s tmp1, tmp2, tmp3, tmp4;
-
-            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
-            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
-
-            tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
-            tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
-
-            //tmp16_1 = compress(tmp3, tmp3);
-            store_partial(const_int(8), pDst, tmp3);
-
-            store_partial(const_int(8), pDst + (2 * dstStride), tmp4);  //row2
-
-            tmp3 = blend2q<1, 3>((Vec2q)tmp3, (Vec2q)tmp3);
-            tmp4 = blend2q<1, 3>((Vec2q)tmp4, (Vec2q)tmp4);
-
-            store_partial(const_int(8), pDst + (3 * dstStride), tmp4);   //row3
-            store_partial(const_int(8), pDst + (dstStride), tmp3);       //row1
-        }
-        else
-        {
-            store_partial(const_int(8), pDst, row11);
-            store_partial(const_int(8), pDst + (dstStride), row21);
-            store_partial(const_int(8), pDst + (2 * dstStride), row31);
-            store_partial(const_int(8), pDst + (3 * dstStride), row41);
-        }
-    }
-}
-
-#else /* if HIGH_BIT_DEPTH */
-void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
-{
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-
-    pixel* refMain;
-    pixel* refSide;
-
-    // Initialise the Main and Left reference array.
-    if (intraPredAngle < 0)
-    {
-        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
-        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
-
-        // Extend the Main reference to the left.
-        int invAngleSum    = 128;     // rounding for (shift by 8)
-        for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
-        {
-            invAngleSum += invAngle;
-            refMain[k] = refSide[invAngleSum >> 8];
-        }
-    }
-    else
-    {
-        refMain = modeVer ? refAbove : refLeft;
-        refSide = modeVer ? refLeft  : refAbove;
-    }
-
-    // bfilter will always be true for exactly vertical/horizontal modes
-    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
-    {
-        if (modeHor)
-        {
-            Vec16uc v_main;
-            v_main = load_partial(const_int(4), (void*)(refMain + 1));
-
-            Vec16uc tmp16;
-            tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v_main, v_main);
-            tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(tmp16, tmp16);
-            Vec2uq tmp;
-
-            if (bFilter)
-            {
-                Vec16uc v_temp;
-                Vec8s v_side_0; // refSide[0] value in a vector
-                v_temp = load_partial(const_int(8), (void*)refSide);
-                v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
-                v_side_0 = v_side_0 & 0x00FF;
-
-                //shift v_side by 1 element (1 byte)
-                tmp = reinterpret_i(v_temp);
-                tmp = tmp >> 8;
-                v_temp = reinterpret_i(tmp);
-                Vec8s v_side = extend_low(v_temp);
-
-                Vec8s row0 = extend_low(tmp16);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row0 += v_side;
-                row0 = min(max(0, row0), 255);
-                Vec16uc v_res(compress_unsafe(row0, 0));
-                store_partial(const_int(4), pDst, v_res);
-            }
-            else
-            {
-                store_partial(const_int(4), pDst, tmp16);
-            }
-
-            tmp = (Vec2uq)tmp16;
-            tmp >>= 32;
-            store_partial(const_int(4), pDst + dstStride, tmp);
-
-            tmp = blend2q<1, 3>(reinterpret_i(tmp16), reinterpret_i(tmp16));
-            store_partial(const_int(4), pDst + (2 * dstStride), tmp);
-
-            tmp >>= 32;
-            store_partial(const_int(4), pDst + (3 * dstStride), tmp);
-        }
-        else
-        {
-            Vec16uc v_main;
-            v_main = load_partial(const_int(4), refMain + 1);
-            store_partial(const_int(4), pDst, v_main);
-            store_partial(const_int(4), pDst + dstStride, v_main);
-            store_partial(const_int(4), pDst + (2 * dstStride), v_main);
-            store_partial(const_int(4), pDst + (3 * dstStride), v_main);
-            if (bFilter)
-            {
-                for (int k = 0; k < 4; k++)
-                {
-                    pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << 8) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
-                }
-            }
-        }
-    }
-    else
-    {
-        Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
-        Vec16uc tmp16_1, tmp16_2;
-        Vec2uq tmp2uq;
-        Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31), v_ipAngle(0);
-        switch (intraPredAngle)
-        {
-        case -32:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);    //-1,0,1,2
-            store_partial(const_int(4), pDst, tmp16_1);
-            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 1); //-2,-1,0,1
-            store_partial(const_int(4), pDst + dstStride, tmp16_2);
-            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 2);
-            store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
-            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 3);
-            store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
-            return;
-
-        case -26:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 3);
-            row41 = extend_low(tmp16_1);    //offsets(-4,-3,-2,-1)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(-3,-2,-1,0)
-
-            row31 = row42;                  //offsets(-3,-2,-1,0)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row32 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
-
-            row21 = row32;                  //offsets(-2,-1,0,1)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 24;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row22 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row11 = row22;                  //offsets(-1,0,1,2)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 32;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(0,1,2,3)
-
-            v_deltaPos = v_ipAngle = -26;
-            break;
-
-        case -21:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
-            row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
-
-            row31 = row42;                  //offsets(-2,-1,0,1)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row21 = row31;                  //offsets(-2,-1,0,1)
-            row22 = row32;
-
-            row11 = row32;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 24;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            v_deltaPos = v_ipAngle = -21;
-            break;
-
-        case -17:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
-            row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
-
-            row31 = row42;                  //offsets(-2,-1,0,1)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row21 = row31;                  //offsets(-2,-1,0,1)
-            row22 = row32;
-
-            row11 = row32;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 24;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            v_deltaPos = v_ipAngle = -17;
-            break;
-
-        case -13:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
-            row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row11 = row42;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row21 = row42;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row41;
-            row32 = row42;
-
-            v_deltaPos = v_ipAngle = -13;
-            break;
-
-        case -9:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
-            row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row11 = row42;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
-
-            row21 = row42;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row42;
-            row32 = row12;
-
-            v_deltaPos = v_ipAngle = -9;
-            break;
-
-        case -5:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-            row21 = row11;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row11;
-            row32 = row12;
-            row41 = row11;
-            row42 = row12;
-
-            v_deltaPos = v_ipAngle = -5;
-            break;
-
-        case -2:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-            row21 = row11;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row11;
-            row32 = row12;
-            row41 = row11;
-            row42 = row12;
-
-            v_deltaPos = v_ipAngle = -2;
-            break;
-
-        case 2:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-            row21 = row11;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row11;
-            row32 = row12;
-            row41 = row11;
-            row42 = row12;
-
-            v_deltaPos = v_ipAngle = 2;
-            break;
-
-        case 5:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-            row21 = row11;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row11;
-            row32 = row12;
-            row41 = row11;
-            row42 = row12;
-
-            v_deltaPos = v_ipAngle = 5;
-            break;
-
-        case 9:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-            row21 = row11;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row11;
-            row32 = row12;
-            row41 = row12;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);
-
-            v_deltaPos = v_ipAngle = 9;
-            break;
-
-        case 13:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-
-            row21 = row11;                  //offsets(0,1,2,3)
-            row22 = row12;
-            row31 = row12;                  //offsets(1,2,3,4)
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row32 = extend_low(tmp16_2);    //offsets(2,3,4,5)
-
-            row41 = row31;                  //offsets(1,2,3,4)
-            row42 = row32;
-
-            v_deltaPos = v_ipAngle = 13;
-            break;
-
-        case 17:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-
-            row21 = row12;
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
-
-            row31 = row21;
-            row32 = row22;
-
-            row41 = row22;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 24;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
-
-            v_deltaPos = v_ipAngle = 17;
-            break;
-
-        case 21:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-
-            row21 = row12;
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
-
-            row31 = row21;
-            row32 = row22;
-
-            row41 = row22;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 24;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
-
-            v_deltaPos = v_ipAngle = 21;
-            break;
-
-        case 26:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
-
-            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 8;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
-
-            row21 = row12;
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 16;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
-
-            row31 = row22;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 24;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row32 = extend_low(tmp16_2);    //offsets(3,4,5,6)
-
-            row41 = row32;
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq = tmp2uq >> 32;
-            tmp16_2 = reinterpret_i(tmp2uq);
-            row42 = extend_low(tmp16_2);    //offsets(4,5,6,7)
-
-            v_deltaPos = v_ipAngle = 26;
-            break;
-
-        case 32:
-            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 2);
-            store_partial(const_int(4), pDst, tmp16_1);
-            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 3);
-            store_partial(const_int(4), pDst + dstStride, tmp16_2);
-            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 4);
-            store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
-            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 5);
-            store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
-            return;
-        }
-
-        //row1
-        v_deltaFract = v_deltaPos & thirty1;
-        row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
-
-        //row2
-        v_deltaPos += v_ipAngle;
-        v_deltaFract = v_deltaPos & thirty1;
-        row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
-
-        //row3
-        v_deltaPos += v_ipAngle;
-        v_deltaFract = v_deltaPos & thirty1;
-        row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
-
-        //row4
-        v_deltaPos += v_ipAngle;
-        v_deltaFract = v_deltaPos & thirty1;
-        row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
-
-        // Flip the block
-
-        if (modeHor)
-        {
-            Vec8s tmp1, tmp2, tmp3, tmp4;
-
-            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
-            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
-
-            tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
-            tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
-
-            tmp16_1 = compress_unsafe(tmp3, tmp3);
-            store_partial(const_int(4), pDst, tmp16_1);
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq >>= 32;
-            store_partial(const_int(4), pDst + dstStride, tmp2uq);
-
-            tmp16_1 = compress_unsafe(tmp4, tmp4);
-            store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
-
-            tmp2uq = reinterpret_i(tmp16_1);
-            tmp2uq >>= 32;
-            store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
-        }
-        else
-        {
-            store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
-            store_partial(const_int(4), pDst + (dstStride), compress_unsafe(row21, row21));
-            store_partial(const_int(4), pDst + (2 * dstStride), compress_unsafe(row31, row31));
-            store_partial(const_int(4), pDst + (3 * dstStride), compress_unsafe(row41, row41));
-        }
-    }
-}
-
-#endif /* if HIGH_BIT_DEPTH */
-
-#if HIGH_BIT_DEPTH
-#else
-#define PREDANG_CALCROW_VER(X) { \
-        LOADROW(row11, GETAP(lookIdx, X)); \
-        LOADROW(row12, GETAP(lookIdx, X) + 1); \
-        CALCROW(row11, row11, row12); \
-        store_partial(const_int(8), pDst + (X * dstStride), compress(row11, row11)); \
-}
-
-#define PREDANG_CALCROW_HOR(X, rowx) { \
-        LOADROW(row11, GETAP(lookIdx, X)); \
-        LOADROW(row12, GETAP(lookIdx, X) + 1); \
-        CALCROW(rowx, row11, row12); \
-}
-
-// ROW is a Vec8s variable, X is the index in of data to be loaded
-#define LOADROW(ROW, X) { \
-        tmp = load_partial(const_int(8), refMain + 1 + X); \
-        ROW = extend_low(tmp); \
-}
-
-#define CALCROW(RES, ROW1, ROW2) { \
-        v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1; \
-        RES = ((thirty2 - v_deltaFract) * ROW1 + (v_deltaFract * ROW2) + 16) >> 5; \
-}
-
-void xPredIntraAng8x8(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
-{
-    int k;
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int lookIdx = intraPredAngle;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-
-    pixel* refMain;
-    pixel* refSide;
-
-    // Initialise the Main and Left reference array.
-    if (intraPredAngle < 0)
-    {
-        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
-        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
-
-        // Extend the Main reference to the left.
-        int invAngleSum    = 128;     // rounding for (shift by 8)
-        for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
-        {
-            invAngleSum += invAngle;
-            refMain[k] = refSide[invAngleSum >> 8];
-        }
-    }
-    else
-    {
-        refMain = modeVer ? refAbove : refLeft;
-        refSide = modeVer ? refLeft  : refAbove;
-    }
-
-    // bfilter will always be true for blocksize 8
-    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
-    {
-        if (modeHor)
-        {
-            Vec16uc v_temp;
-            Vec16uc tmp1;
-
-            v_temp.load(refMain + 1);
-            Vec8s v_main;
-            v_main = extend_low(v_temp);
-
-            if (bFilter)
-            {
-                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
-                Vec16uc v_temp16;
-                v_temp16.load(refSide + 1);
-                Vec8s v_side;
-                v_side = extend_low(v_temp16);
-
-                Vec8s row0;
-                row0 = permute8s<0, 0, 0, 0, 0, 0, 0, 0>(v_main);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row0 = row0 + v_side;
-                row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
-                tmp1 = compress(row0, row0);
-                store_partial(const_int(8), pDst, tmp1);            //row0
-            }
-            else
-            {
-                tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-                store_partial(const_int(8), pDst, tmp1); //row0
-            }
-            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (1 * dstStride), tmp1); //row1
-
-            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
-
-            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
-
-            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (4 * dstStride), tmp1); //row4
-
-            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (5 * dstStride), tmp1); //row5
-
-            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (6 * dstStride), tmp1); //row6
-
-            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
-            store_partial(const_int(8), pDst + (7 * dstStride), tmp1); //row7
-        }
-        else
-        {
-            Vec16uc v_main;
-            v_main = load_partial(const_int(8), refMain + 1);
-            store_partial(const_int(8), pDst, v_main);
-            store_partial(const_int(8), pDst + dstStride, v_main);
-            store_partial(const_int(8), pDst + (2 * dstStride), v_main);
-            store_partial(const_int(8), pDst + (3 * dstStride), v_main);
-            store_partial(const_int(8), pDst + (4 * dstStride), v_main);
-            store_partial(const_int(8), pDst + (5 * dstStride), v_main);
-            store_partial(const_int(8), pDst + (6 * dstStride), v_main);
-            store_partial(const_int(8), pDst + (7 * dstStride), v_main);
-
-            if (bFilter)
-            {
-                Vec16uc v_temp;
-                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
-
-                v_temp.load(refSide + 1);
-                Vec8s v_side;
-                v_side = extend_low(v_temp);
-
-                v_temp.load(refMain + 1);
-                Vec8s row0;
-                row0 = permute16uc<0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1>(v_temp);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row0 = row0 + v_side;
-                row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
-                pDst[0 * dstStride] = row0[0];
-                pDst[1 * dstStride] = row0[1];
-                pDst[2 * dstStride] = row0[2];
-                pDst[3 * dstStride] = row0[3];
-                pDst[4 * dstStride] = row0[4];
-                pDst[5 * dstStride] = row0[5];
-                pDst[6 * dstStride] = row0[6];
-                pDst[7 * dstStride] = row0[7];
-            }
-        }
-    }
-    else if (intraPredAngle == -32)
-    {
-        Vec16uc tmp;
-        tmp = load_partial(const_int(8), refMain);        //-1,0,1,2
-        store_partial(const_int(8), pDst, tmp);
-        tmp = load_partial(const_int(8), refMain - 1);     //-2,-1,0,1
-        store_partial(const_int(8), pDst + dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 2);
-        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 3);
-        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 4);
-        store_partial(const_int(8), pDst + 4 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 5);
-        store_partial(const_int(8), pDst + 5 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 6);
-        store_partial(const_int(8), pDst + 6 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain - 7);
-        store_partial(const_int(8), pDst + 7 * dstStride, tmp);
-        return;
-    }
-    else if (intraPredAngle == 32)
-    {
-        Vec8s tmp;
-        tmp = load_partial(const_int(8), refMain + 2);        //-1,0,1,2
-        store_partial(const_int(8), pDst, tmp);
-        tmp = load_partial(const_int(8), refMain + 3);     //-2,-1,0,1
-        store_partial(const_int(8), pDst + dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 4);
-        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 5);
-        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 6);
-        store_partial(const_int(8), pDst + 4 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 7);
-        store_partial(const_int(8), pDst + 5 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 8);
-        store_partial(const_int(8), pDst + 6 * dstStride, tmp);
-        tmp = load_partial(const_int(8), refMain + 9);
-        store_partial(const_int(8), pDst + 7 * dstStride, tmp);
-        return;
-    }
-    else
-    {
-        if (modeHor)         // Near horizontal modes
-        {
-            Vec16uc tmp;
-            Vec8s row11, row12;
-            Vec16uc row1, row2, row3, row4, tmp16_1, tmp16_2;
-            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
-            Vec8s tmp1, tmp2;
-            v_deltaPos = 0;
-            v_ipAngle = intraPredAngle;
-            switch (intraPredAngle)
-            {
-            case -5:
-                LOADROW(row11, -1);
-                LOADROW(row12, 0);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row1 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row2 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row3 = compress(tmp1, tmp2);
-                row12 = row11;
-                LOADROW(row11, -2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row4 = compress(tmp1, tmp2);
-                break;
-
-            case -2:
-                LOADROW(row11, -1);
-                LOADROW(row12, 0);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row1 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row2 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row3 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row4 = compress(tmp1, tmp2);
-                break;
-
-            case 2:
-                LOADROW(row11, 0);
-                LOADROW(row12, 1);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row1 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row2 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row3 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row4 = compress(tmp1, tmp2);
-                break;
-
-            case 5:
-                LOADROW(row11, 0);
-                LOADROW(row12, 1);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row1 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row2 = compress(tmp1, tmp2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row3 = compress(tmp1, tmp2);
-                row11 = row12;
-                LOADROW(row12, 2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                row4 = compress(tmp1, tmp2);
-                break;
-
-            default:               // these cases uses the lookup table to identify access patterns
-
-                PREDANG_CALCROW_HOR(0, tmp1);
-                PREDANG_CALCROW_HOR(1, tmp2);
-                row1 = compress(tmp1, tmp2);
-                PREDANG_CALCROW_HOR(2, tmp1);
-                PREDANG_CALCROW_HOR(3, tmp2);
-                row2 = compress(tmp1, tmp2);
-                PREDANG_CALCROW_HOR(4, tmp1);
-                PREDANG_CALCROW_HOR(5, tmp2);
-                row3 = compress(tmp1, tmp2);
-                PREDANG_CALCROW_HOR(6, tmp1);
-                PREDANG_CALCROW_HOR(7, tmp2);
-                row4 = compress(tmp1, tmp2);
-            }
-
-            // Flip the block
-            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
-            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
-            row1 = tmp16_1;
-            row2 = tmp16_2;
-
-            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
-            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
-            row3 = tmp16_1;
-            row4 = tmp16_2;
-
-            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
-            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
-            row1 = tmp16_1;
-            row2 = tmp16_2;
-
-            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
-            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
-            row3 = tmp16_1;
-            row4 = tmp16_2;
-
-            tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row1, (Vec4i)row3);
-            tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row1, (Vec4i)row3);
-            row1 = tmp16_1;
-            row3 = tmp16_2;
-
-            tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row2, (Vec4i)row4);
-            tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row2, (Vec4i)row4);
-            row2 = tmp16_1;
-            row4 = tmp16_2;
-
-            store_partial(const_int(8), pDst, row1);   //row1
-            store_partial(const_int(8), pDst + (2 * dstStride), row3);   //row3
-            store_partial(const_int(8), pDst + (4 * dstStride), row2);   //row5
-            store_partial(const_int(8), pDst + (6 * dstStride), row4);   //row7
-
-            row1 = blend2q<1, 3>((Vec2q)row1, (Vec2q)row1);
-            store_partial(const_int(8), pDst + (1 * dstStride), row1);   //row2
-
-            row1 = blend2q<1, 3>((Vec2q)row3, (Vec2q)row3);
-            store_partial(const_int(8), pDst + (3 * dstStride), row1);   //row4
-
-            row1 = blend2q<1, 3>((Vec2q)row2, (Vec2q)row2);
-            store_partial(const_int(8), pDst + (5 * dstStride), row1);   //row6
-
-            row1 = blend2q<1, 3>((Vec2q)row4, (Vec2q)row4);
-            store_partial(const_int(8), pDst + (7 * dstStride), row1);   //row8
-        }
-        else                         // Vertical modes
-        {
-            Vec8s row11, row12;
-            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
-            Vec16uc tmp;
-            Vec8s tmp1, tmp2;
-            v_deltaPos = 0;
-            v_ipAngle = intraPredAngle;
-            switch (intraPredAngle)
-            {
-            case -5:
-                LOADROW(row11, -1);
-                LOADROW(row12, 0);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
-                row12 = row11;
-                LOADROW(row11, -2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
-                break;
-
-            case -2:
-                LOADROW(row11, -1);
-                LOADROW(row12, 0);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
-                break;
-
-            case 2:
-                LOADROW(row11, 0);
-                LOADROW(row12, 1);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
-                break;
-
-            case 5:
-                LOADROW(row11, 0);
-                LOADROW(row12, 1);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
-                row11 = row12;
-                LOADROW(row12, 2);
-                CALCROW(tmp1, row11, row12);
-                CALCROW(tmp2, row11, row12);
-                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
-                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
-                break;
-
-            default:                   // these cases uses the lookup table to identify access patterns
-                PREDANG_CALCROW_VER(0);
-                PREDANG_CALCROW_VER(1);
-                PREDANG_CALCROW_VER(2);
-                PREDANG_CALCROW_VER(3);
-                PREDANG_CALCROW_VER(4);
-                PREDANG_CALCROW_VER(5);
-                PREDANG_CALCROW_VER(6);
-                PREDANG_CALCROW_VER(7);
-            }
-        }
-    }
-}
-
-#undef PREDANG_CALCROW_VER
-#undef PREDANG_CALCROW_HOR
-#undef LOADROW
-#undef CALCROW
-#endif /* if HIGH_BIT_DEPTH */
-
-//16x16
-#if HIGH_BIT_DEPTH
-#else
-#define PREDANG_CALCROW_VER(X) { \
-        LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
-        LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
-        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        /*compress(row11L, row11H).store(pDst + ((X)*dstStride));*/ \
-        itmp = _mm_packus_epi16(row11L, row11H); \
-        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
-}
-
-#define PREDANG_CALCROW_HOR(X, rowx) { \
-        LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
-        LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
-        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
-        /*rowx = compress(row11L, row11H);*/  \
-        rowx = _mm_packus_epi16(row11L, row11H); \
-}
-
-// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
-#define LOADROW(ROWL, ROWH, X) { \
-        /*tmp.load(refMain + 1 + (X)); */ \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
-        /* ROWL = extend_low(tmp);*/  \
-        ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        /*ROWH = extend_high(tmp);*/  \
-        ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-}
-
-#define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
-        /*v_deltaPos += v_ipAngle; \
-        v_deltaFract = v_deltaPos & thirty1;*/ \
-        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-        /*RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
-        RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5;*/ \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, ROW1L); \
-        it3 = _mm_mullo_epi16(v_deltaFract, ROW2L); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        RESL = _mm_srai_epi16(it2, 5); \
-        \
-        it2 = _mm_mullo_epi16(it1, ROW1H); \
-        it3 = _mm_mullo_epi16(v_deltaFract, ROW2H); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        RESH = _mm_srai_epi16(it2, 5); \
-}
-
-#define  BLND2_16(R1, R2) { \
-        /*tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); */ \
-        itmp1 = _mm_unpacklo_epi8(R1, R2); \
-        /*tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2);*/ \
-        itmp2 = _mm_unpackhi_epi8(R1, R2); \
-        R1 = itmp1; \
-        R2 = itmp2; \
-}
-
-#define MB4(R1, R2, R3, R4) { \
-        BLND2_16(R1, R2) \
-        BLND2_16(R3, R4) \
-        /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3);*/  \
-        itmp1 = _mm_unpacklo_epi16(R1, R3); \
-        /* tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3);*/ \
-        itmp2 = _mm_unpackhi_epi16(R1, R3); \
-        R1 = itmp1; \
-        R3 = itmp2; \
-        /*R1 = tmp1; \
-        R3 = tmp2;*/ \
-        /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
-        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4);*/ \
-        itmp1 = _mm_unpacklo_epi16(R2, R4); \
-        itmp2 = _mm_unpackhi_epi16(R2, R4); \
-        R2 = itmp1; \
-        R4 = itmp2; \
-        /*R2 = tmp1; \
-        R4 = tmp2;*/ \
-}
-
-#define BLND2_4(R1, R2) { \
-        /* tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
-        tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); */ \
-        itmp1 = _mm_unpacklo_epi32(R1, R2); \
-        itmp2 = _mm_unpackhi_epi32(R1, R2); \
-        R1 = itmp1; \
-        R2 = itmp2; \
-        /*R1 = tmp1; \
-        R2 = tmp2; */\
-}
-
-#define BLND2_2(R1, R2) { \
-        /*tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
-        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2);*/ \
-        itmp1 = _mm_unpacklo_epi64(R1, R2); \
-        itmp2 = _mm_unpackhi_epi64(R1, R2); \
-        /*tmp1.store(pDst); */ \
-        _mm_storeu_si128((__m128i*)pDst, itmp1); \
-        pDst += dstStride; \
-        /*tmp2.store(pDst);*/ \
-        _mm_storeu_si128((__m128i*)pDst, itmp2); \
-        pDst += dstStride; \
-}
-
-#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
-        PREDANG_CALCROW_HOR(0 + X, R1) \
-        PREDANG_CALCROW_HOR(1 + X, R2) \
-        PREDANG_CALCROW_HOR(2 + X, R3) \
-        PREDANG_CALCROW_HOR(3 + X, R4) \
-        PREDANG_CALCROW_HOR(4 + X, R5) \
-        PREDANG_CALCROW_HOR(5 + X, R6) \
-        PREDANG_CALCROW_HOR(6 + X, R7) \
-        PREDANG_CALCROW_HOR(7 + X, R8) \
-        MB4(R1, R2, R3, R4) \
-        MB4(R5, R6, R7, R8) \
-        BLND2_4(R1, R5); \
-        BLND2_4(R2, R6); \
-        BLND2_4(R3, R7); \
-        BLND2_4(R4, R8); \
-}
-
-void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
-{
-    int k;
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int lookIdx = intraPredAngle;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-
-    pixel* refMain;
-    pixel* refSide;
-
-    // Initialise the Main and Left reference array.
-    if (intraPredAngle < 0)
-    {
-        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
-        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
-
-        // Extend the Main reference to the left.
-        int invAngleSum    = 128;     // rounding for (shift by 8)
-        if (intraPredAngle != -32)
-            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
-            {
-                invAngleSum += invAngle;
-                refMain[k] = refSide[invAngleSum >> 8];
-            }
-    }
-    else
-    {
-        refMain = modeVer ? refAbove : refLeft;
-        refSide = modeVer ? refLeft  : refAbove;
-    }
-
-    // bfilter will always be true for blocksize 8
-    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
-    {
-        if (modeHor)
-        {
-            Vec16uc v_temp;
-            Vec16uc tmp1;
-            v_temp.load(refMain + 1);
-
-            if (bFilter)
-            {
-                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
-                Vec16uc v_temp16;
-                v_temp16.load(refSide + 1);
-                Vec8s v_side;
-                v_side = extend_low(v_temp16);
-
-                Vec8s row01, row02, ref(refMain[1]);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row01 = ref + v_side;
-                row01 = min(max(0, row01), (1 << bitDepth) - 1);
-
-                v_side = extend_high(v_temp16);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row02 = ref + v_side;
-                row02 = min(max(0, row02), (1 << bitDepth) - 1);
-
-                tmp1 = compress_unsafe(row01, row02);
-                tmp1.store(pDst);            //row0
-            }
-            else
-            {
-                tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
-                tmp1.store(pDst); //row0
-            }
-
-            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
-            tmp1.store(pDst + (1 * dstStride)); //row1
-
-            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
-            tmp1.store(pDst + (2 * dstStride)); //row2
-
-            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
-            tmp1.store(pDst + (3 * dstStride)); //row3
-
-            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
-            tmp1.store(pDst + (4 * dstStride)); //row4
-
-            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
-            tmp1.store(pDst + (5 * dstStride)); //row5
-
-            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
-            tmp1.store(pDst + (6 * dstStride)); //row6
-
-            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
-            tmp1.store(pDst + (7 * dstStride)); //row7
-
-            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
-            tmp1.store(pDst + (8 * dstStride)); //row8
-
-            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
-            tmp1.store(pDst + (9 * dstStride)); //row9
-
-            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
-            tmp1.store(pDst + (10 * dstStride)); //row10
-
-            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
-            tmp1.store(pDst + (11 * dstStride)); //row11
-
-            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
-            tmp1.store(pDst + (12 * dstStride)); //row12
-
-            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
-            tmp1.store(pDst + (13 * dstStride)); //row13
-
-            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
-            tmp1.store(pDst + (14 * dstStride)); //row14
-
-            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
-            tmp1.store(pDst + (15 * dstStride)); //row15
-        }
-        else
-        {
-            Vec16uc v_main;
-//            v_main.load(refMain + 1);
-            v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
-
-            _mm_storeu_si128((__m128i*)pDst, v_main);
-            _mm_storeu_si128((__m128i*)(pDst + dstStride), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (2 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (3 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (4 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (5 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (6 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (7 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (8 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (9 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (10 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (11 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (12 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (13 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (14 * dstStride)), v_main);
-            _mm_storeu_si128((__m128i*)(pDst + (15 * dstStride)), v_main);
-
-            if (bFilter)
-            {
-                Vec16uc v_temp;
-                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
-
-                v_temp.load(refSide + 1);
-                Vec8s v_side;
-                v_side = extend_low(v_temp);
-
-                Vec8s row0, ref(refMain[1]);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row0 = ref + v_side;
-                row0 = min(max(0, row0), (1 << bitDepth) - 1);
-
-                pDst[0 * dstStride] = row0[0];
-                pDst[1 * dstStride] = row0[1];
-                pDst[2 * dstStride] = row0[2];
-                pDst[3 * dstStride] = row0[3];
-                pDst[4 * dstStride] = row0[4];
-                pDst[5 * dstStride] = row0[5];
-                pDst[6 * dstStride] = row0[6];
-                pDst[7 * dstStride] = row0[7];
-
-                v_side = extend_high(v_temp);
-                v_side -= v_side_0;
-                v_side = v_side >> 1;
-                row0 = ref + v_side;
-                row0 = min(max(0, row0), (1 << bitDepth) - 1);
-                pDst[8 * dstStride] = row0[0];
-                pDst[9 * dstStride] = row0[1];
-                pDst[10 * dstStride] = row0[2];
-                pDst[11 * dstStride] = row0[3];
-                pDst[12 * dstStride] = row0[4];
-                pDst[13 * dstStride] = row0[5];
-                pDst[14 * dstStride] = row0[6];
-                pDst[15 * dstStride] = row0[7];
-            }
-        }
-    }
-    else if (intraPredAngle == -32)
-    {
-        Vec16uc v_refSide;
-        v_refSide.load(refSide);
-        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
-        pixel refMain0 = refMain[0];
-
-        v_refSide.store(refMain - 15);
-        refMain[0] = refMain0;
-
-        Vec16uc tmp;
-        __m128i itmp;
-//        tmp.load(refMain);        //-1,0,1,2
-//        tmp.store(pDst);
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)--refMain);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-
-/*
-        tmp.load(--refMain);
-        pDst += dstStride;
-        tmp.store(pDst);
-        ... 14 times more
-*/
-        return;
-    }
-    else if (intraPredAngle == 32)
-    {
-        Vec8s tmp;
-        __m128i itmp;
-        refMain += 2;
-
-//        tmp.load(refMain++);
-//        tmp.store(pDst);
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-
-/*
-        tmp.load(refMain++);
-        pDst += dstStride;
-        tmp.store(pDst);
-        ... 14 times more
-*/
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        pDst += dstStride;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-
-        return;
-    }
-    else
-    {
-        if (modeHor)
-        {
-            Vec8s row11L, row12L, row11H, row12H;
-            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
-            Vec16uc tmp;
-            Vec16uc R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
-            Vec16uc tmp1, tmp2;
-            v_deltaPos = 0;
-            v_ipAngle = intraPredAngle;
-            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
-//            MB16;
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-        }
-        else
-        {
-            Vec8s row11L, row12L, row11H, row12H;
-            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
-            Vec16uc tmp;
-            Vec8s tmp1, tmp2;
-            v_deltaPos = 0;
-            v_ipAngle = intraPredAngle;
-            __m128i itmp, it1, it2, it3, i16;
-
-            PREDANG_CALCROW_VER(0);
-            PREDANG_CALCROW_VER(1);
-            PREDANG_CALCROW_VER(2);
-            PREDANG_CALCROW_VER(3);
-            PREDANG_CALCROW_VER(4);
-            PREDANG_CALCROW_VER(5);
-            PREDANG_CALCROW_VER(6);
-            PREDANG_CALCROW_VER(7);
-            PREDANG_CALCROW_VER(8);
-            PREDANG_CALCROW_VER(9);
-            PREDANG_CALCROW_VER(10);
-            PREDANG_CALCROW_VER(11);
-            PREDANG_CALCROW_VER(12);
-            PREDANG_CALCROW_VER(13);
-            PREDANG_CALCROW_VER(14);
-            PREDANG_CALCROW_VER(15);
-        }
-    }
-}
-
-#undef PREDANG_CALCROW_VER
-#undef PREDANG_CALCROW_HOR
-#undef LOADROW
-#undef CALCROW
-#undef BLND2_16
-#undef BLND2_2
-#undef BLND2_4
-#undef MB4
-#undef CALC_BLND_8ROWS
-#endif /* if HIGH_BIT_DEPTH */
-
-//32x32
-#if HIGH_BIT_DEPTH
-#else
-#define PREDANG_CALCROW_VER(X) { \
-        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
-        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-  \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
-        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-  \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, row11L); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        row11L = _mm_srai_epi16(it2, 5); \
-        it2 = _mm_mullo_epi16(it1, row11H); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        row11H = _mm_srai_epi16(it2, 5); \
-  \
-        itmp = _mm_packus_epi16(row11L, row11H); \
-        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
-        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-  \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
-        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-  \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, row11L); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        row11L = _mm_srai_epi16(it2, 5); \
-        it2 = _mm_mullo_epi16(it1, row11H); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        row11H = _mm_srai_epi16(it2, 5); \
-  \
-        itmp = _mm_packus_epi16(row11L, row11H); \
-        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
-}
-
-#define PREDANG_CALCROW_VER_MODE2(X) { \
-        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, row11); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row21); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        res1 = _mm_srai_epi16(it2, 5); \
-        it2 = _mm_mullo_epi16(it1, row12); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row22); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        res2 = _mm_srai_epi16(it2, 5); \
-  \
-        itmp = _mm_packus_epi16(res1, res2); \
-        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, row13); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row23); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        res1 = _mm_srai_epi16(it2, 5); \
-        it2 = _mm_mullo_epi16(it1, row14); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row24); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        res2 = _mm_srai_epi16(it2, 5); \
-  \
-        itmp = _mm_packus_epi16(res1, res2); \
-        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
-}
-
-#define PREDANG_CALCROW_HOR(X, rowx) { \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
-        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-  \
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
-        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-  \
-        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, row11L); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        row11L = _mm_srai_epi16(it2, 5); \
-        it2 = _mm_mullo_epi16(it1, row11H); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        row11H = _mm_srai_epi16(it2, 5); \
-  \
-        rowx = _mm_packus_epi16(row11L, row11H); \
-}
-
-#define PREDANG_CALCROW_HOR_MODE2(rowx) { \
-        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
-        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
-        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
-        it2 = _mm_mullo_epi16(it1, row11L); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
-        it2 = _mm_add_epi16(it2, it3); \
-        i16 = _mm_set1_epi16(16); \
-        it2 = _mm_add_epi16(it2, i16); \
-        res1 = _mm_srai_epi16(it2, 5); \
-        it2 = _mm_mullo_epi16(it1, row11H); \
-        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
-        it2 = _mm_add_epi16(it2, it3); \
-        it2 = _mm_add_epi16(it2, i16); \
-        res2 = _mm_srai_epi16(it2, 5); \
-  \
-        rowx = _mm_packus_epi16(res1, res2); \
-}
-
-// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
-#define LOADROW(ROWL, ROWH, X) { \
-/*        tmp.load(refMain + 1 + (X)); \
-        ROWL = extend_low(tmp); \
-        ROWH = extend_high(tmp); */\
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
-        ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
-        ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
-}
-
-#define BLND2_2(R1, R2) { \
-/*        tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
-        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
-        tmp1.store(pDst);   pDst += dstStride; \
-        tmp2.store(pDst);   pDst += dstStride; */\
-        itmp1 = _mm_unpacklo_epi64(R1, R2); \
-        itmp2 = _mm_unpackhi_epi64(R1, R2); \
-        _mm_storeu_si128((__m128i*)pDst, itmp1); \
-        pDst += dstStride; \
-        _mm_storeu_si128((__m128i*)pDst, itmp2); \
-        pDst += dstStride; \
-}
-
-#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
-        itmp1 = _mm_unpacklo_epi8(R1, R2); \
-        itmp2 = _mm_unpackhi_epi8(R1, R2); \
-        R1 = itmp1; \
-        R2 = itmp2; \
-        itmp1 = _mm_unpacklo_epi8(R3, R4); \
-        itmp2 = _mm_unpackhi_epi8(R3, R4); \
-        R3 = itmp1; \
-        R4 = itmp2; \
-        itmp1 = _mm_unpacklo_epi16(R1, R3); \
-        itmp2 = _mm_unpackhi_epi16(R1, R3); \
-        R1 = itmp1; \
-        R3 = itmp2; \
-        itmp1 = _mm_unpacklo_epi16(R2, R4); \
-        itmp2 = _mm_unpackhi_epi16(R2, R4); \
-        R2 = itmp1; \
-        R4 = itmp2; \
-        itmp1 = _mm_unpacklo_epi8(R5, R6); \
-        itmp2 = _mm_unpackhi_epi8(R5, R6); \
-        R5 = itmp1; \
-        R6 = itmp2; \
-        itmp1 = _mm_unpacklo_epi8(R7, R8); \
-        itmp2 = _mm_unpackhi_epi8(R7, R8); \
-        R7 = itmp1; \
-        R8 = itmp2; \
-        itmp1 = _mm_unpacklo_epi16(R5, R7); \
-        itmp2 = _mm_unpackhi_epi16(R5, R7); \
-        R5 = itmp1; \
-        R7 = itmp2; \
-        itmp1 = _mm_unpacklo_epi16(R6, R8); \
-        itmp2 = _mm_unpackhi_epi16(R6, R8); \
-        R6 = itmp1; \
-        R8 = itmp2; \
-        itmp1 = _mm_unpacklo_epi32(R1, R5); \
-        itmp2 = _mm_unpackhi_epi32(R1, R5); \
-        R1 = itmp1; \
-        R5 = itmp2; \
-  \
-        itmp1 = _mm_unpacklo_epi32(R2, R6); \
-        itmp2 = _mm_unpackhi_epi32(R2, R6); \
-        R2 = itmp1; \
-        R6 = itmp2; \
-  \
-        itmp1 = _mm_unpacklo_epi32(R3, R7); \
-        itmp2 = _mm_unpackhi_epi32(R3, R7); \
-        R3 = itmp1; \
-        R7 = itmp2; \
-  \
-        itmp1 = _mm_unpacklo_epi32(R4, R8); \
-        itmp2 = _mm_unpackhi_epi32(R4, R8); \
-        R4 = itmp1; \
-        R8 = itmp2; \
-}
-
-#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
-        PREDANG_CALCROW_HOR(0 + X, R1) \
-        PREDANG_CALCROW_HOR(1 + X, R2) \
-        PREDANG_CALCROW_HOR(2 + X, R3) \
-        PREDANG_CALCROW_HOR(3 + X, R4) \
-        PREDANG_CALCROW_HOR(4 + X, R5) \
-        PREDANG_CALCROW_HOR(5 + X, R6) \
-        PREDANG_CALCROW_HOR(6 + X, R7) \
-}
-
-#define CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) { \
-        PREDANG_CALCROW_HOR_MODE2(R1) \
-        PREDANG_CALCROW_HOR_MODE2(R2) \
-        PREDANG_CALCROW_HOR_MODE2(R3) \
-        PREDANG_CALCROW_HOR_MODE2(R4) \
-        PREDANG_CALCROW_HOR_MODE2(R5) \
-        PREDANG_CALCROW_HOR_MODE2(R6) \
-        PREDANG_CALCROW_HOR_MODE2(R7) \
-}
-
-void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
-{
-    int k;
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int lookIdx = intraPredAngle;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-
-    pixel* refMain;
-    pixel* refSide;
-
-    // Initialise the Main and Left reference array.
-    if (intraPredAngle < 0)
-    {
-        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
-        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
-
-        // Extend the Main reference to the left.
-        int invAngleSum    = 128;     // rounding for (shift by 8)
-        if (intraPredAngle != -32)
-            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
-            {
-                invAngleSum += invAngle;
-                refMain[k] = refSide[invAngleSum >> 8];
-            }
-    }
-    else
-    {
-        refMain = modeVer ? refAbove : refLeft;
-        refSide = modeVer ? refLeft  : refAbove;
-    }
-
-    // bfilter will always be true for blocksize 8
-    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
-    {
-        if (modeHor)
-        {
-            Vec16uc v_temp, tmp1;
-
-            v_temp.load(refMain + 1);
-            /*BROADSTORE16ROWS;*/
-            tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
-            tmp1.store(pDst + (0 * dstStride));
-            tmp1.store(pDst + (0 * dstStride) + 16);
-            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
-            tmp1.store(pDst + (1 * dstStride));
-            tmp1.store(pDst + (1 * dstStride) + 16);
-            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
-            tmp1.store(pDst + (2 * dstStride));
-            tmp1.store(pDst + (2 * dstStride) + 16);
-            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
-            tmp1.store(pDst + (3 * dstStride));
-            tmp1.store(pDst + (3 * dstStride) + 16);
-            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
-            tmp1.store(pDst + (4 * dstStride));
-            tmp1.store(pDst + (4 * dstStride) + 16);
-            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
-            tmp1.store(pDst + (5 * dstStride));
-            tmp1.store(pDst + (5 * dstStride) + 16);
-            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
-            tmp1.store(pDst + (6 * dstStride));
-            tmp1.store(pDst + (6 * dstStride) + 16);
-            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
-            tmp1.store(pDst + (7 * dstStride));
-            tmp1.store(pDst + (7 * dstStride) + 16);
-            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
-            tmp1.store(pDst + (8 * dstStride));
-            tmp1.store(pDst + (8 * dstStride) + 16);
-            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
-            tmp1.store(pDst + (9 * dstStride));
-            tmp1.store(pDst + (9 * dstStride) + 16);
-            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
-            tmp1.store(pDst + (10 * dstStride));
-            tmp1.store(pDst + (10 * dstStride) + 16);
-            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
-            tmp1.store(pDst + (11 * dstStride));
-            tmp1.store(pDst + (11 * dstStride) + 16);
-            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
-            tmp1.store(pDst + (12 * dstStride));
-            tmp1.store(pDst + (12 * dstStride) + 16);
-            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
-            tmp1.store(pDst + (13 * dstStride));
-            tmp1.store(pDst + (13 * dstStride) + 16);
-            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
-            tmp1.store(pDst + (14 * dstStride));
-            tmp1.store(pDst + (14 * dstStride) + 16);
-            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
-            tmp1.store(pDst + (15 * dstStride));
-            tmp1.store(pDst + (15 * dstStride) + 16);
-
-            pDst += 16 * dstStride;
-            v_temp.load(refMain + 1 + 16);
-            /*BROADSTORE16ROWS;*/
-            tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
-            tmp1.store(pDst + (0 * dstStride));
-            tmp1.store(pDst + (0 * dstStride) + 16);
-            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
-            tmp1.store(pDst + (1 * dstStride));
-            tmp1.store(pDst + (1 * dstStride) + 16);
-            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
-            tmp1.store(pDst + (2 * dstStride));
-            tmp1.store(pDst + (2 * dstStride) + 16);
-            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
-            tmp1.store(pDst + (3 * dstStride));
-            tmp1.store(pDst + (3 * dstStride) + 16);
-            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
-            tmp1.store(pDst + (4 * dstStride));
-            tmp1.store(pDst + (4 * dstStride) + 16);
-            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
-            tmp1.store(pDst + (5 * dstStride));
-            tmp1.store(pDst + (5 * dstStride) + 16);
-            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
-            tmp1.store(pDst + (6 * dstStride));
-            tmp1.store(pDst + (6 * dstStride) + 16);
-            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
-            tmp1.store(pDst + (7 * dstStride));
-            tmp1.store(pDst + (7 * dstStride) + 16);
-            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
-            tmp1.store(pDst + (8 * dstStride));
-            tmp1.store(pDst + (8 * dstStride) + 16);
-            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
-            tmp1.store(pDst + (9 * dstStride));
-            tmp1.store(pDst + (9 * dstStride) + 16);
-            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
-            tmp1.store(pDst + (10 * dstStride));
-            tmp1.store(pDst + (10 * dstStride) + 16);
-            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
-            tmp1.store(pDst + (11 * dstStride));
-            tmp1.store(pDst + (11 * dstStride) + 16);
-            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
-            tmp1.store(pDst + (12 * dstStride));
-            tmp1.store(pDst + (12 * dstStride) + 16);
-            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
-            tmp1.store(pDst + (13 * dstStride));
-            tmp1.store(pDst + (13 * dstStride) + 16);
-            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
-            tmp1.store(pDst + (14 * dstStride));
-            tmp1.store(pDst + (14 * dstStride) + 16);
-            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
-            tmp1.store(pDst + (15 * dstStride));
-            tmp1.store(pDst + (15 * dstStride) + 16);
-        }
-        else
-        {
-            __m128i v_main;
-            Pel *dstOriginal = pDst;
-//            v_main.load(refMain + 1);
-            v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
-//            v_main.store(pDst);
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-
-            pDst = dstOriginal + 16;
-            v_main = _mm_loadu_si128((__m128i const*)(refMain + 17));
-//            v_main.store(pDst);
-
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-            pDst += dstStride;
-            _mm_storeu_si128((__m128i*)(pDst), v_main);
-        }
-    }
-    else if (intraPredAngle == -32)
-    {
-        Vec16uc v_refSide;
-        pixel refMain0 = refMain[0];
-
-        v_refSide.load(refSide);
-        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
-        v_refSide.store(refMain - 15);
-
-        v_refSide.load(refSide + 16);
-        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
-        v_refSide.store(refMain - 31);
-
-        refMain[0] = refMain0;
-
-        __m128i itmp;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-
-        return;
-    }
-    else if (intraPredAngle == 32)
-    {
-        __m128i itmp;
-        refMain += 2;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        return;
-    }
-    else
-    {
-        if (modeHor)
-        {
-            __m128i row11L, row12L, row11H, row12H, res1, res2;
-            __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
-            __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
-
-            Pel * original_pDst = pDst;
-            v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
-            v_ipAngle = _mm_set1_epi16(intraPredAngle);
-            thirty2 = _mm_set1_epi16(32);
-            thirty1 = _mm_set1_epi16(31);
-            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
-
-            switch (intraPredAngle)
-            {
-            case -2:
-                LOADROW(row11L, row11H, -1)
-                LOADROW(row12L, row12H,  0)
-                R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row12L = row11L;
-                row12H = row11H;
-                LOADROW(row11L, row11H, -2)
-                R16 = _mm_packus_epi16(row11L, row11H);
-                pDst = original_pDst + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                pDst = original_pDst + (16 * dstStride);
-                refMain += 16;
-
-                v_deltaPos = _mm_setzero_si128();
-                v_ipAngle = _mm_set1_epi16(intraPredAngle);
-                LOADROW(row11L, row11H, -1)
-                LOADROW(row12L, row12H,  0)
-                R16 = _mm_packus_epi16(row11L, row11H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row12L = row11L;
-                row12H = row11H;
-                LOADROW(row11L, row11H, -2)
-                R16 = _mm_packus_epi16(row11L, row11H);
-                pDst = original_pDst + (16 * dstStride) + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-                return;
-
-            case  2:
-                LOADROW(row11L, row11H, 0)
-                LOADROW(row12L, row12H, 1)
-                R16 = _mm_packus_epi16(row12L, row12H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row11L = row12L;
-                row11H = row12H;
-                LOADROW(row12L, row12H, 2)
-                R16 = _mm_packus_epi16(row12L, row12H);
-                pDst = original_pDst + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                pDst = original_pDst + (16 * dstStride);
-                refMain += 16;
-                v_deltaPos = _mm_setzero_si128();
-
-                v_ipAngle = _mm_set1_epi16(intraPredAngle);
-                LOADROW(row11L, row11H, 0)
-                LOADROW(row12L, row12H, 1)
-                R16 = _mm_packus_epi16(row12L, row12H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row11L = row12L;
-                row11H = row12H;
-                LOADROW(row12L, row12H, 2)
-                R16 = _mm_packus_epi16(row12L, row12H);
-                pDst = original_pDst + (16 * dstStride) + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-                return;
-            }
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
-            PREDANG_CALCROW_HOR(7 + 0, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
-            PREDANG_CALCROW_HOR(7 + 8, R16)
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-
-            pDst = original_pDst + 16;
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
-            PREDANG_CALCROW_HOR(7 + 16, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
-            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-
-            pDst = original_pDst + (16 * dstStride);
-            refMain += 16;
-            v_deltaPos = _mm_setzero_si128();
-            v_ipAngle = _mm_set1_epi16(intraPredAngle);
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
-            PREDANG_CALCROW_HOR(7 + 0, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
-            PREDANG_CALCROW_HOR(7 + 8, R16)
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-            pDst = original_pDst + (16 * dstStride) + 16;
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
-            PREDANG_CALCROW_HOR(7 + 16, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
-            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-        }
-        else
-        {
-            __m128i row11L, row12L, row11H, row12H;
-            __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
-            __m128i row11, row12, row13, row14, row21, row22, row23, row24;
-            __m128i res1, res2;
-
-            v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
-            v_ipAngle = _mm_set1_epi16(intraPredAngle);
-            thirty2 = _mm_set1_epi16(32);
-            thirty1 = _mm_set1_epi16(31);
-            __m128i itmp, it1, it2, it3, i16;
-
-            switch (intraPredAngle)
-            {
-            case -2:
-                LOADROW(row11, row12, -1)
-                LOADROW(row21, row22,  0)
-                LOADROW(row13, row14, 15)
-                LOADROW(row23, row24, 16)
-                for (int i = 0; i <= 14; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                //deltaFract == 0 for 16th row
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                itmp = _mm_packus_epi16(row11, row12);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row13, row14);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
-                row21 = row11;
-                row22 = row12;
-                row23 = row13;
-                row24 = row14;
-
-                LOADROW(row11, row12, -2)
-                LOADROW(row13, row14, 14)
-                for (int i = 16; i <= 30; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                itmp = _mm_packus_epi16(row11, row12);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row13, row14);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
-                return;
-
-            case  2:
-
-                LOADROW(row11, row12, 0)
-                LOADROW(row21, row22, 1)
-                LOADROW(row13, row14, 16)
-                LOADROW(row23, row24, 17)
-                for (int i = 0; i <= 14; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                //deltaFract == 0 for 16th row
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                itmp = _mm_packus_epi16(row21, row22);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row23, row24);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
-                row11 = row21;
-                row12 = row22;
-                row13 = row23;
-                row14 = row24;
-
-                LOADROW(row21, row22, 2)
-                LOADROW(row23, row24, 18)
-                for (int i = 16; i <= 30; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                itmp = _mm_packus_epi16(row21, row22);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row23, row24);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
-                return;
-            }
-
-            for (int i = 0; i <= 30; i++)
-            {
-                PREDANG_CALCROW_VER(i);
-            }
-
-            itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
-            _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
-            itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
-            _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-        }
-    }
-}
-
-#endif /* if HIGH_BIT_DEPTH */
-
-void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
-{
-#if HIGH_BIT_DEPTH
-#else
-    switch (width)
-    {
-    case 4:
-        xPredIntraAng4x4(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
-        return;
-    case 8:
-        xPredIntraAng8x8(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
-        return;
-    case 16:
-        xPredIntraAng16x16(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
-        return;
-    case 32:
-        xPredIntraAng32x32(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
-        return;
-    }
-
-#endif /* if HIGH_BIT_DEPTH */
-
-    int k, l;
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-    {
-        pixel* refMain;
-        pixel* refSide;
-
-        // Initialise the Main and Left reference array.
-        if (intraPredAngle < 0)
-        {
-            refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
-            refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
-
-            // Extend the Main reference to the left.
-            int invAngleSum    = 128; // rounding for (shift by 8)
-            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
-            {
-                invAngleSum += invAngle;
-                refMain[k] = refSide[invAngleSum >> 8];
-            }
-        }
-        else
-        {
-            refMain = modeVer ? refAbove : refLeft;
-            refSide = modeVer ? refLeft  : refAbove;
-        }
-
-        if (intraPredAngle == 0)
-        {
-            for (k = 0; k < blkSize; k++)
-            {
-                for (l = 0; l < blkSize; l++)
-                {
-                    pDst[k * dstStride + l] = refMain[l + 1];
-                }
-            }
-
-            if (bFilter)
-            {
-                for (k = 0; k < blkSize; k++)
-                {
-                    pDst[k * dstStride] = (pixel)Clip3(0, (1 << bitDepth) - 1, static_cast<short>(pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1));
-                }
-            }
-        }
-        else
-        {
-            int deltaPos = 0;
-            int deltaInt;
-            int deltaFract;
-            int refMainIndex;
-
-            for (k = 0; k < blkSize; k++)
-            {
-                deltaPos += intraPredAngle;
-                deltaInt   = deltaPos >> 5;
-                deltaFract = deltaPos & (32 - 1);
-
-                if (deltaFract)
-                {
-                    // Do linear filtering
-                    for (l = 0; l < blkSize; l++)
-                    {
-                        refMainIndex        = l + deltaInt + 1;
-                        pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
-                    }
-                }
-                else
-                {
-                    // Just copy the integer samples
-                    for (l = 0; l < blkSize; l++)
-                    {
-                        pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
-                    }
-                }
-            }
-        }
-
-        // Flip the block if this is the horizontal mode
-        if (modeHor)
-        {
-            pixel  tmp;
-            for (k = 0; k < blkSize - 1; k++)
-            {
-                for (l = k + 1; l < blkSize; l++)
-                {
-                    tmp                 = pDst[k * dstStride + l];
-                    pDst[k * dstStride + l] = pDst[l * dstStride + k];
-                    pDst[l * dstStride + k] = tmp;
-                }
-            }
-        }
-    }
-}
-}
-
-#include "utils.h"
-
-namespace x265 {
-void NAME(Setup_Vec_IPredPrimitives)(EncoderPrimitives& p)
-{
-    initFileStaticVars();
-    p.getIPredDC = predIntraDC;
-    p.getIPredPlanar = predIntraPlanar;
-    p.getIPredAng = xPredIntraAngBufRef;
-}
-}
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <chenm003 at 163.com>
+ *          Deepthi Devaki <deepthidevaki at multicorewareinc.com>
+ *          Steve Borho <steve at borho.org>
+ *          ShinYee Chung <shinyee at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#include "primitives.h"
+#include "TLibCommon/TComRom.h"
+#include <assert.h>
+#include <smmintrin.h>
+
+extern char g_aucConvertToBit[];
+extern unsigned char g_aucIntraFilterType[][35];
+
+using namespace x265;
+
+namespace {
+const int angAP[17][64] =
+{
+    {
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+    },
+    {
+        0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52
+    },
+    {
+        0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 26, 27, 28, 28, 29, 30, 30, 31, 32, 32, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 40, 41, 42
+    },
+    {
+        0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 34
+    },
+    {
+        0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 26
+    },
+    {
+        0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18
+    },
+    {
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10
+    },
+    {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+    },
+    { // 0th virtual index; never used; just to help indexing
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
+    },
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4
+    },
+    {
+        -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7, -7, -8, -8, -8, -8, -8, -8, -8, -9, -9, -9, -9, -9, -9, -10, -10, -10, -10, -10, -10, -10
+    },
+    {
+        -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -6, -6, -6, -6, -7, -7, -7, -8, -8, -8, -8, -9, -9, -9, -9, -10, -10, -10, -11, -11, -11, -11, -12, -12, -12, -13, -13, -13, -13, -14, -14, -14, -15, -15, -15, -15, -16, -16, -16, -17, -17, -17, -17, -18, -18, -18, -18
+    },
+    {
+        -1, -1, -2, -2, -3, -3, -3, -4, -4, -5, -5, -5, -6, -6, -7, -7, -7, -8, -8, -9, -9, -9, -10, -10, -11, -11, -11, -12, -12, -13, -13, -13, -14, -14, -15, -15, -16, -16, -16, -17, -17, -18, -18, -18, -19, -19, -20, -20, -20, -21, -21, -22, -22, -22, -23, -23, -24, -24, -24, -25, -25, -26, -26, -26
+    },
+    {
+        -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8, -9, -10, -10, -11, -11, -12, -12, -13, -13, -14, -14, -15, -15, -16, -16, -17, -17, -18, -19, -19, -20, -20, -21, -21, -22, -22, -23, -23, -24, -24, -25, -25, -26, -27, -27, -28, -28, -29, -29, -30, -30, -31, -31, -32, -32, -33, -33, -34, -34
+    },
+    {
+        -1, -2, -2, -3, -4, -4, -5, -6, -6, -7, -8, -8, -9, -10, -10, -11, -12, -12, -13, -14, -14, -15, -16, -16, -17, -18, -18, -19, -20, -20, -21, -21, -22, -23, -23, -24, -25, -25, -26, -27, -27, -28, -29, -29, -30, -31, -31, -32, -33, -33, -34, -35, -35, -36, -37, -37, -38, -39, -39, -40, -41, -41, -42, -42
+    },
+    {
+        -1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26, -27, -28, -29, -30, -31, -31, -32, -33, -34, -35, -35, -36, -37, -38, -39, -39, -40, -41, -42, -43, -44, -44, -45, -46, -47, -48, -48, -49, -50, -51, -52, -52
+    },
+    {
+        -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64
+    }
+};
+
+#define GETAP(X, Y) angAP[8 - (X)][(Y)]
+
+__m128i v_multiL, v_multiH, v_multiH2, v_multiH3, v_multiH4, v_multiH5, v_multiH6, v_multiH7;
+__m128i v_multi_2Row;
+
+/* When compiled with /arch:AVX, this code is not safe to run on non-AVX CPUs and
+ * thus we cannot use static initialization.  This routine is only called if the
+ * detected CPU can support this SIMD architecture. */
+static void initFileStaticVars()
+{
+    v_multiL = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    v_multiH = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
+    v_multiH2 = _mm_setr_epi16(17, 18, 19, 20, 21, 22, 23, 24);
+    v_multiH3 = _mm_setr_epi16(25, 26, 27, 28, 29, 30, 31, 32);
+    v_multiH4 = _mm_setr_epi16(33, 34, 35, 36, 37, 38, 39, 40);
+    v_multiH5 = _mm_setr_epi16(41, 42, 43, 44, 45, 46, 47, 48);
+    v_multiH6 = _mm_setr_epi16(49, 50, 51, 52, 53, 54, 55, 56);
+    v_multiH7 = _mm_setr_epi16(57, 58, 59, 60, 61, 62, 63, 64);
+    v_multi_2Row = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
+}
+
+static inline
+void xDCPredFiltering(pixel* pSrc, intptr_t iSrcStride, pixel* rpDst, intptr_t iDstStride, int iWidth, int /*iHeight*/)
+{
+    pixel* pDst = rpDst;
+    int y;
+    pixel pixDC = *pDst;
+    int pixDCx3 = pixDC * 3 + 2;
+
+    // boundary pixels processing
+    pDst[0] = (pixel)((pSrc[-iSrcStride] + pSrc[-1] + 2 * pixDC + 2) >> 2);
+
+    Vec8us im1(pixDCx3);
+    Vec8us im2, im3;
+#if HIGH_BIT_DEPTH
+    switch (iWidth)
+    {
+    case 4:
+        im2 = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
+        im2 = (im1 + im2) >> const_int(2);
+        store_partial(const_int(8), &pDst[1], im2);
+        break;
+
+    case 8:
+        im2.load(&pSrc[1 - iSrcStride]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1]);
+        break;
+
+    case 16:
+        im2.load(&pSrc[1 - iSrcStride]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1]);
+
+        im2.load(&pSrc[1 - iSrcStride + 8]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 8]);
+        break;
+
+    case 32:
+        im2.load(&pSrc[1 - iSrcStride]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1]);
+
+        im2.load(&pSrc[1 - iSrcStride + 8]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 8]);
+
+        im2.load(&pSrc[1 - iSrcStride + 16]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 16]);
+
+        im2.load(&pSrc[1 - iSrcStride + 24]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 24]);
+        break;
+
+    //case 64:
+    default:
+        im2.load(&pSrc[1 - iSrcStride]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1]);
+
+        im2.load(&pSrc[1 - iSrcStride + 8]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 8]);
+
+        im2.load(&pSrc[1 - iSrcStride + 16]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 16]);
+
+        im2.load(&pSrc[1 - iSrcStride + 24]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 24]);
+
+        im2.load(&pSrc[1 - iSrcStride + 32]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 32]);
+
+        im2.load(&pSrc[1 - iSrcStride + 40]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 40]);
+
+        im2.load(&pSrc[1 - iSrcStride + 48]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 48]);
+
+        im2.load(&pSrc[1 - iSrcStride + 56]);
+        im2 = (im1 + im2) >> const_int(2);
+        im2.store(&pDst[1 + 56]);
+        break;
+    }
+
+#else /* if HIGH_BIT_DEPTH */
+    Vec16uc pix;
+    switch (iWidth)
+    {
+    case 4:
+        pix = load_partial(const_int(4), &pSrc[1 - iSrcStride]);
+        im2 = extend_low(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        pix = compress(im2, im2);
+        store_partial(const_int(4), &pDst[1], pix);
+        break;
+
+    case 8:
+        pix = load_partial(const_int(8), &pSrc[1 - iSrcStride]);
+        im2 = extend_low(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        pix = compress(im2, im2);
+        store_partial(const_int(8), &pDst[1], pix);
+        break;
+
+    case 16:
+        pix.load(&pSrc[1 - iSrcStride]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1]);
+        break;
+
+    case 32:
+        pix.load(&pSrc[1 - iSrcStride]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1]);
+
+        pix.load(&pSrc[1 - iSrcStride + 16]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1 + 16]);
+        break;
+
+    //case 64:
+    default:
+        pix.load(&pSrc[1 - iSrcStride]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1]);
+
+        pix.load(&pSrc[1 - iSrcStride + 16]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1 + 16]);
+
+        pix.load(&pSrc[1 - iSrcStride + 32]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1 + 32]);
+
+        pix.load(&pSrc[1 - iSrcStride + 48]);
+        im2 = extend_low(pix);
+        im3 = extend_high(pix);
+        im2 = (im1 + im2) >> const_int(2);
+        im3 = (im1 + im3) >> const_int(2);
+        pix = compress(im2, im3);
+        pix.store(&pDst[1 + 48]);
+        break;
+    }
+
+#endif /* if HIGH_BIT_DEPTH */
+
+    for (y = 1; y < iWidth; y++)
+    {
+        pDst[iDstStride] = (pixel)((pSrc[iSrcStride - 1] + pixDCx3) >> 2);
+        pSrc += iSrcStride;
+        pDst += iDstStride;
+    }
+}
+
+void predIntraDC(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride, int width, int bFilter)
+{
+    //assert(iWidth == iHeight); // all of Intra is NxN
+    //assert(blkAboveAvailable || blkLeftAvailable); // I think left and above always true since HEVC have a pixel fill process
+    int iSum = 0;
+    int logSize = g_aucConvertToBit[width] + 2;
+    pixel *pSrcAbove = &pSrc[-srcStride];
+    pixel *pSrcLeft = &pSrc[-1];
+
+    for (int iInd = 0; iInd < width; iInd++)
+    {
+        iSum += *pSrcLeft;
+        pSrcLeft += srcStride;
+    }
+
+#if HIGH_BIT_DEPTH
+    Vec8s sumAbove(0);
+    Vec8s m0;
+
+    switch (width)
+    {
+    case 4:
+        sumAbove = load_partial(const_int(8), pSrcAbove);
+        break;
+    case 8:
+        m0.load(pSrcAbove);
+        sumAbove = m0;
+        break;
+    case 16:
+        m0.load(pSrcAbove);
+        sumAbove  = m0;
+        m0.load(pSrcAbove + 8);
+        sumAbove += m0;
+        break;
+    case 32:
+        m0.load(pSrcAbove);
+        sumAbove  = m0;
+        m0.load(pSrcAbove + 8);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 16);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 24);
+        sumAbove += m0;
+        break;
+        //case 64:
+    default:
+        // CHECK_ME: the max support bit_depth is 13-bits
+        m0.load(pSrcAbove);
+        sumAbove  = m0;
+        m0.load(pSrcAbove + 8);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 16);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 24);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 32);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 40);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 48);
+        sumAbove += m0;
+        m0.load(pSrcAbove + 56);
+        sumAbove += m0;
+        break;
+    }
+
+    iSum += horizontal_add_x(sumAbove);
+
+    logSize += 1;
+    pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
+    Vec8us dcValN(dcVal);
+    int k;
+
+    pixel *pDst1 = pDst;
+    switch (width)
+    {
+    case 4:
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        break;
+
+    case 8:
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        dcValN.store(pDst1);
+        pDst1 += dstStride;
+        break;
+
+    case 16:
+        for (k = 0; k < 16; k += 2)
+        {
+            dcValN.store(pDst1);
+            dcValN.store(pDst1 + 8);
+            pDst1 += dstStride;
+            dcValN.store(pDst1);
+            dcValN.store(pDst1 + 8);
+            pDst1 += dstStride;
+        }
+
+        break;
+
+    case 32:
+        for (k = 0; k < 32; k++)
+        {
+            dcValN.store(pDst1);
+            dcValN.store(pDst1 +  8);
+            dcValN.store(pDst1 + 16);
+            dcValN.store(pDst1 + 24);
+            pDst1 += dstStride;
+        }
+
+        break;
+
+    //case 64:
+    default:
+        for (k = 0; k < 64; k++)
+        {
+            dcValN.store(pDst1);
+            dcValN.store(pDst1 +  8);
+            dcValN.store(pDst1 + 16);
+            dcValN.store(pDst1 + 24);
+            dcValN.store(pDst1 + 32);
+            dcValN.store(pDst1 + 40);
+            dcValN.store(pDst1 + 48);
+            dcValN.store(pDst1 + 56);
+            pDst1 += dstStride;
+        }
+
+        break;
+    }
+
+    if (bFilter)
+    {
+        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
+    }
+#else // if !HIGH_BIT_DEPTH
+
+    {
+        Vec16uc pix;
+        Vec8us  im;
+        Vec4ui  im1, im2;
+
+        switch (width)
+        {
+        case 4:
+            pix.fromUint32(*(uint32_t*)pSrcAbove);
+            iSum += horizontal_add(extend_low(pix));
+            break;
+        case 8:
+#if X86_64
+            pix.fromUint64(*(uint64_t*)pSrcAbove);
+#else
+            pix.load_partial(8, pSrcAbove);
+#endif
+            iSum += horizontal_add(extend_low(pix));
+            break;
+        case 16:
+            pix.load(pSrcAbove);
+            iSum += horizontal_add_x(pix);
+            break;
+        case 32:
+            pix.load(pSrcAbove);
+            im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
+            pix.load(pSrcAbove + 16);
+            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+            im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
+            iSum += toInt32(im1);
+            break;
+        //case 64:
+        default:
+            pix.load(pSrcAbove);
+            im1 = (Vec4ui)(pix.sad(_mm_setzero_si128()));
+            pix.load(pSrcAbove + 16);
+            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+            pix.load(pSrcAbove + 32);
+            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+            pix.load(pSrcAbove + 48);
+            im1 += (Vec4ui)(pix.sad(_mm_setzero_si128()));
+            im1 += (Vec4ui)((Vec128b)im1 >> const_int(64));
+            //im1 += extract_hi64(im1);
+            iSum += toInt32(im1);
+            break;
+        }
+    }
+
+    logSize += 1;
+    pixel dcVal = (iSum + (1 << (logSize - 1))) >> logSize;
+    Vec16uc dcValN(dcVal);
+    int k;
+
+    pixel *pDst1 = pDst;
+    switch (width)
+    {
+    case 4:
+        store_partial(const_int(4), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(4), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(4), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(4), pDst1, dcValN);
+        break;
+
+    case 8:
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        pDst1 += dstStride;
+        store_partial(const_int(8), pDst1, dcValN);
+        break;
+
+    case 16:
+        for (k = 0; k < 16; k += 4)
+        {
+            store_partial(const_int(16), pDst1, dcValN);
+            pDst1 += dstStride;
+            store_partial(const_int(16), pDst1, dcValN);
+            pDst1 += dstStride;
+            store_partial(const_int(16), pDst1, dcValN);
+            pDst1 += dstStride;
+            store_partial(const_int(16), pDst1, dcValN);
+            pDst1 += dstStride;
+        }
+
+        break;
+
+    case 32:
+        for (k = 0; k < 32; k += 2)
+        {
+            store_partial(const_int(16), pDst1,    dcValN);
+            store_partial(const_int(16), pDst1 + 16, dcValN);
+            pDst1 += dstStride;
+            store_partial(const_int(16), pDst1,    dcValN);
+            store_partial(const_int(16), pDst1 + 16, dcValN);
+            pDst1 += dstStride;
+        }
+
+        break;
+
+    case 64:
+        for (k = 0; k < 64; k++)
+        {
+            store_partial(const_int(16), pDst1,    dcValN);
+            store_partial(const_int(16), pDst1 + 16, dcValN);
+            store_partial(const_int(16), pDst1 + 32, dcValN);
+            store_partial(const_int(16), pDst1 + 48, dcValN);
+            pDst1 += dstStride;
+        }
+
+        break;
+    }
+
+    if (bFilter)
+    {
+        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, width);
+    }
+#endif // if HIGH_BIT_DEPTH
+}
+
+#if HIGH_BIT_DEPTH
+// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp
+void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    int k, bottomLeft, topRight;
+    // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits
+    int16_t leftColumn[4];
+
+    // Get left and above reference column and row
+    Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
+
+    for (k = 0; k < 4; k++)
+    {
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn);   // leftColumn
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[4 * srcStride - 1];
+    topRight   = pSrc[4 - srcStride];
+
+    Vec8s v_bottomLeft(bottomLeft);
+    Vec8s v_topRight(topRight);
+
+    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+    Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+    v_topRow = v_topRow << const_int(2);
+    v_leftColumn = v_leftColumn << const_int(2);
+
+    // Generate prediction signal
+    Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
+    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+    Vec8s v_horPred, v_rightColumnN;
+    Vec8s v_im4;
+    Vec16uc v_im5;
+
+    // line0
+    v_horPred = broadcast(const_int(0), v_horPred4);
+    v_rightColumnN = broadcast(const_int(0), v_rightColumn) * v_multi;
+    v_horPred = v_horPred + v_rightColumnN;
+    v_topRow = v_topRow + v_bottomRow;
+    // CHECK_ME: the HM don't clip the pixel, so I assume there is biggest 12+3=15(bits)
+    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+    store_partial(const_int(8), &rpDst[0 * dstStride], v_im4);
+
+    // line1
+    v_horPred = broadcast(const_int(1), v_horPred4);
+    v_rightColumnN = broadcast(const_int(1), v_rightColumn) * v_multi;
+    v_horPred = v_horPred + v_rightColumnN;
+    v_topRow = v_topRow + v_bottomRow;
+    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+    store_partial(const_int(8), &rpDst[1 * dstStride], v_im4);
+
+    // line2
+    v_horPred = broadcast(const_int(2), v_horPred4);
+    v_rightColumnN = broadcast(const_int(2), v_rightColumn) * v_multi;
+    v_horPred = v_horPred + v_rightColumnN;
+    v_topRow = v_topRow + v_bottomRow;
+    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+    store_partial(const_int(8), &rpDst[2 * dstStride], v_im4);
+
+    // line3
+    v_horPred = broadcast(const_int(3), v_horPred4);
+    v_rightColumnN = broadcast(const_int(3), v_rightColumn) * v_multi;
+    v_horPred = v_horPred + v_rightColumnN;
+    v_topRow = v_topRow + v_bottomRow;
+    v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3);
+    store_partial(const_int(8), &rpDst[3 * dstStride], v_im4);
+}
+
+#else /* if HIGH_BIT_DEPTH */
+void predIntraPlanar4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    int k;
+    pixel bottomLeft, topRight;
+
+    // Get left and above reference column and row
+    Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &pSrc[-srcStride]); // topRow
+    Vec8s v_topRow = extend_low(im0);
+
+    int16_t leftColumn[4];
+
+    for (k = 0; k < 4; k++)
+    {
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn);   // leftColumn
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[4 * srcStride - 1];
+    topRight   = pSrc[4 - srcStride];
+
+    Vec8s v_bottomLeft(bottomLeft);
+    Vec8s v_topRight(topRight);
+
+    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+    Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+    v_topRow = v_topRow << const_int(2);
+    v_leftColumn = v_leftColumn << const_int(2);
+
+    Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
+    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+    Vec8s v_horPred, v_rightColumnN;
+    Vec8s v_im4;
+    Vec16uc v_im5;
+
+#define COMP_PRED_PLANAR4_ROW(X) { \
+        v_horPred = broadcast(const_int((X)), v_horPred4); \
+        v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \
+        v_horPred = v_horPred + v_rightColumnN; \
+        v_topRow = v_topRow + v_bottomRow; \
+        v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \
+        v_im5 = compress_unsafe(v_im4, v_im4); \
+        store_partial(const_int(4), &rpDst[(X)*dstStride], v_im5); \
+}
+
+    COMP_PRED_PLANAR4_ROW(0)
+    COMP_PRED_PLANAR4_ROW(1)
+    COMP_PRED_PLANAR4_ROW(2)
+    COMP_PRED_PLANAR4_ROW(3)
+
+#undef COMP_PRED_PLANAR4_ROW
+}
+
+#if INSTRSET >= 5
+void predIntraPlanar4_sse4(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    pixel bottomLeft, topRight;
+
+    // Get left and above reference column and row
+    __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&pSrc[-srcStride]); // topRow
+    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+
+    v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[4 * srcStride - 1];
+    topRight   = pSrc[4 - srcStride];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+    __m128i v_bottomRow   = _mm_sub_epi16(v_bottomLeft, v_topRow);
+
+    v_topRow = _mm_slli_epi16(v_topRow, 2);
+
+    __m128i v_horPred, v_rightColumnN;
+    __m128i v_im4;
+    __m128i v_im5;
+    __m128i _tmp0, _tmp1;
+
+    __m128i v_bottomRowL = _mm_unpacklo_epi64(v_bottomRow, _mm_setzero_si128());
+    v_topRow = _mm_sub_epi16(v_topRow, v_bottomRowL);
+    v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);
+
+#define COMP_PRED_PLANAR_2ROW(Y) { \
+        _tmp0 = _mm_cvtsi32_si128((pSrc[((Y)) * srcStride - 1] << 2) + 4); \
+        _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
+        _tmp1 = _mm_cvtsi32_si128((pSrc[((Y)+1) * srcStride - 1] << 2) + 4); \
+        _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
+        v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \
+        _tmp0 = _mm_cvtsi32_si128(topRight - pSrc[((Y)) * srcStride - 1]); \
+        _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
+        _tmp1 = _mm_cvtsi32_si128(topRight - pSrc[((Y)+1) * srcStride - 1]); \
+        _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
+        v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \
+        v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \
+        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \
+        v_im5 = _mm_packus_epi16(v_im4, v_im4); \
+        *(uint32_t*)&rpDst[(Y)*dstStride] = _mm_cvtsi128_si32(v_im5); \
+        *(uint32_t*)&rpDst[((Y)+1) * dstStride] = _mm_cvtsi128_si32(_mm_shuffle_epi32(v_im5, 0x55));; \
+}
+
+    COMP_PRED_PLANAR_2ROW(0)
+    COMP_PRED_PLANAR_2ROW(2)
+
+#undef COMP_PRED_PLANAR4_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+
+#define COMP_PRED_PLANAR_ROW(X) { \
+        v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+        v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
+        v_horPred = v_horPred + v_rightColumnN; \
+        v_topRow = v_topRow + v_bottomRow; \
+        v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
+        store_partial(const_int(16), &rpDst[X * dstStride], v_im4); \
+}
+
+void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    int k, bottomLeft, topRight;
+
+    int16_t leftColumn[8];
+
+    // Get left and above reference column and row
+    Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
+
+    for (k = 0; k < 8; k++)
+    {
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn);   // leftColumn
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[8 * srcStride - 1];
+    topRight   = pSrc[8 - srcStride];
+
+    Vec8s v_bottomLeft(bottomLeft);
+    Vec8s v_topRight(topRight);
+
+    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+    Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+    int shift = g_aucConvertToBit[8];          // Using value corresponding to width = 8
+    v_topRow = v_topRow << (2 + shift);
+    v_leftColumn = v_leftColumn << (2 + shift);
+
+    // Generate prediction signal
+    Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
+    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+    Vec8s v_horPred, v_rightColumnN;
+    Vec8s v_im4;
+    Vec16uc v_im5;
+
+    COMP_PRED_PLANAR_ROW(0);     // row 0
+    COMP_PRED_PLANAR_ROW(1);
+    COMP_PRED_PLANAR_ROW(2);
+    COMP_PRED_PLANAR_ROW(3);
+    COMP_PRED_PLANAR_ROW(4);
+    COMP_PRED_PLANAR_ROW(5);
+    COMP_PRED_PLANAR_ROW(6);
+    COMP_PRED_PLANAR_ROW(7);     // row 7
+}
+
+#undef COMP_PRED_PLANAR_ROW
+#else /* if HIGH_BIT_DEPTH */
+
+#define COMP_PRED_PLANAR_ROW(X) { \
+        v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+        v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
+        v_horPred = v_horPred + v_rightColumnN; \
+        v_topRow = v_topRow + v_bottomRow; \
+        v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
+        v_im5 = compress(v_im4, v_im4); \
+        store_partial(const_int(8), &rpDst[X * dstStride], v_im5); \
+}
+
+void predIntraPlanar8(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    int k;
+    pixel bottomLeft, topRight;
+    int16_t leftColumn[8];
+
+    // Get left and above reference column and row
+    Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &pSrc[-srcStride]); // topRow
+    Vec8s v_topRow = extend_low(im0);
+
+    for (k = 0; k < 8; k++)
+    {
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    Vec8s v_leftColumn;
+    v_leftColumn.load(leftColumn);   // leftColumn
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[8 * srcStride - 1];
+    topRight   = pSrc[8 - srcStride];
+
+    Vec8s v_bottomLeft(bottomLeft);
+    Vec8s v_topRight(topRight);
+
+    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
+    Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+    int shift = g_aucConvertToBit[8];         // Using value corresponding to width = 8
+    v_topRow = v_topRow << (2 + shift);
+    v_leftColumn = v_leftColumn << (2 + shift);
+
+    Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
+    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
+    Vec8s v_horPred, v_rightColumnN;
+    Vec8s v_im4;
+    Vec16uc v_im5;
+
+    COMP_PRED_PLANAR_ROW(0);     // row 0
+    COMP_PRED_PLANAR_ROW(1);
+    COMP_PRED_PLANAR_ROW(2);
+    COMP_PRED_PLANAR_ROW(3);
+    COMP_PRED_PLANAR_ROW(4);
+    COMP_PRED_PLANAR_ROW(5);
+    COMP_PRED_PLANAR_ROW(6);
+    COMP_PRED_PLANAR_ROW(7);     // row 7
+}
+
+#undef COMP_PRED_PLANAR_ROW
+
+#if INSTRSET >= 5
+void predIntraPlanar8_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+    pixel bottomLeft, topRight;
+
+    // Get left and above reference column and row
+    __m128i im0 = _mm_loadl_epi64((__m128i*)&pSrc[0 - srcStride]); // topRow
+    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+
+    __m128i v_leftColumn = _mm_setzero_si128();
+
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[0 * srcStride - 1], 0);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[1 * srcStride - 1], 1);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[2 * srcStride - 1], 2);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[3 * srcStride - 1], 3);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[4 * srcStride - 1], 4);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[5 * srcStride - 1], 5);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[6 * srcStride - 1], 6);
+    v_leftColumn = _mm_insert_epi8(v_leftColumn, pSrc[7 * srcStride - 1], 7);
+    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[8 * srcStride - 1];
+    topRight   = pSrc[8 - srcStride];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+    __m128i v_topRight   = _mm_set1_epi16(topRight);
+
+    __m128i v_bottomRow   = _mm_sub_epi16(v_bottomLeft, v_topRow);
+    __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
+
+    v_topRow = _mm_slli_epi16(v_topRow, 3);
+    v_leftColumn = _mm_slli_epi16(v_leftColumn, 3);
+
+    __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));
+    __m128i v_horPred, v_rightColumnN;
+    __m128i v_im4;
+    __m128i v_im5;
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+        if ((Y) < 4) { \
+            v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \
+            v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \
+            v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
+            v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \
+        } \
+        else { \
+            v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \
+            v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \
+            v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
+            v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \
+        } \
+        v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \
+        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 4); \
+        v_im5 = _mm_packus_epi16(v_im4, v_im4); \
+        _mm_storel_epi64((__m128i*)&pDst[(Y)*dstStride], v_im5); \
+}
+
+    COMP_PRED_PLANAR_ROW(0)
+    COMP_PRED_PLANAR_ROW(1)
+    COMP_PRED_PLANAR_ROW(2)
+    COMP_PRED_PLANAR_ROW(3)
+    COMP_PRED_PLANAR_ROW(4)
+    COMP_PRED_PLANAR_ROW(5)
+    COMP_PRED_PLANAR_ROW(6)
+    COMP_PRED_PLANAR_ROW(7)
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+#define COMP_PRED_PLANAR_ROW(X) { \
+        v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+        v_horPred_hi = v_horPred_lo; \
+        v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
+        v_rightColumnN_hi = v_rightColumnN_lo; \
+        v_rightColumnN_lo *= v_multi_lo; \
+        v_rightColumnN_hi *= v_multi_hi; \
+        v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
+        v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
+        v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
+        v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
+        v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
+        v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
+        v_im4_lo.store(&rpDst[X * dstStride]); \
+        v_im4_hi.store(&rpDst[X * dstStride + 8]); \
+}
+
+void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    int k;
+    pixel bottomLeft, topRight;
+    int16_t leftColumn[16];
+
+    // Get left and above reference column and row
+    Vec8s v_topRow_lo, v_topRow_hi;
+
+    v_topRow_lo.load(&pSrc[-srcStride]);
+    v_topRow_hi.load(&pSrc[-srcStride + 8]);
+
+    for (k = 0; k < 16; k++)
+    {
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    Vec8s v_leftColumn;
+    v_leftColumn.load(leftColumn);   // leftColumn
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[16 * srcStride - 1];
+    topRight   = pSrc[16 - srcStride];
+
+    Vec8s v_bottomLeft(bottomLeft);
+    Vec8s v_topRight(topRight);
+
+    Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
+    Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
+    Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+    int shift = g_aucConvertToBit[16];         // Using value corresponding to width = 8
+    v_topRow_lo = v_topRow_lo << (2 + shift);
+    v_topRow_hi = v_topRow_hi << (2 + shift);
+    v_leftColumn = v_leftColumn << (2 + shift);
+
+    Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
+    const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
+    const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
+    Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
+    Vec8s v_im4_lo, v_im4_hi;
+    Vec16uc v_im5;
+
+    COMP_PRED_PLANAR_ROW(0);     // row 0
+    COMP_PRED_PLANAR_ROW(1);
+    COMP_PRED_PLANAR_ROW(2);
+    COMP_PRED_PLANAR_ROW(3);
+    COMP_PRED_PLANAR_ROW(4);
+    COMP_PRED_PLANAR_ROW(5);
+    COMP_PRED_PLANAR_ROW(6);
+    COMP_PRED_PLANAR_ROW(7);     // row 7
+
+    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows
+    v_rightColumn = v_topRight - v_leftColumn;
+    v_leftColumn = v_leftColumn << (2 + shift);
+    v_horPred4 = v_leftColumn + Vec8s(16);
+
+    COMP_PRED_PLANAR_ROW(8);     // row 0
+    COMP_PRED_PLANAR_ROW(9);
+    COMP_PRED_PLANAR_ROW(10);
+    COMP_PRED_PLANAR_ROW(11);
+    COMP_PRED_PLANAR_ROW(12);
+    COMP_PRED_PLANAR_ROW(13);
+    COMP_PRED_PLANAR_ROW(14);
+    COMP_PRED_PLANAR_ROW(15);
+}
+
+#undef COMP_PRED_PLANAR_ROW
+
+#else /* if HIGH_BIT_DEPTH */
+#define COMP_PRED_PLANAR_ROW(X) { \
+        v_horPred_lo = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
+        v_horPred_hi = v_horPred_lo; \
+        v_rightColumnN_lo = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn); \
+        v_rightColumnN_hi = v_rightColumnN_lo; \
+        v_rightColumnN_lo *= v_multi_lo; \
+        v_rightColumnN_hi *= v_multi_hi; \
+        v_horPred_lo = v_horPred_lo + v_rightColumnN_lo; \
+        v_horPred_hi = v_horPred_hi + v_rightColumnN_hi; \
+        v_topRow_lo = v_topRow_lo + v_bottomRow_lo; \
+        v_topRow_hi = v_topRow_hi + v_bottomRow_hi; \
+        v_im4_lo = (Vec8s)(v_horPred_lo + v_topRow_lo) >> (3 + shift); \
+        v_im4_hi = (Vec8s)(v_horPred_hi + v_topRow_hi) >> (3 + shift); \
+        v_im5 = compress(v_im4_lo, v_im4_hi); \
+        store_partial(const_int(16), &rpDst[X * dstStride], v_im5); \
+}
+
+void predIntraPlanar16(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride)
+{
+    int k;
+    pixel bottomLeft, topRight;
+    int16_t leftColumn[16];
+
+    // Get left and above reference column and row
+    Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &pSrc[-srcStride]); // topRow
+    Vec8s v_topRow_lo = extend_low(im0);
+    Vec8s v_topRow_hi = extend_high(im0);
+
+    for (k = 0; k < 16; k++)
+    {
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    Vec8s v_leftColumn;
+    v_leftColumn.load(leftColumn);   // leftColumn
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[16 * srcStride - 1];
+    topRight   = pSrc[16 - srcStride];
+
+    Vec8s v_bottomLeft(bottomLeft);
+    Vec8s v_topRight(topRight);
+
+    Vec8s v_bottomRow_lo = v_bottomLeft - v_topRow_lo;
+    Vec8s v_bottomRow_hi = v_bottomLeft - v_topRow_hi;
+    Vec8s v_rightColumn = v_topRight - v_leftColumn;
+
+    int shift = g_aucConvertToBit[16];         // Using value corresponding to width = 8
+    v_topRow_lo = v_topRow_lo << (2 + shift);
+    v_topRow_hi = v_topRow_hi << (2 + shift);
+    v_leftColumn = v_leftColumn << (2 + shift);
+
+    Vec8s v_horPred4 = v_leftColumn + Vec8s(16);
+    const Vec8s v_multi_lo(1, 2, 3, 4, 5, 6, 7, 8);
+    const Vec8s v_multi_hi(9, 10, 11, 12, 13, 14, 15, 16);
+    Vec8s v_horPred_lo, v_horPred_hi, v_rightColumnN_lo, v_rightColumnN_hi;
+    Vec8s v_im4_lo, v_im4_hi;
+    Vec16uc v_im5;
+
+    COMP_PRED_PLANAR_ROW(0);     // row 0
+    COMP_PRED_PLANAR_ROW(1);
+    COMP_PRED_PLANAR_ROW(2);
+    COMP_PRED_PLANAR_ROW(3);
+    COMP_PRED_PLANAR_ROW(4);
+    COMP_PRED_PLANAR_ROW(5);
+    COMP_PRED_PLANAR_ROW(6);
+    COMP_PRED_PLANAR_ROW(7);     // row 7
+
+    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows
+    v_rightColumn = v_topRight - v_leftColumn;
+    v_leftColumn = v_leftColumn << (2 + shift);
+    v_horPred4 = v_leftColumn + Vec8s(16);
+
+    COMP_PRED_PLANAR_ROW(8);     // row 0
+    COMP_PRED_PLANAR_ROW(9);
+    COMP_PRED_PLANAR_ROW(10);
+    COMP_PRED_PLANAR_ROW(11);
+    COMP_PRED_PLANAR_ROW(12);
+    COMP_PRED_PLANAR_ROW(13);
+    COMP_PRED_PLANAR_ROW(14);
+    COMP_PRED_PLANAR_ROW(15);
+}
+
+#undef COMP_PRED_PLANAR_ROW
+
+#if INSTRSET >= 5
+void predIntraPlanar16_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+    pixel bottomLeft, topRight;
+    __m128i v_topRow[2];
+    __m128i v_bottomRow[2];
+
+    // Get left and above reference column and row
+    __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
+
+    v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+    v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[16 * srcStride - 1];
+    topRight   = pSrc[16 - srcStride];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+
+    v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
+    v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
+
+    v_topRow[0] = _mm_slli_epi16(v_topRow[0], 4);
+    v_topRow[1] = _mm_slli_epi16(v_topRow[1], 4);
+
+    __m128i v_horPred, v_horPredN[2], v_rightColumnN[2];
+    __m128i v_im4L, v_im4H;
+    __m128i v_im5;
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+        v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 4) + 16); \
+        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
+        v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
+        __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
+        _tmp = _mm_shufflelo_epi16(_tmp, 0); \
+        _tmp = _mm_shuffle_epi32(_tmp, 0); \
+        v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
+        v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
+        v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
+        v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
+        v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
+        v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
+        v_im4L = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 5); \
+        v_im4H = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 5); \
+        v_im5 = _mm_packus_epi16(v_im4L, v_im4H); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5); \
+}
+
+    COMP_PRED_PLANAR_ROW(0)
+    COMP_PRED_PLANAR_ROW(1)
+    COMP_PRED_PLANAR_ROW(2)
+    COMP_PRED_PLANAR_ROW(3)
+    COMP_PRED_PLANAR_ROW(4)
+    COMP_PRED_PLANAR_ROW(5)
+    COMP_PRED_PLANAR_ROW(6)
+    COMP_PRED_PLANAR_ROW(7)
+    COMP_PRED_PLANAR_ROW(8)
+    COMP_PRED_PLANAR_ROW(9)
+    COMP_PRED_PLANAR_ROW(10)
+    COMP_PRED_PLANAR_ROW(11)
+    COMP_PRED_PLANAR_ROW(12)
+    COMP_PRED_PLANAR_ROW(13)
+    COMP_PRED_PLANAR_ROW(14)
+    COMP_PRED_PLANAR_ROW(15)
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#if INSTRSET >= 5
+void predIntraPlanar32_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+    pixel bottomLeft, topRight;
+    __m128i v_topRow[4];
+    __m128i v_bottomRow[4];
+
+    // Get left and above reference column and row
+    __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
+    __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
+
+    v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+    v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
+    v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
+    v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[32 * srcStride - 1];
+    topRight   = pSrc[32 - srcStride];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+
+    v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
+    v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
+    v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
+    v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
+
+    v_topRow[0] = _mm_slli_epi16(v_topRow[0], 5);
+    v_topRow[1] = _mm_slli_epi16(v_topRow[1], 5);
+    v_topRow[2] = _mm_slli_epi16(v_topRow[2], 5);
+    v_topRow[3] = _mm_slli_epi16(v_topRow[3], 5);
+
+    __m128i v_horPred, v_horPredN[4], v_rightColumnN[4];
+    __m128i v_im4[4];
+    __m128i v_im5[2];
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+        v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 5) + 32); \
+        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
+        v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
+        __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
+        _tmp = _mm_shufflelo_epi16(_tmp, 0); \
+        _tmp = _mm_shuffle_epi32(_tmp, 0); \
+        v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
+        v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
+        v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
+        v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
+        v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
+        v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
+        v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
+        v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
+        v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
+        v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
+        v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
+        v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
+        v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 6); \
+        v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 6); \
+        v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 6); \
+        v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 6); \
+        v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
+        v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
+}
+
+    int i;
+    for (i = 0; i < 32; i += 2)
+    {
+        COMP_PRED_PLANAR_ROW(i + 0);
+        COMP_PRED_PLANAR_ROW(i + 1);
+    }
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#if INSTRSET >= 5
+void predIntraPlanar64_sse4(pixel* pSrc, intptr_t srcStride, pixel* pDst, intptr_t dstStride)
+{
+    pixel bottomLeft, topRight;
+    __m128i v_topRow[8];
+    __m128i v_bottomRow[8];
+
+    // Get left and above reference column and row
+    __m128i im0 = _mm_loadu_si128((__m128i*)&pSrc[0 - srcStride]); // topRow
+    __m128i im1 = _mm_loadu_si128((__m128i*)&pSrc[16 - srcStride]); // topRow
+    __m128i im2 = _mm_loadu_si128((__m128i*)&pSrc[32 - srcStride]); // topRow
+    __m128i im3 = _mm_loadu_si128((__m128i*)&pSrc[48 - srcStride]); // topRow
+
+    v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+    v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
+    v_topRow[2] = _mm_unpacklo_epi8(im1, _mm_setzero_si128());
+    v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
+    v_topRow[4] = _mm_unpacklo_epi8(im2, _mm_setzero_si128());
+    v_topRow[5] = _mm_unpackhi_epi8(im2, _mm_setzero_si128());
+    v_topRow[6] = _mm_unpacklo_epi8(im3, _mm_setzero_si128());
+    v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = pSrc[64 * srcStride - 1];
+    topRight   = pSrc[64 - srcStride];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+
+    v_bottomRow[0] = _mm_sub_epi16(v_bottomLeft, v_topRow[0]);
+    v_bottomRow[1] = _mm_sub_epi16(v_bottomLeft, v_topRow[1]);
+    v_bottomRow[2] = _mm_sub_epi16(v_bottomLeft, v_topRow[2]);
+    v_bottomRow[3] = _mm_sub_epi16(v_bottomLeft, v_topRow[3]);
+    v_bottomRow[4] = _mm_sub_epi16(v_bottomLeft, v_topRow[4]);
+    v_bottomRow[5] = _mm_sub_epi16(v_bottomLeft, v_topRow[5]);
+    v_bottomRow[6] = _mm_sub_epi16(v_bottomLeft, v_topRow[6]);
+    v_bottomRow[7] = _mm_sub_epi16(v_bottomLeft, v_topRow[7]);
+
+    v_topRow[0] = _mm_slli_epi16(v_topRow[0], 6);
+    v_topRow[1] = _mm_slli_epi16(v_topRow[1], 6);
+    v_topRow[2] = _mm_slli_epi16(v_topRow[2], 6);
+    v_topRow[3] = _mm_slli_epi16(v_topRow[3], 6);
+    v_topRow[4] = _mm_slli_epi16(v_topRow[4], 6);
+    v_topRow[5] = _mm_slli_epi16(v_topRow[5], 6);
+    v_topRow[6] = _mm_slli_epi16(v_topRow[6], 6);
+    v_topRow[7] = _mm_slli_epi16(v_topRow[7], 6);
+
+    __m128i v_horPred, v_horPredN[8], v_rightColumnN[8];
+    __m128i v_im4[8];
+    __m128i v_im5[4];
+
+#define COMP_PRED_PLANAR_ROW(Y) { \
+        v_horPred = _mm_cvtsi32_si128((pSrc[(Y)*srcStride - 1] << 6) + 64); \
+        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
+        v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
+        __m128i _tmp = _mm_cvtsi32_si128(topRight - pSrc[(Y)*srcStride - 1]); \
+        _tmp = _mm_shufflelo_epi16(_tmp, 0); \
+        _tmp = _mm_shuffle_epi32(_tmp, 0); \
+        v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
+        v_rightColumnN[1] = _mm_mullo_epi16(_tmp, v_multiH); \
+        v_rightColumnN[2] = _mm_mullo_epi16(_tmp, v_multiH2); \
+        v_rightColumnN[3] = _mm_mullo_epi16(_tmp, v_multiH3); \
+        v_rightColumnN[4] = _mm_mullo_epi16(_tmp, v_multiH4); \
+        v_rightColumnN[5] = _mm_mullo_epi16(_tmp, v_multiH5); \
+        v_rightColumnN[6] = _mm_mullo_epi16(_tmp, v_multiH6); \
+        v_rightColumnN[7] = _mm_mullo_epi16(_tmp, v_multiH7); \
+        v_horPredN[0] = _mm_add_epi16(v_horPred, v_rightColumnN[0]); \
+        v_horPredN[1] = _mm_add_epi16(v_horPred, v_rightColumnN[1]); \
+        v_horPredN[2] = _mm_add_epi16(v_horPred, v_rightColumnN[2]); \
+        v_horPredN[3] = _mm_add_epi16(v_horPred, v_rightColumnN[3]); \
+        v_horPredN[4] = _mm_add_epi16(v_horPred, v_rightColumnN[4]); \
+        v_horPredN[5] = _mm_add_epi16(v_horPred, v_rightColumnN[5]); \
+        v_horPredN[6] = _mm_add_epi16(v_horPred, v_rightColumnN[6]); \
+        v_horPredN[7] = _mm_add_epi16(v_horPred, v_rightColumnN[7]); \
+        v_topRow[0] = _mm_add_epi16(v_topRow[0], v_bottomRow[0]); \
+        v_topRow[1] = _mm_add_epi16(v_topRow[1], v_bottomRow[1]); \
+        v_topRow[2] = _mm_add_epi16(v_topRow[2], v_bottomRow[2]); \
+        v_topRow[3] = _mm_add_epi16(v_topRow[3], v_bottomRow[3]); \
+        v_topRow[4] = _mm_add_epi16(v_topRow[4], v_bottomRow[4]); \
+        v_topRow[5] = _mm_add_epi16(v_topRow[5], v_bottomRow[5]); \
+        v_topRow[6] = _mm_add_epi16(v_topRow[6], v_bottomRow[6]); \
+        v_topRow[7] = _mm_add_epi16(v_topRow[7], v_bottomRow[7]); \
+        v_im4[0] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[0], v_topRow[0]), 7); \
+        v_im4[1] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[1], v_topRow[1]), 7); \
+        v_im4[2] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[2], v_topRow[2]), 7); \
+        v_im4[3] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[3], v_topRow[3]), 7); \
+        v_im4[4] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[4], v_topRow[4]), 7); \
+        v_im4[5] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[5], v_topRow[5]), 7); \
+        v_im4[6] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[6], v_topRow[6]), 7); \
+        v_im4[7] = _mm_srai_epi16(_mm_add_epi16(v_horPredN[7], v_topRow[7]), 7); \
+        v_im5[0] = _mm_packus_epi16(v_im4[0], v_im4[1]); \
+        v_im5[1] = _mm_packus_epi16(v_im4[2], v_im4[3]); \
+        v_im5[2] = _mm_packus_epi16(v_im4[4], v_im4[5]); \
+        v_im5[3] = _mm_packus_epi16(v_im4[6], v_im4[7]); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride], v_im5[0]); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 16], v_im5[1]); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 32], v_im5[2]); \
+        _mm_storeu_si128((__m128i*)&pDst[(Y)*dstStride + 48], v_im5[3]); \
+}
+
+    int i;
+    for (i = 0; i < 64; i++)
+    {
+        COMP_PRED_PLANAR_ROW(i + 0);
+        //COMP_PRED_PLANAR_ROW(i+1);
+    }
+
+#undef COMP_PRED_PLANAR_ROW
+}
+
+#endif // INSTRSET >= 5
+
+#endif /* if HIGH_BIT_DEPTH */
+
+typedef void predIntraPlanar_t (pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride);
+predIntraPlanar_t *intraPlanarN[] =
+{
+#if !HIGH_BIT_DEPTH && INSTRSET >= 5
+    predIntraPlanar4_sse4,
+    predIntraPlanar8_sse4,
+    predIntraPlanar16_sse4,
+    predIntraPlanar32_sse4,
+    predIntraPlanar64_sse4,
+#else
+    predIntraPlanar4,
+    predIntraPlanar8,
+    predIntraPlanar16,
+#endif
+};
+
+void predIntraPlanar(pixel* pSrc, intptr_t srcStride, pixel* rpDst, intptr_t dstStride, int width)
+{
+    //assert(width == height);
+
+    int nLog2Size = g_aucConvertToBit[width] + 2;
+
+#if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)
+    intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
+    return;
+#else
+    int k, l, bottomLeft, topRight;
+    int horPred;
+    // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here
+    int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
+    // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
+    int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
+    int blkSize = width;
+    int offset2D = width;
+    int shift1D = nLog2Size;
+    int shift2D = shift1D + 1;
+
+    if (width < 32)
+    {
+        intraPlanarN[nLog2Size - 2](pSrc, srcStride, rpDst, dstStride);
+        return;
+    }
+
+    // Get left and above reference column and row
+    for (k = 0; k < blkSize + 1; k++)
+    {
+        topRow[k] = pSrc[k - srcStride];
+        leftColumn[k] = pSrc[k * srcStride - 1];
+    }
+
+    // Prepare intermediate variables used in interpolation
+    bottomLeft = leftColumn[blkSize];
+    topRight   = topRow[blkSize];
+    for (k = 0; k < blkSize; k++)
+    {
+        bottomRow[k]   = bottomLeft - topRow[k];
+        rightColumn[k] = topRight   - leftColumn[k];
+        topRow[k]      <<= shift1D;
+        leftColumn[k]  <<= shift1D;
+    }
+
+    // Generate prediction signal
+    for (k = 0; k < blkSize; k++)
+    {
+        horPred = leftColumn[k] + offset2D;
+        for (l = 0; l < blkSize; l++)
+        {
+            horPred += rightColumn[k];
+            topRow[l] += bottomRow[l];
+            rpDst[k * dstStride + l] = ((horPred + topRow[l]) >> shift2D);
+        }
+    }
+
+#endif /* if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5) */
+}
+
+#if HIGH_BIT_DEPTH
+void xPredIntraAng4x4(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+
+    pixel* refMain;
+    pixel* refSide;
+
+    // Initialise the Main and Left reference array.
+    if (intraPredAngle < 0)
+    {
+        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
+        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
+
+        // Extend the Main reference to the left.
+        int invAngleSum    = 128;     // rounding for (shift by 8)
+        for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
+        {
+            invAngleSum += invAngle;
+            refMain[k] = refSide[invAngleSum >> 8];
+        }
+    }
+    else
+    {
+        refMain = modeVer ? refAbove : refLeft;
+        refSide = modeVer ? refLeft  : refAbove;
+    }
+
+    // bfilter will always be true for blocksize 4
+    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
+    {
+        if (modeHor)
+        {
+            Vec8s v_temp;
+            Vec8s v_side_0; // refSide[0] value in a vector
+            v_temp.load((void*)refSide);
+            v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
+
+            Vec8s v_side;
+            v_side.load(refSide + 1);
+
+            Vec8s v_main;
+            v_main = load_partial(const_int(8), (void*)(refMain + 1));
+
+            Vec8s tmp1, tmp2;
+            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(v_main, v_main);
+            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp1);
+            tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
+
+            Vec8s row0;
+            v_side -= v_side_0;
+            v_side = v_side >> 1;
+            row0 = tmp2 + v_side;
+            row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+            store_partial(const_int(8), pDst, row0);                //row0
+            store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
+
+            tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp2, tmp2);
+            tmp1 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp1);
+
+            store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
+            store_partial(const_int(8), pDst + (dstStride), tmp2);    //row1
+        }
+        else
+        {
+            Vec16uc v_main;
+            v_main = load_partial(const_int(8), refMain + 1);
+            store_partial(const_int(8), pDst, v_main);
+            store_partial(const_int(8), pDst + dstStride, v_main);
+            store_partial(const_int(8), pDst + (2 * dstStride), v_main);
+            store_partial(const_int(8), pDst + (3 * dstStride), v_main);
+
+            for (int k = 0; k < 4; k++)
+            {
+                pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << bitDepth) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
+            }
+        }
+    }
+    else if (intraPredAngle == -32)
+    {
+        Vec8s tmp;
+        tmp = load_partial(const_int(8), refMain);        //-1,0,1,2
+        store_partial(const_int(8), pDst, tmp);
+        tmp = load_partial(const_int(8), refMain - 1);     //-2,-1,0,1
+        store_partial(const_int(8), pDst + dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 2);
+        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 3);
+        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+        return;
+    }
+    else if (intraPredAngle == 32)
+    {
+        Vec8s tmp;
+        tmp = load_partial(const_int(8), refMain + 2);        //-1,0,1,2
+        store_partial(const_int(8), pDst, tmp);
+        tmp = load_partial(const_int(8), refMain + 3);     //-2,-1,0,1
+        store_partial(const_int(8), pDst + dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 4);
+        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 5);
+        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+        return;
+    }
+    else
+    {
+        Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
+        Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+
+        row11 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0));
+        row12 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 0) + 1);
+
+        row21 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1));
+        row22 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 1) + 1);
+
+        row31 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2));
+        row32 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 2) + 1);
+
+        row41 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3));
+        row42 = (Vec8s)load_partial(const_int(8), refMain + 1 + GETAP(lookIdx, 3) + 1);
+
+        v_deltaPos = v_ipAngle = intraPredAngle;
+
+        //row1
+        v_deltaFract = v_deltaPos & thirty1;
+        row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
+
+        //row2
+        v_deltaPos += v_ipAngle;
+        v_deltaFract = v_deltaPos & thirty1;
+        row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
+
+        //row3
+        v_deltaPos += v_ipAngle;
+        v_deltaFract = v_deltaPos & thirty1;
+        row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
+
+        //row4
+        v_deltaPos += v_ipAngle;
+        v_deltaFract = v_deltaPos & thirty1;
+        row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
+
+        // Flip the block
+
+        if (modeHor)
+        {
+            Vec8s tmp1, tmp2, tmp3, tmp4;
+
+            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
+            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
+
+            tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
+            tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
+
+            //tmp16_1 = compress(tmp3, tmp3);
+            store_partial(const_int(8), pDst, tmp3);
+
+            store_partial(const_int(8), pDst + (2 * dstStride), tmp4);  //row2
+
+            tmp3 = blend2q<1, 3>((Vec2q)tmp3, (Vec2q)tmp3);
+            tmp4 = blend2q<1, 3>((Vec2q)tmp4, (Vec2q)tmp4);
+
+            store_partial(const_int(8), pDst + (3 * dstStride), tmp4);   //row3
+            store_partial(const_int(8), pDst + (dstStride), tmp3);       //row1
+        }
+        else
+        {
+            store_partial(const_int(8), pDst, row11);
+            store_partial(const_int(8), pDst + (dstStride), row21);
+            store_partial(const_int(8), pDst + (2 * dstStride), row31);
+            store_partial(const_int(8), pDst + (3 * dstStride), row41);
+        }
+    }
+}
+
+#else /* if HIGH_BIT_DEPTH */
+void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
+{
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+
+    pixel* refMain;
+    pixel* refSide;
+
+    // Initialise the Main and Left reference array.
+    if (intraPredAngle < 0)
+    {
+        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
+        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
+
+        // Extend the Main reference to the left.
+        int invAngleSum    = 128;     // rounding for (shift by 8)
+        for (int k = -1; k > blkSize * intraPredAngle >> 5; k--)
+        {
+            invAngleSum += invAngle;
+            refMain[k] = refSide[invAngleSum >> 8];
+        }
+    }
+    else
+    {
+        refMain = modeVer ? refAbove : refLeft;
+        refSide = modeVer ? refLeft  : refAbove;
+    }
+
+    // bfilter will always be true for exactly vertical/horizontal modes
+    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
+    {
+        if (modeHor)
+        {
+            Vec16uc v_main;
+            v_main = load_partial(const_int(4), (void*)(refMain + 1));
+
+            Vec16uc tmp16;
+            tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v_main, v_main);
+            tmp16 = blend16c<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(tmp16, tmp16);
+            Vec2uq tmp;
+
+            if (bFilter)
+            {
+                Vec16uc v_temp;
+                Vec8s v_side_0; // refSide[0] value in a vector
+                v_temp = load_partial(const_int(8), (void*)refSide);
+                v_side_0 = broadcast(const_int(0), (Vec8s)v_temp);
+                v_side_0 = v_side_0 & 0x00FF;
+
+                //shift v_side by 1 element (1 byte)
+                tmp = reinterpret_i(v_temp);
+                tmp = tmp >> 8;
+                v_temp = reinterpret_i(tmp);
+                Vec8s v_side = extend_low(v_temp);
+
+                Vec8s row0 = extend_low(tmp16);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row0 += v_side;
+                row0 = min(max(0, row0), 255);
+                Vec16uc v_res(compress_unsafe(row0, 0));
+                store_partial(const_int(4), pDst, v_res);
+            }
+            else
+            {
+                store_partial(const_int(4), pDst, tmp16);
+            }
+
+            tmp = (Vec2uq)tmp16;
+            tmp >>= 32;
+            store_partial(const_int(4), pDst + dstStride, tmp);
+
+            tmp = blend2q<1, 3>(reinterpret_i(tmp16), reinterpret_i(tmp16));
+            store_partial(const_int(4), pDst + (2 * dstStride), tmp);
+
+            tmp >>= 32;
+            store_partial(const_int(4), pDst + (3 * dstStride), tmp);
+        }
+        else
+        {
+            Vec16uc v_main;
+            v_main = load_partial(const_int(4), refMain + 1);
+            store_partial(const_int(4), pDst, v_main);
+            store_partial(const_int(4), pDst + dstStride, v_main);
+            store_partial(const_int(4), pDst + (2 * dstStride), v_main);
+            store_partial(const_int(4), pDst + (3 * dstStride), v_main);
+            if (bFilter)
+            {
+                for (int k = 0; k < 4; k++)
+                {
+                    pDst[k * dstStride] = (pixel)Clip3((short)0, (short)((1 << 8) - 1), static_cast<short>((pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
+                }
+            }
+        }
+    }
+    else
+    {
+        Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
+        Vec16uc tmp16_1, tmp16_2;
+        Vec2uq tmp2uq;
+        Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31), v_ipAngle(0);
+        switch (intraPredAngle)
+        {
+        case -32:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);    //-1,0,1,2
+            store_partial(const_int(4), pDst, tmp16_1);
+            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 1); //-2,-1,0,1
+            store_partial(const_int(4), pDst + dstStride, tmp16_2);
+            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 2);
+            store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
+            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 3);
+            store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
+            return;
+
+        case -26:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 3);
+            row41 = extend_low(tmp16_1);    //offsets(-4,-3,-2,-1)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(-3,-2,-1,0)
+
+            row31 = row42;                  //offsets(-3,-2,-1,0)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row32 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
+
+            row21 = row32;                  //offsets(-2,-1,0,1)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 24;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row22 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row11 = row22;                  //offsets(-1,0,1,2)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 32;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(0,1,2,3)
+
+            v_deltaPos = v_ipAngle = -26;
+            break;
+
+        case -21:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
+            row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
+
+            row31 = row42;                  //offsets(-2,-1,0,1)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row21 = row31;                  //offsets(-2,-1,0,1)
+            row22 = row32;
+
+            row11 = row32;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 24;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            v_deltaPos = v_ipAngle = -21;
+            break;
+
+        case -17:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
+            row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
+
+            row31 = row42;                  //offsets(-2,-1,0,1)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row21 = row31;                  //offsets(-2,-1,0,1)
+            row22 = row32;
+
+            row11 = row32;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 24;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            v_deltaPos = v_ipAngle = -17;
+            break;
+
+        case -13:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
+            row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row11 = row42;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row21 = row42;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row41;
+            row32 = row42;
+
+            v_deltaPos = v_ipAngle = -13;
+            break;
+
+        case -9:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
+            row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row11 = row42;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
+
+            row21 = row42;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row42;
+            row32 = row12;
+
+            v_deltaPos = v_ipAngle = -9;
+            break;
+
+        case -5:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+            row21 = row11;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row11;
+            row32 = row12;
+            row41 = row11;
+            row42 = row12;
+
+            v_deltaPos = v_ipAngle = -5;
+            break;
+
+        case -2:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+            row21 = row11;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row11;
+            row32 = row12;
+            row41 = row11;
+            row42 = row12;
+
+            v_deltaPos = v_ipAngle = -2;
+            break;
+
+        case 2:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+            row21 = row11;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row11;
+            row32 = row12;
+            row41 = row11;
+            row42 = row12;
+
+            v_deltaPos = v_ipAngle = 2;
+            break;
+
+        case 5:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+            row21 = row11;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row11;
+            row32 = row12;
+            row41 = row11;
+            row42 = row12;
+
+            v_deltaPos = v_ipAngle = 5;
+            break;
+
+        case 9:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+            row21 = row11;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row11;
+            row32 = row12;
+            row41 = row12;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);
+
+            v_deltaPos = v_ipAngle = 9;
+            break;
+
+        case 13:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+
+            row21 = row11;                  //offsets(0,1,2,3)
+            row22 = row12;
+            row31 = row12;                  //offsets(1,2,3,4)
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row32 = extend_low(tmp16_2);    //offsets(2,3,4,5)
+
+            row41 = row31;                  //offsets(1,2,3,4)
+            row42 = row32;
+
+            v_deltaPos = v_ipAngle = 13;
+            break;
+
+        case 17:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+
+            row21 = row12;
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
+
+            row31 = row21;
+            row32 = row22;
+
+            row41 = row22;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 24;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
+
+            v_deltaPos = v_ipAngle = 17;
+            break;
+
+        case 21:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+
+            row21 = row12;
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
+
+            row31 = row21;
+            row32 = row22;
+
+            row41 = row22;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 24;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
+
+            v_deltaPos = v_ipAngle = 21;
+            break;
+
+        case 26:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
+
+            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 8;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
+
+            row21 = row12;
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 16;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
+
+            row31 = row22;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 24;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row32 = extend_low(tmp16_2);    //offsets(3,4,5,6)
+
+            row41 = row32;
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq = tmp2uq >> 32;
+            tmp16_2 = reinterpret_i(tmp2uq);
+            row42 = extend_low(tmp16_2);    //offsets(4,5,6,7)
+
+            v_deltaPos = v_ipAngle = 26;
+            break;
+
+        case 32:
+            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 2);
+            store_partial(const_int(4), pDst, tmp16_1);
+            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 3);
+            store_partial(const_int(4), pDst + dstStride, tmp16_2);
+            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 4);
+            store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
+            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 5);
+            store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
+            return;
+        }
+
+        //row1
+        v_deltaFract = v_deltaPos & thirty1;
+        row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) + 16) >> 5;
+
+        //row2
+        v_deltaPos += v_ipAngle;
+        v_deltaFract = v_deltaPos & thirty1;
+        row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) + 16) >> 5;
+
+        //row3
+        v_deltaPos += v_ipAngle;
+        v_deltaFract = v_deltaPos & thirty1;
+        row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) + 16) >> 5;
+
+        //row4
+        v_deltaPos += v_ipAngle;
+        v_deltaFract = v_deltaPos & thirty1;
+        row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) + 16) >> 5;
+
+        // Flip the block
+
+        if (modeHor)
+        {
+            Vec8s tmp1, tmp2, tmp3, tmp4;
+
+            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
+            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
+
+            tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
+            tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
+
+            tmp16_1 = compress_unsafe(tmp3, tmp3);
+            store_partial(const_int(4), pDst, tmp16_1);
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq >>= 32;
+            store_partial(const_int(4), pDst + dstStride, tmp2uq);
+
+            tmp16_1 = compress_unsafe(tmp4, tmp4);
+            store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
+
+            tmp2uq = reinterpret_i(tmp16_1);
+            tmp2uq >>= 32;
+            store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
+        }
+        else
+        {
+            store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
+            store_partial(const_int(4), pDst + (dstStride), compress_unsafe(row21, row21));
+            store_partial(const_int(4), pDst + (2 * dstStride), compress_unsafe(row31, row31));
+            store_partial(const_int(4), pDst + (3 * dstStride), compress_unsafe(row41, row41));
+        }
+    }
+}
+
+#endif /* if HIGH_BIT_DEPTH */
+
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+        LOADROW(row11, GETAP(lookIdx, X)); \
+        LOADROW(row12, GETAP(lookIdx, X) + 1); \
+        CALCROW(row11, row11, row12); \
+        store_partial(const_int(8), pDst + (X * dstStride), compress(row11, row11)); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+        LOADROW(row11, GETAP(lookIdx, X)); \
+        LOADROW(row12, GETAP(lookIdx, X) + 1); \
+        CALCROW(rowx, row11, row12); \
+}
+
+// ROW is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROW, X) { \
+        tmp = load_partial(const_int(8), refMain + 1 + X); \
+        ROW = extend_low(tmp); \
+}
+
+#define CALCROW(RES, ROW1, ROW2) { \
+        v_deltaPos += v_ipAngle; \
+        v_deltaFract = v_deltaPos & thirty1; \
+        RES = ((thirty2 - v_deltaFract) * ROW1 + (v_deltaFract * ROW2) + 16) >> 5; \
+}
+
+void xPredIntraAng8x8(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
+{
+    int k;
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+
+    pixel* refMain;
+    pixel* refSide;
+
+    // Initialise the Main and Left reference array.
+    if (intraPredAngle < 0)
+    {
+        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
+        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
+
+        // Extend the Main reference to the left.
+        int invAngleSum    = 128;     // rounding for (shift by 8)
+        for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+        {
+            invAngleSum += invAngle;
+            refMain[k] = refSide[invAngleSum >> 8];
+        }
+    }
+    else
+    {
+        refMain = modeVer ? refAbove : refLeft;
+        refSide = modeVer ? refLeft  : refAbove;
+    }
+
+    // bfilter will always be true for blocksize 8
+    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
+    {
+        if (modeHor)
+        {
+            Vec16uc v_temp;
+            Vec16uc tmp1;
+
+            v_temp.load(refMain + 1);
+            Vec8s v_main;
+            v_main = extend_low(v_temp);
+
+            if (bFilter)
+            {
+                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+                Vec16uc v_temp16;
+                v_temp16.load(refSide + 1);
+                Vec8s v_side;
+                v_side = extend_low(v_temp16);
+
+                Vec8s row0;
+                row0 = permute8s<0, 0, 0, 0, 0, 0, 0, 0>(v_main);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row0 = row0 + v_side;
+                row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+                tmp1 = compress(row0, row0);
+                store_partial(const_int(8), pDst, tmp1);            //row0
+            }
+            else
+            {
+                tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+                store_partial(const_int(8), pDst, tmp1); //row0
+            }
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (1 * dstStride), tmp1); //row1
+
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (2 * dstStride), tmp1); //row2
+
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (3 * dstStride), tmp1); //row3
+
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (4 * dstStride), tmp1); //row4
+
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (5 * dstStride), tmp1); //row5
+
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (6 * dstStride), tmp1); //row6
+
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, -256, -256, -256, -256, -256, -256, -256, -256>(v_temp);
+            store_partial(const_int(8), pDst + (7 * dstStride), tmp1); //row7
+        }
+        else
+        {
+            Vec16uc v_main;
+            v_main = load_partial(const_int(8), refMain + 1);
+            store_partial(const_int(8), pDst, v_main);
+            store_partial(const_int(8), pDst + dstStride, v_main);
+            store_partial(const_int(8), pDst + (2 * dstStride), v_main);
+            store_partial(const_int(8), pDst + (3 * dstStride), v_main);
+            store_partial(const_int(8), pDst + (4 * dstStride), v_main);
+            store_partial(const_int(8), pDst + (5 * dstStride), v_main);
+            store_partial(const_int(8), pDst + (6 * dstStride), v_main);
+            store_partial(const_int(8), pDst + (7 * dstStride), v_main);
+
+            if (bFilter)
+            {
+                Vec16uc v_temp;
+                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+
+                v_temp.load(refSide + 1);
+                Vec8s v_side;
+                v_side = extend_low(v_temp);
+
+                v_temp.load(refMain + 1);
+                Vec8s row0;
+                row0 = permute16uc<0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1>(v_temp);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row0 = row0 + v_side;
+                row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+                pDst[0 * dstStride] = row0[0];
+                pDst[1 * dstStride] = row0[1];
+                pDst[2 * dstStride] = row0[2];
+                pDst[3 * dstStride] = row0[3];
+                pDst[4 * dstStride] = row0[4];
+                pDst[5 * dstStride] = row0[5];
+                pDst[6 * dstStride] = row0[6];
+                pDst[7 * dstStride] = row0[7];
+            }
+        }
+    }
+    else if (intraPredAngle == -32)
+    {
+        Vec16uc tmp;
+        tmp = load_partial(const_int(8), refMain);        //-1,0,1,2
+        store_partial(const_int(8), pDst, tmp);
+        tmp = load_partial(const_int(8), refMain - 1);     //-2,-1,0,1
+        store_partial(const_int(8), pDst + dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 2);
+        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 3);
+        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 4);
+        store_partial(const_int(8), pDst + 4 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 5);
+        store_partial(const_int(8), pDst + 5 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 6);
+        store_partial(const_int(8), pDst + 6 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain - 7);
+        store_partial(const_int(8), pDst + 7 * dstStride, tmp);
+        return;
+    }
+    else if (intraPredAngle == 32)
+    {
+        Vec8s tmp;
+        tmp = load_partial(const_int(8), refMain + 2);        //-1,0,1,2
+        store_partial(const_int(8), pDst, tmp);
+        tmp = load_partial(const_int(8), refMain + 3);     //-2,-1,0,1
+        store_partial(const_int(8), pDst + dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 4);
+        store_partial(const_int(8), pDst + 2 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 5);
+        store_partial(const_int(8), pDst + 3 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 6);
+        store_partial(const_int(8), pDst + 4 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 7);
+        store_partial(const_int(8), pDst + 5 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 8);
+        store_partial(const_int(8), pDst + 6 * dstStride, tmp);
+        tmp = load_partial(const_int(8), refMain + 9);
+        store_partial(const_int(8), pDst + 7 * dstStride, tmp);
+        return;
+    }
+    else
+    {
+        if (modeHor)         // Near horizontal modes
+        {
+            Vec16uc tmp;
+            Vec8s row11, row12;
+            Vec16uc row1, row2, row3, row4, tmp16_1, tmp16_2;
+            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+            Vec8s tmp1, tmp2;
+            v_deltaPos = 0;
+            v_ipAngle = intraPredAngle;
+            switch (intraPredAngle)
+            {
+            case -5:
+                LOADROW(row11, -1);
+                LOADROW(row12, 0);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row1 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row2 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row3 = compress(tmp1, tmp2);
+                row12 = row11;
+                LOADROW(row11, -2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row4 = compress(tmp1, tmp2);
+                break;
+
+            case -2:
+                LOADROW(row11, -1);
+                LOADROW(row12, 0);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row1 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row2 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row3 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row4 = compress(tmp1, tmp2);
+                break;
+
+            case 2:
+                LOADROW(row11, 0);
+                LOADROW(row12, 1);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row1 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row2 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row3 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row4 = compress(tmp1, tmp2);
+                break;
+
+            case 5:
+                LOADROW(row11, 0);
+                LOADROW(row12, 1);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row1 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row2 = compress(tmp1, tmp2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row3 = compress(tmp1, tmp2);
+                row11 = row12;
+                LOADROW(row12, 2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                row4 = compress(tmp1, tmp2);
+                break;
+
+            default:               // these cases uses the lookup table to identify access patterns
+
+                PREDANG_CALCROW_HOR(0, tmp1);
+                PREDANG_CALCROW_HOR(1, tmp2);
+                row1 = compress(tmp1, tmp2);
+                PREDANG_CALCROW_HOR(2, tmp1);
+                PREDANG_CALCROW_HOR(3, tmp2);
+                row2 = compress(tmp1, tmp2);
+                PREDANG_CALCROW_HOR(4, tmp1);
+                PREDANG_CALCROW_HOR(5, tmp2);
+                row3 = compress(tmp1, tmp2);
+                PREDANG_CALCROW_HOR(6, tmp1);
+                PREDANG_CALCROW_HOR(7, tmp2);
+                row4 = compress(tmp1, tmp2);
+            }
+
+            // Flip the block
+            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
+            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
+            row1 = tmp16_1;
+            row2 = tmp16_2;
+
+            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
+            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
+            row3 = tmp16_1;
+            row4 = tmp16_2;
+
+            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);
+            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);
+            row1 = tmp16_1;
+            row2 = tmp16_2;
+
+            tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);
+            tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);
+            row3 = tmp16_1;
+            row4 = tmp16_2;
+
+            tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row1, (Vec4i)row3);
+            tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row1, (Vec4i)row3);
+            row1 = tmp16_1;
+            row3 = tmp16_2;
+
+            tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row2, (Vec4i)row4);
+            tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row2, (Vec4i)row4);
+            row2 = tmp16_1;
+            row4 = tmp16_2;
+
+            store_partial(const_int(8), pDst, row1);   //row1
+            store_partial(const_int(8), pDst + (2 * dstStride), row3);   //row3
+            store_partial(const_int(8), pDst + (4 * dstStride), row2);   //row5
+            store_partial(const_int(8), pDst + (6 * dstStride), row4);   //row7
+
+            row1 = blend2q<1, 3>((Vec2q)row1, (Vec2q)row1);
+            store_partial(const_int(8), pDst + (1 * dstStride), row1);   //row2
+
+            row1 = blend2q<1, 3>((Vec2q)row3, (Vec2q)row3);
+            store_partial(const_int(8), pDst + (3 * dstStride), row1);   //row4
+
+            row1 = blend2q<1, 3>((Vec2q)row2, (Vec2q)row2);
+            store_partial(const_int(8), pDst + (5 * dstStride), row1);   //row6
+
+            row1 = blend2q<1, 3>((Vec2q)row4, (Vec2q)row4);
+            store_partial(const_int(8), pDst + (7 * dstStride), row1);   //row8
+        }
+        else                         // Vertical modes
+        {
+            Vec8s row11, row12;
+            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+            Vec16uc tmp;
+            Vec8s tmp1, tmp2;
+            v_deltaPos = 0;
+            v_ipAngle = intraPredAngle;
+            switch (intraPredAngle)
+            {
+            case -5:
+                LOADROW(row11, -1);
+                LOADROW(row12, 0);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+                row12 = row11;
+                LOADROW(row11, -2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+                break;
+
+            case -2:
+                LOADROW(row11, -1);
+                LOADROW(row12, 0);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+                break;
+
+            case 2:
+                LOADROW(row11, 0);
+                LOADROW(row12, 1);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+                break;
+
+            case 5:
+                LOADROW(row11, 0);
+                LOADROW(row12, 1);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst, compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + dstStride, compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (2 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (3 * dstStride), compress(tmp2, tmp2));
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (4 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (5 * dstStride), compress(tmp2, tmp2));
+                row11 = row12;
+                LOADROW(row12, 2);
+                CALCROW(tmp1, row11, row12);
+                CALCROW(tmp2, row11, row12);
+                store_partial(const_int(8), pDst + (6 * dstStride), compress(tmp1, tmp1));
+                store_partial(const_int(8), pDst + (7 * dstStride), compress(tmp2, tmp2));
+                break;
+
+            default:                   // these cases uses the lookup table to identify access patterns
+                PREDANG_CALCROW_VER(0);
+                PREDANG_CALCROW_VER(1);
+                PREDANG_CALCROW_VER(2);
+                PREDANG_CALCROW_VER(3);
+                PREDANG_CALCROW_VER(4);
+                PREDANG_CALCROW_VER(5);
+                PREDANG_CALCROW_VER(6);
+                PREDANG_CALCROW_VER(7);
+            }
+        }
+    }
+}
+
+#undef PREDANG_CALCROW_VER
+#undef PREDANG_CALCROW_HOR
+#undef LOADROW
+#undef CALCROW
+#endif /* if HIGH_BIT_DEPTH */
+
+//16x16
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+        LOADROW(row11L, row11H, GETAP(lookIdx, X)); \
+        LOADROW(row12L, row12H, GETAP(lookIdx, X) + 1); \
+        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
+        /*compress(row11L, row11H).store(pDst + ((X)*dstStride));*/ \
+        itmp = _mm_packus_epi16(row11L, row11H); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+        LOADROW(row11L, row11H, GETAP(lookIdx, (X))); \
+        LOADROW(row12L, row12H, GETAP(lookIdx, (X)) + 1); \
+        CALCROW(row11L, row11H, row11L, row11H, row12L, row12H); \
+        /*rowx = compress(row11L, row11H);*/  \
+        rowx = _mm_packus_epi16(row11L, row11H); \
+}
+
+// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROWL, ROWH, X) { \
+        /*tmp.load(refMain + 1 + (X)); */ \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
+        /* ROWL = extend_low(tmp);*/  \
+        ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        /*ROWH = extend_high(tmp);*/  \
+        ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+}
+
+#define CALCROW(RESL, RESH, ROW1L, ROW1H, ROW2L, ROW2H) { \
+        /*v_deltaPos += v_ipAngle; \
+        v_deltaFract = v_deltaPos & thirty1;*/ \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        /*RESL = ((thirty2 - v_deltaFract) * ROW1L + (v_deltaFract * ROW2L) + 16) >> 5; \
+        RESH = ((thirty2 - v_deltaFract) * ROW1H + (v_deltaFract * ROW2H) + 16) >> 5;*/ \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, ROW1L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, ROW2L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        RESL = _mm_srai_epi16(it2, 5); \
+        \
+        it2 = _mm_mullo_epi16(it1, ROW1H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, ROW2H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        RESH = _mm_srai_epi16(it2, 5); \
+}
+
+#define  BLND2_16(R1, R2) { \
+        /*tmp1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(R1, R2); */ \
+        itmp1 = _mm_unpacklo_epi8(R1, R2); \
+        /*tmp2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(R1, R2);*/ \
+        itmp2 = _mm_unpackhi_epi8(R1, R2); \
+        R1 = itmp1; \
+        R2 = itmp2; \
+}
+
+#define MB4(R1, R2, R3, R4) { \
+        BLND2_16(R1, R2) \
+        BLND2_16(R3, R4) \
+        /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R1, (Vec8s)R3);*/  \
+        itmp1 = _mm_unpacklo_epi16(R1, R3); \
+        /* tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R1, (Vec8s)R3);*/ \
+        itmp2 = _mm_unpackhi_epi16(R1, R3); \
+        R1 = itmp1; \
+        R3 = itmp2; \
+        /*R1 = tmp1; \
+        R3 = tmp2;*/ \
+        /*tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>((Vec8s)R2, (Vec8s)R4); \
+        tmp2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>((Vec8s)R2, (Vec8s)R4);*/ \
+        itmp1 = _mm_unpacklo_epi16(R2, R4); \
+        itmp2 = _mm_unpackhi_epi16(R2, R4); \
+        R2 = itmp1; \
+        R4 = itmp2; \
+        /*R2 = tmp1; \
+        R4 = tmp2;*/ \
+}
+
+#define BLND2_4(R1, R2) { \
+        /* tmp1 = blend4i<0, 4, 1, 5>((Vec4i)R1, (Vec4i)R2); \
+        tmp2 = blend4i<2, 6, 3, 7>((Vec4i)R1, (Vec4i)R2); */ \
+        itmp1 = _mm_unpacklo_epi32(R1, R2); \
+        itmp2 = _mm_unpackhi_epi32(R1, R2); \
+        R1 = itmp1; \
+        R2 = itmp2; \
+        /*R1 = tmp1; \
+        R2 = tmp2; */\
+}
+
+#define BLND2_2(R1, R2) { \
+        /*tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2);*/ \
+        itmp1 = _mm_unpacklo_epi64(R1, R2); \
+        itmp2 = _mm_unpackhi_epi64(R1, R2); \
+        /*tmp1.store(pDst); */ \
+        _mm_storeu_si128((__m128i*)pDst, itmp1); \
+        pDst += dstStride; \
+        /*tmp2.store(pDst);*/ \
+        _mm_storeu_si128((__m128i*)pDst, itmp2); \
+        pDst += dstStride; \
+}
+
+#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
+        PREDANG_CALCROW_HOR(0 + X, R1) \
+        PREDANG_CALCROW_HOR(1 + X, R2) \
+        PREDANG_CALCROW_HOR(2 + X, R3) \
+        PREDANG_CALCROW_HOR(3 + X, R4) \
+        PREDANG_CALCROW_HOR(4 + X, R5) \
+        PREDANG_CALCROW_HOR(5 + X, R6) \
+        PREDANG_CALCROW_HOR(6 + X, R7) \
+        PREDANG_CALCROW_HOR(7 + X, R8) \
+        MB4(R1, R2, R3, R4) \
+        MB4(R5, R6, R7, R8) \
+        BLND2_4(R1, R5); \
+        BLND2_4(R2, R6); \
+        BLND2_4(R3, R7); \
+        BLND2_4(R4, R8); \
+}
+
+void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
+{
+    int k;
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+
+    pixel* refMain;
+    pixel* refSide;
+
+    // Initialise the Main and Left reference array.
+    if (intraPredAngle < 0)
+    {
+        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
+        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
+
+        // Extend the Main reference to the left.
+        int invAngleSum    = 128;     // rounding for (shift by 8)
+        if (intraPredAngle != -32)
+            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+            {
+                invAngleSum += invAngle;
+                refMain[k] = refSide[invAngleSum >> 8];
+            }
+    }
+    else
+    {
+        refMain = modeVer ? refAbove : refLeft;
+        refSide = modeVer ? refLeft  : refAbove;
+    }
+
+    // bfilter will always be true for blocksize 8
+    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
+    {
+        if (modeHor)
+        {
+            Vec16uc v_temp;
+            Vec16uc tmp1;
+            v_temp.load(refMain + 1);
+
+            if (bFilter)
+            {
+                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+                Vec16uc v_temp16;
+                v_temp16.load(refSide + 1);
+                Vec8s v_side;
+                v_side = extend_low(v_temp16);
+
+                Vec8s row01, row02, ref(refMain[1]);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row01 = ref + v_side;
+                row01 = min(max(0, row01), (1 << bitDepth) - 1);
+
+                v_side = extend_high(v_temp16);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row02 = ref + v_side;
+                row02 = min(max(0, row02), (1 << bitDepth) - 1);
+
+                tmp1 = compress_unsafe(row01, row02);
+                tmp1.store(pDst);            //row0
+            }
+            else
+            {
+                tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+                tmp1.store(pDst); //row0
+            }
+
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+            tmp1.store(pDst + (1 * dstStride)); //row1
+
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+            tmp1.store(pDst + (2 * dstStride)); //row2
+
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+            tmp1.store(pDst + (3 * dstStride)); //row3
+
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+            tmp1.store(pDst + (4 * dstStride)); //row4
+
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+            tmp1.store(pDst + (5 * dstStride)); //row5
+
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+            tmp1.store(pDst + (6 * dstStride)); //row6
+
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+            tmp1.store(pDst + (7 * dstStride)); //row7
+
+            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+            tmp1.store(pDst + (8 * dstStride)); //row8
+
+            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+            tmp1.store(pDst + (9 * dstStride)); //row9
+
+            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+            tmp1.store(pDst + (10 * dstStride)); //row10
+
+            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+            tmp1.store(pDst + (11 * dstStride)); //row11
+
+            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+            tmp1.store(pDst + (12 * dstStride)); //row12
+
+            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+            tmp1.store(pDst + (13 * dstStride)); //row13
+
+            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+            tmp1.store(pDst + (14 * dstStride)); //row14
+
+            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+            tmp1.store(pDst + (15 * dstStride)); //row15
+        }
+        else
+        {
+            Vec16uc v_main;
+//            v_main.load(refMain + 1);
+            v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
+
+            _mm_storeu_si128((__m128i*)pDst, v_main);
+            _mm_storeu_si128((__m128i*)(pDst + dstStride), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (2 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (3 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (4 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (5 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (6 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (7 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (8 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (9 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (10 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (11 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (12 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (13 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (14 * dstStride)), v_main);
+            _mm_storeu_si128((__m128i*)(pDst + (15 * dstStride)), v_main);
+
+            if (bFilter)
+            {
+                Vec16uc v_temp;
+                Vec8s v_side_0(refSide[0]); // refSide[0] value in a vector
+
+                v_temp.load(refSide + 1);
+                Vec8s v_side;
+                v_side = extend_low(v_temp);
+
+                Vec8s row0, ref(refMain[1]);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row0 = ref + v_side;
+                row0 = min(max(0, row0), (1 << bitDepth) - 1);
+
+                pDst[0 * dstStride] = row0[0];
+                pDst[1 * dstStride] = row0[1];
+                pDst[2 * dstStride] = row0[2];
+                pDst[3 * dstStride] = row0[3];
+                pDst[4 * dstStride] = row0[4];
+                pDst[5 * dstStride] = row0[5];
+                pDst[6 * dstStride] = row0[6];
+                pDst[7 * dstStride] = row0[7];
+
+                v_side = extend_high(v_temp);
+                v_side -= v_side_0;
+                v_side = v_side >> 1;
+                row0 = ref + v_side;
+                row0 = min(max(0, row0), (1 << bitDepth) - 1);
+                pDst[8 * dstStride] = row0[0];
+                pDst[9 * dstStride] = row0[1];
+                pDst[10 * dstStride] = row0[2];
+                pDst[11 * dstStride] = row0[3];
+                pDst[12 * dstStride] = row0[4];
+                pDst[13 * dstStride] = row0[5];
+                pDst[14 * dstStride] = row0[6];
+                pDst[15 * dstStride] = row0[7];
+            }
+        }
+    }
+    else if (intraPredAngle == -32)
+    {
+        Vec16uc v_refSide;
+        v_refSide.load(refSide);
+        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+        pixel refMain0 = refMain[0];
+
+        v_refSide.store(refMain - 15);
+        refMain[0] = refMain0;
+
+        Vec16uc tmp;
+        __m128i itmp;
+//        tmp.load(refMain);        //-1,0,1,2
+//        tmp.store(pDst);
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)--refMain);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+/*
+        tmp.load(--refMain);
+        pDst += dstStride;
+        tmp.store(pDst);
+        ... 14 times more
+*/
+        return;
+    }
+    else if (intraPredAngle == 32)
+    {
+        Vec8s tmp;
+        __m128i itmp;
+        refMain += 2;
+
+//        tmp.load(refMain++);
+//        tmp.store(pDst);
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+/*
+        tmp.load(refMain++);
+        pDst += dstStride;
+        tmp.store(pDst);
+        ... 14 times more
+*/
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        pDst += dstStride;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+        return;
+    }
+    else
+    {
+        if (modeHor)
+        {
+            Vec8s row11L, row12L, row11H, row12H;
+            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+            Vec16uc tmp;
+            Vec16uc R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+            Vec16uc tmp1, tmp2;
+            v_deltaPos = 0;
+            v_ipAngle = intraPredAngle;
+            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+//            MB16;
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+        }
+        else
+        {
+            Vec8s row11L, row12L, row11H, row12H;
+            Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;
+            Vec16uc tmp;
+            Vec8s tmp1, tmp2;
+            v_deltaPos = 0;
+            v_ipAngle = intraPredAngle;
+            __m128i itmp, it1, it2, it3, i16;
+
+            PREDANG_CALCROW_VER(0);
+            PREDANG_CALCROW_VER(1);
+            PREDANG_CALCROW_VER(2);
+            PREDANG_CALCROW_VER(3);
+            PREDANG_CALCROW_VER(4);
+            PREDANG_CALCROW_VER(5);
+            PREDANG_CALCROW_VER(6);
+            PREDANG_CALCROW_VER(7);
+            PREDANG_CALCROW_VER(8);
+            PREDANG_CALCROW_VER(9);
+            PREDANG_CALCROW_VER(10);
+            PREDANG_CALCROW_VER(11);
+            PREDANG_CALCROW_VER(12);
+            PREDANG_CALCROW_VER(13);
+            PREDANG_CALCROW_VER(14);
+            PREDANG_CALCROW_VER(15);
+        }
+    }
+}
+
+#undef PREDANG_CALCROW_VER
+#undef PREDANG_CALCROW_HOR
+#undef LOADROW
+#undef CALCROW
+#undef BLND2_16
+#undef BLND2_2
+#undef BLND2_4
+#undef MB4
+#undef CALC_BLND_8ROWS
+#endif /* if HIGH_BIT_DEPTH */
+
+//32x32
+#if HIGH_BIT_DEPTH
+#else
+#define PREDANG_CALCROW_VER(X) { \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)]))); \
+        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 1))); \
+        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11L = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11H = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(row11L, row11H); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 16))); \
+        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][(X)] + 17))); \
+        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11L = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11H = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(row11L, row11H); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
+}
+
+#define PREDANG_CALCROW_VER_MODE2(X) { \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row21); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res1 = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row12); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row22); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res2 = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(res1, res2); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride)), itmp); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row13); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row23); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res1 = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row14); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row24); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res2 = _mm_srai_epi16(it2, 5); \
+  \
+        itmp = _mm_packus_epi16(res1, res2); \
+        _mm_storeu_si128((__m128i*)(pDst + ((X)*dstStride) + 16), itmp); \
+}
+
+#define PREDANG_CALCROW_HOR(X, rowx) { \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))]))); \
+        row11L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row11H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (angAP[8 - (lookIdx)][((X))] + 1))); \
+        row12L = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        row12H = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+  \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11L = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        row11H = _mm_srai_epi16(it2, 5); \
+  \
+        rowx = _mm_packus_epi16(row11L, row11H); \
+}
+
+#define PREDANG_CALCROW_HOR_MODE2(rowx) { \
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle); \
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1); \
+        it1 = _mm_sub_epi16(thirty2, v_deltaFract); \
+        it2 = _mm_mullo_epi16(it1, row11L); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12L); \
+        it2 = _mm_add_epi16(it2, it3); \
+        i16 = _mm_set1_epi16(16); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res1 = _mm_srai_epi16(it2, 5); \
+        it2 = _mm_mullo_epi16(it1, row11H); \
+        it3 = _mm_mullo_epi16(v_deltaFract, row12H); \
+        it2 = _mm_add_epi16(it2, it3); \
+        it2 = _mm_add_epi16(it2, i16); \
+        res2 = _mm_srai_epi16(it2, 5); \
+  \
+        rowx = _mm_packus_epi16(res1, res2); \
+}
+
+// ROWL/H is a Vec8s variable, X is the index in of data to be loaded
+#define LOADROW(ROWL, ROWH, X) { \
+/*        tmp.load(refMain + 1 + (X)); \
+        ROWL = extend_low(tmp); \
+        ROWH = extend_high(tmp); */\
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + (X))); \
+        ROWL = _mm_unpacklo_epi8(itmp, _mm_setzero_si128()); \
+        ROWH = _mm_unpackhi_epi8(itmp, _mm_setzero_si128()); \
+}
+
+#define BLND2_2(R1, R2) { \
+/*        tmp1 = blend2q<0, 2>((Vec2q)R1, (Vec2q)R2); \
+        tmp2 = blend2q<1, 3>((Vec2q)R1, (Vec2q)R2); \
+        tmp1.store(pDst);   pDst += dstStride; \
+        tmp2.store(pDst);   pDst += dstStride; */\
+        itmp1 = _mm_unpacklo_epi64(R1, R2); \
+        itmp2 = _mm_unpackhi_epi64(R1, R2); \
+        _mm_storeu_si128((__m128i*)pDst, itmp1); \
+        pDst += dstStride; \
+        _mm_storeu_si128((__m128i*)pDst, itmp2); \
+        pDst += dstStride; \
+}
+
+#define MB8(R1, R2, R3, R4, R5, R6, R7, R8) { \
+        itmp1 = _mm_unpacklo_epi8(R1, R2); \
+        itmp2 = _mm_unpackhi_epi8(R1, R2); \
+        R1 = itmp1; \
+        R2 = itmp2; \
+        itmp1 = _mm_unpacklo_epi8(R3, R4); \
+        itmp2 = _mm_unpackhi_epi8(R3, R4); \
+        R3 = itmp1; \
+        R4 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R1, R3); \
+        itmp2 = _mm_unpackhi_epi16(R1, R3); \
+        R1 = itmp1; \
+        R3 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R2, R4); \
+        itmp2 = _mm_unpackhi_epi16(R2, R4); \
+        R2 = itmp1; \
+        R4 = itmp2; \
+        itmp1 = _mm_unpacklo_epi8(R5, R6); \
+        itmp2 = _mm_unpackhi_epi8(R5, R6); \
+        R5 = itmp1; \
+        R6 = itmp2; \
+        itmp1 = _mm_unpacklo_epi8(R7, R8); \
+        itmp2 = _mm_unpackhi_epi8(R7, R8); \
+        R7 = itmp1; \
+        R8 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R5, R7); \
+        itmp2 = _mm_unpackhi_epi16(R5, R7); \
+        R5 = itmp1; \
+        R7 = itmp2; \
+        itmp1 = _mm_unpacklo_epi16(R6, R8); \
+        itmp2 = _mm_unpackhi_epi16(R6, R8); \
+        R6 = itmp1; \
+        R8 = itmp2; \
+        itmp1 = _mm_unpacklo_epi32(R1, R5); \
+        itmp2 = _mm_unpackhi_epi32(R1, R5); \
+        R1 = itmp1; \
+        R5 = itmp2; \
+  \
+        itmp1 = _mm_unpacklo_epi32(R2, R6); \
+        itmp2 = _mm_unpackhi_epi32(R2, R6); \
+        R2 = itmp1; \
+        R6 = itmp2; \
+  \
+        itmp1 = _mm_unpacklo_epi32(R3, R7); \
+        itmp2 = _mm_unpackhi_epi32(R3, R7); \
+        R3 = itmp1; \
+        R7 = itmp2; \
+  \
+        itmp1 = _mm_unpacklo_epi32(R4, R8); \
+        itmp2 = _mm_unpackhi_epi32(R4, R8); \
+        R4 = itmp1; \
+        R8 = itmp2; \
+}
+
+#define CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, X) { \
+        PREDANG_CALCROW_HOR(0 + X, R1) \
+        PREDANG_CALCROW_HOR(1 + X, R2) \
+        PREDANG_CALCROW_HOR(2 + X, R3) \
+        PREDANG_CALCROW_HOR(3 + X, R4) \
+        PREDANG_CALCROW_HOR(4 + X, R5) \
+        PREDANG_CALCROW_HOR(5 + X, R6) \
+        PREDANG_CALCROW_HOR(6 + X, R7) \
+}
+
+#define CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8) { \
+        PREDANG_CALCROW_HOR_MODE2(R1) \
+        PREDANG_CALCROW_HOR_MODE2(R2) \
+        PREDANG_CALCROW_HOR_MODE2(R3) \
+        PREDANG_CALCROW_HOR_MODE2(R4) \
+        PREDANG_CALCROW_HOR_MODE2(R5) \
+        PREDANG_CALCROW_HOR_MODE2(R6) \
+        PREDANG_CALCROW_HOR_MODE2(R7) \
+}
+
+void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+    int k;
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+
+    pixel* refMain;
+    pixel* refSide;
+
+    // Initialise the Main and Left reference array.
+    if (intraPredAngle < 0)
+    {
+        refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
+        refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
+
+        // Extend the Main reference to the left.
+        int invAngleSum    = 128;     // rounding for (shift by 8)
+        if (intraPredAngle != -32)
+            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+            {
+                invAngleSum += invAngle;
+                refMain[k] = refSide[invAngleSum >> 8];
+            }
+    }
+    else
+    {
+        refMain = modeVer ? refAbove : refLeft;
+        refSide = modeVer ? refLeft  : refAbove;
+    }
+
+    // bfilter will always be true for blocksize 8
+    if (intraPredAngle == 0)  // Exactly hotizontal/vertical angles
+    {
+        if (modeHor)
+        {
+            Vec16uc v_temp, tmp1;
+
+            v_temp.load(refMain + 1);
+            /*BROADSTORE16ROWS;*/
+            tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+            tmp1.store(pDst + (0 * dstStride));
+            tmp1.store(pDst + (0 * dstStride) + 16);
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+            tmp1.store(pDst + (1 * dstStride));
+            tmp1.store(pDst + (1 * dstStride) + 16);
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+            tmp1.store(pDst + (2 * dstStride));
+            tmp1.store(pDst + (2 * dstStride) + 16);
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+            tmp1.store(pDst + (3 * dstStride));
+            tmp1.store(pDst + (3 * dstStride) + 16);
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+            tmp1.store(pDst + (4 * dstStride));
+            tmp1.store(pDst + (4 * dstStride) + 16);
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+            tmp1.store(pDst + (5 * dstStride));
+            tmp1.store(pDst + (5 * dstStride) + 16);
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+            tmp1.store(pDst + (6 * dstStride));
+            tmp1.store(pDst + (6 * dstStride) + 16);
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+            tmp1.store(pDst + (7 * dstStride));
+            tmp1.store(pDst + (7 * dstStride) + 16);
+            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+            tmp1.store(pDst + (8 * dstStride));
+            tmp1.store(pDst + (8 * dstStride) + 16);
+            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+            tmp1.store(pDst + (9 * dstStride));
+            tmp1.store(pDst + (9 * dstStride) + 16);
+            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+            tmp1.store(pDst + (10 * dstStride));
+            tmp1.store(pDst + (10 * dstStride) + 16);
+            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+            tmp1.store(pDst + (11 * dstStride));
+            tmp1.store(pDst + (11 * dstStride) + 16);
+            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+            tmp1.store(pDst + (12 * dstStride));
+            tmp1.store(pDst + (12 * dstStride) + 16);
+            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+            tmp1.store(pDst + (13 * dstStride));
+            tmp1.store(pDst + (13 * dstStride) + 16);
+            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+            tmp1.store(pDst + (14 * dstStride));
+            tmp1.store(pDst + (14 * dstStride) + 16);
+            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+            tmp1.store(pDst + (15 * dstStride));
+            tmp1.store(pDst + (15 * dstStride) + 16);
+
+            pDst += 16 * dstStride;
+            v_temp.load(refMain + 1 + 16);
+            /*BROADSTORE16ROWS;*/
+            tmp1 = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>(v_temp);
+            tmp1.store(pDst + (0 * dstStride));
+            tmp1.store(pDst + (0 * dstStride) + 16);
+            tmp1 = permute16uc<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>(v_temp);
+            tmp1.store(pDst + (1 * dstStride));
+            tmp1.store(pDst + (1 * dstStride) + 16);
+            tmp1 = permute16uc<2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2>(v_temp);
+            tmp1.store(pDst + (2 * dstStride));
+            tmp1.store(pDst + (2 * dstStride) + 16);
+            tmp1 = permute16uc<3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3>(v_temp);
+            tmp1.store(pDst + (3 * dstStride));
+            tmp1.store(pDst + (3 * dstStride) + 16);
+            tmp1 = permute16uc<4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4>(v_temp);
+            tmp1.store(pDst + (4 * dstStride));
+            tmp1.store(pDst + (4 * dstStride) + 16);
+            tmp1 = permute16uc<5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5>(v_temp);
+            tmp1.store(pDst + (5 * dstStride));
+            tmp1.store(pDst + (5 * dstStride) + 16);
+            tmp1 = permute16uc<6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6>(v_temp);
+            tmp1.store(pDst + (6 * dstStride));
+            tmp1.store(pDst + (6 * dstStride) + 16);
+            tmp1 = permute16uc<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>(v_temp);
+            tmp1.store(pDst + (7 * dstStride));
+            tmp1.store(pDst + (7 * dstStride) + 16);
+            tmp1 = permute16uc<8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8>(v_temp);
+            tmp1.store(pDst + (8 * dstStride));
+            tmp1.store(pDst + (8 * dstStride) + 16);
+            tmp1 = permute16uc<9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9>(v_temp);
+            tmp1.store(pDst + (9 * dstStride));
+            tmp1.store(pDst + (9 * dstStride) + 16);
+            tmp1 = permute16uc<10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10>(v_temp);
+            tmp1.store(pDst + (10 * dstStride));
+            tmp1.store(pDst + (10 * dstStride) + 16);
+            tmp1 = permute16uc<11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11>(v_temp);
+            tmp1.store(pDst + (11 * dstStride));
+            tmp1.store(pDst + (11 * dstStride) + 16);
+            tmp1 = permute16uc<12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12>(v_temp);
+            tmp1.store(pDst + (12 * dstStride));
+            tmp1.store(pDst + (12 * dstStride) + 16);
+            tmp1 = permute16uc<13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13>(v_temp);
+            tmp1.store(pDst + (13 * dstStride));
+            tmp1.store(pDst + (13 * dstStride) + 16);
+            tmp1 = permute16uc<14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14>(v_temp);
+            tmp1.store(pDst + (14 * dstStride));
+            tmp1.store(pDst + (14 * dstStride) + 16);
+            tmp1 = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>(v_temp);
+            tmp1.store(pDst + (15 * dstStride));
+            tmp1.store(pDst + (15 * dstStride) + 16);
+        }
+        else
+        {
+            __m128i v_main;
+            Pel *dstOriginal = pDst;
+//            v_main.load(refMain + 1);
+            v_main = _mm_loadu_si128((__m128i const*)(refMain + 1));
+//            v_main.store(pDst);
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+
+            pDst = dstOriginal + 16;
+            v_main = _mm_loadu_si128((__m128i const*)(refMain + 17));
+//            v_main.store(pDst);
+
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+            pDst += dstStride;
+            _mm_storeu_si128((__m128i*)(pDst), v_main);
+        }
+    }
+    else if (intraPredAngle == -32)
+    {
+        Vec16uc v_refSide;
+        pixel refMain0 = refMain[0];
+
+        v_refSide.load(refSide);
+        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+        v_refSide.store(refMain - 15);
+
+        v_refSide.load(refSide + 16);
+        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+        v_refSide.store(refMain - 31);
+
+        refMain[0] = refMain0;
+
+        __m128i itmp;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain--;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+
+        return;
+    }
+    else if (intraPredAngle == 32)
+    {
+        __m128i itmp;
+        refMain += 2;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain++);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        pDst += dstStride;
+        refMain++;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        itmp = _mm_loadu_si128((__m128i const*)refMain);
+        refMain++;
+        _mm_storeu_si128((__m128i*)pDst, itmp);
+        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+        pDst += dstStride;
+
+        return;
+    }
+    else
+    {
+        if (modeHor)
+        {
+            __m128i row11L, row12L, row11H, row12H, res1, res2;
+            __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+            __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+            Pel * original_pDst = pDst;
+            v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+            v_ipAngle = _mm_set1_epi16(intraPredAngle);
+            thirty2 = _mm_set1_epi16(32);
+            thirty1 = _mm_set1_epi16(31);
+            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+            switch (intraPredAngle)
+            {
+            case -2:
+                LOADROW(row11L, row11H, -1)
+                LOADROW(row12L, row12H,  0)
+                R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+                row12L = row11L;
+                row12H = row11H;
+                LOADROW(row11L, row11H, -2)
+                R16 = _mm_packus_epi16(row11L, row11H);
+                pDst = original_pDst + 16;
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                pDst = original_pDst + (16 * dstStride);
+                refMain += 16;
+
+                v_deltaPos = _mm_setzero_si128();
+                v_ipAngle = _mm_set1_epi16(intraPredAngle);
+                LOADROW(row11L, row11H, -1)
+                LOADROW(row12L, row12H,  0)
+                R16 = _mm_packus_epi16(row11L, row11H);
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+                row12L = row11L;
+                row12H = row11H;
+                LOADROW(row11L, row11H, -2)
+                R16 = _mm_packus_epi16(row11L, row11H);
+                pDst = original_pDst + (16 * dstStride) + 16;
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+                return;
+
+            case  2:
+                LOADROW(row11L, row11H, 0)
+                LOADROW(row12L, row12H, 1)
+                R16 = _mm_packus_epi16(row12L, row12H);
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+                row11L = row12L;
+                row11H = row12H;
+                LOADROW(row12L, row12H, 2)
+                R16 = _mm_packus_epi16(row12L, row12H);
+                pDst = original_pDst + 16;
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                pDst = original_pDst + (16 * dstStride);
+                refMain += 16;
+                v_deltaPos = _mm_setzero_si128();
+
+                v_ipAngle = _mm_set1_epi16(intraPredAngle);
+                LOADROW(row11L, row11H, 0)
+                LOADROW(row12L, row12H, 1)
+                R16 = _mm_packus_epi16(row12L, row12H);
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+
+                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+                row11L = row12L;
+                row11H = row12H;
+                LOADROW(row12L, row12H, 2)
+                R16 = _mm_packus_epi16(row12L, row12H);
+                pDst = original_pDst + (16 * dstStride) + 16;
+
+                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+                PREDANG_CALCROW_HOR_MODE2(R8)
+                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+                BLND2_2(R1, R9)
+                BLND2_2(R5, R13)
+                BLND2_2(R3, R11)
+                BLND2_2(R7, R15)
+                BLND2_2(R2, R10)
+                BLND2_2(R6, R14)
+                BLND2_2(R4, R12)
+                BLND2_2(R8, R16)
+                return;
+            }
+
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            PREDANG_CALCROW_HOR(7 + 0, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            PREDANG_CALCROW_HOR(7 + 8, R16)
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+
+            pDst = original_pDst + 16;
+
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+            PREDANG_CALCROW_HOR(7 + 16, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+
+            pDst = original_pDst + (16 * dstStride);
+            refMain += 16;
+            v_deltaPos = _mm_setzero_si128();
+            v_ipAngle = _mm_set1_epi16(intraPredAngle);
+
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            PREDANG_CALCROW_HOR(7 + 0, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            PREDANG_CALCROW_HOR(7 + 8, R16)
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+            pDst = original_pDst + (16 * dstStride) + 16;
+
+            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+            PREDANG_CALCROW_HOR(7 + 16, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+        }
+        else
+        {
+            __m128i row11L, row12L, row11H, row12H;
+            __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+            __m128i row11, row12, row13, row14, row21, row22, row23, row24;
+            __m128i res1, res2;
+
+            v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+            v_ipAngle = _mm_set1_epi16(intraPredAngle);
+            thirty2 = _mm_set1_epi16(32);
+            thirty1 = _mm_set1_epi16(31);
+            __m128i itmp, it1, it2, it3, i16;
+
+            switch (intraPredAngle)
+            {
+            case -2:
+                LOADROW(row11, row12, -1)
+                LOADROW(row21, row22,  0)
+                LOADROW(row13, row14, 15)
+                LOADROW(row23, row24, 16)
+                for (int i = 0; i <= 14; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
+
+                //deltaFract == 0 for 16th row
+                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+                itmp = _mm_packus_epi16(row11, row12);
+                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+                itmp = _mm_packus_epi16(row13, row14);
+                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+                row21 = row11;
+                row22 = row12;
+                row23 = row13;
+                row24 = row14;
+
+                LOADROW(row11, row12, -2)
+                LOADROW(row13, row14, 14)
+                for (int i = 16; i <= 30; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
+
+                itmp = _mm_packus_epi16(row11, row12);
+                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+                itmp = _mm_packus_epi16(row13, row14);
+                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+
+                return;
+
+            case  2:
+
+                LOADROW(row11, row12, 0)
+                LOADROW(row21, row22, 1)
+                LOADROW(row13, row14, 16)
+                LOADROW(row23, row24, 17)
+                for (int i = 0; i <= 14; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
+
+                //deltaFract == 0 for 16th row
+
+                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+                itmp = _mm_packus_epi16(row21, row22);
+                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+                itmp = _mm_packus_epi16(row23, row24);
+                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+                row11 = row21;
+                row12 = row22;
+                row13 = row23;
+                row14 = row24;
+
+                LOADROW(row21, row22, 2)
+                LOADROW(row23, row24, 18)
+                for (int i = 16; i <= 30; i++)
+                {
+                    PREDANG_CALCROW_VER_MODE2(i);
+                }
+
+                itmp = _mm_packus_epi16(row21, row22);
+                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+                itmp = _mm_packus_epi16(row23, row24);
+                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+
+                return;
+            }
+
+            for (int i = 0; i <= 30; i++)
+            {
+                PREDANG_CALCROW_VER(i);
+            }
+
+            itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+            _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+            itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
+            _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+        }
+    }
+}
+
+#endif /* if HIGH_BIT_DEPTH */
+
+void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove)
+{
+#if HIGH_BIT_DEPTH
+#else
+    switch (width)
+    {
+    case 4:
+        xPredIntraAng4x4(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
+        return;
+    case 8:
+        xPredIntraAng8x8(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
+        return;
+    case 16:
+        xPredIntraAng16x16(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove, bFilter);
+        return;
+    case 32:
+        xPredIntraAng32x32(bitDepth, pDst, dstStride, width, dirMode, refLeft, refAbove);
+        return;
+    }
+
+#endif /* if HIGH_BIT_DEPTH */
+
+    int k, l;
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+    int invAngle       = invAngTable[absAng];
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    // Do angular predictions
+    {
+        pixel* refMain;
+        pixel* refSide;
+
+        // Initialise the Main and Left reference array.
+        if (intraPredAngle < 0)
+        {
+            refMain = (modeVer ? refAbove : refLeft); // + (blkSize - 1);
+            refSide = (modeVer ? refLeft : refAbove); // + (blkSize - 1);
+
+            // Extend the Main reference to the left.
+            int invAngleSum    = 128; // rounding for (shift by 8)
+            for (k = -1; k > blkSize * intraPredAngle >> 5; k--)
+            {
+                invAngleSum += invAngle;
+                refMain[k] = refSide[invAngleSum >> 8];
+            }
+        }
+        else
+        {
+            refMain = modeVer ? refAbove : refLeft;
+            refSide = modeVer ? refLeft  : refAbove;
+        }
+
+        if (intraPredAngle == 0)
+        {
+            for (k = 0; k < blkSize; k++)
+            {
+                for (l = 0; l < blkSize; l++)
+                {
+                    pDst[k * dstStride + l] = refMain[l + 1];
+                }
+            }
+
+            if (bFilter)
+            {
+                for (k = 0; k < blkSize; k++)
+                {
+                    pDst[k * dstStride] = (pixel)Clip3(0, (1 << bitDepth) - 1, static_cast<short>(pDst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1));
+                }
+            }
+        }
+        else
+        {
+            int deltaPos = 0;
+            int deltaInt;
+            int deltaFract;
+            int refMainIndex;
+
+            for (k = 0; k < blkSize; k++)
+            {
+                deltaPos += intraPredAngle;
+                deltaInt   = deltaPos >> 5;
+                deltaFract = deltaPos & (32 - 1);
+
+                if (deltaFract)
+                {
+                    // Do linear filtering
+                    for (l = 0; l < blkSize; l++)
+                    {
+                        refMainIndex        = l + deltaInt + 1;
+                        pDst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
+                    }
+                }
+                else
+                {
+                    // Just copy the integer samples
+                    for (l = 0; l < blkSize; l++)
+                    {
+                        pDst[k * dstStride + l] = refMain[l + deltaInt + 1];
+                    }
+                }
+            }
+        }
+
+        // Flip the block if this is the horizontal mode
+        if (modeHor)
+        {
+            pixel  tmp;
+            for (k = 0; k < blkSize - 1; k++)
+            {
+                for (l = k + 1; l < blkSize; l++)
+                {
+                    tmp                 = pDst[k * dstStride + l];
+                    pDst[k * dstStride + l] = pDst[l * dstStride + k];
+                    pDst[l * dstStride + k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+#if HIGH_BIT_DEPTH
+#else // HIGH_BIT_DEPTH
+
+#if INSTRSET < 40
+void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+    int iMode;
+
+    // avoid warning
+    (pLeft1);
+    (pAbove1);
+
+    for( iMode = 2; iMode <= 34; iMode++ )
+    {
+        pixel *pLeft = pLeft0;
+        pixel *pAbove = pAbove0;
+        pixel *pDst = pDst0 + (iMode-2) * (4 * 4);
+        xPredIntraAngBufRef(8, pDst, 4, 4, iMode, bLuma, pLeft, pAbove);
+
+        // Optimize code don't flip buffer
+        bool modeHor = (iMode < 18);
+        // Flip the block if this is the horizontal mode
+        if (modeHor)
+        {
+            pixel  tmp;
+            const int width = 4;
+            for (int k = 0; k < width - 1; k++)
+            {
+                for (int l = k + 1; l < width; l++)
+                {
+                    tmp                 = pDst[k * width + l];
+                    pDst[k * width + l] = pDst[l * width + k];
+                    pDst[l * width + k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+#else // INSTRSET >= 4
+
+void xPredIntraAngs4(pixel *pDst, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+    
+}
+#endif // INSTRSET < 4
+
+#endif // HIGH_BIT_DEPTH
+
+}
+
+#include "utils.h"
+
+namespace x265 {
+void NAME(Setup_Vec_IPredPrimitives)(EncoderPrimitives& p)
+{
+    initFileStaticVars();
+    p.getIPredDC = predIntraDC;
+    p.getIPredPlanar = predIntraPlanar;
+    p.getIPredAng = xPredIntraAngBufRef;
+    p.getIPredAngs4 = xPredIntraAngs4;
+}
+
+}
diff --git a/source/test/intrapredharness.cpp b/source/test/intrapredharness.cpp
index 1f6bcf6..42b6dbe 100644
--- a/source/test/intrapredharness.cpp
+++ b/source/test/intrapredharness.cpp
@@ -47,6 +47,8 @@ IntraPredHarness::IntraPredHarness()
 
     pixel_out_C   = (pixel*)malloc(out_size * sizeof(pixel));
     pixel_out_Vec = (pixel*)malloc(out_size * sizeof(pixel));
+    pixel_out_33_C   = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), out_size_33, 32);
+    pixel_out_33_Vec = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), out_size_33, 32);
 
     if (!pixel_out_C || !pixel_out_Vec)
     {
@@ -62,6 +64,8 @@ IntraPredHarness::~IntraPredHarness()
     free(pixel_buff);
     free(pixel_out_C);
     free(pixel_out_Vec);
+    TestHarness::alignedFree(pixel_out_33_C);
+    TestHarness::alignedFree(pixel_out_33_Vec);
 }
 
 bool IntraPredHarness::check_getIPredDC_primitive(x265::getIPredDC_t ref, x265::getIPredDC_t opt)
@@ -167,6 +171,52 @@ bool IntraPredHarness::check_getIPredAng_primitive(x265::getIPredAng_p ref, x265
     return true;
 }
 
+bool IntraPredHarness::check_getIPredAngs4_primitive(x265::getIPredAngs_t ref, x265::getIPredAngs_t opt)
+{
+    int j = ADI_BUF_STRIDE;
+
+    Bool isLuma;
+
+    for (int width = 4; width <= 4; width <<= 1)
+    {
+        for (int i = 0; i <= 100; i++)
+        {
+            isLuma = (width <= 16) && (rand()%2);
+
+            pixel * refAbove0 = pixel_buff + j;
+            pixel * refLeft0 = refAbove0 + 3 * width;
+            refLeft0[0] = refAbove0[0];
+
+            pixel * refAbove1 = pixel_buff + j + FENC_STRIDE;
+            pixel * refLeft1 = refAbove1 + 3 * width + FENC_STRIDE;
+            refLeft1[0] = refAbove1[0];
+
+#if _DEBUG
+            memset(pixel_out_33_Vec, 0xCD, out_size);
+            memset(pixel_out_33_C, 0xCD, out_size);
+#endif
+
+            ref(pixel_out_33_C,   refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
+            opt(pixel_out_33_Vec, refAbove0, refLeft0, refAbove1, refLeft1, isLuma);
+            for (int p = 2-2; p <= 34-2; p++)
+            {
+                for (int k = 0; k < width; k++)
+                {
+                    if (memcmp(pixel_out_33_C + p * (width *width) + k * width, pixel_out_33_Vec + p * (width *width) + k * width, width))
+                    {
+                        printf("\nFailed: [%2d]: width=%d, mode=%d, bfilter=%d\n", k, width, p+2, isLuma);
+                        return false;
+                    }
+                }
+            }
+
+            j += FENC_STRIDE;
+        }
+    }
+
+    return true;
+}
+
 bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.getIPredDC)
@@ -193,6 +243,14 @@ bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const Encod
             return false;
         }
     }
+    if (opt.getIPredAngs4)
+    {
+        if (!check_getIPredAngs4_primitive(ref.getIPredAngs4, opt.getIPredAngs4))
+        {
+            printf("intrapred_angular_4x4_33_modes failed\n");
+            return false;
+        }
+    }
 
     return true;
 }
@@ -239,4 +297,18 @@ void IntraPredHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderP
             }
         }
     }
+    if (opt.getIPredAngs4)
+    {
+        for (int ii = 4; ii <= 4; ii <<= 1)
+        {
+            width = ii;
+            bool bFilter  = (width <= 16);
+            pixel * refAbove = pixel_buff + srcStride;
+            pixel * refLeft = refAbove + 3 * width;
+            refLeft[0] = refAbove[0];
+            printf("IPred_getIPredAngs4\t\t");
+            REPORT_SPEEDUP(opt.getIPredAngs4, ref.getIPredAngs4,
+                           pixel_out_33_Vec, refAbove, refLeft, refAbove, refLeft, bFilter);
+        }
+    }
 }
diff --git a/source/test/intrapredharness.h b/source/test/intrapredharness.h
index 5f138d3..9c144f1 100644
--- a/source/test/intrapredharness.h
+++ b/source/test/intrapredharness.h
@@ -34,15 +34,19 @@ protected:
     pixel *pixel_buff;
     pixel *pixel_out_C;
     pixel *pixel_out_Vec;
+    pixel *pixel_out_33_C;
+    pixel *pixel_out_33_Vec;
 
     pixel *IP_vec_output_p, *IP_C_output_p;
 
     static const int ip_t_size = 4 * 65 * 65 * 100;
     static const int out_size = 64 * FENC_STRIDE;
+    static const int out_size_33 = 33 * 64 * FENC_STRIDE;
 
     bool check_getIPredDC_primitive(x265::getIPredDC_t ref, x265::getIPredDC_t opt);
     bool check_getIPredPlanar_primitive(x265::getIPredPlanar_t ref, x265::getIPredPlanar_t opt);
     bool check_getIPredAng_primitive(x265::getIPredAng_p ref, x265::getIPredAng_p opt);
+    bool check_getIPredAngs4_primitive(x265::getIPredAngs_t ref, x265::getIPredAngs_t opt);
 
 public:
 
-- 
1.8.3.msysgit.0




More information about the x265-devel mailing list