[x265] [PATCH 2 of 2] primitves: 8 bit : PredIntraAng32x32 function table implementation

mandar at multicorewareinc.com mandar at multicorewareinc.com
Fri Jun 28 14:32:23 CEST 2013


# HG changeset patch
# User Mandar Gurav
# Date 1372422711 25200
# Node ID af5d7a6d24858af71355964374213adbb654b4f3
# Parent  124a8b402cd671dfd76bfebcdd0ebad8136b0da1
primitves: 8 bit : PredIntraAng32x32 function table implementation

diff -r 124a8b402cd6 -r af5d7a6d2485 source/common/vec/intrapred.inc
--- a/source/common/vec/intrapred.inc	Thu Jun 27 23:24:42 2013 -0700
+++ b/source/common/vec/intrapred.inc	Fri Jun 28 05:31:51 2013 -0700
@@ -2218,7 +2218,10 @@
 
 typedef void (*PredIntraAng4x4_table)(pixel* pDst, int dstStride, pixel *refMain, int dirMode);
 PredIntraAng4x4_table PredIntraAng4[] = {
-    /* PredIntraAng4_0 is replaced with PredIntraAng4_2. For PredIntraAng4_0 we are going through default path in the xPredIntraAng4x4 because we cannot afford to pass large number arguments for this function. */
+    /* 
+    * PredIntraAng4_0 is replaced with PredIntraAng4_2. For PredIntraAng4_0 we are going through default path in the xPredIntraAng4x4
+    because we cannot afford to pass large number arguments for this function. 
+    */
     PredIntraAng4_32,
     PredIntraAng4_26,
     PredIntraAng4_21,
@@ -2227,7 +2230,7 @@
     PredIntraAng4_9,
     PredIntraAng4_5,
     PredIntraAng4_2,
-    PredIntraAng4_2,    //Intentionally wrong! It should be "PredIntraAng4_0" here. 
+    PredIntraAng4_2,    //0 can be done by 2- will never be called
     PredIntraAng4_m_2,
     PredIntraAng4_m_5,
     PredIntraAng4_m_9,
@@ -2243,7 +2246,7 @@
     PredIntraAng4_m_9,
     PredIntraAng4_m_5,
     PredIntraAng4_m_2,
-    PredIntraAng4_2,    //Intentionally wrong! It should be "PredIntraAng4_0" here. 
+    PredIntraAng4_2,    //0 can be done by 2 - will never be called 
     PredIntraAng4_2,
     PredIntraAng4_5,
     PredIntraAng4_9,
@@ -2664,40 +2667,42 @@
 typedef void (*PredIntraAng8x8_table)(pixel* pDst, int dstStride, pixel *refMain, int dirMode);
 PredIntraAng8x8_table PredIntraAng8[] = {
     /* 
-    PredIntraAng8_0 is replaced with PredIntraAng8_2. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 because we cannot afford to pass large number arguments for this function. 
-    Path for PredIntraAng8_21, PredIntraAng8_m_21, PredIntraAng8_17, PredIntraAng8_m_17, PredIntraAng8_13, PredIntraAng8_m_13, PredIntraAng8_9, PredIntraAng8_m_9 is same as PredIntraAng8_26.
+    * PredIntraAng8_0 is replaced with PredIntraAng8_2. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 
+    because we cannot afford to pass large number arguments for this function. 
+    * Path for PredIntraAng8_21, PredIntraAng8_m_21, PredIntraAng8_17, PredIntraAng8_m_17, PredIntraAng8_13, PredIntraAng8_m_13, PredIntraAng8_9,
+    PredIntraAng8_m_9 is same as PredIntraAng8_26.
     */
     PredIntraAng8_32,
     PredIntraAng8_26,
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_21" here.      
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_17" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_13" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_9" here.
+    PredIntraAng8_26,       //21 can be done by 26      
+    PredIntraAng8_26,       //17 can be done by 26
+    PredIntraAng8_26,       //13 can be done by 26
+    PredIntraAng8_26,       //9 can be done by 26
     PredIntraAng8_5,    
     PredIntraAng8_2,
-    PredIntraAng8_2,        //Intentionally wrong! It should be "PredIntraAng8_0" here.
+    PredIntraAng8_2,        //0 can be done by 2 - will never be called
     PredIntraAng8_m_2,
     PredIntraAng8_m_5,
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_9" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_13" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_17" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_21" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_26" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_32" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_26" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_21" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_17" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_13" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_m_9" here.
+    PredIntraAng8_26,       //-9 can be done by 26
+    PredIntraAng8_26,       //-13 can be done by 26
+    PredIntraAng8_26,       //-17 can be done by 26
+    PredIntraAng8_26,       //-21 can be done by 26
+    PredIntraAng8_26,       //-26 can be done by 26
+    PredIntraAng8_26,       //-32 can be done by 26
+    PredIntraAng8_26,       //-26 can be done by 26
+    PredIntraAng8_26,       //-21 can be done by 26
+    PredIntraAng8_26,       //-17 can be done by 26
+    PredIntraAng8_26,       //-13 can be done by 26
+    PredIntraAng8_26,       //-9 can be done by 26
     PredIntraAng8_m_5,
     PredIntraAng8_m_2,
-    PredIntraAng8_2,        //Intentionally wrong! It should be "PredIntraAng8_0" here.
+    PredIntraAng8_2,        //0 can be done by 2 - will never be called
     PredIntraAng8_2,
     PredIntraAng8_5,
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_9" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_13" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_17" here.
-    PredIntraAng8_26,       //Intentionally wrong! It should be "PredIntraAng8_21" here.
+    PredIntraAng8_26,       //9 can be done by 26
+    PredIntraAng8_26,       //13 can be done by 26
+    PredIntraAng8_26,       //17 can be done by 26
+    PredIntraAng8_26,       //21 can be done by 26
     PredIntraAng8_26,
     PredIntraAng8_32
 };
@@ -3155,46 +3160,49 @@
     }
 }
 
-typedef void (*PredIntraAng16x16_table)(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int dirMode);
-PredIntraAng16x16_table PredIntraAng16[] = {
-    /* 
-    PredIntraAng16_0 is replaced with PredIntraAng16_26. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 because we cannot afford to pass large number arguments for this function. 
-    Path for PredIntraAng16_26, PredIntraAng16_m_26, PredIntraAng16_21, PredIntraAng16_m_21, PredIntraAng16_17, PredIntraAng16_m_17, PredIntraAng16_13, PredIntraAng16_m_13, PredIntraAng16_9, PredIntraAng16_m_9, PredIntraAng16_5, PredIntraAng16_m_5, PredIntraAng16_2, PredIntraAng16_m_2 is same as PredIntraAng16_26.
-    */
-    PredIntraAng16_32,
-    PredIntraAng16_26,
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_21" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_17" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_13" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_9" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_5" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_2" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_0" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_2" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_5" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_9" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_13" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_17" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_21" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_26" here.
-    PredIntraAng16_m_32,      
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_26" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_21" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_17" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_13" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_9" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_5" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_m_2" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_0" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_2" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_5" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_9" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_13" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_17" here.
-    PredIntraAng16_26,      //Intentionally wrong! It should be "PredIntraAng16_21" here.
-    PredIntraAng16_26,
-    PredIntraAng16_32
-};
+//typedef void (*PredIntraAng16x16_table)(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int dirMode);
+//PredIntraAng16x16_table PredIntraAng16[] = {
+//    /* 
+//    * PredIntraAng16_0 is replaced with PredIntraAng16_26. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8
+//    because we cannot afford to pass large number arguments for this function. 
+//    * Path for PredIntraAng16_26, PredIntraAng16_m_26, PredIntraAng16_21, PredIntraAng16_m_21, PredIntraAng16_17, PredIntraAng16_m_17, 
+//    PredIntraAng16_13, PredIntraAng16_m_13, PredIntraAng16_9, PredIntraAng16_m_9, PredIntraAng16_5, PredIntraAng16_m_5, PredIntraAng16_2,
+//    PredIntraAng16_m_2 is same as PredIntraAng16_26.
+//    */
+//    PredIntraAng16_32,
+//    PredIntraAng16_26,
+//    PredIntraAng16_26,      //21 can be done by 26
+//    PredIntraAng16_26,      //17 can be done by 26
+//    PredIntraAng16_26,      //13 can be done by 26
+//    PredIntraAng16_26,      //9 can be done by 26
+//    PredIntraAng16_26,      //5 can be done by 26
+//    PredIntraAng16_26,      //2 can be done by 26
+//    PredIntraAng16_26,      //0 can be done by 26 - will never be called
+//    PredIntraAng16_26,      //-2 can be done by 26
+//    PredIntraAng16_26,      //-5 can be done by 26
+//    PredIntraAng16_26,      //-9 can be done by 26
+//    PredIntraAng16_26,      //-13 can be done by 26
+//    PredIntraAng16_26,      //-17 can be done by 26
+//    PredIntraAng16_26,      //-21 can be done by 26
+//    PredIntraAng16_26,      //-26 can be done by 26
+//    PredIntraAng16_m_32,      
+//    PredIntraAng16_26,      //-26 can be done by 26
+//    PredIntraAng16_26,      //-21 can be done by 26
+//    PredIntraAng16_26,      //-17 can be done by 26
+//    PredIntraAng16_26,      //-13 can be done by 26
+//    PredIntraAng16_26,      //-9 can be done by 26
+//    PredIntraAng16_26,      //-5 can be done by 26
+//    PredIntraAng16_26,      //-2 can be done by 26
+//    PredIntraAng16_26,      //0 can be done by 26 - will never be called
+//    PredIntraAng16_26,      //2 can be done by 26
+//    PredIntraAng16_26,      //5 can be done by 26
+//    PredIntraAng16_26,      //9 can be done by 26
+//    PredIntraAng16_26,      //13 can be done by 26
+//    PredIntraAng16_26,      //17 can be done by 26
+//    PredIntraAng16_26,      //21 can be done by 26
+//    PredIntraAng16_26,
+//    PredIntraAng16_32
+//};
 
 void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
 {
@@ -3382,9 +3390,17 @@
             }
         }
     }
+    else if(intraPredAngle==32)
+    {
+        PredIntraAng16_32(pDst, dstStride, refMain, refSide, dirMode);        
+    }
+    else if(intraPredAngle==-32)
+    {
+        PredIntraAng16_m_32(pDst, dstStride, refMain, refSide, dirMode);        
+    }
     else
     {
-        PredIntraAng16[dirMode-2](pDst, dstStride, refMain, refSide, dirMode);        
+        PredIntraAng16_26(pDst, dstStride, refMain, refSide, dirMode);        
     }
 }
 
@@ -3631,13 +3647,445 @@
         PREDANG_CALCROW_HOR_MODE2(R7) \
 }
 
-void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+void PredIntraAng32_32(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int /*dirMode*/)
+{    
+    __m128i itmp;
+    refMain += 2;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain++);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain++;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    refMain++;
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+}
+
+void PredIntraAng32_m_32(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int /*dirMode*/)
 {
-    int k;
-    int blkSize        = width;
-
-    // Map the mode index to main prediction direction and angle
-    assert(dirMode > 1); //no planar and dc
+    Vec16uc v_refSide;
+    pixel refMain0 = refMain[0];
+
+    v_refSide.load(refSide);
+    v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+    v_refSide.store(refMain - 15);
+
+    v_refSide.load(refSide + 16);
+    v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+    v_refSide.store(refMain - 31);
+
+    refMain[0] = refMain0;
+
+    __m128i itmp;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+    itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+    _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+    pDst += dstStride;
+    refMain--;
+
+    itmp = _mm_loadu_si128((__m128i const*)refMain);
+    _mm_storeu_si128((__m128i*)pDst, itmp);
+}
+
+void PredIntraAng32_26(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int dirMode)
+{
     bool modeHor       = (dirMode < 18);
     bool modeVer       = !modeHor;
     int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
@@ -3647,16 +4095,496 @@
 
     // Set bitshifts and scale the angle parameter to block size
     int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
     absAng             = angTable[absAng];
     intraPredAngle     = signAng * absAng;
 
-    // Do angular predictions
+    if (modeHor)
+    {
+        __m128i row11L, row12L, row11H, row12H;
+        __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+        __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+        Pel * original_pDst = pDst;
+        v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        thirty2 = _mm_set1_epi16(32);
+        thirty1 = _mm_set1_epi16(31);
+        __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;            
+
+        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            PREDANG_CALCROW_HOR(7 + 0, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            PREDANG_CALCROW_HOR(7 + 8, R16)
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+
+            pDst = original_pDst + 16;
+
+        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+            PREDANG_CALCROW_HOR(7 + 16, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+
+            pDst = original_pDst + (16 * dstStride);
+        refMain += 16;
+        v_deltaPos = _mm_setzero_si128();
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+
+        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+            PREDANG_CALCROW_HOR(7 + 0, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+            PREDANG_CALCROW_HOR(7 + 8, R16)
+            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+            pDst = original_pDst + (16 * dstStride) + 16;
+
+        CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+            PREDANG_CALCROW_HOR(7 + 16, R8)
+            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+            BLND2_2(R1, R9)
+            BLND2_2(R5, R13)
+            BLND2_2(R3, R11)
+            BLND2_2(R7, R15)
+            BLND2_2(R2, R10)
+            BLND2_2(R6, R14)
+            BLND2_2(R4, R12)
+            BLND2_2(R8, R16)
+    }
+    else
+    {
+        __m128i row11L, row12L, row11H, row12H;
+        __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+        //__m128i row14, row22;
+        //__m128i res1, res2;
+
+        v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        thirty2 = _mm_set1_epi16(32);
+        thirty1 = _mm_set1_epi16(31);
+        __m128i itmp, it1, it2, it3, i16;
+
+        for (int i = 0; i <= 30; i++)
+        {
+            PREDANG_CALCROW_VER(i);
+        }
+
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+        _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+        itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
+        _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+    }
+}
+
+void PredIntraAng32_2(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int dirMode)
+{
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    //int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    if (modeHor)
+    {
+        __m128i row11L, row12L, row11H, row12H, res1, res2;
+        __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+        __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+        Pel * original_pDst = pDst;
+        v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        thirty2 = _mm_set1_epi16(32);
+        thirty1 = _mm_set1_epi16(31);
+        __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+
+        LOADROW(row11L, row11H, 0)
+        LOADROW(row12L, row12H, 1)
+        R16 = _mm_packus_epi16(row12L, row12H);
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+        row11L = row12L;
+        row11H = row12H;
+        LOADROW(row12L, row12H, 2)
+        R16 = _mm_packus_epi16(row12L, row12H);
+        pDst = original_pDst + 16;
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+        pDst = original_pDst + (16 * dstStride);
+        refMain += 16;
+        v_deltaPos = _mm_setzero_si128();
+
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        LOADROW(row11L, row11H, 0)
+        LOADROW(row12L, row12H, 1)
+        R16 = _mm_packus_epi16(row12L, row12H);
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+        row11L = row12L;
+        row11H = row12H;
+        LOADROW(row12L, row12H, 2)
+        R16 = _mm_packus_epi16(row12L, row12H);
+        pDst = original_pDst + (16 * dstStride) + 16;
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+        return;
+    }
+    else
+    {
+        __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+        __m128i row11, row12, row13, row14, row21, row22, row23, row24;
+        __m128i res1, res2;
+
+        v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        thirty2 = _mm_set1_epi16(32);
+        thirty1 = _mm_set1_epi16(31);
+        __m128i itmp, it1, it2, it3, i16;
+
+        LOADROW(row11, row12, 0)
+        LOADROW(row21, row22, 1)
+        LOADROW(row13, row14, 16)
+        LOADROW(row23, row24, 17)
+        for (int i = 0; i <= 14; i++)
+        {
+            PREDANG_CALCROW_VER_MODE2(i);
+        }
+
+        //deltaFract == 0 for 16th row
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+        itmp = _mm_packus_epi16(row21, row22);
+        _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+        itmp = _mm_packus_epi16(row23, row24);
+        _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+        row11 = row21;
+        row12 = row22;
+        row13 = row23;
+        row14 = row24;
+
+        LOADROW(row21, row22, 2)
+        LOADROW(row23, row24, 18)
+        for (int i = 16; i <= 30; i++)
+        {
+            PREDANG_CALCROW_VER_MODE2(i);
+        }
+
+        itmp = _mm_packus_epi16(row21, row22);
+        _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+        itmp = _mm_packus_epi16(row23, row24);
+        _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+    }
+}
+
+void PredIntraAng32_m_2(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int dirMode)
+{
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
+    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+    //int lookIdx = intraPredAngle;
+    int absAng         = abs(intraPredAngle);
+    int signAng        = intraPredAngle < 0 ? -1 : 1;
+
+    // Set bitshifts and scale the angle parameter to block size
+    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
+    absAng             = angTable[absAng];
+    intraPredAngle     = signAng * absAng;
+
+    if (modeHor)
+    {
+        __m128i row11L, row12L, row11H, row12H, res1, res2;
+        __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+        __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+        Pel * original_pDst = pDst;
+        v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        thirty2 = _mm_set1_epi16(32);
+        thirty1 = _mm_set1_epi16(31);
+        __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+        LOADROW(row11L, row11H, -1)
+        LOADROW(row12L, row12H,  0)
+        R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+        row12L = row11L;
+        row12H = row11H;
+        LOADROW(row11L, row11H, -2)
+        R16 = _mm_packus_epi16(row11L, row11H);
+        pDst = original_pDst + 16;
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+        pDst = original_pDst + (16 * dstStride);
+        refMain += 16;
+
+        v_deltaPos = _mm_setzero_si128();
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        LOADROW(row11L, row11H, -1)
+        LOADROW(row12L, row12H,  0)
+        R16 = _mm_packus_epi16(row11L, row11H);
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+        row12L = row11L;
+        row12H = row11H;
+        LOADROW(row11L, row11H, -2)
+        R16 = _mm_packus_epi16(row11L, row11H);
+        pDst = original_pDst + (16 * dstStride) + 16;
+
+        CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+        PREDANG_CALCROW_HOR_MODE2(R8)
+        MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+        CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+        MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+        BLND2_2(R1, R9)
+        BLND2_2(R5, R13)
+        BLND2_2(R3, R11)
+        BLND2_2(R7, R15)
+        BLND2_2(R2, R10)
+        BLND2_2(R6, R14)
+        BLND2_2(R4, R12)
+        BLND2_2(R8, R16)
+
+    }
+    else
+    {        
+        __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+        __m128i row11, row12, row13, row14, row21, row22, row23, row24;
+        __m128i res1, res2;
+
+        v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+        v_ipAngle = _mm_set1_epi16(intraPredAngle);
+        thirty2 = _mm_set1_epi16(32);
+        thirty1 = _mm_set1_epi16(31);
+        __m128i itmp, it1, it2, it3, i16;
+
+
+        LOADROW(row11, row12, -1)
+        LOADROW(row21, row22,  0)
+        LOADROW(row13, row14, 15)
+        LOADROW(row23, row24, 16)
+        for (int i = 0; i <= 14; i++)
+        {
+            PREDANG_CALCROW_VER_MODE2(i);
+        }
+
+        //deltaFract == 0 for 16th row
+        v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+        v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+        itmp = _mm_packus_epi16(row11, row12);
+        _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+        itmp = _mm_packus_epi16(row13, row14);
+        _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+        row21 = row11;
+        row22 = row12;
+        row23 = row13;
+        row24 = row14;
+
+        LOADROW(row11, row12, -2)
+        LOADROW(row13, row14, 14)
+        for (int i = 16; i <= 30; i++)
+        {
+            PREDANG_CALCROW_VER_MODE2(i);
+        }
+
+        itmp = _mm_packus_epi16(row11, row12);
+        _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+        itmp = _mm_packus_epi16(row13, row14);
+        _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+    }            
+}
+
+typedef void (*PredIntraAng32x32_table)(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int dirMode);
+PredIntraAng32x32_table PredIntraAng32[] = {
+    /* 
+    PredIntraAng16_0 is replaced with PredIntraAng16_26. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 because we cannot afford to pass large number arguments for this function. 
+    Path for PredIntraAng16_26, PredIntraAng16_m_26, PredIntraAng16_21, PredIntraAng16_m_21, PredIntraAng16_17, PredIntraAng16_m_17, PredIntraAng16_13, PredIntraAng16_m_13, PredIntraAng16_9, PredIntraAng16_m_9, PredIntraAng16_5, PredIntraAng16_m_5, PredIntraAng16_2, PredIntraAng16_m_2 is same as PredIntraAng16_26.
+    */    
+    PredIntraAng32_32,
+    PredIntraAng32_26,
+    PredIntraAng32_26,  //21 can be done by 26
+    PredIntraAng32_26,  //17 can be done by 26
+    PredIntraAng32_26,  //13 can be done by 26
+    PredIntraAng32_26,  //9 can be done by 26   
+    PredIntraAng32_26,  //5 can be done by 26
+    PredIntraAng32_2,
+    PredIntraAng32_26,  //0 can be done by 26 - will never be called
+    PredIntraAng32_m_2,
+    PredIntraAng32_26,  //-5 can be done by 26
+    PredIntraAng32_26,  //-9 can be done by 26
+    PredIntraAng32_26,  //-13 can be done by 26
+    PredIntraAng32_26,  //-17 can be done by 26
+    PredIntraAng32_26,  //-21 can be done by 26
+    PredIntraAng32_26,  //-26 can be done by 26
+    PredIntraAng32_m_32,
+    PredIntraAng32_26,  //-26 can be done by 26
+    PredIntraAng32_26,  //-21 can be done by 26
+    PredIntraAng32_26,  //-17 can be done by 26
+    PredIntraAng32_26,  //-13 can be done by 26
+    PredIntraAng32_26,  //-9 can be done by 26
+    PredIntraAng32_26,  //-5 can be done by 26
+    PredIntraAng32_m_2,
+    PredIntraAng32_26,  //0 can be done by 26 - will never be called
+    PredIntraAng32_2,
+    PredIntraAng32_26,  //5 can be done by 26
+    PredIntraAng32_26,  //9 can be done by 26
+    PredIntraAng32_26,  //13 can be done by 26
+    PredIntraAng32_26,  //17 can be done by 26
+    PredIntraAng32_26,  //21 can be done by 26
+    PredIntraAng32_26,
+    PredIntraAng32_32
+};
+
+void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+    int k;
+    int blkSize        = width;
+
+    // Map the mode index to main prediction direction and angle
+    assert(dirMode > 1); //no planar and dc
+    static const int mode_to_angle_table[] = {32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32};
+    static const int mode_to_invAng_table[] = {256, 315, 390, 482, 630, 910, 1638, 4096, 0, 4096, 1638, 910, 630, 482, 390, 315, 256, 315, 390, 482, 630, 910, 1638, 4096, 0, 4096, 1638, 910, 630, 482, 390, 315, 256};
+    int intraPredAngle = mode_to_angle_table[dirMode-2];
+    int invAngle       = mode_to_invAng_table[dirMode-2];
+    bool modeHor       = (dirMode < 18);
+    bool modeVer       = !modeHor;
 
     pixel* refMain;
     pixel* refSide;
-
     // Initialise the Main and Left reference array.
     if (intraPredAngle < 0)
     {
@@ -3928,808 +4856,10 @@
             _mm_storeu_si128((__m128i*)(pDst), v_main);
         }
     }
-    else if (intraPredAngle == -32)
-    {
-        Vec16uc v_refSide;
-        pixel refMain0 = refMain[0];
-
-        v_refSide.load(refSide);
-        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
-        v_refSide.store(refMain - 15);
-
-        v_refSide.load(refSide + 16);
-        v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
-        v_refSide.store(refMain - 31);
-
-        refMain[0] = refMain0;
-
-        __m128i itmp;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain--;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-
-        return;
-    }
-    else if (intraPredAngle == 32)
-    {
-        __m128i itmp;
-        refMain += 2;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain++);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        pDst += dstStride;
-        refMain++;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        itmp = _mm_loadu_si128((__m128i const*)refMain);
-        refMain++;
-        _mm_storeu_si128((__m128i*)pDst, itmp);
-        _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
-        itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
-        _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
-        pDst += dstStride;
-
-        return;
-    }
     else
     {
-        if (modeHor)
-        {
-            __m128i row11L, row12L, row11H, row12H, res1, res2;
-            __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
-            __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
-
-            Pel * original_pDst = pDst;
-            v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
-            v_ipAngle = _mm_set1_epi16(intraPredAngle);
-            thirty2 = _mm_set1_epi16(32);
-            thirty1 = _mm_set1_epi16(31);
-            __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
-
-            switch (intraPredAngle)
-            {
-            case -2:
-                LOADROW(row11L, row11H, -1)
-                LOADROW(row12L, row12H,  0)
-                R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row12L = row11L;
-                row12H = row11H;
-                LOADROW(row11L, row11H, -2)
-                R16 = _mm_packus_epi16(row11L, row11H);
-                pDst = original_pDst + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                pDst = original_pDst + (16 * dstStride);
-                refMain += 16;
-
-                v_deltaPos = _mm_setzero_si128();
-                v_ipAngle = _mm_set1_epi16(intraPredAngle);
-                LOADROW(row11L, row11H, -1)
-                LOADROW(row12L, row12H,  0)
-                R16 = _mm_packus_epi16(row11L, row11H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row12L = row11L;
-                row12H = row11H;
-                LOADROW(row11L, row11H, -2)
-                R16 = _mm_packus_epi16(row11L, row11H);
-                pDst = original_pDst + (16 * dstStride) + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-                return;
-
-            case  2:
-                LOADROW(row11L, row11H, 0)
-                LOADROW(row12L, row12H, 1)
-                R16 = _mm_packus_epi16(row12L, row12H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row11L = row12L;
-                row11H = row12H;
-                LOADROW(row12L, row12H, 2)
-                R16 = _mm_packus_epi16(row12L, row12H);
-                pDst = original_pDst + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                pDst = original_pDst + (16 * dstStride);
-                refMain += 16;
-                v_deltaPos = _mm_setzero_si128();
-
-                v_ipAngle = _mm_set1_epi16(intraPredAngle);
-                LOADROW(row11L, row11H, 0)
-                LOADROW(row12L, row12H, 1)
-                R16 = _mm_packus_epi16(row12L, row12H);
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                row11L = row12L;
-                row11H = row12H;
-                LOADROW(row12L, row12H, 2)
-                R16 = _mm_packus_epi16(row12L, row12H);
-                pDst = original_pDst + (16 * dstStride) + 16;
-
-                CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
-                PREDANG_CALCROW_HOR_MODE2(R8)
-                MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-                CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
-                MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-                BLND2_2(R1, R9)
-                BLND2_2(R5, R13)
-                BLND2_2(R3, R11)
-                BLND2_2(R7, R15)
-                BLND2_2(R2, R10)
-                BLND2_2(R6, R14)
-                BLND2_2(R4, R12)
-                BLND2_2(R8, R16)
-                return;
-            }
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
-            PREDANG_CALCROW_HOR(7 + 0, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
-            PREDANG_CALCROW_HOR(7 + 8, R16)
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-
-            pDst = original_pDst + 16;
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
-            PREDANG_CALCROW_HOR(7 + 16, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
-            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-
-            pDst = original_pDst + (16 * dstStride);
-            refMain += 16;
-            v_deltaPos = _mm_setzero_si128();
-            v_ipAngle = _mm_set1_epi16(intraPredAngle);
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
-            PREDANG_CALCROW_HOR(7 + 0, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
-            PREDANG_CALCROW_HOR(7 + 8, R16)
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-            pDst = original_pDst + (16 * dstStride) + 16;
-
-            CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
-            PREDANG_CALCROW_HOR(7 + 16, R8)
-            MB8(R1, R2, R3, R4, R5, R6, R7, R8)
-            CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
-            R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
-            MB8(R9, R10, R11, R12, R13, R14, R15, R16)
-            BLND2_2(R1, R9)
-            BLND2_2(R5, R13)
-            BLND2_2(R3, R11)
-            BLND2_2(R7, R15)
-            BLND2_2(R2, R10)
-            BLND2_2(R6, R14)
-            BLND2_2(R4, R12)
-            BLND2_2(R8, R16)
-        }
-        else
-        {
-            __m128i row11L, row12L, row11H, row12H;
-            __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
-            __m128i row11, row12, row13, row14, row21, row22, row23, row24;
-            __m128i res1, res2;
-
-            v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
-            v_ipAngle = _mm_set1_epi16(intraPredAngle);
-            thirty2 = _mm_set1_epi16(32);
-            thirty1 = _mm_set1_epi16(31);
-            __m128i itmp, it1, it2, it3, i16;
-
-            switch (intraPredAngle)
-            {
-            case -2:
-                LOADROW(row11, row12, -1)
-                LOADROW(row21, row22,  0)
-                LOADROW(row13, row14, 15)
-                LOADROW(row23, row24, 16)
-                for (int i = 0; i <= 14; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                //deltaFract == 0 for 16th row
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                itmp = _mm_packus_epi16(row11, row12);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row13, row14);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
-                row21 = row11;
-                row22 = row12;
-                row23 = row13;
-                row24 = row14;
-
-                LOADROW(row11, row12, -2)
-                LOADROW(row13, row14, 14)
-                for (int i = 16; i <= 30; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                itmp = _mm_packus_epi16(row11, row12);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row13, row14);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
-                return;
-
-            case  2:
-
-                LOADROW(row11, row12, 0)
-                LOADROW(row21, row22, 1)
-                LOADROW(row13, row14, 16)
-                LOADROW(row23, row24, 17)
-                for (int i = 0; i <= 14; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                //deltaFract == 0 for 16th row
-
-                v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
-                v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
-                itmp = _mm_packus_epi16(row21, row22);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row23, row24);
-                _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
-                row11 = row21;
-                row12 = row22;
-                row13 = row23;
-                row14 = row24;
-
-                LOADROW(row21, row22, 2)
-                LOADROW(row23, row24, 18)
-                for (int i = 16; i <= 30; i++)
-                {
-                    PREDANG_CALCROW_VER_MODE2(i);
-                }
-
-                itmp = _mm_packus_epi16(row21, row22);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
-                itmp = _mm_packus_epi16(row23, row24);
-                _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
-                return;
-            }
-
-            for (int i = 0; i <= 30; i++)
-            {
-                PREDANG_CALCROW_VER(i);
-            }
-
-            itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
-            _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
-            itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
-            _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-        }
-    }
+        PredIntraAng32[dirMode-2](pDst, dstStride, refMain, refSide, dirMode);        
+    }    
 }
 
 #endif /* if HIGH_BIT_DEPTH */
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xhevc_27June-2.patch
Type: text/x-patch
Size: 79744 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130628/4298e479/attachment-0001.bin>


More information about the x265-devel mailing list