[x265] [PATCH 2 of 2] primitves: 8 bit : PredIntraAng32x32 function table implementation
mandar at multicorewareinc.com
mandar at multicorewareinc.com
Fri Jun 28 14:32:23 CEST 2013
# HG changeset patch
# User Mandar Gurav
# Date 1372422711 25200
# Node ID af5d7a6d24858af71355964374213adbb654b4f3
# Parent 124a8b402cd671dfd76bfebcdd0ebad8136b0da1
primitves: 8 bit : PredIntraAng32x32 function table implementation
diff -r 124a8b402cd6 -r af5d7a6d2485 source/common/vec/intrapred.inc
--- a/source/common/vec/intrapred.inc Thu Jun 27 23:24:42 2013 -0700
+++ b/source/common/vec/intrapred.inc Fri Jun 28 05:31:51 2013 -0700
@@ -2218,7 +2218,10 @@
typedef void (*PredIntraAng4x4_table)(pixel* pDst, int dstStride, pixel *refMain, int dirMode);
PredIntraAng4x4_table PredIntraAng4[] = {
- /* PredIntraAng4_0 is replaced with PredIntraAng4_2. For PredIntraAng4_0 we are going through default path in the xPredIntraAng4x4 because we cannot afford to pass large number arguments for this function. */
+ /*
+ * PredIntraAng4_0 is replaced with PredIntraAng4_2. For PredIntraAng4_0 we are going through default path in the xPredIntraAng4x4
+ because we cannot afford to pass large number arguments for this function.
+ */
PredIntraAng4_32,
PredIntraAng4_26,
PredIntraAng4_21,
@@ -2227,7 +2230,7 @@
PredIntraAng4_9,
PredIntraAng4_5,
PredIntraAng4_2,
- PredIntraAng4_2, //Intentionally wrong! It should be "PredIntraAng4_0" here.
+ PredIntraAng4_2, //0 can be done by 2- will never be called
PredIntraAng4_m_2,
PredIntraAng4_m_5,
PredIntraAng4_m_9,
@@ -2243,7 +2246,7 @@
PredIntraAng4_m_9,
PredIntraAng4_m_5,
PredIntraAng4_m_2,
- PredIntraAng4_2, //Intentionally wrong! It should be "PredIntraAng4_0" here.
+ PredIntraAng4_2, //0 can be done by 2 - will never be called
PredIntraAng4_2,
PredIntraAng4_5,
PredIntraAng4_9,
@@ -2664,40 +2667,42 @@
typedef void (*PredIntraAng8x8_table)(pixel* pDst, int dstStride, pixel *refMain, int dirMode);
PredIntraAng8x8_table PredIntraAng8[] = {
/*
- PredIntraAng8_0 is replaced with PredIntraAng8_2. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 because we cannot afford to pass large number arguments for this function.
- Path for PredIntraAng8_21, PredIntraAng8_m_21, PredIntraAng8_17, PredIntraAng8_m_17, PredIntraAng8_13, PredIntraAng8_m_13, PredIntraAng8_9, PredIntraAng8_m_9 is same as PredIntraAng8_26.
+ * PredIntraAng8_0 is replaced with PredIntraAng8_2. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8
+ because we cannot afford to pass large number arguments for this function.
+ * Path for PredIntraAng8_21, PredIntraAng8_m_21, PredIntraAng8_17, PredIntraAng8_m_17, PredIntraAng8_13, PredIntraAng8_m_13, PredIntraAng8_9,
+ PredIntraAng8_m_9 is same as PredIntraAng8_26.
*/
PredIntraAng8_32,
PredIntraAng8_26,
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_21" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_17" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_13" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_9" here.
+ PredIntraAng8_26, //21 can be done by 26
+ PredIntraAng8_26, //17 can be done by 26
+ PredIntraAng8_26, //13 can be done by 26
+ PredIntraAng8_26, //9 can be done by 26
PredIntraAng8_5,
PredIntraAng8_2,
- PredIntraAng8_2, //Intentionally wrong! It should be "PredIntraAng8_0" here.
+ PredIntraAng8_2, //0 can be done by 2 - will never be called
PredIntraAng8_m_2,
PredIntraAng8_m_5,
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_9" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_13" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_17" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_21" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_26" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_32" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_26" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_21" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_17" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_13" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_m_9" here.
+ PredIntraAng8_26, //-9 can be done by 26
+ PredIntraAng8_26, //-13 can be done by 26
+ PredIntraAng8_26, //-17 can be done by 26
+ PredIntraAng8_26, //-21 can be done by 26
+ PredIntraAng8_26, //-26 can be done by 26
+ PredIntraAng8_26, //-32 can be done by 26
+ PredIntraAng8_26, //-26 can be done by 26
+ PredIntraAng8_26, //-21 can be done by 26
+ PredIntraAng8_26, //-17 can be done by 26
+ PredIntraAng8_26, //-13 can be done by 26
+ PredIntraAng8_26, //-9 can be done by 26
PredIntraAng8_m_5,
PredIntraAng8_m_2,
- PredIntraAng8_2, //Intentionally wrong! It should be "PredIntraAng8_0" here.
+ PredIntraAng8_2, //0 can be done by 2 - will never be called
PredIntraAng8_2,
PredIntraAng8_5,
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_9" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_13" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_17" here.
- PredIntraAng8_26, //Intentionally wrong! It should be "PredIntraAng8_21" here.
+ PredIntraAng8_26, //9 can be done by 26
+ PredIntraAng8_26, //13 can be done by 26
+ PredIntraAng8_26, //17 can be done by 26
+ PredIntraAng8_26, //21 can be done by 26
PredIntraAng8_26,
PredIntraAng8_32
};
@@ -3155,46 +3160,49 @@
}
}
-typedef void (*PredIntraAng16x16_table)(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int dirMode);
-PredIntraAng16x16_table PredIntraAng16[] = {
- /*
- PredIntraAng16_0 is replaced with PredIntraAng16_26. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 because we cannot afford to pass large number arguments for this function.
- Path for PredIntraAng16_26, PredIntraAng16_m_26, PredIntraAng16_21, PredIntraAng16_m_21, PredIntraAng16_17, PredIntraAng16_m_17, PredIntraAng16_13, PredIntraAng16_m_13, PredIntraAng16_9, PredIntraAng16_m_9, PredIntraAng16_5, PredIntraAng16_m_5, PredIntraAng16_2, PredIntraAng16_m_2 is same as PredIntraAng16_26.
- */
- PredIntraAng16_32,
- PredIntraAng16_26,
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_21" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_17" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_13" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_9" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_5" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_2" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_0" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_2" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_5" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_9" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_13" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_17" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_21" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_26" here.
- PredIntraAng16_m_32,
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_26" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_21" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_17" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_13" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_9" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_5" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_m_2" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_0" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_2" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_5" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_9" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_13" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_17" here.
- PredIntraAng16_26, //Intentionally wrong! It should be "PredIntraAng16_21" here.
- PredIntraAng16_26,
- PredIntraAng16_32
-};
+//typedef void (*PredIntraAng16x16_table)(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int dirMode);
+//PredIntraAng16x16_table PredIntraAng16[] = {
+// /*
+// * PredIntraAng16_0 is replaced with PredIntraAng16_26. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8
+// because we cannot afford to pass large number arguments for this function.
+// * Path for PredIntraAng16_26, PredIntraAng16_m_26, PredIntraAng16_21, PredIntraAng16_m_21, PredIntraAng16_17, PredIntraAng16_m_17,
+// PredIntraAng16_13, PredIntraAng16_m_13, PredIntraAng16_9, PredIntraAng16_m_9, PredIntraAng16_5, PredIntraAng16_m_5, PredIntraAng16_2,
+// PredIntraAng16_m_2 is same as PredIntraAng16_26.
+// */
+// PredIntraAng16_32,
+// PredIntraAng16_26,
+// PredIntraAng16_26, //21 can be done by 26
+// PredIntraAng16_26, //17 can be done by 26
+// PredIntraAng16_26, //13 can be done by 26
+// PredIntraAng16_26, //9 can be done by 26
+// PredIntraAng16_26, //5 can be done by 26
+// PredIntraAng16_26, //2 can be done by 26
+// PredIntraAng16_26, //0 can be done by 26 - will never be called
+// PredIntraAng16_26, //-2 can be done by 26
+// PredIntraAng16_26, //-5 can be done by 26
+// PredIntraAng16_26, //-9 can be done by 26
+// PredIntraAng16_26, //-13 can be done by 26
+// PredIntraAng16_26, //-17 can be done by 26
+// PredIntraAng16_26, //-21 can be done by 26
+// PredIntraAng16_26, //-26 can be done by 26
+// PredIntraAng16_m_32,
+// PredIntraAng16_26, //-26 can be done by 26
+// PredIntraAng16_26, //-21 can be done by 26
+// PredIntraAng16_26, //-17 can be done by 26
+// PredIntraAng16_26, //-13 can be done by 26
+// PredIntraAng16_26, //-9 can be done by 26
+// PredIntraAng16_26, //-5 can be done by 26
+// PredIntraAng16_26, //-2 can be done by 26
+// PredIntraAng16_26, //0 can be done by 26 - will never be called
+// PredIntraAng16_26, //2 can be done by 26
+// PredIntraAng16_26, //5 can be done by 26
+// PredIntraAng16_26, //9 can be done by 26
+// PredIntraAng16_26, //13 can be done by 26
+// PredIntraAng16_26, //17 can be done by 26
+// PredIntraAng16_26, //21 can be done by 26
+// PredIntraAng16_26,
+// PredIntraAng16_32
+//};
void xPredIntraAng16x16(int bitDepth, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
{
@@ -3382,9 +3390,17 @@
}
}
}
+ else if(intraPredAngle==32)
+ {
+ PredIntraAng16_32(pDst, dstStride, refMain, refSide, dirMode);
+ }
+ else if(intraPredAngle==-32)
+ {
+ PredIntraAng16_m_32(pDst, dstStride, refMain, refSide, dirMode);
+ }
else
{
- PredIntraAng16[dirMode-2](pDst, dstStride, refMain, refSide, dirMode);
+ PredIntraAng16_26(pDst, dstStride, refMain, refSide, dirMode);
}
}
@@ -3631,13 +3647,445 @@
PREDANG_CALCROW_HOR_MODE2(R7) \
}
-void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+void PredIntraAng32_32(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int /*dirMode*/)
+{
+ __m128i itmp;
+ refMain += 2;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain++);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain++;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ refMain++;
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+}
+
+void PredIntraAng32_m_32(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int /*dirMode*/)
{
- int k;
- int blkSize = width;
-
- // Map the mode index to main prediction direction and angle
- assert(dirMode > 1); //no planar and dc
+ Vec16uc v_refSide;
+ pixel refMain0 = refMain[0];
+
+ v_refSide.load(refSide);
+ v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+ v_refSide.store(refMain - 15);
+
+ v_refSide.load(refSide + 16);
+ v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
+ v_refSide.store(refMain - 31);
+
+ refMain[0] = refMain0;
+
+ __m128i itmp;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
+ _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+ pDst += dstStride;
+ refMain--;
+
+ itmp = _mm_loadu_si128((__m128i const*)refMain);
+ _mm_storeu_si128((__m128i*)pDst, itmp);
+}
+
+void PredIntraAng32_26(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int dirMode)
+{
bool modeHor = (dirMode < 18);
bool modeVer = !modeHor;
int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
@@ -3647,16 +4095,496 @@
// Set bitshifts and scale the angle parameter to block size
int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
absAng = angTable[absAng];
intraPredAngle = signAng * absAng;
- // Do angular predictions
+ if (modeHor)
+ {
+ __m128i row11L, row12L, row11H, row12H;
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+ Pel * original_pDst = pDst;
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+ PREDANG_CALCROW_HOR(7 + 0, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+ PREDANG_CALCROW_HOR(7 + 8, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + 16;
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+ PREDANG_CALCROW_HOR(7 + 16, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+ R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + (16 * dstStride);
+ refMain += 16;
+ v_deltaPos = _mm_setzero_si128();
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
+ PREDANG_CALCROW_HOR(7 + 0, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
+ PREDANG_CALCROW_HOR(7 + 8, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ pDst = original_pDst + (16 * dstStride) + 16;
+
+ CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
+ PREDANG_CALCROW_HOR(7 + 16, R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
+ R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ }
+ else
+ {
+ __m128i row11L, row12L, row11H, row12H;
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ //__m128i row14, row22;
+ //__m128i res1, res2;
+
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, it1, it2, it3, i16;
+
+ for (int i = 0; i <= 30; i++)
+ {
+ PREDANG_CALCROW_VER(i);
+ }
+
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+ itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+ }
+}
+
+void PredIntraAng32_2(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int dirMode)
+{
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ //int lookIdx = intraPredAngle;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ if (modeHor)
+ {
+ __m128i row11L, row12L, row11H, row12H, res1, res2;
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+ Pel * original_pDst = pDst;
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+
+ LOADROW(row11L, row11H, 0)
+ LOADROW(row12L, row12H, 1)
+ R16 = _mm_packus_epi16(row12L, row12H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row11L = row12L;
+ row11H = row12H;
+ LOADROW(row12L, row12H, 2)
+ R16 = _mm_packus_epi16(row12L, row12H);
+ pDst = original_pDst + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + (16 * dstStride);
+ refMain += 16;
+ v_deltaPos = _mm_setzero_si128();
+
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ LOADROW(row11L, row11H, 0)
+ LOADROW(row12L, row12H, 1)
+ R16 = _mm_packus_epi16(row12L, row12H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row11L = row12L;
+ row11H = row12H;
+ LOADROW(row12L, row12H, 2)
+ R16 = _mm_packus_epi16(row12L, row12H);
+ pDst = original_pDst + (16 * dstStride) + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+ return;
+ }
+ else
+ {
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i row11, row12, row13, row14, row21, row22, row23, row24;
+ __m128i res1, res2;
+
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, it1, it2, it3, i16;
+
+ LOADROW(row11, row12, 0)
+ LOADROW(row21, row22, 1)
+ LOADROW(row13, row14, 16)
+ LOADROW(row23, row24, 17)
+ for (int i = 0; i <= 14; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ //deltaFract == 0 for 16th row
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ itmp = _mm_packus_epi16(row21, row22);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row23, row24);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+ row11 = row21;
+ row12 = row22;
+ row13 = row23;
+ row14 = row24;
+
+ LOADROW(row21, row22, 2)
+ LOADROW(row23, row24, 18)
+ for (int i = 16; i <= 30; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ itmp = _mm_packus_epi16(row21, row22);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row23, row24);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+ }
+}
+
+void PredIntraAng32_m_2(pixel* pDst, int dstStride, pixel *refMain, pixel * /*refSide*/, int dirMode)
+{
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ //int lookIdx = intraPredAngle;
+ int absAng = abs(intraPredAngle);
+ int signAng = intraPredAngle < 0 ? -1 : 1;
+
+ // Set bitshifts and scale the angle parameter to block size
+ int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ absAng = angTable[absAng];
+ intraPredAngle = signAng * absAng;
+
+ if (modeHor)
+ {
+ __m128i row11L, row12L, row11H, row12H, res1, res2;
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
+
+ Pel * original_pDst = pDst;
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
+
+ LOADROW(row11L, row11H, -1)
+ LOADROW(row12L, row12H, 0)
+ R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row12L = row11L;
+ row12H = row11H;
+ LOADROW(row11L, row11H, -2)
+ R16 = _mm_packus_epi16(row11L, row11H);
+ pDst = original_pDst + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ pDst = original_pDst + (16 * dstStride);
+ refMain += 16;
+
+ v_deltaPos = _mm_setzero_si128();
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ LOADROW(row11L, row11H, -1)
+ LOADROW(row12L, row12H, 0)
+ R16 = _mm_packus_epi16(row11L, row11H);
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ row12L = row11L;
+ row12H = row11H;
+ LOADROW(row11L, row11H, -2)
+ R16 = _mm_packus_epi16(row11L, row11H);
+ pDst = original_pDst + (16 * dstStride) + 16;
+
+ CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
+ PREDANG_CALCROW_HOR_MODE2(R8)
+ MB8(R1, R2, R3, R4, R5, R6, R7, R8)
+ CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
+ MB8(R9, R10, R11, R12, R13, R14, R15, R16)
+ BLND2_2(R1, R9)
+ BLND2_2(R5, R13)
+ BLND2_2(R3, R11)
+ BLND2_2(R7, R15)
+ BLND2_2(R2, R10)
+ BLND2_2(R6, R14)
+ BLND2_2(R4, R12)
+ BLND2_2(R8, R16)
+
+ }
+ else
+ {
+ __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
+ __m128i row11, row12, row13, row14, row21, row22, row23, row24;
+ __m128i res1, res2;
+
+ v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
+ v_ipAngle = _mm_set1_epi16(intraPredAngle);
+ thirty2 = _mm_set1_epi16(32);
+ thirty1 = _mm_set1_epi16(31);
+ __m128i itmp, it1, it2, it3, i16;
+
+
+ LOADROW(row11, row12, -1)
+ LOADROW(row21, row22, 0)
+ LOADROW(row13, row14, 15)
+ LOADROW(row23, row24, 16)
+ for (int i = 0; i <= 14; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ //deltaFract == 0 for 16th row
+ v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
+ v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
+ itmp = _mm_packus_epi16(row11, row12);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row13, row14);
+ _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
+
+ row21 = row11;
+ row22 = row12;
+ row23 = row13;
+ row24 = row14;
+
+ LOADROW(row11, row12, -2)
+ LOADROW(row13, row14, 14)
+ for (int i = 16; i <= 30; i++)
+ {
+ PREDANG_CALCROW_VER_MODE2(i);
+ }
+
+ itmp = _mm_packus_epi16(row11, row12);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
+ itmp = _mm_packus_epi16(row13, row14);
+ _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
+ }
+}
+
+typedef void (*PredIntraAng32x32_table)(pixel* pDst, int dstStride, pixel *refMain, pixel *refSide, int dirMode);
+PredIntraAng32x32_table PredIntraAng32[] = {
+ /*
+ PredIntraAng16_0 is replaced with PredIntraAng16_26. For PredIntraAng8_0 we are going through default path in the xPredIntraAng8x8 because we cannot afford to pass large number arguments for this function.
+ Path for PredIntraAng16_26, PredIntraAng16_m_26, PredIntraAng16_21, PredIntraAng16_m_21, PredIntraAng16_17, PredIntraAng16_m_17, PredIntraAng16_13, PredIntraAng16_m_13, PredIntraAng16_9, PredIntraAng16_m_9, PredIntraAng16_5, PredIntraAng16_m_5, PredIntraAng16_2, PredIntraAng16_m_2 is same as PredIntraAng16_26.
+ */
+ PredIntraAng32_32,
+ PredIntraAng32_26,
+ PredIntraAng32_26, //21 can be done by 26
+ PredIntraAng32_26, //17 can be done by 26
+ PredIntraAng32_26, //13 can be done by 26
+ PredIntraAng32_26, //9 can be done by 26
+ PredIntraAng32_26, //5 can be done by 26
+ PredIntraAng32_2,
+ PredIntraAng32_26, //0 can be done by 26 - will never be called
+ PredIntraAng32_m_2,
+ PredIntraAng32_26, //-5 can be done by 26
+ PredIntraAng32_26, //-9 can be done by 26
+ PredIntraAng32_26, //-13 can be done by 26
+ PredIntraAng32_26, //-17 can be done by 26
+ PredIntraAng32_26, //-21 can be done by 26
+ PredIntraAng32_26, //-26 can be done by 26
+ PredIntraAng32_m_32,
+ PredIntraAng32_26, //-26 can be done by 26
+ PredIntraAng32_26, //-21 can be done by 26
+ PredIntraAng32_26, //-17 can be done by 26
+ PredIntraAng32_26, //-13 can be done by 26
+ PredIntraAng32_26, //-9 can be done by 26
+ PredIntraAng32_26, //-5 can be done by 26
+ PredIntraAng32_m_2,
+ PredIntraAng32_26, //0 can be done by 26 - will never be called
+ PredIntraAng32_2,
+ PredIntraAng32_26, //5 can be done by 26
+ PredIntraAng32_26, //9 can be done by 26
+ PredIntraAng32_26, //13 can be done by 26
+ PredIntraAng32_26, //17 can be done by 26
+ PredIntraAng32_26, //21 can be done by 26
+ PredIntraAng32_26,
+ PredIntraAng32_32
+};
+
+void xPredIntraAng32x32(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove)
+{
+ int k;
+ int blkSize = width;
+
+ // Map the mode index to main prediction direction and angle
+ assert(dirMode > 1); //no planar and dc
+ static const int mode_to_angle_table[] = {32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32};
+ static const int mode_to_invAng_table[] = {256, 315, 390, 482, 630, 910, 1638, 4096, 0, 4096, 1638, 910, 630, 482, 390, 315, 256, 315, 390, 482, 630, 910, 1638, 4096, 0, 4096, 1638, 910, 630, 482, 390, 315, 256};
+ int intraPredAngle = mode_to_angle_table[dirMode-2];
+ int invAngle = mode_to_invAng_table[dirMode-2];
+ bool modeHor = (dirMode < 18);
+ bool modeVer = !modeHor;
pixel* refMain;
pixel* refSide;
-
// Initialise the Main and Left reference array.
if (intraPredAngle < 0)
{
@@ -3928,808 +4856,10 @@
_mm_storeu_si128((__m128i*)(pDst), v_main);
}
}
- else if (intraPredAngle == -32)
- {
- Vec16uc v_refSide;
- pixel refMain0 = refMain[0];
-
- v_refSide.load(refSide);
- v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
- v_refSide.store(refMain - 15);
-
- v_refSide.load(refSide + 16);
- v_refSide = permute16uc<15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(v_refSide);
- v_refSide.store(refMain - 31);
-
- refMain[0] = refMain0;
-
- __m128i itmp;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst + (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 16));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain--;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
-
- return;
- }
- else if (intraPredAngle == 32)
- {
- __m128i itmp;
- refMain += 2;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain++);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- _mm_storeu_si128((__m128i*)pDst, itmp);
- pDst += dstStride;
- refMain++;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- itmp = _mm_loadu_si128((__m128i const*)refMain);
- refMain++;
- _mm_storeu_si128((__m128i*)pDst, itmp);
- _mm_storeu_si128((__m128i*)(pDst - (16 * dstStride) + 16), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 15));
- _mm_storeu_si128((__m128i*)(pDst + 16), itmp);
- pDst += dstStride;
-
- return;
- }
else
{
- if (modeHor)
- {
- __m128i row11L, row12L, row11H, row12H, res1, res2;
- __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
- __m128i R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16;
-
- Pel * original_pDst = pDst;
- v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- thirty2 = _mm_set1_epi16(32);
- thirty1 = _mm_set1_epi16(31);
- __m128i itmp, itmp1, itmp2, it1, it2, it3, i16;
-
- switch (intraPredAngle)
- {
- case -2:
- LOADROW(row11L, row11H, -1)
- LOADROW(row12L, row12H, 0)
- R16 = _mm_packus_epi16(row11L, row11H); //R16 = compress(row11L, row11H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row12L = row11L;
- row12H = row11H;
- LOADROW(row11L, row11H, -2)
- R16 = _mm_packus_epi16(row11L, row11H);
- pDst = original_pDst + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + (16 * dstStride);
- refMain += 16;
-
- v_deltaPos = _mm_setzero_si128();
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- LOADROW(row11L, row11H, -1)
- LOADROW(row12L, row12H, 0)
- R16 = _mm_packus_epi16(row11L, row11H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row12L = row11L;
- row12H = row11H;
- LOADROW(row11L, row11H, -2)
- R16 = _mm_packus_epi16(row11L, row11H);
- pDst = original_pDst + (16 * dstStride) + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- return;
-
- case 2:
- LOADROW(row11L, row11H, 0)
- LOADROW(row12L, row12H, 1)
- R16 = _mm_packus_epi16(row12L, row12H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row11L = row12L;
- row11H = row12H;
- LOADROW(row12L, row12H, 2)
- R16 = _mm_packus_epi16(row12L, row12H);
- pDst = original_pDst + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + (16 * dstStride);
- refMain += 16;
- v_deltaPos = _mm_setzero_si128();
-
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- LOADROW(row11L, row11H, 0)
- LOADROW(row12L, row12H, 1)
- R16 = _mm_packus_epi16(row12L, row12H);
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- row11L = row12L;
- row11H = row12H;
- LOADROW(row12L, row12H, 2)
- R16 = _mm_packus_epi16(row12L, row12H);
- pDst = original_pDst + (16 * dstStride) + 16;
-
- CALC_BLND_8ROWS_MODE2(R1, R2, R3, R4, R5, R6, R7, R8)
- PREDANG_CALCROW_HOR_MODE2(R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS_MODE2(R9, R10, R11, R12, R13, R14, R15, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- return;
- }
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
- PREDANG_CALCROW_HOR(7 + 0, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
- PREDANG_CALCROW_HOR(7 + 8, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + 16;
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
- PREDANG_CALCROW_HOR(7 + 16, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
- R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
-
- pDst = original_pDst + (16 * dstStride);
- refMain += 16;
- v_deltaPos = _mm_setzero_si128();
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 0)
- PREDANG_CALCROW_HOR(7 + 0, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 8)
- PREDANG_CALCROW_HOR(7 + 8, R16)
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- pDst = original_pDst + (16 * dstStride) + 16;
-
- CALC_BLND_8ROWS(R1, R2, R3, R4, R5, R6, R7, R8, 16)
- PREDANG_CALCROW_HOR(7 + 16, R8)
- MB8(R1, R2, R3, R4, R5, R6, R7, R8)
- CALC_BLND_8ROWS(R9, R10, R11, R12, R13, R14, R15, R16, 24)
- R16 = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
- MB8(R9, R10, R11, R12, R13, R14, R15, R16)
- BLND2_2(R1, R9)
- BLND2_2(R5, R13)
- BLND2_2(R3, R11)
- BLND2_2(R7, R15)
- BLND2_2(R2, R10)
- BLND2_2(R6, R14)
- BLND2_2(R4, R12)
- BLND2_2(R8, R16)
- }
- else
- {
- __m128i row11L, row12L, row11H, row12H;
- __m128i v_deltaFract, v_deltaPos, thirty2, thirty1, v_ipAngle;
- __m128i row11, row12, row13, row14, row21, row22, row23, row24;
- __m128i res1, res2;
-
- v_deltaPos = _mm_setzero_si128(); //v_deltaPos = 0;
- v_ipAngle = _mm_set1_epi16(intraPredAngle);
- thirty2 = _mm_set1_epi16(32);
- thirty1 = _mm_set1_epi16(31);
- __m128i itmp, it1, it2, it3, i16;
-
- switch (intraPredAngle)
- {
- case -2:
- LOADROW(row11, row12, -1)
- LOADROW(row21, row22, 0)
- LOADROW(row13, row14, 15)
- LOADROW(row23, row24, 16)
- for (int i = 0; i <= 14; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- //deltaFract == 0 for 16th row
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- itmp = _mm_packus_epi16(row11, row12);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row13, row14);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
- row21 = row11;
- row22 = row12;
- row23 = row13;
- row24 = row14;
-
- LOADROW(row11, row12, -2)
- LOADROW(row13, row14, 14)
- for (int i = 16; i <= 30; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- itmp = _mm_packus_epi16(row11, row12);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row13, row14);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
- return;
-
- case 2:
-
- LOADROW(row11, row12, 0)
- LOADROW(row21, row22, 1)
- LOADROW(row13, row14, 16)
- LOADROW(row23, row24, 17)
- for (int i = 0; i <= 14; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- //deltaFract == 0 for 16th row
-
- v_deltaPos = _mm_add_epi16(v_deltaPos, v_ipAngle);
- v_deltaFract = _mm_and_si128(v_deltaPos, thirty1);
- itmp = _mm_packus_epi16(row21, row22);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row23, row24);
- _mm_storeu_si128((__m128i*)(pDst + ((15) * dstStride) + 16), itmp);
-
- row11 = row21;
- row12 = row22;
- row13 = row23;
- row14 = row24;
-
- LOADROW(row21, row22, 2)
- LOADROW(row23, row24, 18)
- for (int i = 16; i <= 30; i++)
- {
- PREDANG_CALCROW_VER_MODE2(i);
- }
-
- itmp = _mm_packus_epi16(row21, row22);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
- itmp = _mm_packus_epi16(row23, row24);
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
-
- return;
- }
-
- for (int i = 0; i <= 30; i++)
- {
- PREDANG_CALCROW_VER(i);
- }
-
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 1 + GETAP(lookIdx, 31)));
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride)), itmp);
- itmp = _mm_loadu_si128((__m128i const*)(refMain + 17 + GETAP(lookIdx, 31)));
- _mm_storeu_si128((__m128i*)(pDst + ((31) * dstStride) + 16), itmp);
- }
- }
+ PredIntraAng32[dirMode-2](pDst, dstStride, refMain, refSide, dirMode);
+ }
}
#endif /* if HIGH_BIT_DEPTH */
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xhevc_27June-2.patch
Type: text/x-patch
Size: 79744 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130628/4298e479/attachment-0001.bin>
More information about the x265-devel
mailing list