[x265] [PATCH] inrapred: generate all of 33 IntraAngle-4x4 modes once

Min Chen chenm003 at 163.com
Wed Jun 19 14:28:41 CEST 2013


>From 7b03306aaa06b92405e87534fdb346dce0741140 Mon Sep 17 00:00:00 2001
From: Min Chen <chenm003 at 163.com>
Date: Wed, 19 Jun 2013 20:02:12 +0800
Subject: [PATCH] inrapred: generate all of 33 IntraAngle-4x4 modes once

---
 source/Lib/TLibEncoder/TEncSearch.cpp |  13 +-
 source/common/vec/intrapred.inc       | 418 +++++++++++++++++++++++++++++++++-
 2 files changed, 415 insertions(+), 16 deletions(-)

diff --git a/source/Lib/TLibEncoder/TEncSearch.cpp b/source/Lib/TLibEncoder/TEncSearch.cpp
index 761fee6..fd25b32 100644
--- a/source/Lib/TLibEncoder/TEncSearch.cpp
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp
@@ -2256,7 +2256,7 @@ Void TEncSearch::estIntraPredQT(TComDataCU* pcCU,
                 x265::primitives.getIPredAngs4(tmp, pAbove0, pLeft0, pAbove1, pLeft1, (uiWidth<16));
 
                 // TODO: We need SATD_x4 here
-                for (UInt uiMode = 2; uiMode < 18; uiMode++)
+                for (UInt uiMode = 2; uiMode < numModesAvailable; uiMode++)
                 {
                     //predIntraLumaAng(pcCU->getPattern(), uiMode, piPred, uiStride, uiWidth);
                     //for (int k = 0; k < uiWidth; k++)
@@ -2268,15 +2268,12 @@ Void TEncSearch::estIntraPredQT(TComDataCU* pcCU,
                     //    }
                     //}
                     //UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)piPred, uiStride);
+                    bool modeHor = (uiMode < 18);
+                    Pel *pSrc = (modeHor ? buf1 : piOrg);
+                    intptr_t srcStride = (modeHor ? uiWidth : uiStride);
 
                     // use hadamard transform here
-                    UInt uiSad = sa8d((pixel*)buf1, uiWidth, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
-                    uiSads[uiMode] = uiSad;
-                }
-                for (UInt uiMode = 18; uiMode < numModesAvailable; uiMode++)
-                {
-                    // use hadamard transform here
-                    UInt uiSad = sa8d((pixel*)piOrg, uiStride, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
+                    UInt uiSad = sa8d((pixel*)pSrc, srcStride, (pixel*)&tmp[(uiMode - 2) * (uiWidth * uiWidth)], uiWidth);
                     uiSads[uiMode] = uiSad;
                 }
             }
diff --git a/source/common/vec/intrapred.inc b/source/common/vec/intrapred.inc
index 775f4e9..da9f1e1 100644
--- a/source/common/vec/intrapred.inc
+++ b/source/common/vec/intrapred.inc
@@ -1672,6 +1672,7 @@ void xPredIntraAng4x4(int bitDepth, pixel* pDst, int dstStride, int width, int d
 }
 
 #else /* if HIGH_BIT_DEPTH */
+__m128i _tmp0, _tmp1;
 void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
 {
     int blkSize        = width;
@@ -1746,6 +1747,8 @@ void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int width, i
                 Vec8s row0 = extend_low(tmp16);
                 v_side -= v_side_0;
                 v_side = v_side >> 1;
+                _tmp0 = row0;
+                _tmp1 = v_side;
                 row0 += v_side;
                 row0 = min(max(0, row0), 255);
                 Vec16uc v_res(compress_unsafe(row0, 0));
@@ -4665,10 +4668,9 @@ void xPredIntraAngBufRef(int bitDepth, pixel* pDst, int dstStride, int width, in
     }
 }
 
-#if HIGH_BIT_DEPTH
-#else // HIGH_BIT_DEPTH
+#if 0//HIGH_BIT_DEPTH || (INSTRSET < 4)
 
-#if INSTRSET < 40
+// TODO: reference code, please optimize it
 void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
 {
     int iMode;
@@ -4704,15 +4706,415 @@ void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1
     }
 }
 
-#else // INSTRSET >= 4
+#else // HIGH_BIT_DEPTH || (INSTRSET < 4)
+
+ALIGN_VAR_32(static const unsigned char, tab_angle_0[][16]) =
+{
+    { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 },         //  0
+    { 15, 0, 0, 1, 2, 3, 4, 5, 7, 0, 0, 9, 10, 11, 12, 13 },    //  1
+    { 12, 0, 0, 1, 2, 3, 4, 5, 3, 0, 0, 9, 10, 11, 12, 13 },    //  2
+    { 15, 11, 12, 0, 0, 1, 2, 3, 7, 3, 4, 0, 0, 9, 10, 11 },    //  3
+    { 13, 12, 11, 8, 8, 1, 2, 3, 5, 4, 3, 0, 0, 9, 10, 11 },    //  4
+    { 9, 0, 0, 1, 2, 3, 4, 5, 1, 0, 0, 9, 10, 11, 12, 13 },     //  5
+    { 11, 10, 9, 0, 0, 1, 2, 3, 4, 2, 1, 0, 0, 9, 10, 11 },     //  6
+    { 15, 12, 11, 10, 9, 0, 0, 1, 7, 4, 3, 2, 1, 0, 0, 9 },     //  7
+    { 0, 10, 11, 13, 1, 0, 10, 11, 3, 2, 0, 10, 5, 4, 2, 0},    //  8
+};
 
-void xPredIntraAngs4(pixel *pDst, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+ALIGN_VAR_32(static const char, tab_angle_1[][16]) =
 {
-    
+#define MAKE_COEF8(a) \
+    { 32-(a), (a), 32-(a), (a), 32-(a), (a), 32-(a), (a), 32-(a), (a), 32-(a), (a), 32-(a), (a), 32-(a), (a) },
+
+    MAKE_COEF8(  0 )
+    MAKE_COEF8(  1 )
+    MAKE_COEF8(  2 )
+    MAKE_COEF8(  3 )
+    MAKE_COEF8(  4 )
+    MAKE_COEF8(  5 )
+    MAKE_COEF8(  6 )
+    MAKE_COEF8(  7 )
+    MAKE_COEF8(  8 )
+    MAKE_COEF8(  9 )
+    MAKE_COEF8( 10 )
+    MAKE_COEF8( 11 )
+    MAKE_COEF8( 12 )
+    MAKE_COEF8( 13 )
+    MAKE_COEF8( 14 )
+    MAKE_COEF8( 15 )
+    MAKE_COEF8( 16 )
+    MAKE_COEF8( 17 )
+    MAKE_COEF8( 18 )
+    MAKE_COEF8( 19 )
+    MAKE_COEF8( 20 )
+    MAKE_COEF8( 21 )
+    MAKE_COEF8( 22 )
+    MAKE_COEF8( 23 )
+    MAKE_COEF8( 24 )
+    MAKE_COEF8( 25 )
+    MAKE_COEF8( 26 )
+    MAKE_COEF8( 27 )
+    MAKE_COEF8( 28 )
+    MAKE_COEF8( 29 )
+    MAKE_COEF8( 30 )
+    MAKE_COEF8( 31 )
+    MAKE_COEF8( 32 )
+
+#undef MAKE_COEF8
+};
+
+void xPredIntraAngs4(pixel *pDst0, pixel *pAbove0, pixel *pLeft0, pixel *pAbove1, pixel *pLeft1, bool bLuma)
+{
+    // avoid warning
+    (pLeft1);
+    (pAbove1);
+
+    pixel (*pDstN)[4*4] = (pixel(*)[4*4])pDst0;
+
+    __m128i T00, T01, T02, T03, T04, T05, T06, T07;
+    __m128i T10, T11, T12, T13;
+    __m128i T20, T21, T22, T23;
+    __m128i T30, T31, T32;
+    __m128i R00, R10, R20, R30;
+    __m128i R01, R11, R21, R31;
+
+    R00 = _mm_loadu_si128((__m128i*)(pLeft0+1));    // [-- -- -- -- -- -- --  -- -08 -07 -06 -05 -04 -03 -02 -01]
+    R10 = _mm_srli_si128(R00, 1);                   // [-- -- -- -- -- -- --  --  -- -08 -07 -06 -05 -04 -03 -02]
+    R20 = _mm_srli_si128(R00, 2);                   // [-- -- -- -- -- -- --  --  --  -- -08 -07 -06 -05 -04 -03]
+    R30 = _mm_srli_si128(R00, 3);                   // [-- -- -- -- -- -- --  --  --  --  -- -08 -07 -06 -05 -04]
+
+    R01 = _mm_loadu_si128((__m128i*)(pAbove0+1));   // [-- -- -- -- -- -- --  --  08  07  06  05  04  03  02  01]
+    R11 = _mm_srli_si128(R01, 1);                   // [-- -- -- -- -- -- --  --  --  08  07  06  05  04  03  02]
+    R21 = _mm_srli_si128(R01, 2);                   // [-- -- -- -- -- -- --  --  --  --  08  07  06  05  04  03]
+    R31 = _mm_srli_si128(R01, 3);                   // [-- -- -- -- -- -- --  --  --  --  --  08  07  06  05  04]
+
+    T00 = _mm_unpacklo_epi32(R00, R00);
+    T00 = _mm_unpacklo_epi64(T00, T00);
+    _mm_store_si128((__m128i*)pDstN[ 8], T00);
+
+    T00 = _mm_unpacklo_epi32(R01, R01);
+    T00 = _mm_unpacklo_epi64(T00, T00);
+    _mm_store_si128((__m128i*)pDstN[24], T00);
+
+    if (bLuma)
+    {
+        __m128i roundH, roundV;
+        __m128i pL = _mm_set1_epi16(pLeft0[1]);
+        __m128i pT = _mm_set1_epi16(pAbove0[1]);
+        roundH = _mm_set1_epi16(pAbove0[0]);
+        roundV = roundH;
+
+        roundH = _mm_srai_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(R01, _mm_setzero_si128()), roundH), 1);
+        roundV = _mm_srai_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(R00, _mm_setzero_si128()), roundV), 1);
+
+        T00 = _mm_add_epi16(roundH, pL);
+        T00 = _mm_packus_epi16(T00, T00);
+        T01 = _mm_add_epi16(roundV, pT);
+        T01 = _mm_packus_epi16(T01, T01);
+
+        int tmp0;
+        tmp0 = _mm_cvtsi128_si32(T00);
+        pDstN[ 8][0*4] = tmp0 & 0xFF;
+        pDstN[ 8][1*4] = (tmp0>>8) & 0xFF;
+        pDstN[ 8][2*4] = (tmp0>>16) & 0xFF;
+        pDstN[ 8][3*4] = (tmp0>>24) & 0xFF;
+
+        tmp0 = _mm_cvtsi128_si32(T01);
+        pDstN[24][0*4] = tmp0 & 0xFF;
+        pDstN[24][1*4] = (tmp0>>8) & 0xFF;
+        pDstN[24][2*4] = (tmp0>>16) & 0xFF;
+        pDstN[24][3*4] = (tmp0>>24) & 0xFF;
+    }
+
+    const __m128i c_16 = _mm_set1_epi16(16);
+
+    T00 = _mm_shufflelo_epi16(R10, 0x94);
+    T01 = _mm_shufflelo_epi16(R20, 0x94);
+    T00 = _mm_unpacklo_epi32(T00, T01);
+    _mm_store_si128((__m128i*)pDstN[ 0], T00);
+
+    T00 = _mm_shufflelo_epi16(R11, 0x94);
+    T01 = _mm_shufflelo_epi16(R21, 0x94);
+    T00 = _mm_unpacklo_epi32(T00, T01);
+    _mm_store_si128((__m128i*)pDstN[32], T00);
+
+    T00 = _mm_shuffle_epi8( R00, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T01 = _mm_shuffle_epi8( R10, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T02 = _mm_shuffle_epi8( R20, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T03 = _mm_shuffle_epi8( R30, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T04 = _mm_shuffle_epi8( R01, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T05 = _mm_shuffle_epi8( R11, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T06 = _mm_shuffle_epi8( R21, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T07 = _mm_shuffle_epi8( R31, _mm_load_si128((__m128i*)tab_angle_0[ 0]));
+    T00 = _mm_unpacklo_epi64(T00, T04);
+    T01 = _mm_unpacklo_epi64(T01, T05);
+    T02 = _mm_unpacklo_epi64(T02, T06);
+    T03 = _mm_unpacklo_epi64(T03, T07);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[26]));
+    T11 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[20]));
+    T12 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)tab_angle_1[14]));
+    T13 = _mm_maddubs_epi16(T03, _mm_load_si128((__m128i*)tab_angle_1[ 8]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 1], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[31], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[21]));
+    T11 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[10]));
+    T12 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[31]));
+    T13 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)tab_angle_1[20]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 2], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[30], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[17]));
+    T11 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[ 2]));
+    T12 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[19]));
+    T13 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)tab_angle_1[ 4]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 3], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[29], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[13]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[26]));
+    T12 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[ 7]));
+    T13 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[20]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 4], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[28], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 9]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[18]));
+    T12 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[27]));
+    T13 = _mm_maddubs_epi16(T01, _mm_load_si128((__m128i*)tab_angle_1[ 4]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 5], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[27], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 5]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[10]));
+    T12 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[15]));
+    T13 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[20]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 6], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[26], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 2]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 4]));
+    T12 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 6]));
+    T13 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 8]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 7], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[25], T22);
+
+    R00 = _mm_loadu_si128((__m128i*)(pLeft0));      // [-- -- -- -- -- --  -- -08 -07 -06 -05 -04 -03 -02 -01  00]
+    R10 = _mm_srli_si128(R00, 1);                   // [-- -- -- -- -- --  --  -- -08 -07 -06 -05 -04 -03 -02 -01]
+    R20 = _mm_srli_si128(R00, 2);                   // [-- -- -- -- -- --  --  --  -- -08 -07 -06 -05 -04 -03 -02]
+    R30 = _mm_srli_si128(R00, 3);                   // [-- -- -- -- -- --  --  --  --  -- -08 -07 -06 -05 -04 -03]
+
+    R01 = _mm_loadu_si128((__m128i*)(pAbove0));     // [-- -- -- -- -- -- --   08  07  06  05  04  03  02  01  00]
+    R11 = _mm_srli_si128(R01, 1);                   // [-- -- -- -- -- -- --   --  08  07  06  05  04  03  02  01]
+    R21 = _mm_srli_si128(R01, 2);                   // [-- -- -- -- -- -- --   --  --  08  07  06  05  04  03  02]
+    R31 = _mm_srli_si128(R01, 3);                   // [-- -- -- -- -- -- --   --  --  --  08  07  06  05  04  03]
+
+    T00 = _mm_shuffle_epi8( R00, _mm_load_si128((__m128i*)tab_angle_0[ 0]));    // [ -- -08 -07 -06 -06 -05 -05 -04 -04 -03 -03 -02 -02 -01 -01  00]
+    T04 = _mm_shuffle_epi8( R01, _mm_load_si128((__m128i*)tab_angle_0[ 0]));    // [ --  08  07  06  06  05  05  04  04  03  03  02  02  01  01  00]
+    T00 = _mm_unpacklo_epi64(T00, T04);     // [ 04  03  03  02  02  01  01  00 -04 -03 -03 -02 -02 -01 -01  00]
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[30]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[28]));
+    T12 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[26]));
+    T13 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[24]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[ 9], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[23], T22);
+
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[27]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[22]));
+    T12 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[17]));
+    T13 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[12]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[10], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[22], T22);
+
+    T30 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[1]));
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[23]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[14]));
+    T12 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 5]));
+    T13 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[28]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[11], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[21], T22);
+
+    T30 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[2]));
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[19]));
+    T11 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 6]));
+    T12 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[25]));
+    T13 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[12]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[12], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[20], T22);
+
+    T31 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[3]));
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[15]));
+    T11 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[30]));
+    T12 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[13]));
+    T13 = _mm_maddubs_epi16(T31, _mm_load_si128((__m128i*)tab_angle_1[28]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[13], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[19], T22);
+
+    T31 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[4]));
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[11]));
+    T11 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[22]));
+    T12 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[ 1]));
+    T13 = _mm_maddubs_epi16(T31, _mm_load_si128((__m128i*)tab_angle_1[12]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[14], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[18], T22);
+
+    T30 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[5]));
+    T31 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[6]));
+    T32 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[7]));
+    T10 = _mm_maddubs_epi16(T00, _mm_load_si128((__m128i*)tab_angle_1[ 6]));
+    T11 = _mm_maddubs_epi16(T30, _mm_load_si128((__m128i*)tab_angle_1[12]));
+    T12 = _mm_maddubs_epi16(T31, _mm_load_si128((__m128i*)tab_angle_1[18]));
+    T13 = _mm_maddubs_epi16(T32, _mm_load_si128((__m128i*)tab_angle_1[24]));
+    T20 = _mm_unpacklo_epi64(T10, T11);
+    T21 = _mm_unpacklo_epi64(T12, T13);
+    T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_16), 5);
+    T21 = _mm_srai_epi16(_mm_add_epi16(T21, c_16), 5);
+    T20 = _mm_packus_epi16(T20, T21);
+    _mm_store_si128((__m128i*)pDstN[15], T20);
+    T22 = _mm_unpackhi_epi64(T10, T11);
+    T23 = _mm_unpackhi_epi64(T12, T13);
+    T22 = _mm_srai_epi16(_mm_add_epi16(T22, c_16), 5);
+    T23 = _mm_srai_epi16(_mm_add_epi16(T23, c_16), 5);
+    T22 = _mm_packus_epi16(T22, T23);
+    _mm_store_si128((__m128i*)pDstN[17], T22);
+
+    T30 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_angle_0[8]));
+    _mm_store_si128((__m128i*)pDstN[16], T30);
 }
-#endif // INSTRSET < 4
 
-#endif // HIGH_BIT_DEPTH
+#endif // HIGH_BIT_DEPTH || (INSTRSET < 4)
 
 }
 
-- 
1.8.3.msysgit.0




More information about the x265-devel mailing list