[x265] [PATCH] intra-sse3.cpp: Created common macros PRED_INTRA_ANGLE_4_START, PRED_INTRA_ANGLE_4_END for PredIntraAng4_[ANGLE] function

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 11 09:13:27 CEST 2013


# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381475507 -19800
#      Fri Oct 11 12:41:47 2013 +0530
# Node ID 0f5a6fd46f0acb3b401f9fe8c026ec97e1021cfd
# Parent  7320ecd0901c8338b5a1e1c121d2da44619ee17d
intra-sse3.cpp: Created common macros PRED_INTRA_ANGLE_4_START, PRED_INTRA_ANGLE_4_END for PredIntraAng4_[ANGLE] function.

diff -r 7320ecd0901c -r 0f5a6fd46f0a source/common/vec/intra-sse3.cpp
--- a/source/common/vec/intra-sse3.cpp	Thu Oct 10 22:11:58 2013 -0500
+++ b/source/common/vec/intra-sse3.cpp	Fri Oct 11 12:41:47 2013 +0530
@@ -39,6 +39,67 @@
 
 extern unsigned char IntraFilterType[][35];
 
+#define PRED_INTRA_ANGLE_4_START \
+        __m128i row11, row12, row21, row22, row31, row32, row41, row42; \
+        __m128i tmp16_1, tmp16_2, tmp2, deltaFract; \
+        __m128i deltaPos = _mm_set1_epi16(0); \
+        __m128i ipAngle  = _mm_set1_epi16(0); \
+        __m128i thirty1  = _mm_set1_epi16(31); \
+        __m128i thirty2  = _mm_set1_epi16(32); \
+        bool modeHor     = (dirMode < 18);
+
+#define PRED_INTRA_ANGLE_4_END \
+        deltaFract = _mm_and_si128(deltaPos, thirty1); \
+        __m128i mullo = _mm_mullo_epi16(row11, _mm_sub_epi16(thirty2, deltaFract)); \
+        __m128i sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row12)); \
+        row11 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+         \
+        deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
+        deltaFract = _mm_and_si128(deltaPos, thirty1); \
+        mullo = _mm_mullo_epi16(row21, _mm_sub_epi16(thirty2, deltaFract)); \
+        sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row22)); \
+        row21 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+         \
+        deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
+        deltaFract = _mm_and_si128(deltaPos, thirty1); \
+        mullo = _mm_mullo_epi16(row31, _mm_sub_epi16(thirty2, deltaFract)); \
+        sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row32)); \
+        row31 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+         \
+        deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
+        deltaFract = _mm_and_si128(deltaPos, thirty1); \
+        mullo = _mm_mullo_epi16(row41, _mm_sub_epi16(thirty2, deltaFract)); \
+        sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row42)); \
+        row41 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+         \
+        if (modeHor) \
+        { \
+            __m128i tmp1, tmp2, tmp3, tmp4; \
+             \
+            tmp1 =   _mm_unpacklo_epi16(row11, row31); \
+            tmp2 =   _mm_unpacklo_epi16(row21, row41); \
+            tmp3 =   _mm_unpacklo_epi16(tmp1, tmp2); \
+            tmp4 =   _mm_unpackhi_epi16(tmp1, tmp2); \
+             \
+            tmp16_1 = _mm_packus_epi16(tmp3, tmp3); \
+            *(uint32_t*)(dst) = _mm_cvtsi128_si32(tmp16_1); \
+            tmp2 = tmp16_1; \
+            tmp2 = _mm_srl_epi64(tmp2, _mm_cvtsi32_si128(32)); \
+            *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(tmp2); \
+            tmp16_1 = _mm_packus_epi16(tmp4, tmp4); \
+            *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(tmp16_1); \
+            tmp2 = tmp16_1; \
+            tmp2 = _mm_srl_epi64(tmp2, _mm_cvtsi32_si128(32)); \
+            *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(tmp2); \
+        } \
+        else \
+        { \
+            *(uint32_t*)(dst) = _mm_cvtsi128_si32(_mm_packus_epi16(row11,row11)); \
+            *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(_mm_packus_epi16(row21,row21)); \
+            *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row31,row31)); \
+            *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row41,row41)); \
+        }
+
 #define PRED_INTRA_ANG4_START   \
     Vec8s row11, row12, row21, row22, row31, row32, row41, row42;   \
     Vec16uc tmp16_1, tmp16_2;   \


More information about the x265-devel mailing list