[x265] [PATCH] intra-sse3.cpp: Created common macros PRED_INTRA_ANGLE_4_START, PRED_INTRA_ANGLE_4_END for PredIntraAng4_[ANGLE] function
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Oct 11 09:13:27 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381475507 -19800
# Fri Oct 11 12:41:47 2013 +0530
# Node ID 0f5a6fd46f0acb3b401f9fe8c026ec97e1021cfd
# Parent 7320ecd0901c8338b5a1e1c121d2da44619ee17d
intra-sse3.cpp: Created common macros PRED_INTRA_ANGLE_4_START, PRED_INTRA_ANGLE_4_END for PredIntraAng4_[ANGLE] function.
diff -r 7320ecd0901c -r 0f5a6fd46f0a source/common/vec/intra-sse3.cpp
--- a/source/common/vec/intra-sse3.cpp Thu Oct 10 22:11:58 2013 -0500
+++ b/source/common/vec/intra-sse3.cpp Fri Oct 11 12:41:47 2013 +0530
@@ -39,6 +39,67 @@
extern unsigned char IntraFilterType[][35];
+#define PRED_INTRA_ANGLE_4_START \
+ __m128i row11, row12, row21, row22, row31, row32, row41, row42; \
+ __m128i tmp16_1, tmp16_2, tmp2, deltaFract; \
+ __m128i deltaPos = _mm_set1_epi16(0); \
+ __m128i ipAngle = _mm_set1_epi16(0); \
+ __m128i thirty1 = _mm_set1_epi16(31); \
+ __m128i thirty2 = _mm_set1_epi16(32); \
+ bool modeHor = (dirMode < 18);
+
+#define PRED_INTRA_ANGLE_4_END \
+ deltaFract = _mm_and_si128(deltaPos, thirty1); \
+ __m128i mullo = _mm_mullo_epi16(row11, _mm_sub_epi16(thirty2, deltaFract)); \
+ __m128i sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row12)); \
+ row11 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+ \
+ deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
+ deltaFract = _mm_and_si128(deltaPos, thirty1); \
+ mullo = _mm_mullo_epi16(row21, _mm_sub_epi16(thirty2, deltaFract)); \
+ sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row22)); \
+ row21 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+ \
+ deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
+ deltaFract = _mm_and_si128(deltaPos, thirty1); \
+ mullo = _mm_mullo_epi16(row31, _mm_sub_epi16(thirty2, deltaFract)); \
+ sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row32)); \
+ row31 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+ \
+ deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
+ deltaFract = _mm_and_si128(deltaPos, thirty1); \
+ mullo = _mm_mullo_epi16(row41, _mm_sub_epi16(thirty2, deltaFract)); \
+ sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row42)); \
+ row41 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
+ \
+ if (modeHor) \
+ { \
+ __m128i tmp1, tmp2, tmp3, tmp4; \
+ \
+ tmp1 = _mm_unpacklo_epi16(row11, row31); \
+ tmp2 = _mm_unpacklo_epi16(row21, row41); \
+ tmp3 = _mm_unpacklo_epi16(tmp1, tmp2); \
+ tmp4 = _mm_unpackhi_epi16(tmp1, tmp2); \
+ \
+ tmp16_1 = _mm_packus_epi16(tmp3, tmp3); \
+ *(uint32_t*)(dst) = _mm_cvtsi128_si32(tmp16_1); \
+ tmp2 = tmp16_1; \
+ tmp2 = _mm_srl_epi64(tmp2, _mm_cvtsi32_si128(32)); \
+ *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(tmp2); \
+ tmp16_1 = _mm_packus_epi16(tmp4, tmp4); \
+ *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(tmp16_1); \
+ tmp2 = tmp16_1; \
+ tmp2 = _mm_srl_epi64(tmp2, _mm_cvtsi32_si128(32)); \
+ *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(tmp2); \
+ } \
+ else \
+ { \
+ *(uint32_t*)(dst) = _mm_cvtsi128_si32(_mm_packus_epi16(row11,row11)); \
+ *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(_mm_packus_epi16(row21,row21)); \
+ *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row31,row31)); \
+ *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row41,row41)); \
+ }
+
#define PRED_INTRA_ANG4_START \
Vec8s row11, row12, row21, row22, row31, row32, row41, row42; \
Vec16uc tmp16_1, tmp16_2; \
More information about the x265-devel
mailing list