[x265] [PATCH] dct: modified block copy used in dct8 with convert16to32 inline function
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Fri Oct 11 09:13:15 CEST 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381475536 -19800
# Fri Oct 11 12:42:16 2013 +0530
# Node ID 8bb743458331d7cdc1008e217542e406818c5a7a
# Parent 7134a091a71d7acf08e80392bf8c0f6f0dcec700
dct: modified block copy used in dct8 with convert16to32 inline function
diff -r 7134a091a71d -r 8bb743458331 source/common/vec/dct-sse41.cpp
--- a/source/common/vec/dct-sse41.cpp Thu Oct 10 17:21:36 2013 -0500
+++ b/source/common/vec/dct-sse41.cpp Fri Oct 11 12:42:16 2013 +0530
@@ -402,6 +402,28 @@
}
}
+inline void convert16to32(short *org, int *dst, int num)
+{
+ int i;
+
+ for (i = 0; i < num; i += 8)
+ {
+ __m128i im16;
+ __m128i im32L, im32H;
+ __m128i sign;
+
+ im16 = _mm_loadu_si128((__m128i*)org);
+ sign = _mm_srai_epi16(im16, 15);
+ im32L = _mm_unpacklo_epi16(im16, sign);
+ im32H = _mm_unpackhi_epi16(im16, sign);
+ _mm_storeu_si128((__m128i*)dst, im32L);
+ _mm_storeu_si128((__m128i*)(dst + 4), im32H);
+
+ org += 8;
+ dst += 8;
+ }
+}
+
void dct8(short *src, int *dst, intptr_t stride)
{
const int shift_1st = 2;
@@ -418,15 +440,9 @@
partialButterfly8(block, coef, shift_1st, 8);
partialButterfly8(coef, block, shift_2nd, 8);
- /* TODO: inline cvt16to32 once it is intrinsic based */
#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
+
+ convert16to32(block, dst, N*N);
#undef N
}
More information about the x265-devel
mailing list