[x265] [PATCH] dct: modified block copy used in dct8 with convert16to32 inline function

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Fri Oct 11 09:13:15 CEST 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381475536 -19800
#      Fri Oct 11 12:42:16 2013 +0530
# Node ID 8bb743458331d7cdc1008e217542e406818c5a7a
# Parent  7134a091a71d7acf08e80392bf8c0f6f0dcec700
dct: modified block copy used in dct8 with convert16to32 inline function

diff -r 7134a091a71d -r 8bb743458331 source/common/vec/dct-sse41.cpp
--- a/source/common/vec/dct-sse41.cpp	Thu Oct 10 17:21:36 2013 -0500
+++ b/source/common/vec/dct-sse41.cpp	Fri Oct 11 12:42:16 2013 +0530
@@ -402,6 +402,28 @@
     }
 }
 
+inline void convert16to32(short *org, int *dst, int num)
+{
+    int i;
+
+    for (i = 0; i < num; i += 8)
+    {
+        __m128i im16;
+        __m128i im32L, im32H;
+        __m128i sign;
+
+        im16 = _mm_loadu_si128((__m128i*)org);
+        sign = _mm_srai_epi16(im16, 15);
+        im32L = _mm_unpacklo_epi16(im16, sign);
+        im32H = _mm_unpackhi_epi16(im16, sign);
+        _mm_storeu_si128((__m128i*)dst, im32L);
+        _mm_storeu_si128((__m128i*)(dst + 4), im32H);
+
+        org += 8;
+        dst += 8;
+    }
+}
+
 void dct8(short *src, int *dst, intptr_t stride)
 {
     const int shift_1st = 2;
@@ -418,15 +440,9 @@
     partialButterfly8(block, coef, shift_1st, 8);
     partialButterfly8(coef, block, shift_2nd, 8);
 
-    /* TODO: inline cvt16to32 once it is intrinsic based */
 #define N (8)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
+
+    convert16to32(block, dst, N*N);
 
 #undef N
 }


More information about the x265-devel mailing list