[x265] [PATCH] improvement cvt32to16_shr by merge width and height loop
Min Chen
chenm003 at 163.com
Thu Oct 17 16:15:31 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1382019277 -28800
# Node ID 6c3d24d5f3882c5ea570da94f4275428107460ed
# Parent cc4f3a436f46c3eef3e2030696c2eaa2a7a9d63d
improvement cvt32to16_shr by merge width and height loop
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Thu Oct 17 22:13:21 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Thu Oct 17 22:14:37 2013 +0800
@@ -490,11 +490,9 @@
int j, k;
if (shift > 0)
{
+ assert(width == height);
transformSkipShift = shift;
- for (j = 0; j < height; j++)
- {
- primitives.cvt32to16_shr(&residual[j * stride], &coef[j * width], shift, width);
- }
+ primitives.cvt32to16_shr(residual, coef, stride, shift, width);
}
else
{
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Oct 17 22:13:21 2013 +0800
+++ b/source/common/pixel.cpp Thu Oct 17 22:14:37 2013 +0800
@@ -439,13 +439,18 @@
}
}
-void convert32to16_shr(short *dst, int *src, int shift, int num)
+void convert32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
{
int round = 1 << (shift - 1);
- for (int i = 0; i < num; i++)
+ for (int i = 0; i < size; i++)
{
- dst[i] = (short)((src[i] + round) >> shift);
+ for (int j = 0; j < size; j++)
+ {
+ dst[j] = (short)((src[j] + round) >> shift);
+ }
+ src += size;
+ dst += stride;
}
}
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/common/primitives.h
--- a/source/common/primitives.h Thu Oct 17 22:13:21 2013 +0800
+++ b/source/common/primitives.h Thu Oct 17 22:14:37 2013 +0800
@@ -179,7 +179,7 @@
typedef void (*cvt16to32_shl_t)(int *dst, short *src, intptr_t, int, int);
typedef void (*cvt16to16_shl_t)(short *dst, short *src, int, int, intptr_t, int);
-typedef void (*cvt32to16_shr_t)(short *dst, int *src, int, int);
+typedef void (*cvt32to16_shr_t)(short *dst, int *src, intptr_t, int, int);
typedef void (*dct_t)(short *src, int *dst, intptr_t stride);
typedef void (*idct_t)(int *src, short *dst, intptr_t stride);
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/common/vec/pixel-sse3.cpp
--- a/source/common/vec/pixel-sse3.cpp Thu Oct 17 22:13:21 2013 +0800
+++ b/source/common/vec/pixel-sse3.cpp Thu Oct 17 22:14:37 2013 +0800
@@ -31,23 +31,25 @@
using namespace x265;
namespace {
-void convert32to16_shr(short *dst, int *org, int shift, int num)
+void convert32to16_shr(short *dst, int *org, intptr_t stride, int shift, int size)
{
- int i;
+ int i, j;
__m128i round = _mm_set1_epi32(1 << (shift - 1));
- for (i = 0; i < num; i += 4)
+ for (i = 0; i < size; i++)
{
- __m128i im32;
- __m128i im16;
+ for (j = 0; j < size; j += 4)
+ {
+ __m128i im32;
+ __m128i im16;
- im32 = _mm_loadu_si128((__m128i const*)org);
- im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));
- im16 = _mm_packs_epi32(im32, im32);
- _mm_storel_epi64((__m128i*)dst, im16);
-
- org += 4;
- dst += 4;
+ im32 = _mm_loadu_si128((__m128i const*)(org + j));
+ im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));
+ im16 = _mm_packs_epi32(im32, im32);
+ _mm_storel_epi64((__m128i*)(dst + j), im16);
+ }
+ org += size;
+ dst += stride;
}
}
More information about the x265-devel
mailing list