[x265] [PATCH] improvement cvt32to16_shr by merge width and height loop

Thu Oct 17 16:15:31 CEST 2013

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1382019277 -28800
# Node ID 6c3d24d5f3882c5ea570da94f4275428107460ed
# Parent  cc4f3a436f46c3eef3e2030696c2eaa2a7a9d63d
improvement cvt32to16_shr by merge width and height loop

diff -r cc4f3a436f46 -r 6c3d24d5f388 source/Lib/TLibCommon/TComTrQuant.cpp

--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Thu Oct 17 22:13:21 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Thu Oct 17 22:14:37 2013 +0800
@@ -490,11 +490,9 @@
     int  j, k;
     if (shift > 0)
     {
+        assert(width == height);
         transformSkipShift = shift;
-        for (j = 0; j < height; j++)
-        {
-            primitives.cvt32to16_shr(&residual[j * stride], &coef[j * width], shift, width);
-        }
+        primitives.cvt32to16_shr(residual, coef, stride, shift, width);
     }
     else
     {
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Thu Oct 17 22:13:21 2013 +0800
+++ b/source/common/pixel.cpp	Thu Oct 17 22:14:37 2013 +0800
@@ -439,13 +439,18 @@
     }
 }
 
-void convert32to16_shr(short *dst, int *src, int shift, int num)
+void convert32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
 {
     int round = 1 << (shift - 1);
 
-    for (int i = 0; i < num; i++)
+    for (int i = 0; i < size; i++)
     {
-        dst[i] = (short)((src[i] + round) >> shift);
+        for (int j = 0; j < size; j++)
+        {
+            dst[j] = (short)((src[j] + round) >> shift);
+        }
+        src += size;
+        dst += stride;
     }
 }
 
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/common/primitives.h
--- a/source/common/primitives.h	Thu Oct 17 22:13:21 2013 +0800
+++ b/source/common/primitives.h	Thu Oct 17 22:14:37 2013 +0800
@@ -179,7 +179,7 @@
 
 typedef void (*cvt16to32_shl_t)(int *dst, short *src, intptr_t, int, int);
 typedef void (*cvt16to16_shl_t)(short *dst, short *src, int, int, intptr_t, int);
-typedef void (*cvt32to16_shr_t)(short *dst, int *src, int, int);
+typedef void (*cvt32to16_shr_t)(short *dst, int *src, intptr_t, int, int);
 
 typedef void (*dct_t)(short *src, int *dst, intptr_t stride);
 typedef void (*idct_t)(int *src, short *dst, intptr_t stride);
diff -r cc4f3a436f46 -r 6c3d24d5f388 source/common/vec/pixel-sse3.cpp
--- a/source/common/vec/pixel-sse3.cpp	Thu Oct 17 22:13:21 2013 +0800
+++ b/source/common/vec/pixel-sse3.cpp	Thu Oct 17 22:14:37 2013 +0800
@@ -31,23 +31,25 @@
 using namespace x265;
 
 namespace {
-void convert32to16_shr(short *dst, int *org, int shift, int num)
+void convert32to16_shr(short *dst, int *org, intptr_t stride, int shift, int size)
 {
-    int i;
+    int i, j;
     __m128i round = _mm_set1_epi32(1 << (shift - 1));
 
-    for (i = 0; i < num; i += 4)
+    for (i = 0; i < size; i++)
     {
-        __m128i im32;
-        __m128i im16;
+        for (j = 0; j < size; j += 4)
+        {
+            __m128i im32;
+            __m128i im16;
 
-        im32 = _mm_loadu_si128((__m128i const*)org);
-        im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));
-        im16 = _mm_packs_epi32(im32, im32);
-        _mm_storel_epi64((__m128i*)dst, im16);
-
-        org += 4;
-        dst += 4;
+            im32 = _mm_loadu_si128((__m128i const*)(org + j));
+            im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));
+            im16 = _mm_packs_epi32(im32, im32);
+            _mm_storel_epi64((__m128i*)(dst + j), im16);
+        }
+        org += size;
+        dst += stride;
     }
 }