[x265] [PATCH] quant_c optimization, downscaling qCoef from int32_t* to int16_t*

Tue Sep 2 16:14:16 CEST 2014

# HG changeset patch
# User Praveen Tiwari
# Date 1409661936 -19800
# Node ID 40e242e316b962116d64fb43444029c5c6546484
# Parent  e1b2ab942177bc9f67547a61c365c6167b5cee38
quant_c optimization, downscaling qCoef from int32_t* to int16_t*

diff -r e1b2ab942177 -r 40e242e316b9 source/common/dct.cpp

--- a/source/common/dct.cpp	Tue Sep 02 17:52:33 2014 +0530
+++ b/source/common/dct.cpp	Tue Sep 02 18:15:36 2014 +0530
@@ -769,7 +769,7 @@
     }
 }
 
-uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int32_t* qCoef, int qBits, int add, int numCoeff)
+uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
 {
     int qBits8 = qBits - 8;
     uint32_t numSig = 0;
@@ -785,7 +785,7 @@
         if (level)
             ++numSig;
         level *= sign;
-        qCoef[blockpos] = Clip3(-32768, 32767, level);
+        qCoef[blockpos] = (int16_t)Clip3(-32768, 32767, level);
     }
 
     return numSig;
diff -r e1b2ab942177 -r 40e242e316b9 source/common/primitives.h
--- a/source/common/primitives.h	Tue Sep 02 17:52:33 2014 +0530
+++ b/source/common/primitives.h	Tue Sep 02 18:15:36 2014 +0530
@@ -164,7 +164,7 @@
 typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
diff -r e1b2ab942177 -r 40e242e316b9 source/common/quant.cpp
--- a/source/common/quant.cpp	Tue Sep 02 17:52:33 2014 +0530
+++ b/source/common/quant.cpp	Tue Sep 02 18:15:36 2014 +0530
@@ -409,7 +409,21 @@
         int add = (cu->m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
         int numCoeff = 1 << log2TrSize * 2;
 
-        uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff);
+        /* This section of code is to safely convert int32_t coefficients to int16_t, once the caller function is
+         * optimize to take coefficients as int16_t*, it will be cleanse.*/
+        ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
+        for (int i = 0; i < numCoeff; i++)
+        {
+             qCoeff[i] = (int16_t)Clip3(-32768, 32767, coeff[i]);
+        }
+        uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, qCoeff, qbits, add, numCoeff);
+
+        /* This section of code is to safely convert int32_t coefficients to int16_t, once the caller function is
+         * optimize to take coefficients as int16_t*, it will be cleanse.*/
+        for (int i = 0; i < numCoeff; i++)
+        {
+             coeff[i] = qCoeff[i];
+        }
 
         if (numSig >= 2 && cu->m_slice->m_pps->bSignHideEnabled)
         {
diff -r e1b2ab942177 -r 40e242e316b9 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Sep 02 17:52:33 2014 +0530
+++ b/source/common/x86/pixel-util.h	Tue Sep 02 18:15:36 2014 +0530
@@ -44,7 +44,7 @@
 void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
 void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
 
-uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
diff -r e1b2ab942177 -r 40e242e316b9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Sep 02 17:52:33 2014 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue Sep 02 18:15:36 2014 +0530
@@ -855,7 +855,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal quant, 5,6,8
@@ -895,8 +895,7 @@
     pxor        m2, m1
     psubd       m2, m1
     packssdw    m2, m2
-    pmovsxwd    m2, m2
-    movu        [r3], m2
+    movh        [r3], m2
     ; 4 coeff
     movu        m0, [r0 + 16]   ; m0 = level
     pxor        m1, m1
@@ -917,13 +916,12 @@
     pxor        m2, m1
     psubd       m2, m1
     packssdw    m2, m2
-    pmovsxwd    m2, m2
-    movu        [r3 + 16], m2
+    movh        [r3 + 8], m2
 
     add         r0, 32
     add         r1, 32
     add         r2, 32
-    add         r3, 32
+    add         r3, 16
 
     dec         r4d
     jnz        .loop
diff -r e1b2ab942177 -r 40e242e316b9 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Tue Sep 02 17:52:33 2014 +0530
+++ b/source/test/mbdstharness.cpp	Tue Sep 02 18:15:36 2014 +0530
@@ -214,18 +214,19 @@
         int bits = rand() % 32;
         int valueToAdd = rand() % (32 * 1024);
         int cmp_size = sizeof(int) * height * width;
+        int cmp_size1 = sizeof(short) * height * width;
         int numCoeff = height * width;
 
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
 
-        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mintbuf2, bits, valueToAdd, numCoeff);
-        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
+        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
+        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
 
-        if (memcmp(mintbuf3, mintbuf1, cmp_size))
+        if (memcmp(mintbuf1, mintbuf3, cmp_size))
             return false;
 
-        if (memcmp(mintbuf4, mintbuf2, cmp_size))
+        if (memcmp(mshortbuf2, mshortbuf3, cmp_size1))
             return false;
 
         if (optReturnValue != refReturnValue)
@@ -430,7 +431,7 @@
     if (opt.quant)
     {
         printf("quant\t\t");
-        REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mintbuf4, 23, 23785, 32 * 32);
+        REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
     }
 
     if (opt.nquant)