[x265] [PATCH] count_nonzero primitive optimization, downscaling quantCoef from int32_t* to int16_t*

Tue Sep 2 16:08:04 CEST 2014

# HG changeset patch
# User Praveen Tiwari
# Date 1408951177 -19800
# Node ID 380a796052afc62cac7e480fde70e3766a940246
# Parent  c5624effb73c74e63fd2e42d2a48ea4490074dce
count_nonzero primitive optimization, downscaling quantCoef from int32_t* to int16_t*

diff -r c5624effb73c -r 380a796052af source/common/dct.cpp

--- a/source/common/dct.cpp	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/dct.cpp	Mon Aug 25 12:49:37 2014 +0530
@@ -817,7 +817,7 @@
     return numSig;
 }
 
-int  count_nonzero_c(const int32_t *quantCoeff, int numCoeff)
+int  count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
 {
     X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
     X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
diff -r c5624effb73c -r 380a796052af source/common/primitives.h
--- a/source/common/primitives.h	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/primitives.h	Mon Aug 25 12:49:37 2014 +0530
@@ -166,7 +166,7 @@
 typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-typedef int  (*count_nonzero_t)(const int32_t *quantCoeff, int numCoeff);
+typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
 
 typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff -r c5624effb73c -r 380a796052af source/common/quant.cpp
--- a/source/common/quant.cpp	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/quant.cpp	Mon Aug 25 12:49:37 2014 +0530
@@ -441,7 +441,15 @@
         const uint32_t sizeIdx = log2TrSize - 2;
         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
 
-        X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << log2TrSize * 2), "numSig differ\n");
+        int numCoeff = (1 << (log2TrSize * 2));
+        assert(numCoeff <= 1024);
+        ALIGN_VAR_16(int16_t, qCoeff[1024]);
+        for (int i = 0; i < numCoeff; i++)
+        {
+            qCoeff[i] = (int16_t)Clip3(-32768, 32767, coeff[i]);
+        }
+
+        X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, 1 << log2TrSize * 2), "numSig differ\n");
 
         // DC only
         if (numSig == 1 && coeff[0] != 0 && !useDST)
@@ -479,7 +487,14 @@
     int numCoeff = 1 << log2TrSize * 2;
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
 
-    X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, numCoeff), "numSig differ\n");
+    assert(numCoeff <= 1024);
+    ALIGN_VAR_16(int16_t, qCoeff[1024]);
+    for (int i = 0; i < numCoeff; i++)
+    {
+        qCoeff[i] = (int16_t)Clip3(-32768, 32767, dstCoeff[i]);
+    }
+
+    X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, 1 << log2TrSize * 2), "numSig differ\n");
     if (!numSig)
         return 0;
 
diff -r c5624effb73c -r 380a796052af source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/x86/pixel-util.h	Mon Aug 25 12:49:37 2014 +0530
@@ -47,7 +47,7 @@
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
+int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
 
 void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff -r c5624effb73c -r 380a796052af source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/x86/pixel-util8.asm	Mon Aug 25 12:49:37 2014 +0530
@@ -1051,10 +1051,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; int count_nonzero(const int32_t *quantCoeff, int numCoeff);
+; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
 ;-----------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal count_nonzero, 2,2,5
+cglobal count_nonzero, 2,2,4
     pxor        m0, m0
     shr         r1d, 4
     movd        m1, r1d
@@ -1063,12 +1063,8 @@
 .loop:
     mova        m2, [r0 +  0]
     mova        m3, [r0 + 16]
-    packssdw    m2, m3
-    mova        m3, [r0 + 32]
-    mova        m4, [r0 + 48]
-    add         r0, 64
-    packssdw    m3, m4
     packsswb    m2, m3
+    add         r0, 32
     pcmpeqb     m2, m0
     paddb       m1, m2
     dec         r1d
diff -r c5624effb73c -r 380a796052af source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/encoder/entropy.cpp	Mon Aug 25 12:49:37 2014 +0530
@@ -1598,8 +1598,16 @@
 {
     uint32_t trSize = 1 << log2TrSize;
 
+    int numCoeff = (1 << (log2TrSize << 1));
+    assert(numCoeff <= 1024);
+    ALIGN_VAR_16(int16_t, qCoeff[1024]);
+    for (int i = 0; i < numCoeff; i++)
+    {
+        qCoeff[i] = (int16_t)Clip3(-32768, 32767, coeff[i]);
+    }
+
     // compute number of significant coefficients
-    uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1)));
+    uint32_t numSig = primitives.count_nonzero(qCoeff, (1 << (log2TrSize << 1)));
 
     X265_CHECK(numSig > 0, "cbf check fail\n");
 
diff -r c5624effb73c -r 380a796052af source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Mon Sep 01 14:13:37 2014 +0530
+++ b/source/test/mbdstharness.cpp	Mon Aug 25 12:49:37 2014 +0530
@@ -277,7 +277,7 @@
 
 bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
 {
-    ALIGN_VAR_32(int32_t, qcoeff[32 * 32]);
+    ALIGN_VAR_32(int16_t, qcoeff[32 * 32]);
 
     for (int i = 0; i < 4; i++)
     {
@@ -287,7 +287,7 @@
 
         for (int n = 0; n <= num; n++)
         {
-            memset(qcoeff, 0, num * sizeof(int32_t));
+            memset(qcoeff, 0, num * sizeof(int16_t));
 
             for (int j = 0; j < n; j++)
             {
@@ -297,7 +297,7 @@
                     k = (k + 11) & mask;
                 }
 
-                qcoeff[k] = rand() - RAND_MAX / 2;
+                qcoeff[k] = (int16_t)rand() - RAND_MAX / 2;
             }
 
             int refval = ref(qcoeff, num);
@@ -436,7 +436,7 @@
         for (int i = 4; i <= 32; i <<= 1)
         {
             printf("count_nonzero[%dx%d]", i, i);
-            REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbufidct, i * i)
+            REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1, i * i)
         }
     }
 }