[x265] asm: update count_nonzero, add testbench

Satoshi Nakagawa nakagawa424 at oki.com
Fri Feb 21 04:27:22 CET 2014


>>+    pshufd      m1, m1, 0
>>+    packssdw    m1, m1
> packssdw is expendsive instruction, pshuflw+punpcklqdq is better.

revised, thanks.


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1392953002 -32400
#      Fri Feb 21 12:23:22 2014 +0900
# Node ID e4a80e46bd80e7d516dc881da7f38737c0071ccf
# Parent  894bde574bc1678471e0c23ceb381a806768ea95
asm: update count_nonzero, add testbench

diff -r 894bde574bc1 -r e4a80e46bd80 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Feb 20 17:18:42 2014 -0600
+++ b/source/common/x86/pixel-util8.asm	Fri Feb 21 12:23:22 2014 +0900
@@ -1240,11 +1240,12 @@
 ; int count_nonzero(const int32_t *quantCoeff, int numCoeff);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal count_nonzero, 2,3,4
+cglobal count_nonzero, 2,2,4
     pxor        m0, m0
-    pxor        m1, m1
-    mov         r2d, r1d
     shr         r1d, 3
+    movd        m1, r1d
+    pshuflw     m1, m1, 0
+    punpcklqdq  m1, m1
 
 .loop
     mova        m2, [r0]
@@ -1252,16 +1253,13 @@
     add         r0, 32
     packssdw    m2, m3
     pcmpeqw     m2, m0
-    psrlw       m2, 15
-    packsswb    m2, m2
-    psadbw      m2, m0
-    paddd       m1, m2
+    paddw       m1, m2
     dec         r1d
-    jnz        .loop
-
-    movd        r1d, m1
-    sub         r2d, r1d
-    mov         eax, r2d
+    jnz         .loop
+
+    packuswb    m1, m1
+    psadbw      m1, m0
+    movd        eax, m1
 
     RET
 
diff -r 894bde574bc1 -r e4a80e46bd80 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Thu Feb 20 17:18:42 2014 -0600
+++ b/source/test/mbdstharness.cpp	Fri Feb 21 12:23:22 2014 +0900
@@ -380,6 +380,41 @@
     return true;
 }
 
+bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
+{
+    ALIGN_VAR_32(int32_t, qcoeff[32 * 32]);
+
+    for (int i = 0; i < 4; i++)
+    {
+        int log2TrSize = i + 2;
+        int num = 1 << (log2TrSize * 2);
+        int mask = num - 1;
+
+        for (int n = 0; n <= num; n++)
+        {
+            memset(qcoeff, 0, num * sizeof(int32_t));
+
+            for (int j = 0; j < n; j++)
+            {
+                int k = rand() & mask;
+                while (qcoeff[k])
+                {
+                    k = (k + 11) & mask;
+                }
+                qcoeff[k] = rand() - RAND_MAX / 2;
+            }
+
+            int refval = ref(qcoeff, num);
+            int optval = opt(qcoeff, num);
+
+            if (refval != optval)
+                return false;
+        }
+    }
+
+    return true;
+}
+
 bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (int i = 0; i < NUM_DCTS; i++)
@@ -424,6 +459,15 @@
         }
     }
 
+    if (opt.count_nonzero)
+    {
+        if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero))
+        {
+            printf("count_nonzero: Failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -465,4 +509,13 @@
         int dummy = -1;
         REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
     }
+
+    if (opt.count_nonzero)
+    {
+        for (int i = 4; i <= 32; i <<= 1)
+        {
+            printf("count_nonzero[%dx%d]", i, i);
+            REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbufidct, i * i)
+        }
+    }
 }
diff -r 894bde574bc1 -r e4a80e46bd80 source/test/mbdstharness.h
--- a/source/test/mbdstharness.h	Thu Feb 20 17:18:42 2014 -0600
+++ b/source/test/mbdstharness.h	Fri Feb 21 12:23:22 2014 +0900
@@ -43,6 +43,7 @@
     bool check_quant_primitive(quant_t ref, quant_t opt);
     bool check_dct_primitive(dct_t ref, dct_t opt, int width);
     bool check_idct_primitive(idct_t ref, idct_t opt, int width);
+    bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
 
 public:
 


More information about the x265-devel mailing list