[x265] [PATCH 1 of 2] improve count_nonzero by SSSE3

Min Chen chenm003 at 163.com
Fri Jun 27 02:19:20 CEST 2014


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1403828330 25200
# Node ID 5bb8cc1ce689c5fc353809662a1af557e4a9e087
# Parent  1b669c33ff3a8d8f6c9bd1e18979c009baed2433
improve count_nonzero by SSSE3

diff -r 1b669c33ff3a -r 5bb8cc1ce689 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 25 22:46:45 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 26 17:18:50 2014 -0700
@@ -1037,7 +1037,6 @@
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
         p.idct[IDST_4x4] = x265_idst4_sse2;
-        p.count_nonzero = x265_count_nonzero_sse2;
 
         LUMA_SS_FILTERS(_sse2);
     }
@@ -1050,6 +1049,7 @@
 
         p.dct[DST_4x4] = x265_dst4_ssse3;
         p.idct[IDCT_8x8] = x265_idct8_ssse3;
+        p.count_nonzero = x265_count_nonzero_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -1173,7 +1173,6 @@
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
         p.idct[IDST_4x4] = x265_idst4_sse2;
-        p.count_nonzero = x265_count_nonzero_sse2;
         p.planecopy_sp = x265_downShift_16_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
@@ -1208,6 +1207,7 @@
 
         p.dct[DST_4x4] = x265_dst4_ssse3;
         p.idct[IDCT_8x8] = x265_idct8_ssse3;
+        p.count_nonzero = x265_count_nonzero_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r 1b669c33ff3a -r 5bb8cc1ce689 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Jun 25 22:46:45 2014 +0530
+++ b/source/common/x86/pixel-util.h	Thu Jun 26 17:18:50 2014 -0700
@@ -46,7 +46,7 @@
 
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
 void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_sse2(const int32_t *quantCoeff, int numCoeff);
+int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
 
 void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff -r 1b669c33ff3a -r 5bb8cc1ce689 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Jun 25 22:46:45 2014 +0530
+++ b/source/common/x86/pixel-util8.asm	Thu Jun 26 17:18:50 2014 -0700
@@ -1070,28 +1070,31 @@
 ;-----------------------------------------------------------------------------
 ; int count_nonzero(const int32_t *quantCoeff, int numCoeff);
 ;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal count_nonzero, 2,2,4
+INIT_XMM ssse3
+cglobal count_nonzero, 2,2,5
     pxor        m0, m0
-    shr         r1d, 3
+    shr         r1d, 4
     movd        m1, r1d
-    pshuflw     m1, m1, 0
-    punpcklqdq  m1, m1
+    pshufb      m1, m0
 
 .loop:
-    mova        m2, [r0]
+    mova        m2, [r0 +  0]
     mova        m3, [r0 + 16]
-    add         r0, 32
     packssdw    m2, m3
-    pcmpeqw     m2, m0
-    paddw       m1, m2
+    mova        m3, [r0 + 32]
+    mova        m4, [r0 + 48]
+    add         r0, 64
+    packssdw    m3, m4
+    packsswb    m2, m3
+    pcmpeqb     m2, m0
+    paddb       m1, m2
     dec         r1d
     jnz         .loop
 
-    packuswb    m1, m1
     psadbw      m1, m0
-    movd        eax, m1
-
+    pshufd      m0, m1, 2
+    paddd       m0, m1
+    movd        eax, m0
     RET
 
 



More information about the x265-devel mailing list