[x265] [PATCH] asm: count_nonzero ssse3 to sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Wed Jun 10 18:02:12 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1433952103 25200
# Node ID c3da462abd1ff1341e43081fd651591e43fc79f2
# Parent  6245476add8f0562e3ccb657f572ff94fe96adf0
asm: count_nonzero ssse3 to sse2

The ssse3 count_nonzero primitives only use up to sse2 instructions.
This patch just coverts them to sse2 so for sse2 they replace c code.

64-bit

./test/TestBench --testbench transforms | grep nonzero
count_nonzero[4x4]	2.83x 	 140.00   	 396.24
count_nonzero[8x8]	2.88x 	 307.48   	 885.47
count_nonzero[16x16]	2.91x 	 914.96   	 2662.87
count_nonzero[32x32]	2.83x 	 3314.98  	 9383.80

32-bit

./test/TestBench --testbench transforms | grep nonzero
count_nonzero[4x4]	1.80x 	 162.50   	 292.38
count_nonzero[8x8]	7.10x 	 305.00   	 2164.98
count_nonzero[16x16]	8.76x 	 905.00   	 7925.22
count_nonzero[32x32]	9.37x 	 3305.02  	 30965.26

10bpp

./test/TestBench --testbench transforms | grep nonzero
count_nonzero[4x4]	2.82x 	 139.99   	 395.13
count_nonzero[8x8]	2.88x 	 307.49   	 885.31
count_nonzero[16x16]	2.91x 	 914.97   	 2663.42
count_nonzero[32x32]	2.83x 	 3314.99  	 9382.70

diff -r 6245476add8f -r c3da462abd1f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 10 09:01:43 2015 -0700
@@ -939,6 +939,7 @@
         ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
+        ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -961,10 +962,6 @@
 
         p.dst4x4 = x265_dst4_ssse3;
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
-        p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3;
-        p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_ssse3;
-        p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_ssse3;
-        p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_ssse3;
         p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
 
         p.pu[LUMA_4x4].convert_p2s = x265_filterPixelToShort_4x4_ssse3;
@@ -2055,6 +2052,7 @@
         ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
+        ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -2094,8 +2092,6 @@
         p.dst4x4 = x265_dst4_ssse3;
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
 
-        ALL_LUMA_TU(count_nonzero, count_nonzero, ssse3);
-
         // MUST be done after LUMA_FILTERS() to overwrite default version
         p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_ssse3;
 
diff -r 6245476add8f -r c3da462abd1f source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/pixel-util.h	Wed Jun 10 09:01:43 2015 -0700
@@ -51,10 +51,10 @@
 void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
 void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
 
-int x265_count_nonzero_4x4_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_8x8_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_16x16_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_32x32_ssse3(const int16_t* quantCoeff);
+int x265_count_nonzero_4x4_sse2(const int16_t* quantCoeff);
+int x265_count_nonzero_8x8_sse2(const int16_t* quantCoeff);
+int x265_count_nonzero_16x16_sse2(const int16_t* quantCoeff);
+int x265_count_nonzero_32x32_sse2(const int16_t* quantCoeff);
 int x265_count_nonzero_4x4_avx2(const int16_t* quantCoeff);
 int x265_count_nonzero_8x8_avx2(const int16_t* quantCoeff);
 int x265_count_nonzero_16x16_avx2(const int16_t* quantCoeff);
diff -r 6245476add8f -r c3da462abd1f source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Jun 10 09:01:43 2015 -0700
@@ -948,11 +948,11 @@
     jnz            .loop
     RET
 
-
+z
 ;-----------------------------------------------------------------------------
-; int x265_count_nonzero_4x4_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff);
 ;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
 cglobal count_nonzero_4x4, 1,1,2
     pxor            m0, m0
 
@@ -988,9 +988,9 @@
 
 
 ;-----------------------------------------------------------------------------
-; int x265_count_nonzero_8x8_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_8x8_sse2(const int16_t *quantCoeff);
 ;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
 cglobal count_nonzero_8x8, 1,1,3
     pxor            m0, m0
     movu            m1, [pb_4]
@@ -1038,9 +1038,9 @@
 
 
 ;-----------------------------------------------------------------------------
-; int x265_count_nonzero_16x16_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_16x16_sse2(const int16_t *quantCoeff);
 ;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
 cglobal count_nonzero_16x16, 1,1,3
     pxor            m0, m0
     movu            m1, [pb_16]
@@ -1087,9 +1087,9 @@
 
 
 ;-----------------------------------------------------------------------------
-; int x265_count_nonzero_32x32_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);
 ;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
 cglobal count_nonzero_32x32, 1,1,3
     pxor            m0, m0
     movu            m1, [pb_64]


More information about the x265-devel mailing list