[x265] [PATCH] asm: count_nonzero ssse3 to sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Wed Jun 10 18:02:12 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1433952103 25200
# Node ID c3da462abd1ff1341e43081fd651591e43fc79f2
# Parent 6245476add8f0562e3ccb657f572ff94fe96adf0
asm: count_nonzero ssse3 to sse2
The ssse3 count_nonzero primitives only use up to sse2 instructions.
This patch just coverts them to sse2 so for sse2 they replace c code.
64-bit
./test/TestBench --testbench transforms | grep nonzero
count_nonzero[4x4] 2.83x 140.00 396.24
count_nonzero[8x8] 2.88x 307.48 885.47
count_nonzero[16x16] 2.91x 914.96 2662.87
count_nonzero[32x32] 2.83x 3314.98 9383.80
32-bit
./test/TestBench --testbench transforms | grep nonzero
count_nonzero[4x4] 1.80x 162.50 292.38
count_nonzero[8x8] 7.10x 305.00 2164.98
count_nonzero[16x16] 8.76x 905.00 7925.22
count_nonzero[32x32] 9.37x 3305.02 30965.26
10bpp
./test/TestBench --testbench transforms | grep nonzero
count_nonzero[4x4] 2.82x 139.99 395.13
count_nonzero[8x8] 2.88x 307.49 885.31
count_nonzero[16x16] 2.91x 914.97 2663.42
count_nonzero[32x32] 2.83x 3314.99 9382.70
diff -r 6245476add8f -r c3da462abd1f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jun 10 09:01:43 2015 -0700
@@ -939,6 +939,7 @@
ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
+ ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
}
if (cpuMask & X265_CPU_SSE3)
{
@@ -961,10 +962,6 @@
p.dst4x4 = x265_dst4_ssse3;
p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
- p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3;
- p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_ssse3;
- p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_ssse3;
- p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_ssse3;
p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
p.pu[LUMA_4x4].convert_p2s = x265_filterPixelToShort_4x4_ssse3;
@@ -2055,6 +2052,7 @@
ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
+ ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
}
if (cpuMask & X265_CPU_SSE3)
{
@@ -2094,8 +2092,6 @@
p.dst4x4 = x265_dst4_ssse3;
p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
- ALL_LUMA_TU(count_nonzero, count_nonzero, ssse3);
-
// MUST be done after LUMA_FILTERS() to overwrite default version
p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_ssse3;
diff -r 6245476add8f -r c3da462abd1f source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/pixel-util.h Wed Jun 10 09:01:43 2015 -0700
@@ -51,10 +51,10 @@
void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_4x4_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_8x8_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_16x16_ssse3(const int16_t* quantCoeff);
-int x265_count_nonzero_32x32_ssse3(const int16_t* quantCoeff);
+int x265_count_nonzero_4x4_sse2(const int16_t* quantCoeff);
+int x265_count_nonzero_8x8_sse2(const int16_t* quantCoeff);
+int x265_count_nonzero_16x16_sse2(const int16_t* quantCoeff);
+int x265_count_nonzero_32x32_sse2(const int16_t* quantCoeff);
int x265_count_nonzero_4x4_avx2(const int16_t* quantCoeff);
int x265_count_nonzero_8x8_avx2(const int16_t* quantCoeff);
int x265_count_nonzero_16x16_avx2(const int16_t* quantCoeff);
diff -r 6245476add8f -r c3da462abd1f source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Jun 10 09:01:43 2015 -0700
@@ -948,11 +948,11 @@
jnz .loop
RET
-
+z
;-----------------------------------------------------------------------------
-; int x265_count_nonzero_4x4_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
cglobal count_nonzero_4x4, 1,1,2
pxor m0, m0
@@ -988,9 +988,9 @@
;-----------------------------------------------------------------------------
-; int x265_count_nonzero_8x8_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_8x8_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
cglobal count_nonzero_8x8, 1,1,3
pxor m0, m0
movu m1, [pb_4]
@@ -1038,9 +1038,9 @@
;-----------------------------------------------------------------------------
-; int x265_count_nonzero_16x16_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_16x16_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
cglobal count_nonzero_16x16, 1,1,3
pxor m0, m0
movu m1, [pb_16]
@@ -1087,9 +1087,9 @@
;-----------------------------------------------------------------------------
-; int x265_count_nonzero_32x32_ssse3(const int16_t *quantCoeff);
+; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
-INIT_XMM ssse3
+INIT_XMM sse2
cglobal count_nonzero_32x32, 1,1,3
pxor m0, m0
movu m1, [pb_64]
More information about the x265-devel
mailing list