[x265] [PATCH 1 of 2] improve count_nonzero by SSSE3
Min Chen
chenm003 at 163.com
Fri Jun 27 02:19:20 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1403828330 25200
# Node ID 5bb8cc1ce689c5fc353809662a1af557e4a9e087
# Parent 1b669c33ff3a8d8f6c9bd1e18979c009baed2433
improve count_nonzero by SSSE3
diff -r 1b669c33ff3a -r 5bb8cc1ce689 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 25 22:46:45 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 26 17:18:50 2014 -0700
@@ -1037,7 +1037,6 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
- p.count_nonzero = x265_count_nonzero_sse2;
LUMA_SS_FILTERS(_sse2);
}
@@ -1050,6 +1049,7 @@
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
+ p.count_nonzero = x265_count_nonzero_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -1173,7 +1173,6 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
- p.count_nonzero = x265_count_nonzero_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
@@ -1208,6 +1207,7 @@
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
+ p.count_nonzero = x265_count_nonzero_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r 1b669c33ff3a -r 5bb8cc1ce689 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Jun 25 22:46:45 2014 +0530
+++ b/source/common/x86/pixel-util.h Thu Jun 26 17:18:50 2014 -0700
@@ -46,7 +46,7 @@
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
-int x265_count_nonzero_sse2(const int32_t *quantCoeff, int numCoeff);
+int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff -r 1b669c33ff3a -r 5bb8cc1ce689 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Jun 25 22:46:45 2014 +0530
+++ b/source/common/x86/pixel-util8.asm Thu Jun 26 17:18:50 2014 -0700
@@ -1070,28 +1070,31 @@
;-----------------------------------------------------------------------------
; int count_nonzero(const int32_t *quantCoeff, int numCoeff);
;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal count_nonzero, 2,2,4
+INIT_XMM ssse3
+cglobal count_nonzero, 2,2,5
pxor m0, m0
- shr r1d, 3
+ shr r1d, 4
movd m1, r1d
- pshuflw m1, m1, 0
- punpcklqdq m1, m1
+ pshufb m1, m0
.loop:
- mova m2, [r0]
+ mova m2, [r0 + 0]
mova m3, [r0 + 16]
- add r0, 32
packssdw m2, m3
- pcmpeqw m2, m0
- paddw m1, m2
+ mova m3, [r0 + 32]
+ mova m4, [r0 + 48]
+ add r0, 64
+ packssdw m3, m4
+ packsswb m2, m3
+ pcmpeqb m2, m0
+ paddb m1, m2
dec r1d
jnz .loop
- packuswb m1, m1
psadbw m1, m0
- movd eax, m1
-
+ pshufd m0, m1, 2
+ paddd m0, m1
+ movd eax, m0
RET
More information about the x265-devel
mailing list