[x265] [PATCH 1 of 3] asm: optimize nquant by PSIGND, improve 11k cycles -> 9.8k cycles
Min Chen
chenm003 at 163.com
Fri Sep 5 03:59:47 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409882310 25200
# Node ID 809782d90535d2b44b97361ba9d94e5103c13138
# Parent fb7b890c5265d17033828718c294f1843d7da6a6
asm: optimize nquant by PSIGND, improve 11k cycles -> 9.8k cycles
diff -r fb7b890c5265 -r 809782d90535 source/common/dct.cpp
--- a/source/common/dct.cpp Thu Sep 04 16:42:24 2014 -0700
+++ b/source/common/dct.cpp Thu Sep 04 18:58:30 2014 -0700
@@ -793,6 +793,10 @@
uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
{
+ X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
+ X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
+ X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
+
uint32_t numSig = 0;
for (int blockpos = 0; blockpos < numCoeff; blockpos++)
diff -r fb7b890c5265 -r 809782d90535 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Sep 04 16:42:24 2014 -0700
+++ b/source/common/x86/pixel-util8.asm Thu Sep 04 18:58:30 2014 -0700
@@ -939,50 +939,43 @@
; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal nquant, 4,5,8
+cglobal nquant, 3,5,8
movd m6, r4m
mov r4d, r5m
pxor m7, m7 ; m7 = numZero
- movd m5, r3d ; m5 = qbits
+ movd m5, r3m ; m5 = qbits
pshufd m6, m6, 0 ; m6 = add
mov r3d, r4d ; r3 = numCoeff
shr r4d, 3
+
.loop:
movu m0, [r0] ; m0 = level
movu m1, [r0 + 16] ; m1 = level
- movu m2, [r1] ; m2 = qcoeff
- movu m3, [r1 + 16] ; m3 = qcoeff
+
+ pabsd m2, m0
+ pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
+ paddd m2, m6
+ psrad m2, m5 ; m0 = level1
+ psignd m2, m0
+
+ pabsd m3, m1
+ pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff
+ paddd m3, m6
+ psrad m3, m5 ; m1 = level1
+ psignd m3, m1
+
+ packssdw m2, m3
+
+ movu [r2], m2
add r0, 32
add r1, 32
+ add r2, 16
pxor m4, m4
- pcmpgtd m4, m0 ; m4 = sign
- pabsd m0, m0
- pmulld m0, m2 ; m0 = tmpLevel1
- paddd m0, m6
- psrad m0, m5 ; m0 = level1
- pxor m0, m4
- psubd m0, m4
-
- pxor m4, m4
- pcmpgtd m4, m1 ; m4 = sign
- pabsd m1, m1
- pmulld m1, m3 ; m1 = tmpLevel1
- paddd m1, m6
- psrad m1, m5 ; m1 = level1
- pxor m1, m4
- psubd m1, m4
-
- packssdw m0, m1
-
- movu [r2], m0
- add r2, 16
+ pcmpeqw m2, m4
+ psubw m7, m2
+
dec r4d
-
- pxor m4, m4
- pcmpeqw m0, m4
- psubw m7, m0
-
jnz .loop
packuswb m7, m7
@@ -990,7 +983,6 @@
mov eax, r3d
movd r4d, m7
sub eax, r4d ; numSig
-
RET
More information about the x265-devel
mailing list