[x265] [PATCH 2 of 3] asm: optimize nquant by PSIGND, improve 13k cycles -> 11k cycles
Min Chen
chenm003 at 163.com
Thu Sep 4 01:37:48 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409787419 25200
# Node ID 4ca9e972f48cb4530ca7181ad7cec351568a99b3
# Parent 94bd00d1af5d8c5f6f26f97c50a727588a860714
asm: optimize nquant by PSIGND, improve 13k cycles -> 11k cycles
diff -r 94bd00d1af5d -r 4ca9e972f48c source/common/dct.cpp
--- a/source/common/dct.cpp Wed Sep 03 16:36:44 2014 -0700
+++ b/source/common/dct.cpp Wed Sep 03 16:36:59 2014 -0700
@@ -801,6 +801,10 @@
{
uint32_t numSig = 0;
+ X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
+ X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
+ X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quantCoeff buffer not aligned\n");
+
for (int blockpos = 0; blockpos < numCoeff; blockpos++)
{
int level = coef[blockpos];
diff -r 94bd00d1af5d -r 4ca9e972f48c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Sep 03 16:36:44 2014 -0700
+++ b/source/common/x86/pixel-util8.asm Wed Sep 03 16:36:59 2014 -0700
@@ -941,55 +941,47 @@
; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal nquant, 4,5,8
+cglobal nquant, 3,5,8
movd m6, r4m
mov r4d, r5m
pxor m7, m7 ; m7 = numZero
- movd m5, r3d ; m5 = qbits
+ movd m5, r3m ; m5 = qbits
pshufd m6, m6, 0 ; m6 = add
mov r3d, r4d ; r3 = numCoeff
shr r4d, 3
+
.loop:
movu m0, [r0] ; m0 = level
movu m1, [r0 + 16] ; m1 = level
- movu m2, [r1] ; m2 = qcoeff
- movu m3, [r1 + 16] ; m3 = qcoeff
+
+ pabsd m2, m0
+ pmulld m2, [r1] ; m4 = tmpLevel1
+ paddd m2, m6
+ psrad m2, m5 ; m4 = level1
+ psignd m2, m0 ; restore sign
+
+ pabsd m3, m1
+ pmulld m3, [r1 + 16] ; m4 = tmpLevel1
+ paddd m3, m6
+ psrad m3, m5 ; m4 = level1
+ psignd m3, m1 ; restore sign
add r0, 32
add r1, 32
- pxor m4, m4
- pcmpgtd m4, m0 ; m4 = sign
- pabsd m0, m0
- pmulld m0, m2 ; m0 = tmpLevel1
- paddd m0, m6
- psrad m0, m5 ; m0 = level1
- pxor m0, m4
- psubd m0, m4
-
- pxor m4, m4
- pcmpgtd m4, m1 ; m4 = sign
- pabsd m1, m1
- pmulld m1, m3 ; m1 = tmpLevel1
- paddd m1, m6
- psrad m1, m5 ; m1 = level1
- pxor m1, m4
- psubd m1, m4
-
- packssdw m0, m0
- packssdw m1, m1
- pmovsxwd m0, m0
+ packssdw m2, m3
+ pmovsxwd m0, m2
+ movhlps m1, m2
pmovsxwd m1, m1
- movu [r2], m0
+ movu [r2 ], m0
movu [r2 + 16], m1
add r2, 32
+
+ pxor m4, m4
+ pcmpeqw m2, m4
+ psubw m7, m2
+
dec r4d
-
- packssdw m0, m1
- pxor m4, m4
- pcmpeqw m0, m4
- psubw m7, m0
-
jnz .loop
packuswb m7, m7
@@ -997,10 +989,8 @@
mov eax, r3d
movd r4d, m7
sub eax, r4d ; numSig
-
RET
-
;-----------------------------------------------------------------------------
; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list