[x265] [PATCH 3 of 5] asm: modify nquant() output to reduce abs operator in rdoQuant()
Min Chen
chenm003 at 163.com
Fri Sep 25 02:15:35 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1443138813 18000
# Node ID 89bb8d801d1a8dbb5461f5b4d5376d5d0b75bd90
# Parent bcd926000cd8fecf2c2c52af49e94d4405b66280
asm: modify nquant() output to reduce abs operator in rdoQuant()
---
source/common/dct.cpp | 5 ++++-
source/common/quant.cpp | 2 +-
source/common/x86/pixel-util8.asm | 7 +++++--
3 files changed, 10 insertions(+), 4 deletions(-)
diff -r bcd926000cd8 -r 89bb8d801d1a source/common/dct.cpp
--- a/source/common/dct.cpp Thu Sep 24 18:53:31 2015 -0500
+++ b/source/common/dct.cpp Thu Sep 24 18:53:33 2015 -0500
@@ -703,7 +703,10 @@
if (level)
++numSig;
level *= sign;
- qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
+
+ // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
+ // But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
+ qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
}
return numSig;
diff -r bcd926000cd8 -r 89bb8d801d1a source/common/quant.cpp
--- a/source/common/quant.cpp Thu Sep 24 18:53:31 2015 -0500
+++ b/source/common/quant.cpp Thu Sep 24 18:53:33 2015 -0500
@@ -841,7 +841,7 @@
{
scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
uint32_t blkPos = codeParams.scan[scanPos];
- uint32_t maxAbsLevel = abs(dstCoeff[blkPos]); /* abs(quantized coeff) */
+ uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */
int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
diff -r bcd926000cd8 -r 89bb8d801d1a source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Sep 24 18:53:31 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Sep 24 18:53:33 2015 -0500
@@ -792,6 +792,7 @@
pshufd m6, m6, 0 ; m6 = add
mov r3d, r4d ; r3 = numCoeff
shr r4d, 3
+ pxor m4, m4
.loop:
pmovsxwd m0, [r0] ; m0 = level
@@ -810,13 +811,13 @@
psignd m3, m1
packssdw m2, m3
+ pabsw m2, m2
movu [r2], m2
add r0, 16
add r1, 32
add r2, 16
- pxor m4, m4
pcmpeqw m2, m4
psubw m7, m2
@@ -862,9 +863,11 @@
psignd m2, m0
packssdw m1, m2
+ pabsw m1, m1
+
vpermq m2, m1, q3120
-
movu [r2], m2
+
add r0, mmsize
add r1, mmsize * 2
add r2, mmsize
More information about the x265-devel
mailing list