[x265] add primitives.nquant for RDOQ
Satoshi Nakagawa
nakagawa424 at oki.com
Wed Jul 2 09:41:32 CEST 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1404286661 -32400
# Wed Jul 02 16:37:41 2014 +0900
# Node ID 3f25ca9b5addda057040a5e1a544b9ede9afc509
# Parent a18972fd05b1d6242a881bef979b9e1ff17543d9
add primitives.nquant for RDOQ
diff -r a18972fd05b1 -r 3f25ca9b5add source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Tue Jul 01 14:58:35 2014 -0500
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Wed Jul 02 16:37:41 2014 +0900
@@ -508,23 +508,30 @@
uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t trSize,
TextType ttype, uint32_t absPartIdx, int32_t *lastPos)
{
- x265_emms();
- selectLambda(ttype);
-
const uint32_t log2TrSize = g_convertToBit[trSize] + 2;
- uint32_t absSum = 0;
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
- uint32_t goRiceParam = 0;
- double blockUncodedCost = 0;
int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
int qbits = QUANT_SHIFT + m_qpParam.m_per + transformShift; // Right shift of non-RDOQ quantizer; level = (coeff*Q + offset)>>q_bits
int add = (1 << (qbits - 1));
- double *errScale = getErrScaleCoeff(scalingListType, log2TrSize - 2, m_qpParam.m_rem);
int32_t *qCoef = getQuantCoeff(scalingListType, m_qpParam.m_rem, log2TrSize - 2);
+ int numCoeff = 1 << log2TrSize * 2;
+ int scaledCoeff[32 * 32];
+ uint32_t numSig = primitives.nquant(srcCoeff, qCoef, scaledCoeff, dstCoeff, qbits, add, numCoeff);
+
+ X265_CHECK(numSig == primitives.count_nonzero(dstCoeff, numCoeff), "numSig differ\n");
+ if (numSig == 0)
+ return 0;
+
+ x265_emms();
+ selectLambda(ttype);
+
+ double *errScale = getErrScaleCoeff(scalingListType, log2TrSize - 2, m_qpParam.m_rem);
+
+ double blockUncodedCost = 0;
double costCoeff[32 * 32];
double costSig[32 * 32];
double costCoeff0[32 * 32];
@@ -544,6 +551,7 @@
int c2 = 0;
double baseCost = 0;
int lastScanPos = -1;
+ uint32_t goRiceParam = 0;
uint32_t c1Idx = 0;
uint32_t c2Idx = 0;
int cgLastScanPos = -1;
@@ -567,16 +575,13 @@
//===== quantization =====
uint32_t blkPos = codingParameters.scan[scanPos];
// set coeff
- int Q = qCoef[blkPos];
double scaleFactor = errScale[blkPos];
- int levelDouble = srcCoeff[blkPos];
- levelDouble = (int)std::min<int64_t>((int64_t)abs((int)levelDouble) * Q, MAX_INT - add);
- uint32_t maxAbsLevel = (levelDouble + add) >> qbits;
+ int levelDouble = scaledCoeff[blkPos];
+ uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);
costCoeff0[scanPos] = ((uint64_t)levelDouble * levelDouble) * scaleFactor;
blockUncodedCost += costCoeff0[scanPos];
- dstCoeff[blkPos] = maxAbsLevel;
if (maxAbsLevel > 0 && lastScanPos < 0)
{
@@ -776,7 +781,7 @@
//===== estimate last position =====
if (lastScanPos < 0)
{
- return absSum;
+ return 0;
}
double bestCost = 0;
@@ -840,6 +845,7 @@
} // end if (sigCoeffGroupFlag[ cgBlkPos ])
} // end for
+ uint32_t absSum = 0;
for (int pos = 0; pos < bestLastIdxp1; pos++)
{
int blkPos = codingParameters.scan[pos];
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/dct.cpp
--- a/source/common/dct.cpp Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/dct.cpp Wed Jul 02 16:37:41 2014 +0900
@@ -780,10 +780,8 @@
for (int blockpos = 0; blockpos < numCoeff; blockpos++)
{
- int level;
- int sign;
- level = coef[blockpos];
- sign = (level < 0 ? -1 : 1);
+ int level = coef[blockpos];
+ int sign = (level < 0 ? -1 : 1);
int tmplevel = abs(level) * quantCoeff[blockpos];
level = ((tmplevel + add) >> qBits);
@@ -798,6 +796,27 @@
return acSum;
}
+uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
+{
+ uint32_t numSig = 0;
+
+ for (int blockpos = 0; blockpos < numCoeff; blockpos++)
+ {
+ int level = coef[blockpos];
+ int sign = (level < 0 ? -1 : 1);
+
+ int tmplevel = abs(level) * quantCoeff[blockpos];
+ scaledCoeff[blockpos] = tmplevel;
+ level = ((tmplevel + add) >> qBits);
+ if (level)
+ ++numSig;
+ level *= sign;
+ qCoef[blockpos] = Clip3(-32768, 32767, level);
+ }
+
+ return numSig;
+}
+
int count_nonzero_c(const int32_t *quantCoeff, int numCoeff)
{
X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
@@ -822,6 +841,7 @@
p.dequant_scaling = dequant_scaling_c;
p.dequant_normal = dequant_normal_c;
p.quant = quant_c;
+ p.nquant = nquant_c;
p.dct[DST_4x4] = dst4_c;
p.dct[DCT_4x4] = dct4_c;
p.dct[DCT_8x8] = dct8_c;
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/primitives.h
--- a/source/common/primitives.h Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/primitives.h Wed Jul 02 16:37:41 2014 +0900
@@ -147,6 +147,7 @@
typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
typedef int (*count_nonzero_t)(const int32_t *quantCoeff, int numCoeff);
@@ -242,6 +243,7 @@
dct_t dct[NUM_DCTS];
idct_t idct[NUM_IDCTS];
quant_t quant;
+ nquant_t nquant;
dequant_scaling_t dequant_scaling;
dequant_normal_t dequant_normal;
count_nonzero_t count_nonzero;
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 02 16:37:41 2014 +0900
@@ -1061,6 +1061,7 @@
p.dct[DCT_8x8] = x265_dct8_sse4;
p.quant = x265_quant_sse4;
+ p.nquant = x265_nquant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
@@ -1257,6 +1258,7 @@
p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
p.quant = x265_quant_sse4;
+ p.nquant = x265_nquant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
p.weight_pp = x265_weight_pp_sse4;
p.weight_sp = x265_weight_sp_sse4;
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/x86/pixel-util.h Wed Jul 02 16:37:41 2014 +0900
@@ -45,6 +45,7 @@
void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
diff -r a18972fd05b1 -r 3f25ca9b5add source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jul 01 14:58:35 2014 -0500
+++ b/source/common/x86/pixel-util8.asm Wed Jul 02 16:37:41 2014 +0900
@@ -879,7 +879,7 @@
%define qbits8 [rsp + 2 * mmsize]
%endif
- ; fill qbits-8
+ ; fill qbits
movd m0, r4d
mova qbits, m0
@@ -979,6 +979,81 @@
;-----------------------------------------------------------------------------
+; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal nquant, 5,6,8
+
+ ; fill qbits
+ movd m5, r4d ; m5 = qbits
+
+ ; fill offset
+ movd m6, r5m
+ pshufd m6, m6, 0 ; m6 = add
+
+ mov r4d, r6m
+ shr r4d, 3
+ pxor m7, m7 ; m7 = numZero
+.loop:
+ ; 4 coeff
+ movu m0, [r0] ; m0 = level
+ pxor m1, m1
+ pcmpgtd m1, m0 ; m1 = sign
+ movu m2, [r1] ; m2 = qcoeff
+ pabsd m0, m0
+ pmulld m0, m2 ; m0 = tmpLevel1
+ movu [r2], m0 ; m0 = scaledCoeff
+ paddd m2, m0, m6
+ psrad m2, m5 ; m2 = level1
+ pxor m4, m4
+ pcmpeqd m4, m2 ; m4 = mask4
+
+ pxor m2, m1
+ psubd m2, m1
+ packssdw m2, m2
+ pmovsxwd m2, m2
+ movu [r3], m2
+ ; 4 coeff
+ movu m0, [r0 + 16] ; m0 = level
+ pxor m1, m1
+ pcmpgtd m1, m0 ; m1 = sign
+ movu m2, [r1 + 16] ; m2 = qcoeff
+ pabsd m0, m0
+ pmulld m0, m2 ; m0 = tmpLevel1
+ movu [r2 + 16], m0 ; m0 = scaledCoeff
+ paddd m2, m0, m6
+ psrad m2, m5 ; m2 = level1
+ pxor m0, m0
+ pcmpeqd m0, m2 ; m0 = mask4
+
+ pxor m2, m1
+ psubd m2, m1
+ packssdw m2, m2
+ pmovsxwd m2, m2
+ movu [r3 + 16], m2
+
+ packssdw m4, m0 ; m4 = mask8
+ psubw m7, m4 ; m7 = numZero
+
+ add r0, 32
+ add r1, 32
+ add r2, 32
+ add r3, 32
+
+ dec r4d
+ jnz .loop
+
+ packuswb m7, m7
+ pxor m0, m0
+ psadbw m0, m7
+ mov eax, r6m
+ movd r4d, m0
+ sub eax, r4d ; numSig
+
+ RET
+
+
+;-----------------------------------------------------------------------------
; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
INIT_XMM sse4
diff -r a18972fd05b1 -r 3f25ca9b5add source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Tue Jul 01 14:58:35 2014 -0500
+++ b/source/test/mbdstharness.cpp Wed Jul 02 16:37:41 2014 +0900
@@ -327,6 +327,50 @@
return true;
}
+bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
+{
+ int j = 0;
+
+ for (int i = 0; i <= ITERS; i++)
+ {
+ int width = (rand() % 4 + 1) * 4;
+
+ if (width == 12)
+ {
+ width = 32;
+ }
+ int height = width;
+
+ uint32_t optReturnValue = 0;
+ uint32_t refReturnValue = 0;
+
+ int bits = rand() % 32;
+ int valueToAdd = rand() % (32 * 1024);
+ int cmp_size = sizeof(int) * height * width;
+ int numCoeff = height * width;
+
+ int index1 = rand() % TEST_CASES;
+ int index2 = rand() % TEST_CASES;
+
+ refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
+ optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
+
+ if (memcmp(mintbuf3, mintbuf5, cmp_size))
+ return false;
+
+ if (memcmp(mintbuf4, mintbuf6, cmp_size))
+ return false;
+
+ if (optReturnValue != refReturnValue)
+ return false;
+
+ reportfail();
+ j += 16;
+ }
+
+ return true;
+}
+
bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
{
ALIGN_VAR_32(int32_t, qcoeff[32 * 32]);
@@ -409,6 +453,15 @@
}
}
+ if (opt.nquant)
+ {
+ if (!check_nquant_primitive(ref.nquant, opt.nquant))
+ {
+ printf("nquant: Failed!\n");
+ return false;
+ }
+ }
+
if (opt.count_nonzero)
{
if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero))
@@ -460,6 +513,12 @@
REPORT_SPEEDUP(opt.quant, ref.quant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32, &dummy);
}
+ if (opt.nquant)
+ {
+ printf("nquant\t\t");
+ REPORT_SPEEDUP(opt.nquant, ref.nquant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
+ }
+
if (opt.count_nonzero)
{
for (int i = 4; i <= 32; i <<= 1)
diff -r a18972fd05b1 -r 3f25ca9b5add source/test/mbdstharness.h
--- a/source/test/mbdstharness.h Tue Jul 01 14:58:35 2014 -0500
+++ b/source/test/mbdstharness.h Wed Jul 02 16:37:41 2014 +0900
@@ -44,6 +44,7 @@
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
bool check_quant_primitive(quant_t ref, quant_t opt);
+ bool check_nquant_primitive(nquant_t ref, nquant_t opt);
bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
More information about the x265-devel
mailing list