[x265] [PATCH 268 of 307] x86: nonPsyRdoQuant primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:26 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar <vignesh at multicorewareinc.com>
# Date 1513752346 -19800
# Wed Dec 20 12:15:46 2017 +0530
# Node ID 9a2c5411769847c4283594b99c1b07a99e92ea4a
# Parent a2224f4d257cf5f5cd391f455aae3117b7fe65ab
x86: nonPsyRdoQuant primitive
This patch also adds AVX512 assembly code for this primitive
C code : 182.62c
AVX512 : 586.00c
diff -r a2224f4d257c -r 9a2c54117698 source/common/dct.cpp
--- a/source/common/dct.cpp Fri Dec 22 13:52:16 2017 +0530
+++ b/source/common/dct.cpp Wed Dec 20 12:15:46 2017 +0530
@@ -980,10 +980,27 @@
sum += sbacGetEntropyBits(mstate, firstC2Flag);
}
}
-
return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
}
+template<int log2TrSize>
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
+ const uint32_t trSize = 1 << log2TrSize;
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
namespace X265_NS {
// x265 private namespace
@@ -993,6 +1010,10 @@
p.dequant_normal = dequant_normal_c;
p.quant = quant_c;
p.nquant = nquant_c;
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_c<2>;
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>;
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
p.dst4x4 = dst4_c;
p.cu[BLOCK_4x4].dct = dct4_c;
p.cu[BLOCK_8x8].dct = dct8_c;
diff -r a2224f4d257c -r 9a2c54117698 source/common/primitives.h
--- a/source/common/primitives.h Fri Dec 22 13:52:16 2017 +0530
+++ b/source/common/primitives.h Wed Dec 20 12:15:46 2017 +0530
@@ -223,7 +223,7 @@
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
-
+typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -299,9 +299,9 @@
intra_allangs_t intra_pred_allangs;
intra_filter_t intra_filter;
intra_pred_t intra_pred[NUM_INTRA_MODE];
+ nonPsyRdoQuant_t nonPsyRdoQuant;
}
cu[NUM_CU_SIZES];
-
/* These remaining primitives work on either fixed block sizes or take
* block dimensions as arguments and thus do not belong in either the PU or
* the CU arrays */
diff -r a2224f4d257c -r 9a2c54117698 source/common/quant.cpp
--- a/source/common/quant.cpp Fri Dec 22 13:52:16 2017 +0530
+++ b/source/common/quant.cpp Wed Dec 20 12:15:46 2017 +0530
@@ -734,12 +734,9 @@
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
/* when no residual coefficient is coded, predicted coef == recon coef */
costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
totalUncodedCost += costUncoded[blkPos + x];
totalRdCost += costUncoded[blkPos + x];
}
@@ -753,25 +750,11 @@
for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
{
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
-
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
-
- for (int y = 0; y < MLS_CG_SIZE; y++)
- {
- for (int x = 0; x < MLS_CG_SIZE; x++)
- {
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
- }
- blkPos += trSize;
- }
+ primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
}
}
-
static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
{
// patternSigCtx = 0
@@ -841,12 +824,9 @@
{
int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
/* when no residual coefficient is coded, predicted coef == recon coef */
costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-
totalUncodedCost += costUncoded[blkPos + x];
totalRdCost += costUncoded[blkPos + x];
@@ -865,16 +845,12 @@
else
{
// non-psy path
+ primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
+ blkPos = codeParams.scan[scanPosBase];
for (int y = 0; y < MLS_CG_SIZE; y++)
{
for (int x = 0; x < MLS_CG_SIZE; x++)
{
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
-
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
-
const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
X265_CHECK(trSize > 4, "trSize check failure\n");
diff -r a2224f4d257c -r 9a2c54117698 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Dec 22 13:52:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 20 12:15:46 2017 +0530
@@ -3083,6 +3083,12 @@
p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
+
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
+
}
#endif
}
@@ -5265,7 +5271,10 @@
p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
-
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
}
#endif
}
diff -r a2224f4d257c -r 9a2c54117698 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Fri Dec 22 13:52:16 2017 +0530
+++ b/source/common/x86/dct8.asm Wed Dec 20 12:15:46 2017 +0530
@@ -510,8 +510,22 @@
tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
-
pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
+;Transform shift and scale bits table for rdoQuant
+tab_nonpsyRdo8 : dq 5, 5
+ dq 4, 7
+ dq 3, 9
+ dq 2, 11
+
+tab_nonpsyRdo10: dq 3, 9
+ dq 2, 11
+ dq 1, 13
+ dq 0, 15
+
+tab_nonpsyRdo12: dq 1, 13
+ dq 0, 15
+ dq -1, 17
+ dq -2, 19
SECTION .text
cextern pd_1
@@ -6399,4 +6413,319 @@
movhps [r1 + 2 * r2], xm0
movhps [r1 + r3], xm1
RET
+
+;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
+;{
+; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+; const int scaleBits = SCALE_BITS - 2 * transformShift;
+; const uint32_t trSize = 1 << log2TrSize;
+
+; for (int y = 0; y < MLS_CG_SIZE; y++)
+; {
+; for (int x = 0; x < MLS_CG_SIZE; x++)
+; {
+; int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+; costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
+; *totalUncodedCost += costUncoded[blkPos + x];
+; *totalRdCost += costUncoded[blkPos + x];
+; }
+; blkPos += trSize;
+; }
+;}
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal nonPsyRdoQuant4, 5, 8, 8
+
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r7, [4 * r4]
+ lea r1, [r1 + 2 * r7]
+
+%if BIT_DEPTH == 12
+ mov r5q, [tab_nonpsyRdo12] ; transformShift
+ mov r6q, [tab_nonpsyRdo12 + 8] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5q, [tab_nonpsyRdo10]
+ mov r6q, [tab_nonpsyRdo10 + 8]
+%elif BIT_DEPTH == 8
+ mov r5q, [tab_nonpsyRdo8]
+ mov r6q, [tab_nonpsyRdo8 + 8]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+
+ movq xm3, r6
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+
+;Row 1, 2
+ movq xm0, [r0]
+ pinsrq xm0, [r0 + 8], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+
+ vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1], ym1
+ vextracti32x8 [r1 + 32], m1 , 1
+
+ ;Row 3, 4
+ movq xm0, [r0 + 16]
+ pinsrq xm0, [r0 + 24], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1
+
+ vfmadd132pd m2, m2, m5
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1 + 64], ym1
+ vextracti32x8 [r1 + 96], m1 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+
+INIT_ZMM avx512
+cglobal nonPsyRdoQuant8, 5, 8, 8
+
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r7, [4 * r4]
+ lea r1, [r1 + 2 * r7]
+
+%if BIT_DEPTH == 12
+ mov r5q, [tab_nonpsyRdo12 + 16] ; transformShift
+ mov r6q, [tab_nonpsyRdo12 + 24] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5q, [tab_nonpsyRdo10 + 16]
+ mov r6q, [tab_nonpsyRdo10 + 24]
+%elif BIT_DEPTH == 8
+ mov r5q, [tab_nonpsyRdo8 + 16]
+ mov r6q, [tab_nonpsyRdo8 + 24]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+
+ movq xm3, r6
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+
+;Row 1, 2
+ movq xm0, [r0]
+ pinsrq xm0, [r0 + mmsize/4], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+
+ vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1], ym1
+ vextracti32x8 [r1 + mmsize], m1 , 1
+
+ ;Row 3, 4
+ movq xm0, [r0 + mmsize/2]
+ pinsrq xm0, [r0 + 3 * mmsize/4], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1
+
+ vfmadd132pd m2, m2, m5
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1 + 2 * mmsize], ym1
+ vextracti32x8 [r1 + 3 * mmsize], m1 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+
+INIT_ZMM avx512
+cglobal nonPsyRdoQuant16, 5, 8, 8
+
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r7, [4 * r4]
+ lea r1, [r1 + 2 * r7]
+
+%if BIT_DEPTH == 12
+ mov r5q, [tab_nonpsyRdo12 + 32] ; transformShift
+ mov r6q, [tab_nonpsyRdo12 + 40] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5q, [tab_nonpsyRdo10 + 32]
+ mov r6q, [tab_nonpsyRdo10 + 40]
+%elif BIT_DEPTH == 8
+ mov r5q, [tab_nonpsyRdo8 + 32]
+ mov r6q, [tab_nonpsyRdo8 + 40]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+
+ movq xm3, r6
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+
+;Row 1, 2
+ movq xm0, [r0]
+ pinsrq xm0, [r0 + mmsize/2], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+
+ vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1], ym1
+ vextracti32x8 [r1 + 2 * mmsize], m1, 1
+
+ ;Row 3, 4
+ movq xm0, [r0 + mmsize]
+ pinsrq xm0, [r0 + 3 * mmsize/2], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1
+
+ vfmadd132pd m2, m2, m5
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1 + 4 * mmsize], ym1
+ vextracti32x8 [r1 + 6 * mmsize], m1 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+
+INIT_ZMM avx512
+cglobal nonPsyRdoQuant32, 5, 8, 8
+
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r7, [4 * r4]
+ lea r1, [r1 + 2 * r7]
+
+%if BIT_DEPTH == 12
+ mov r5q, [tab_nonpsyRdo12 + 48] ; transformShift
+ mov r6q, [tab_nonpsyRdo12 + 56] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5q, [tab_nonpsyRdo10 + 48]
+ mov r6q, [tab_nonpsyRdo10 + 56]
+%elif BIT_DEPTH == 8
+ mov r5q, [tab_nonpsyRdo8 + 48]
+ mov r6q, [tab_nonpsyRdo8 + 56]
+%else
+ %error Unsupported BIT_DEPTH!
+ %endif
+
+ movq xm3, r6
+ movq xm6, [r2]
+ movq xm7, [r3]
+ vpxor m4, m4
+ vpxor m5, m5
+
+;Row 1, 2
+ movq xm0, [r0]
+ pinsrq xm0, [r0 + mmsize], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
+
+ vfmadd132pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1], ym1
+ vextracti32x8 [r1 + 4 * mmsize], m1, 1
+
+ ;Row 3, 4
+ movq xm0, [r0 + 2 * mmsize]
+ pinsrq xm0, [r0 + 3 * mmsize], 1
+ vpmovsxwq m1, xm0
+ vcvtqq2pd m2, m1
+
+ vfmadd132pd m2, m2, m5
+ vfmadd213pd m2, m2, m5
+ vfmadd231pd m2, m2, m5
+
+ vcvtpd2qq m1, m2
+ vpsllq m1, xm3 ; costUncoded
+ paddq m4, m1
+ movu [r1 + 8 * mmsize], ym1
+ vextracti32x8 [r1 + 12 * mmsize], m1 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm5
+ paddq xm4, xm2
+
+ paddq xm6, xm4
+ paddq xm7, xm4
+
+ movq [r2], xm6
+ movq [r3], xm7
+ RET
+
%endif
diff -r a2224f4d257c -r 9a2c54117698 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Fri Dec 22 13:52:16 2017 +0530
+++ b/source/common/x86/dct8.h Wed Dec 20 12:15:46 2017 +0530
@@ -34,7 +34,7 @@
FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
-
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r a2224f4d257c -r 9a2c54117698 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Fri Dec 22 13:52:16 2017 +0530
+++ b/source/test/mbdstharness.cpp Wed Dec 20 12:15:46 2017 +0530
@@ -279,9 +279,52 @@
reportfail();
j += INCR;
}
+ return true;
+}
+
+bool MBDstHarness::check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt)
+{
+ int j = 0;
+ int trSize[4] = { 16, 64, 256, 1024 };
+
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int64_t totalRdCostRef = rand();
+ int64_t totalUncodedCostRef = rand();
+ int64_t totalRdCostOpt = totalRdCostRef;
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
+
+ int index = rand() % 4;
+ uint32_t blkPos = trSize[index];
+ int cmp_size = 4 * MAX_TU_SIZE;
+
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+
+ int index1 = rand() % TEST_CASES;
+
+ ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
+ checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
+
+ if (memcmp(ref_dest, opt_dest, cmp_size))
+ return false;
+
+ if (totalUncodedCostRef != totalUncodedCostOpt)
+ return false;
+
+ if (totalRdCostRef != totalRdCostOpt)
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
return true;
}
+
bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
{
int j = 0;
@@ -418,6 +461,19 @@
return false;
}
}
+
+ for (int i = 0; i < NUM_TR_SIZE; i++)
+ {
+ if (opt.cu[i].nonPsyRdoQuant)
+ {
+ if (!check_nonPsyRdoQuant_primitive(ref.cu[i].nonPsyRdoQuant, opt.cu[i].nonPsyRdoQuant))
+ {
+ printf("nonPsyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
+ }
+
for (int i = 0; i < NUM_TR_SIZE; i++)
{
if (opt.cu[i].count_nonzero)
@@ -505,6 +561,19 @@
printf("nquant\t\t");
REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
}
+
+ for (int value = 0; value < NUM_TR_SIZE; value++)
+ {
+ if (opt.cu[value].nonPsyRdoQuant)
+ {
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+ int64_t totalRdCost = 0;
+ int64_t totalUncodedCost = 0;
+ printf("nonPsyRdoQuant[%dx%d]", 4 << value, 4 << value);
+ REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
+ }
+ }
+
for (int value = 0; value < NUM_TR_SIZE; value++)
{
if (opt.cu[value].count_nonzero)
diff -r a2224f4d257c -r 9a2c54117698 source/test/mbdstharness.h
--- a/source/test/mbdstharness.h Fri Dec 22 13:52:16 2017 +0530
+++ b/source/test/mbdstharness.h Wed Dec 20 12:15:46 2017 +0530
@@ -62,9 +62,9 @@
int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
-
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
+ bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt);
bool check_quant_primitive(quant_t ref, quant_t opt);
bool check_nquant_primitive(nquant_t ref, nquant_t opt);
bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
More information about the x265-devel
mailing list