[x265] [PATCH 277 of 307] x86: psyRdoQuant primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:35 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1514521347 -19800
# Fri Dec 29 09:52:27 2017 +0530
# Node ID 4e9f2efdfd097910aa5bf704a4bbf38b0a28f2a5
# Parent 80775bda5ec16735e7b1de97dedeb7f7ed391c8f
x86: psyRdoQuant primitive
This patch also adds AVX512 assembly code for this primitive
AVX512 :231.20c
C code :1060.74c
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/dct.cpp
--- a/source/common/dct.cpp Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/dct.cpp Fri Dec 29 09:52:27 2017 +0530
@@ -1001,9 +1001,34 @@
blkPos += trSize;
}
}
+template<int log2TrSize>
+static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
+ const uint32_t trSize = 1 << log2TrSize;
+ int max = X265_MAX(0, (2 * transformShift + 1));
+
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
+
+ /* when no residual coefficient is coded, predicted coef == recon coef */
+ costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
+
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
namespace X265_NS {
// x265 private namespace
-
void setupDCTPrimitives_c(EncoderPrimitives& p)
{
p.dequant_scaling = dequant_scaling_c;
@@ -1014,6 +1039,10 @@
p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>;
p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
+ p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
+ p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
+ p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
+ p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
p.dst4x4 = dst4_c;
p.cu[BLOCK_4x4].dct = dct4_c;
p.cu[BLOCK_8x8].dct = dct8_c;
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/primitives.h
--- a/source/common/primitives.h Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/primitives.h Fri Dec 29 09:52:27 2017 +0530
@@ -224,6 +224,8 @@
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
+
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -297,6 +299,7 @@
intra_filter_t intra_filter;
intra_pred_t intra_pred[NUM_INTRA_MODE];
nonPsyRdoQuant_t nonPsyRdoQuant;
+ psyRdoQuant_t psyRdoQuant;
}
cu[NUM_CU_SIZES];
/* These remaining primitives work on either fixed block sizes or take
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/quant.cpp
--- a/source/common/quant.cpp Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/quant.cpp Fri Dec 29 09:52:27 2017 +0530
@@ -642,11 +642,9 @@
X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
if (!numSig)
return 0;
-
const uint32_t trSize = 1 << log2TrSize;
int64_t lambda2 = m_qpParam[ttype].lambda2;
- const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
-
+ int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
/* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
* scale applied that must be removed during unquant. Note that in real dequant there is clipping
* at several stages. We skip the clipping for simplicity when measuring RD cost */
@@ -723,25 +721,9 @@
for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
{
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
-
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
-
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
- for (int y = 0; y < MLS_CG_SIZE; y++)
- {
- for (int x = 0; x < MLS_CG_SIZE; x++)
- {
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
- costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
- /* when no residual coefficient is coded, predicted coef == recon coef */
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
- }
- blkPos += trSize;
- }
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
}
}
else
@@ -814,22 +796,14 @@
// TODO: does we need zero-coeff cost?
const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
-
if (usePsyMask)
{
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ blkPos = codeParams.scan[scanPosBase];
for (int y = 0; y < MLS_CG_SIZE; y++)
{
for (int x = 0; x < MLS_CG_SIZE; x++)
{
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
- costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
- /* when no residual coefficient is coded, predicted coef == recon coef */
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
- totalUncodedCost += costUncoded[blkPos + x];
- totalRdCost += costUncoded[blkPos + x];
-
const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
X265_CHECK(trSize > 4, "trSize check failure\n");
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Dec 29 09:52:27 2017 +0530
@@ -3120,7 +3120,10 @@
p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
-
+ p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
+ p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
+ p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
+ p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
}
#endif
}
@@ -5302,10 +5305,16 @@
p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
+
p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
+ p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
+ p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
+ p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
+ p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
+
}
#endif
}
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/x86/dct8.asm Fri Dec 29 09:52:27 2017 +0530
@@ -516,6 +516,7 @@
tab_nonpsyRdo8 : dq 5, 7, 9, 11
tab_nonpsyRdo10: dq 9, 11, 13, 15
tab_nonpsyRdo12: dq 13, 15, 17, 19
+
SECTION .text
cextern pd_1
cextern pd_2
@@ -542,6 +543,10 @@
%define DST4_ROUND 16
%define DCT8_SHIFT1 6
%define DCT8_ROUND1 32
+ %define RDO_MAX_4 3
+ %define RDO_MAX_8 1
+ %define RDO_MAX_16 0
+ %define RDO_MAX_32 0
%elif BIT_DEPTH == 10
%define DCT4_SHIFT 3
%define DCT4_ROUND 4
@@ -551,6 +556,10 @@
%define DST4_ROUND 4
%define DCT8_SHIFT1 4
%define DCT8_ROUND1 8
+ %define RDO_MAX_4 7
+ %define RDO_MAX_8 5
+ %define RDO_MAX_16 3
+ %define RDO_MAX_32 1
%elif BIT_DEPTH == 8
%define DCT4_SHIFT 1
%define DCT4_ROUND 1
@@ -560,6 +569,10 @@
%define DST4_ROUND 1
%define DCT8_SHIFT1 2
%define DCT8_ROUND1 2
+ %define RDO_MAX_4 11
+ %define RDO_MAX_8 9
+ %define RDO_MAX_16 7
+ %define RDO_MAX_32 5
%else
%error Unsupported BIT_DEPTH!
%endif
@@ -6650,5 +6663,391 @@
movq [r2], xm6
movq [r3], xm7
RET
-
+;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos)
+;{
+; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+; const int scaleBits = SCALE_BITS - 2 * transformShift;
+; const uint32_t trSize = 1 << log2TrSize;
+; int max = X265_MAX(0, (2 * transformShift + 1));
+;
+; for (int y = 0; y < MLS_CG_SIZE; y++)
+; {
+; for (int x = 0; x < MLS_CG_SIZE; x++)
+; {
+; int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+; int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+;
+; costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits);
+;
+; /* when no residual coefficient is coded, predicted coef == recon coef */
+; costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max);
+;
+; *totalUncodedCost += costUncoded[blkPos + x];
+; *totalRdCost += costUncoded[blkPos + x];
+; }
+; blkPos += trSize;
+; }
+;}
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant4, 5, 9, 13
+ mov r5, r5m
+ mov r6d, r6m
+ vpbroadcastq m12, [r5] ; psyScale
+ lea r0, [r0 + 2 * r6]
+ lea r1, [r1 + 2 * r6]
+ lea r6, [4 * r6]
+ lea r2, [r2 + 2 * r6]
+ movq xm0, [r3]
+ movq xm1, [r4]
+
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
+
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+
+;Row 1, 2
+ vpmovsxwq m6, [r0]
+ vpmovsxwq m7, [r1]
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2], m8
+
+ ;Row 3, 4
+ vpmovsxwq m6, [r0 + 16]
+ vpmovsxwq m7, [r1 + 16]
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2 + 64], m8
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r3], xm0
+ movq [r4], xm1
+ RET
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant8, 5, 9, 15
+ mov r5, r5m
+ mov r6d, r6m
+ vpbroadcastq m12, [r5] ; psyScale
+ lea r0, [r0 + 2 * r6]
+ lea r1, [r1 + 2 * r6]
+ lea r6, [4 * r6]
+ lea r2, [r2 + 2 * r6]
+ movq xm0, [r3]
+ movq xm1, [r4]
+
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12 + 8] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10 + 8]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8 + 8]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+
+;Row 1, 2
+ movq xm13, [r0]
+ movq xm14, [r1]
+ pinsrq xm13, [r0 + mmsize/4], 1
+ pinsrq xm14, [r1 + mmsize/4], 1
+ vpmovsxwq m6, xm13
+ vpmovsxwq m7, xm14
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2], ym8
+ vextracti32x8 [r2 + mmsize], m8 , 1
+
+ ;Row 3, 4
+ movq xm13, [r0 + mmsize/2]
+ movq xm14, [r1 + mmsize/2]
+ pinsrq xm13, [r0 + 3 * mmsize/4], 1
+ pinsrq xm14, [r1 + 3 * mmsize/4], 1
+ vpmovsxwq m6, xm13
+ vpmovsxwq m7, xm14
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2 + 2 * mmsize], ym8
+ vextracti32x8 [r2 + 3 * mmsize], m8 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r3], xm0
+ movq [r4], xm1
+ RET
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant16, 5, 9, 15
+ mov r5, r5m
+ mov r6d, r6m
+ vpbroadcastq m12, [r5] ; psyScale
+ lea r0, [r0 + 2 * r6]
+ lea r1, [r1 + 2 * r6]
+ lea r6, [4 * r6]
+ lea r2, [r2 + 2 * r6]
+ movq xm0, [r3]
+ movq xm1, [r4]
+
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10 + 16]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8 + 16]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+
+;Row 1, 2
+ movq xm13, [r0]
+ movq xm14, [r1]
+ pinsrq xm13, [r0 + mmsize/2], 1
+ pinsrq xm14, [r1 + mmsize/2], 1
+ vpmovsxwq m6, xm13
+ vpmovsxwq m7, xm14
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2], ym8
+ vextracti32x8 [r2 + 2 * mmsize], m8 , 1
+
+ ;Row 3, 4
+ movq xm13, [r0 + mmsize]
+ movq xm14, [r1 + mmsize]
+ pinsrq xm13, [r0 + 3 * mmsize/2], 1
+ pinsrq xm14, [r1 + 3 * mmsize/2], 1
+ vpmovsxwq m6, xm13
+ vpmovsxwq m7, xm14
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2 + 4 * mmsize], ym8
+ vextracti32x8 [r2 + 6 * mmsize], m8 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r3], xm0
+ movq [r4], xm1
+ RET
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant32, 5, 9, 15
+ mov r5, r5m
+ mov r6d, r6m
+ vpbroadcastq m12, [r5] ; psyScale
+ lea r0, [r0 + 2 * r6]
+ lea r1, [r1 + 2 * r6]
+ lea r6, [4 * r6]
+ lea r2, [r2 + 2 * r6]
+ movq xm0, [r3]
+ movq xm1, [r4]
+
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10 + 24]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8 + 24]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+
+;Row 1, 2
+ movq xm13, [r0]
+ movq xm14, [r1]
+ pinsrq xm13, [r0 + mmsize], 1
+ pinsrq xm14, [r1 + mmsize], 1
+ vpmovsxwq m6, xm13
+ vpmovsxwq m7, xm14
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2], ym8
+ vextracti32x8 [r2 + 4 * mmsize], m8 , 1
+
+ ;Row 3, 4
+ movq xm13, [r0 + 2 * mmsize]
+ movq xm14, [r1 + 2 * mmsize]
+ pinsrq xm13, [r0 + 3 * mmsize], 1
+ pinsrq xm14, [r1 + 3 * mmsize], 1
+ vpmovsxwq m6, xm13
+ vpmovsxwq m7, xm14
+ psubq m7, m6 ; predictedCoef
+
+ vcvtqq2pd m9, m6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2qq m8, m9
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
+
+ vcvtqq2pd m10, m7
+ vcvtqq2pd m11, m12
+ vfmadd213pd m10, m11, m3
+ vcvtpd2qq m9, m10
+ vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max
+
+ psubq m8, m9
+ paddq m4, m8
+ movu [r2 + 8 * mmsize], ym8
+ vextracti32x8 [r2 + 12 * mmsize], m8 , 1
+
+ vextracti32x8 ym2, m4, 1
+ paddq ym4, ym2
+ vextracti32x4 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r3], xm0
+ movq [r4], xm1
+ RET
+%endif
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/x86/dct8.h Fri Dec 29 09:52:27 2017 +0530
@@ -35,6 +35,8 @@
FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
+
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Tue Jan 02 15:21:08 2018 +0530
+++ b/source/test/mbdstharness.cpp Fri Dec 29 09:52:27 2017 +0530
@@ -61,16 +61,17 @@
for (int i = 0; i < TEST_BUF_SIZE; i++)
{
short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
+ short_test_buff1[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
int_test_buff[0][i] = rand() % PIXEL_MAX;
int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
-
short_test_buff[1][i] = -PIXEL_MAX;
+ short_test_buff1[1][i] = -PIXEL_MAX;
int_test_buff[1][i] = -PIXEL_MAX;
int_idct_test_buff[1][i] = SHORT_MIN;
short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
-
short_test_buff[2][i] = PIXEL_MAX;
+ short_test_buff1[2][i] = PIXEL_MAX;
int_test_buff[2][i] = PIXEL_MAX;
int_idct_test_buff[2][i] = SHORT_MAX;
short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
@@ -324,6 +325,51 @@
return true;
}
+bool MBDstHarness::check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt)
+{
+ int j = 0;
+ int trSize[4] = { 16, 64, 256, 1024 };
+
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int64_t totalRdCostRef = rand();
+ int64_t totalUncodedCostRef = rand();
+ int64_t totalRdCostOpt = totalRdCostRef;
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
+ int64_t *psyScale = X265_MALLOC(int64_t, 1);
+ *psyScale = rand();
+
+ int index = rand() % 4;
+ uint32_t blkPos = trSize[index];
+ int cmp_size = 4 * MAX_TU_SIZE;
+
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+
+ int index1 = rand() % TEST_CASES;
+
+ ref(short_test_buff[index1] + j, short_test_buff1[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, psyScale, blkPos);
+ checked(opt, short_test_buff[index1] + j, short_test_buff1[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, psyScale, blkPos);
+
+ X265_FREE(psyScale);
+ if (memcmp(ref_dest, opt_dest, cmp_size))
+ return false;
+
+ if (totalUncodedCostRef != totalUncodedCostOpt)
+ return false;
+
+ if (totalRdCostRef != totalRdCostOpt)
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
{
@@ -473,6 +519,17 @@
}
}
}
+ for (int i = 0; i < NUM_TR_SIZE; i++)
+ {
+ if (opt.cu[i].psyRdoQuant)
+ {
+ if (!check_psyRdoQuant_primitive(ref.cu[i].psyRdoQuant, opt.cu[i].psyRdoQuant))
+ {
+ printf("psyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
+ }
for (int i = 0; i < NUM_TR_SIZE; i++)
{
@@ -573,6 +630,19 @@
REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
}
}
+ for (int value = 0; value < NUM_TR_SIZE; value++)
+ {
+ if (opt.cu[value].psyRdoQuant)
+ {
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+ int64_t totalRdCost = 0;
+ int64_t totalUncodedCost = 0;
+ int64_t *psyScale = X265_MALLOC(int64_t, 1);
+ *psyScale = 0;
+ printf("psyRdoQuant[%dx%d]", 4 << value, 4 << value);
+ REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0);
+ }
+ }
for (int value = 0; value < NUM_TR_SIZE; value++)
{
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/test/mbdstharness.h
--- a/source/test/mbdstharness.h Tue Jan 02 15:21:08 2018 +0530
+++ b/source/test/mbdstharness.h Fri Dec 29 09:52:27 2017 +0530
@@ -51,11 +51,10 @@
int mintbuf2[MAX_TU_SIZE];
int mintbuf3[MAX_TU_SIZE];
int mintbuf4[MAX_TU_SIZE];
-
int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
+ int16_t short_test_buff1[TEST_CASES][TEST_BUF_SIZE];
int int_test_buff[TEST_CASES][TEST_BUF_SIZE];
int int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE];
-
uint32_t mubuf1[MAX_TU_SIZE];
uint32_t mubuf2[MAX_TU_SIZE];
uint16_t mushortbuf1[MAX_TU_SIZE];
@@ -65,6 +64,7 @@
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt);
+ bool check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt);
bool check_quant_primitive(quant_t ref, quant_t opt);
bool check_nquant_primitive(nquant_t ref, nquant_t opt);
bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
More information about the x265-devel
mailing list