[x265] [PATCH 02 of 13] x86 : AVX2 psyrdoquant primitive for all sizes
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Fri Jul 6 11:18:02 CEST 2018
# HG changeset patch
# User Jayashree <jayashree.c at multicorewareinc.com>
# Date 1523609472 -19800
# Fri Apr 13 14:21:12 2018 +0530
# Node ID d55132eb42010fe0cbb95cd245d3d1fff69743c4
# Parent c9f622347ce51cf90b593e8500ee5a40888c6f29
x86 : AVX2 psyrdoquant primitive for all sizes
diff -r c9f622347ce5 -r d55132eb4201 source/common/cpu.cpp
--- a/source/common/cpu.cpp Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/cpu.cpp Fri Apr 13 14:21:12 2018 +0530
@@ -58,6 +58,7 @@
#endif // if X265_ARCH_ARM
namespace X265_NS {
+static bool enable512 = false;
const cpu_name_t cpu_names[] =
{
#if X265_ARCH_X86
@@ -122,10 +123,14 @@
#pragma warning(disable: 4309) // truncation of constant value
#endif
+bool detect512()
+{
+ return(enable512);
+}
uint32_t cpu_detect(bool benableavx512 )
{
- uint32_t cpu = 0;
+ uint32_t cpu = 0;
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = { 0 };
uint32_t max_extended_cap, max_basic_cap;
@@ -189,7 +194,10 @@
if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
{
if ((ebx & 0xD0030000) == 0xD0030000)
+ {
cpu |= X265_CPU_AVX512;
+ enable512 = true;
+ }
}
}
}
@@ -390,3 +398,4 @@
#endif // if X265_ARCH_X86
}
+
diff -r c9f622347ce5 -r d55132eb4201 source/common/cpu.h
--- a/source/common/cpu.h Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/cpu.h Fri Apr 13 14:21:12 2018 +0530
@@ -26,7 +26,6 @@
#define X265_CPU_H
#include "common.h"
-
/* All assembly functions are prefixed with X265_NS (macro expanded) */
#define PFX3(prefix, name) prefix ## _ ## name
#define PFX2(prefix, name) PFX3(prefix, name)
@@ -51,6 +50,7 @@
namespace X265_NS {
uint32_t cpu_detect(bool);
+bool detect512();
struct cpu_name_t
{
diff -r c9f622347ce5 -r d55132eb4201 source/common/dct.cpp
--- a/source/common/dct.cpp Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/dct.cpp Fri Apr 13 14:21:12 2018 +0530
@@ -1027,6 +1027,47 @@
blkPos += trSize;
}
}
+template<int log2TrSize>
+static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
+ const uint32_t trSize = 1 << log2TrSize;
+
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
+template<int log2TrSize>
+static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+{
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+
+ const uint32_t trSize = 1 << log2TrSize;
+ int max = X265_MAX(0, (2 * transformShift + 1));
+
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+ costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
+ *totalUncodedCost += costUncoded[blkPos + x];
+ *totalRdCost += costUncoded[blkPos + x];
+ }
+ blkPos += trSize;
+ }
+}
+
namespace X265_NS {
// x265 private namespace
void setupDCTPrimitives_c(EncoderPrimitives& p)
@@ -1063,7 +1104,14 @@
p.cu[BLOCK_8x8].copy_cnt = copy_count<8>;
p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
-
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
+ p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
+ p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
+ p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
+ p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
p.scanPosLast = scanPosLast_c;
p.findPosFirstLast = findPosFirstLast_c;
p.costCoeffNxN = costCoeffNxN_c;
diff -r c9f622347ce5 -r d55132eb4201 source/common/primitives.h
--- a/source/common/primitives.h Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/primitives.h Fri Apr 13 14:21:12 2018 +0530
@@ -225,7 +225,8 @@
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
-
+typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos);
+typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -300,6 +301,8 @@
intra_pred_t intra_pred[NUM_INTRA_MODE];
nonPsyRdoQuant_t nonPsyRdoQuant;
psyRdoQuant_t psyRdoQuant;
+ psyRdoQuant_t1 psyRdoQuant_1p;
+ psyRdoQuant_t2 psyRdoQuant_2p;
}
cu[NUM_CU_SIZES];
/* These remaining primitives work on either fixed block sizes or take
diff -r c9f622347ce5 -r d55132eb4201 source/common/quant.cpp
--- a/source/common/quant.cpp Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/quant.cpp Fri Apr 13 14:21:12 2018 +0530
@@ -723,7 +723,14 @@
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
uint32_t blkPos = codeParams.scan[scanPosBase];
- primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ bool enable512 = detect512();
+ if (enable512)
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ else
+ {
+ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
+ primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ }
}
}
else
@@ -798,7 +805,15 @@
uint32_t blkPos = codeParams.scan[scanPosBase];
if (usePsyMask)
{
- primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ bool enable512 = detect512();
+
+ if (enable512)
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ else
+ {
+ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
+ primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+ }
blkPos = codeParams.scan[scanPosBase];
for (int y = 0; y < MLS_CG_SIZE; y++)
{
diff -r c9f622347ce5 -r d55132eb4201 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 13 14:21:12 2018 +0530
@@ -2314,6 +2314,10 @@
p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
/* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only
p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
@@ -4697,6 +4701,10 @@
p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
}
if (cpuMask & X265_CPU_AVX512)
diff -r c9f622347ce5 -r d55132eb4201 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/x86/dct8.asm Fri Apr 13 14:21:12 2018 +0530
@@ -7353,4 +7353,289 @@
movq [r2], xm6
movq [r3], xm7
RET
+
+INIT_YMM avx2
+cglobal psyRdoQuant_1p4, 5, 9, 16
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+ movq xm0, [r2]
+ movq xm1, [r3]
+
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8]
+%else
+ %error Unsupported BIT_DEPTH!
%endif
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+ vpxor m13, m13
+
+ vpmovsxwd m6, [r0]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1], m13
+
+ vpmovsxwd m6, [r0 + 8]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 32], m13
+
+ vpmovsxwd m6, [r0 + 16]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 64], m13
+
+ vpmovsxwd m6, [r0 +24]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 96], m13
+
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r2], xm0
+ movq [r3], xm1
+ RET
+INIT_YMM avx2
+cglobal psyRdoQuant_1p8, 7, 9, 16
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+ movq xm0, [r2]
+ movq xm1, [r3]
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12 +8] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10 +8]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8 + 8 ]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+ vpxor m13, m13
+
+
+ vpmovsxwd m6, [r0]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1], m13
+
+ vpmovsxwd m6, [r0 + 16]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 64], m13
+
+ vpmovsxwd m6, [r0 +32]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 +128], m13
+
+ vpmovsxwd m6, [r0 + 48]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 192], m13
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r2], xm0
+ movq [r3], xm1
+ RET
+
+INIT_YMM avx2
+cglobal psyRdoQuant_1p16, 7, 9, 16
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+ movq xm0, [r2]
+ movq xm1, [r3]
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10 + 16]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8 + 16 ]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+ vpxor m13, m13
+
+ vpmovsxwd m6, [r0]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1], m13
+
+ vpmovsxwd m6, [r0 + mmsize]
+
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 4*mmsize], m13
+
+ vpmovsxwd m6, [r0 + 2 * mmsize]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 8*mmsize], m13
+
+ vpmovsxwd m6, [r0 + 3 * mmsize]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 12*mmsize], m13
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r2], xm0
+ movq [r3], xm1
+ RET
+
+INIT_YMM avx2
+cglobal psyRdoQuant_1p32, 7, 9, 16
+ mov r4d, r4m
+ lea r0, [r0 + 2 * r4]
+ lea r4, [4 * r4]
+ lea r1, [r1 + 2 * r4]
+ movq xm0, [r2]
+ movq xm1, [r3]
+%if BIT_DEPTH == 12
+ mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits
+%elif BIT_DEPTH == 10
+ mov r5, [tab_nonpsyRdo10 + 24]
+%elif BIT_DEPTH == 8
+ mov r5, [tab_nonpsyRdo8 + 24]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ movq xm2, r5
+ vpxor m4, m4
+ vpxor m3, m3
+ vpxor m13, m13
+
+
+ vpmovsxwd m6, [r0]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1], m13
+
+ vpmovsxwd m6, [r0 + 2 * mmsize]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 8 * mmsize], m13
+
+ vpmovsxwd m6, [r0 + 4 * mmsize]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 16 * mmsize], m13
+
+ vpmovsxwd m6, [r0 + 6 * mmsize]
+ vcvtdq2pd m9, xm6
+ vfmadd213pd m9, m9, m3
+ vcvtpd2dq xm8, m9
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
+ paddq m4, m13
+ movu [r1 + 24 *mmsize], m13
+
+ vextracti128 xm2, m4, 1
+ paddq xm4, xm2
+ punpckhqdq xm2, xm4, xm3
+ paddq xm4, xm2
+
+ paddq xm0, xm4
+ paddq xm1, xm4
+
+ movq [r2], xm0
+ movq [r3], xm1
+ RET
+
+%endif
diff -r c9f622347ce5 -r d55132eb4201 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/x86/dct8.h Fri Apr 13 14:21:12 2018 +0530
@@ -37,6 +37,8 @@
FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+FUNCDEF_TU_S2(void, psyRdoQuant_1p, avx2, int16_t* m_resiDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, uint32_t blkPos);
+FUNCDEF_TU_S2(void, psyRdoQuant_2p, avx2, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r c9f622347ce5 -r d55132eb4201 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Wed Mar 07 09:34:37 2018 +0530
+++ b/source/test/mbdstharness.cpp Fri Apr 13 14:21:12 2018 +0530
@@ -370,7 +370,49 @@
return true;
}
+bool MBDstHarness::check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt)
+{
+ int j = 0;
+ int trSize[4] = { 16, 64, 256, 1024 };
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int64_t totalRdCostRef = rand();
+ int64_t totalUncodedCostRef = rand();
+ int64_t totalRdCostOpt = totalRdCostRef;
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
+
+ int index = rand() % 4;
+ uint32_t blkPos = trSize[index];
+ int cmp_size = 4 * MAX_TU_SIZE;
+
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+
+ int index1 = rand() % TEST_CASES;
+
+ ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
+ checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
+
+
+ if (memcmp(ref_dest, opt_dest, cmp_size))
+ return false;
+
+ if (totalUncodedCostRef != totalUncodedCostOpt)
+ return false;
+
+ if (totalRdCostRef != totalRdCostOpt)
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
{
int j = 0;
@@ -530,7 +572,17 @@
}
}
}
-
+ for (int i = 0; i < NUM_TR_SIZE; i++)
+ {
+ if (opt.cu[i].psyRdoQuant_1p)
+ {
+ if (!check_psyRdoQuant_primitive_avx2(ref.cu[i].psyRdoQuant_1p, opt.cu[i].psyRdoQuant_1p))
+ {
+ printf("psyRdoQuant_1p[%dx%d]: Failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
+ }
for (int i = 0; i < NUM_TR_SIZE; i++)
{
if (opt.cu[i].count_nonzero)
@@ -643,7 +695,17 @@
REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0);
}
}
-
+ for (int value = 0; value < NUM_TR_SIZE; value++)
+ {
+ if (opt.cu[value].psyRdoQuant_1p)
+ {
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+ int64_t totalRdCost = 0;
+ int64_t totalUncodedCost = 0;
+ printf("psyRdoQuant_1p[%dx%d]", 4 << value, 4 << value);
+ REPORT_SPEEDUP(opt.cu[value].psyRdoQuant_1p, ref.cu[value].psyRdoQuant_1p, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
+ }
+ }
for (int value = 0; value < NUM_TR_SIZE; value++)
{
if (opt.cu[value].count_nonzero)
diff -r c9f622347ce5 -r d55132eb4201 source/test/mbdstharness.h
--- a/source/test/mbdstharness.h Wed Mar 07 09:34:37 2018 +0530
+++ b/source/test/mbdstharness.h Fri Apr 13 14:21:12 2018 +0530
@@ -71,6 +71,7 @@
bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
bool check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt);
+ bool check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt);
public:
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-02.patch
Type: text/x-patch
Size: 28044 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/a54d8bb3/attachment-0001.bin>
More information about the x265-devel
mailing list