[x265] [PATCH 115 of 307] x86: Aligned routine encoder integration for blockfill_s primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:53 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507182370 -19800
# Thu Oct 05 11:16:10 2017 +0530
# Node ID d4ee703039c6cde39312a596cee019c346a8381b
# Parent 14c93ddbd598128b43a96ff21221e2dbb189d275
x86: Aligned routine encoder integration for blockfill_s primitive
diff -r 14c93ddbd598 -r d4ee703039c6 source/common/quant.cpp
--- a/source/common/quant.cpp Wed Oct 04 15:55:03 2017 +0530
+++ b/source/common/quant.cpp Thu Oct 05 11:16:10 2017 +0530
@@ -188,8 +188,9 @@
m_nr = NULL;
}
-bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy)
+bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy, int cpuid)
{
+ m_cpuid = cpuid;
m_entropyCoder = &entropy;
m_psyRdoqScale = (int32_t)(psyScale * 256.0);
X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n");
@@ -611,7 +612,10 @@
const int add_2nd = 1 << (shift_2nd - 1);
int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
- primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
+ if ((resiStride % 64 == 0) && (m_cpuid & X265_CPU_AVX512))
+ primitives.cu[sizeIdx].blockfill_s_aligned(residual, resiStride, (int16_t)dc_val);
+ else
+ primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
return;
}
diff -r 14c93ddbd598 -r d4ee703039c6 source/common/quant.h
--- a/source/common/quant.h Wed Oct 04 15:55:03 2017 +0530
+++ b/source/common/quant.h Thu Oct 05 11:16:10 2017 +0530
@@ -93,6 +93,7 @@
public:
+ int m_cpuid;
NoiseReduction* m_nr;
NoiseReduction* m_frameNr; // Array of NR structures, one for each frameEncoder
@@ -100,7 +101,7 @@
~Quant();
/* one-time setup */
- bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy);
+ bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy, int cpuid);
bool allocNoiseReduction(const x265_param& param);
/* CU setup */
diff -r 14c93ddbd598 -r d4ee703039c6 source/encoder/search.cpp
--- a/source/encoder/search.cpp Wed Oct 04 15:55:03 2017 +0530
+++ b/source/encoder/search.cpp Thu Oct 05 11:16:10 2017 +0530
@@ -81,7 +81,7 @@
m_rdCost.setSsimRd(param.bSsimRd);
m_me.init(param.internalCsp);
- bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
+ bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder, param.cpuid);
if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
ok &= m_quant.allocNoiseReduction(param);
@@ -2914,7 +2914,10 @@
}
else
{
- primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
+ if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[sizeIdx].blockfill_s_aligned(curResiY, strideResiY, 0);
+ else
+ primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
}
@@ -2947,7 +2950,10 @@
}
else
{
- primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
+ if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[sizeIdxC].blockfill_s_aligned(curResiU, strideResiC, 0);
+ else
+ primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
@@ -2961,7 +2967,11 @@
}
else
{
- primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
+ if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[sizeIdxC].blockfill_s_aligned(curResiV, strideResiC, 0);
+ else
+ primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
+
cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
}
@@ -3229,7 +3239,10 @@
{
cbfFlag[TEXT_LUMA][0] = 0;
singleBits[TEXT_LUMA][0] = 0;
- primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
+ if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[partSize].blockfill_s_aligned(curResiY, strideResiY, 0);
+ else
+ primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
#if CHECKED_BUILD || _DEBUG
uint32_t numCoeffY = 1 << (log2TrSize << 1);
memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
@@ -3252,7 +3265,10 @@
{
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
- primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
+ if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[partSize].blockfill_s_aligned(curResiY, strideResiY, 0);
+ else
+ primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
singleDist[TEXT_LUMA][0] = zeroDistY;
singleBits[TEXT_LUMA][0] = 0;
singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
@@ -3341,7 +3357,10 @@
{
cbfFlag[chromaId][tuIterator.section] = 0;
singleBits[chromaId][tuIterator.section] = 0;
- primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
+ if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[partSizeC].blockfill_s_aligned(curResiC, strideResiC, 0);
+ else
+ primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
#if CHECKED_BUILD || _DEBUG
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
@@ -3364,7 +3383,10 @@
{
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
- primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
+ if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+ primitives.cu[partSizeC].blockfill_s_aligned(curResiC, strideResiC, 0);
+ else
+ primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
singleBits[chromaId][tuIterator.section] = 0;
singleDist[chromaId][tuIterator.section] = zeroDistC;
singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
More information about the x265-devel
mailing list