[x265] [PATCH 120 of 307] x86: Link add_ps_aligned primitive to encoder
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:58 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1507092075 -19800
# Wed Oct 04 10:11:15 2017 +0530
# Node ID a78accbf7387dfe43ba59367b286af31d77e7c8f
# Parent 44433ded38d00c79fa52e69e7c5c5127009f9ede
x86: Link add_ps_aligned primitive to encoder
diff -r 44433ded38d0 -r a78accbf7387 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Fri Oct 06 14:00:56 2017 +0530
+++ b/source/encoder/analysis.cpp Wed Oct 04 10:11:15 2017 +0530
@@ -3325,8 +3325,17 @@
* resiYuv. Generate the recon pixels by adding it to the prediction */
if (cu.m_cbf[0][0])
- primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
- predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
+ {
+ bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
+ bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
+ if (reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) && (resiYuv.m_size % 64 == 0) &&
+ reconPic.m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdx].add_ps_aligned(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
+ predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
+ else
+ primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
+ predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
+ }
else
primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
predY, predYuv.m_size);
@@ -3334,16 +3343,34 @@
{
pixel* predU = predYuv.getCbAddr(absPartIdx);
pixel* predV = predYuv.getCrAddr(absPartIdx);
- if (cu.m_cbf[1][0])
- primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
- predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+ if (cu.m_cbf[1][0])
+ {
+ bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
+ bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
+ if (reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) && (resiYuv.m_csize % 64 == 0) &&
+ reconPic.m_param->cpuid & X265_CPU_AVX512)
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps_aligned(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+ else
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+ }
else
primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
predU, predYuv.m_csize);
- if (cu.m_cbf[2][0])
- primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
- predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+ if (cu.m_cbf[2][0])
+ {
+ bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
+ bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
+ if (reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) && (resiYuv.m_csize % 64 == 0) &&
+ reconPic.m_param->cpuid & X265_CPU_AVX512)
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps_aligned(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+ else
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+ predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+ }
else
primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
predV, predYuv.m_csize);
diff -r 44433ded38d0 -r a78accbf7387 source/encoder/search.cpp
--- a/source/encoder/search.cpp Fri Oct 06 14:00:56 2017 +0530
+++ b/source/encoder/search.cpp Wed Oct 04 10:11:15 2017 +0530
@@ -363,7 +363,13 @@
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
- primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
+ bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
+ bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
+ if (reconQtStride % 64 == 0 && stride % 64 == 0 && reconQtYuvAlign && predAlign && residualAlign && cu.m_encData->m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdx].add_ps_aligned(reconQt, reconQtStride, pred, residual, stride, stride);
+ else
+ primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
}
else
// no coded residual, recon = pred
@@ -561,6 +567,7 @@
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
+ bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
@@ -572,7 +579,12 @@
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
- primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
+ bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
+ if (stride % 64 == 0 && tmpReconStride % 64 == 0 && tmpReconAlign && residualAlign && predAlign && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdx].add_ps_aligned(tmpRecon, tmpReconStride, pred, residual, stride, stride);
+ else
+ primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
}
else if (useTSkip)
{
@@ -732,7 +744,13 @@
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
- primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
+ bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
+ bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
+ if (picStride % 64 == 0 && stride % 64 == 0 && picReconYAlign && predAlign && residualAlign && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdx].add_ps_aligned(picReconY, picStride, pred, residual, stride, stride);
+ else
+ primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
}
else
@@ -910,7 +928,13 @@
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
- primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
+ bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ if (reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdxC].add_ps_aligned(reconQt, reconQtStride, pred, residual, stride, stride);
+ else
+ primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
@@ -1013,7 +1037,13 @@
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
- primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
+ bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ if (reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdxC].add_ps_aligned(recon, reconStride, pred, residual, stride, stride);
+ else
+ primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else if (useTSkip)
@@ -1207,7 +1237,13 @@
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
- primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
+ bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
+ bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
+ if (picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[sizeIdxC].add_ps_aligned(picReconC, picStride, pred, residual, stride, stride);
+ else
+ primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
@@ -3223,8 +3259,14 @@
// non-zero cost calculation for luma - This is an approximation
// finally we have to encode correct cbf after comparing with null cost
pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
+ bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
- primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
+ bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
+ bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
+ if (curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[partSize].add_ps_aligned(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
+ else
+ primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
@@ -3345,7 +3387,13 @@
// finally we have to encode correct cbf after comparing with null cost
pixel* curReconC = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
- primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
+ bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ if (curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[partSizeC].add_ps_aligned(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
+ else
+ primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
@@ -3455,8 +3503,12 @@
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
-
- primitives.cu[partSize].add_ps(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
+ bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
+
+ if (predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[partSize].add_ps_aligned(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
+ else
+ primitives.cu[partSize].add_ps(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
if (m_rdCost.m_psyRd)
@@ -3533,7 +3585,11 @@
m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
- primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+ if (predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+ primitives.cu[partSizeC].add_ps_aligned(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
+ else
+ primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
if (m_rdCost.m_psyRd)
{
More information about the x265-devel
mailing list