[x265] [PATCH 12/14] Implement ASM for SAD used for motion estimation
Snehaa Giridharan
snehaa at multicorewareinc.com
Wed Oct 19 07:31:55 UTC 2022
>From d4c3b24ffa43d283b4b92b64a164c9c7de878c7f Mon Sep 17 00:00:00 2001
From: ashok2022 <ashok at multicorewareinc.com>
Date: Fri, 14 Oct 2022 11:30:23 +0530
Subject: [PATCH] Implement ASM for SAD used for motion estimation
---
source/common/temporalfilter.cpp | 185 ++++++++++++++++++++++++++++---
source/common/temporalfilter.h | 14 ++-
2 files changed, 184 insertions(+), 15 deletions(-)
diff --git a/source/common/temporalfilter.cpp
b/source/common/temporalfilter.cpp
index f3e2dc73c..72edaaac1 100644
--- a/source/common/temporalfilter.cpp
+++ b/source/common/temporalfilter.cpp
@@ -144,6 +144,7 @@ TemporalFilter::TemporalFilter()
m_QP = 0;
m_sliceTypeConfig = 3;
m_numRef = 0;
+ m_useSADinME = 1;
m_range = 2;
m_chromaFactor = 0.55;
@@ -190,7 +191,7 @@ fail:
return 0;
}
-int TemporalFilter::motionErrorLuma(
+int TemporalFilter::motionErrorLumaSAD(
PicYuv *orig,
PicYuv *buffer,
int x,
@@ -215,6 +216,110 @@ int TemporalFilter::motionErrorLuma(
#if 0
const pixel* origRowStart = origOrigin + y *origStride + x;
+ for (int y1 = 0; y1 < bs; y1++)
+ {
+ for (int x1 = 0; x1 < bs; x1++)
+ {
+ int diff = origRowStart[x1] - bufferRowStart[x1];
+ error += abs(diff);
+ }
+
+ origRowStart += origStride;
+ bufferRowStart += buffStride;
+ }
+#else
+ int partEnum = partitionFromSizes(bs, bs);
+ /* copy PU block into cache */
+ primitives.pu[partEnum].copy_pp(predPUYuv.m_buf[0], FENC_STRIDE,
bufferRowStart, buffStride);
+
+ error = m_metld->me.bufSAD(predPUYuv.m_buf[0], FENC_STRIDE);
+#endif
+ if (error > besterror)
+ {
+ return error;
+ }
+ }
+ else
+ {
+ const int *xFilter = s_interpolationFilter[dx & 0xF];
+ const int *yFilter = s_interpolationFilter[dy & 0xF];
+ int tempArray[64 + 8][64];
+
+ int iSum, iBase;
+ for (int y1 = 1; y1 < bs + 7; y1++)
+ {
+ const int yOffset = y + y1 + (dy >> 4) - 3;
+ const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;
+ for (int x1 = 0; x1 < bs; x1++)
+ {
+ iSum = 0;
+ iBase = x + x1 + (dx >> 4) - 3;
+ const pixel *rowStart = sourceRow + iBase;
+
+ iSum += xFilter[1] * rowStart[1];
+ iSum += xFilter[2] * rowStart[2];
+ iSum += xFilter[3] * rowStart[3];
+ iSum += xFilter[4] * rowStart[4];
+ iSum += xFilter[5] * rowStart[5];
+ iSum += xFilter[6] * rowStart[6];
+
+ tempArray[y1][x1] = iSum;
+ }
+ }
+
+ const pixel maxSampleValue = (1 << m_bitDepth) - 1;
+ for (int y1 = 0; y1 < bs; y1++)
+ {
+ const pixel *origRow = origOrigin + (y + y1)*origStride + 0;
+ for (int x1 = 0; x1 < bs; x1++)
+ {
+ iSum = 0;
+ iSum += yFilter[1] * tempArray[y1 + 1][x1];
+ iSum += yFilter[2] * tempArray[y1 + 2][x1];
+ iSum += yFilter[3] * tempArray[y1 + 3][x1];
+ iSum += yFilter[4] * tempArray[y1 + 4][x1];
+ iSum += yFilter[5] * tempArray[y1 + 5][x1];
+ iSum += yFilter[6] * tempArray[y1 + 6][x1];
+
+ iSum = (iSum + (1 << 11)) >> 12;
+ iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ?
maxSampleValue : iSum);
+
+ error += abs(iSum - origRow[x + x1]);
+ }
+ if (error > besterror)
+ {
+ return error;
+ }
+ }
+ }
+ return error;
+}
+
+int TemporalFilter::motionErrorLumaSSD(
+ PicYuv *orig,
+ PicYuv *buffer,
+ int x,
+ int y,
+ int dx,
+ int dy,
+ int bs,
+ int besterror)
+{
+
+ pixel* origOrigin = orig->m_picOrg[0];
+ intptr_t origStride = orig->m_stride;
+ pixel *buffOrigin = buffer->m_picOrg[0];
+ intptr_t buffStride = buffer->m_stride;
+ int error = 0;// dx * 10 + dy * 10;
+ if (((dx | dy) & 0xF) == 0)
+ {
+ dx /= m_motionVectorFactor;
+ dy /= m_motionVectorFactor;
+
+ const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride +
(x + dx);
+#if 0
+ const pixel* origRowStart = origOrigin + y * origStride + x;
+
for (int y1 = 0; y1 < bs; y1++)
{
for (int x1 = 0; x1 < bs; x1++)
@@ -771,6 +876,7 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
const int origWidth = orig->m_picWidth;
const int origHeight = orig->m_picHeight;
+ int error;
for (int blockY = 0; blockY + blockSize <= origHeight; blockY +=
stepSize)
{
@@ -802,7 +908,12 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
{
int mvIdx = testy * prevMvStride + testx;
MV old = previous[mvIdx];
- int error = motionErrorLuma(orig, buffer,
blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
+
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer,
blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer,
blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
+
if (error < leastError)
{
best.set(old.x * factor, old.y * factor);
@@ -812,7 +923,11 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
}
}
- int error = motionErrorLuma(orig, buffer, blockX, blockY,
0, 0, blockSize, leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, 0, 0, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, 0, 0, blockSize, leastError);
+
if (error < leastError)
{
best.set(0, 0);
@@ -826,7 +941,10 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
{
for (int x2 = prevBest.x / m_motionVectorFactor - range;
x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
{
- int error = motionErrorLuma(orig, buffer, blockX,
blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize,
leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize,
leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize,
leastError);
if (error < leastError)
{
best.set(x2 * m_motionVectorFactor, y2 *
m_motionVectorFactor);
@@ -839,7 +957,12 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
{
int idx = ((blockY - stepSize) / stepSize) * mvStride +
(blockX / stepSize);
MV aboveMV = mvs[idx];
- int error = motionErrorLuma(orig, buffer, blockX, blockY,
aboveMV.x, aboveMV.y, blockSize, leastError);
+
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
+
if (error < leastError)
{
best.set(aboveMV.x, aboveMV.y);
@@ -852,7 +975,11 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
int idx = ((blockY / stepSize) * mvStride + (blockX -
stepSize) / stepSize);
MV leftMV = mvs[idx];
- int error = motionErrorLuma(orig, buffer, blockX, blockY,
leftMV.x, leftMV.y, blockSize, leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, leftMV.x, leftMV.y, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, leftMV.x, leftMV.y, blockSize, leastError);
+
if (error < leastError)
{
best.set(leftMV.x, leftMV.y);
@@ -903,6 +1030,7 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
const int origWidth = orig->m_picWidth;
const int origHeight = orig->m_picHeight;
+ int error;
for (int blockY = 0; blockY + blockSize <= origHeight; blockY +=
stepSize)
{
@@ -934,7 +1062,12 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
{
int mvIdx = testy * prevMvStride + testx;
MV old = previous[mvIdx];
- int error = motionErrorLuma(orig, buffer,
blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
+
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer,
blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer,
blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);
+
if (error < leastError)
{
best.set(old.x * factor, old.y * factor);
@@ -944,7 +1077,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
}
}
- int error = motionErrorLuma(orig, buffer, blockX, blockY,
0, 0, blockSize, leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, 0, 0, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, 0, 0, blockSize, leastError);
+
if (error < leastError)
{
best.set(0, 0);
@@ -958,7 +1095,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
{
for (int x2 = prevBest.x / m_motionVectorFactor - range;
x2 <= prevBest.x / m_motionVectorFactor + range; x2++)
{
- int error = motionErrorLuma(orig, buffer, blockX,
blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize,
leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize,
leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize,
leastError);
+
if (error < leastError)
{
best.set(x2 * m_motionVectorFactor, y2 *
m_motionVectorFactor);
@@ -973,7 +1114,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
{
for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x +
doubleRange; x2 += 4)
{
- int error = motionErrorLuma(orig, buffer, blockX,
blockY, x2, y2, blockSize, leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, x2, y2, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, x2, y2, blockSize, leastError);
+
if (error < leastError)
{
best.set(x2, y2);
@@ -988,7 +1133,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
{
for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x +
doubleRange; x2++)
{
- int error = motionErrorLuma(orig, buffer, blockX,
blockY, x2, y2, blockSize, leastError);
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, x2, y2, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, x2, y2, blockSize, leastError);
+
if (error < leastError)
{
best.set(x2, y2);
@@ -1002,7 +1151,12 @@ void
TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P
{
int idx = ((blockY - stepSize) / stepSize) * mvStride +
(blockX / stepSize);
MV aboveMV = mvs[idx];
- int error = motionErrorLuma(orig, buffer, blockX, blockY,
aboveMV.x, aboveMV.y, blockSize, leastError);
+
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, aboveMV.x, aboveMV.y, blockSize, leastError);
+
if (error < leastError)
{
best.set(aboveMV.x, aboveMV.y);
@@ -1014,7 +1168,12 @@ void
TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P
{
int idx = ((blockY / stepSize) * mvStride + (blockX -
stepSize) / stepSize);
MV leftMV = mvs[idx];
- int error = motionErrorLuma(orig, buffer, blockX, blockY,
leftMV.x, leftMV.y, blockSize, leastError);
+
+ if (m_useSADinME)
+ error = motionErrorLumaSAD(orig, buffer, blockX,
blockY, leftMV.x, leftMV.y, blockSize, leastError);
+ else
+ error = motionErrorLumaSSD(orig, buffer, blockX,
blockY, leftMV.x, leftMV.y, blockSize, leastError);
+
if (error < leastError)
{
best.set(leftMV.x, leftMV.y);
diff --git a/source/common/temporalfilter.h b/source/common/temporalfilter.h
index bdefa2824..f49fae3c5 100644
--- a/source/common/temporalfilter.h
+++ b/source/common/temporalfilter.h
@@ -159,7 +159,8 @@ public:
uint8_t m_sliceTypeConfig;
MotionEstimatorTLD* m_metld;
- Yuv predPUYuv;
+ Yuv predPUYuv;
+ int m_useSADinME;
void subsampleLuma(PicYuv *input, PicYuv *output, int factor = 2);
@@ -173,7 +174,16 @@ public:
void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv
*orig, PicYuv *buffer, int blockSize,
MV *previous, uint32_t prevMvStride, int factor, int* minError);
- int motionErrorLuma(PicYuv *orig,
+ int motionErrorLumaSSD(PicYuv *orig,
+ PicYuv *buffer,
+ int x,
+ int y,
+ int dx,
+ int dy,
+ int bs,
+ int besterror = 8 * 8 * 1024 * 1024);
+
+ int motionErrorLumaSAD(PicYuv *orig,
PicYuv *buffer,
int x,
int y,
--
2.34.1.windows.1
*Thanks and Regards,*
*Snehaa.GVideo Codec Engineer,Media & AI analytics
<https://multicorewareinc.com/>*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/215f8ef3/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mcstf_patch_12.diff
Type: application/octet-stream
Size: 15818 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/215f8ef3/attachment-0001.obj>
More information about the x265-devel
mailing list