<div dir="ltr"><div dir="ltr"><div>From d4c3b24ffa43d283b4b92b64a164c9c7de878c7f Mon Sep 17 00:00:00 2001</div><div>From: ashok2022 <<a href="mailto:ashok@multicorewareinc.com">ashok@multicorewareinc.com</a>></div><div>Date: Fri, 14 Oct 2022 11:30:23 +0530</div><div>Subject: [PATCH] Implement ASM for SAD used for motion estimation</div><div><br></div><div>---</div><div> source/common/temporalfilter.cpp | 185 ++++++++++++++++++++++++++++---</div><div> source/common/temporalfilter.h   |  14 ++-</div><div> 2 files changed, 184 insertions(+), 15 deletions(-)</div><div><br></div><div>diff --git a/source/common/temporalfilter.cpp b/source/common/temporalfilter.cpp</div><div>index f3e2dc73c..72edaaac1 100644</div><div>--- a/source/common/temporalfilter.cpp</div><div>+++ b/source/common/temporalfilter.cpp</div><div>@@ -144,6 +144,7 @@ TemporalFilter::TemporalFilter()</div><div>     m_QP = 0;</div><div>     m_sliceTypeConfig = 3;</div><div>     m_numRef = 0;</div><div>+    m_useSADinME = 1;</div><div> </div><div>     m_range = 2;</div><div>     m_chromaFactor = 0.55;</div><div>@@ -190,7 +191,7 @@ fail:</div><div>     return 0;</div><div> }</div><div> </div><div>-int TemporalFilter::motionErrorLuma(</div><div>+int TemporalFilter::motionErrorLumaSAD(</div><div>     PicYuv *orig,</div><div>     PicYuv *buffer,</div><div>     int x,</div><div>@@ -215,6 +216,110 @@ int TemporalFilter::motionErrorLuma(</div><div> #if 0</div><div>         const pixel* origRowStart = origOrigin + y *origStride + x;</div><div> </div><div>+        for (int y1 = 0; y1 < bs; y1++)</div><div>+        {</div><div>+            for (int x1 = 0; x1 < bs; x1++)</div><div>+            {</div><div>+                int diff = origRowStart[x1] - bufferRowStart[x1];</div><div>+                error += abs(diff);</div><div>+            }</div><div>+</div><div>+            origRowStart += origStride;</div><div>+            bufferRowStart += buffStride;</div><div>+        }</div><div>+#else</div><div>+        int partEnum = partitionFromSizes(bs, bs);</div><div>+        /* copy PU block into cache */</div><div>+        primitives.pu[partEnum].copy_pp(predPUYuv.m_buf[0], FENC_STRIDE, bufferRowStart, buffStride);</div><div>+</div><div>+        error = m_metld->me.bufSAD(predPUYuv.m_buf[0], FENC_STRIDE);</div><div>+#endif</div><div>+        if (error > besterror)</div><div>+        {</div><div>+            return error;</div><div>+        }</div><div>+    }</div><div>+    else</div><div>+    {</div><div>+        const int *xFilter = s_interpolationFilter[dx & 0xF];</div><div>+        const int *yFilter = s_interpolationFilter[dy & 0xF];</div><div>+        int tempArray[64 + 8][64];</div><div>+</div><div>+        int iSum, iBase;</div><div>+        for (int y1 = 1; y1 < bs + 7; y1++)</div><div>+        {</div><div>+            const int yOffset = y + y1 + (dy >> 4) - 3;</div><div>+            const pixel *sourceRow = buffOrigin + (yOffset)*buffStride + 0;</div><div>+            for (int x1 = 0; x1 < bs; x1++)</div><div>+            {</div><div>+                iSum = 0;</div><div>+                iBase = x + x1 + (dx >> 4) - 3;</div><div>+                const pixel *rowStart = sourceRow + iBase;</div><div>+</div><div>+                iSum += xFilter[1] * rowStart[1];</div><div>+                iSum += xFilter[2] * rowStart[2];</div><div>+                iSum += xFilter[3] * rowStart[3];</div><div>+                iSum += xFilter[4] * rowStart[4];</div><div>+                iSum += xFilter[5] * rowStart[5];</div><div>+                iSum += xFilter[6] * rowStart[6];</div><div>+</div><div>+                tempArray[y1][x1] = iSum;</div><div>+            }</div><div>+        }</div><div>+</div><div>+        const pixel maxSampleValue = (1 << m_bitDepth) - 1;</div><div>+        for (int y1 = 0; y1 < bs; y1++)</div><div>+        {</div><div>+            const pixel *origRow = origOrigin + (y + y1)*origStride + 0;</div><div>+            for (int x1 = 0; x1 < bs; x1++)</div><div>+            {</div><div>+                iSum = 0;</div><div>+                iSum += yFilter[1] * tempArray[y1 + 1][x1];</div><div>+                iSum += yFilter[2] * tempArray[y1 + 2][x1];</div><div>+                iSum += yFilter[3] * tempArray[y1 + 3][x1];</div><div>+                iSum += yFilter[4] * tempArray[y1 + 4][x1];</div><div>+                iSum += yFilter[5] * tempArray[y1 + 5][x1];</div><div>+                iSum += yFilter[6] * tempArray[y1 + 6][x1];</div><div>+</div><div>+                iSum = (iSum + (1 << 11)) >> 12;</div><div>+                iSum = iSum < 0 ? 0 : (iSum > maxSampleValue ? maxSampleValue : iSum);</div><div>+</div><div>+                error += abs(iSum - origRow[x + x1]);</div><div>+            }</div><div>+            if (error > besterror)</div><div>+            {</div><div>+                return error;</div><div>+            }</div><div>+        }</div><div>+    }</div><div>+    return error;</div><div>+}</div><div>+</div><div>+int TemporalFilter::motionErrorLumaSSD(</div><div>+    PicYuv *orig,</div><div>+    PicYuv *buffer,</div><div>+    int x,</div><div>+    int y,</div><div>+    int dx,</div><div>+    int dy,</div><div>+    int bs,</div><div>+    int besterror)</div><div>+{</div><div>+</div><div>+    pixel* origOrigin = orig->m_picOrg[0];</div><div>+    intptr_t origStride = orig->m_stride;</div><div>+    pixel *buffOrigin = buffer->m_picOrg[0];</div><div>+    intptr_t buffStride = buffer->m_stride;</div><div>+    int error = 0;// dx * 10 + dy * 10;</div><div>+    if (((dx | dy) & 0xF) == 0)</div><div>+    {</div><div>+        dx /= m_motionVectorFactor;</div><div>+        dy /= m_motionVectorFactor;</div><div>+</div><div>+        const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride + (x + dx);</div><div>+#if 0</div><div>+        const pixel* origRowStart = origOrigin + y * origStride + x;</div><div>+</div><div>         for (int y1 = 0; y1 < bs; y1++)</div><div>         {</div><div>             for (int x1 = 0; x1 < bs; x1++)</div><div>@@ -771,6 +876,7 @@ void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *or</div><div>     const int origWidth = orig->m_picWidth;</div><div>     const int origHeight = orig->m_picHeight;</div><div> </div><div>+    int error;</div><div> </div><div>     for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)</div><div>     {</div><div>@@ -802,7 +908,12 @@ void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *or</div><div>                         {</div><div>                             int mvIdx = testy * prevMvStride + testx;</div><div>                             MV old = previous[mvIdx];</div><div>-                            int error = motionErrorLuma(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);</div><div>+</div><div>+                            if (m_useSADinME)</div><div>+                                error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);</div><div>+                            else</div><div>+                                error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);</div><div>+</div><div>                             if (error < leastError)</div><div>                             {</div><div>                                 best.set(old.x * factor, old.y * factor);</div><div>@@ -812,7 +923,11 @@ void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *or</div><div>                     }</div><div>                 }</div><div> </div><div>-                int error = motionErrorLuma(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);</div><div>+                if (m_useSADinME)</div><div>+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);</div><div>+                else</div><div>+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);</div><div>+</div><div>                 if (error < leastError)</div><div>                 {</div><div>                     best.set(0, 0);</div><div>@@ -826,7 +941,10 @@ void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *or</div><div>             {</div><div>                 for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)</div><div>                 {</div><div>-                    int error = motionErrorLuma(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);</div><div>+                    if (m_useSADinME)</div><div>+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);</div><div>+                    else</div><div>+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);</div><div>                     if (error < leastError)</div><div>                     {</div><div>                         best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);</div><div>@@ -839,7 +957,12 @@ void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *or</div><div>             {</div><div>                 int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);</div><div>                 MV aboveMV = mvs[idx];</div><div>-                int error = motionErrorLuma(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);</div><div>+</div><div>+                if (m_useSADinME)</div><div>+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);</div><div>+                else</div><div>+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);</div><div>+</div><div>                 if (error < leastError)</div><div>                 {</div><div>                     best.set(aboveMV.x, aboveMV.y);</div><div>@@ -852,7 +975,11 @@ void TemporalFilter::motionEstimationLuma(MV *mvs, uint32_t mvStride, PicYuv *or</div><div>                 int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);</div><div>                 MV leftMV = mvs[idx];</div><div> </div><div>-                int error = motionErrorLuma(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);</div><div>+                if (m_useSADinME)</div><div>+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);</div><div>+                else</div><div>+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);</div><div>+</div><div>                 if (error < leastError)</div><div>                 {</div><div>                     best.set(leftMV.x, leftMV.y);</div><div>@@ -903,6 +1030,7 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>     const int origWidth = orig->m_picWidth;</div><div>     const int origHeight = orig->m_picHeight;</div><div> </div><div>+    int error;</div><div> </div><div>     for (int blockY = 0; blockY + blockSize <= origHeight; blockY += stepSize)</div><div>     {</div><div>@@ -934,7 +1062,12 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>                         {</div><div>                             int mvIdx = testy * prevMvStride + testx;</div><div>                             MV old = previous[mvIdx];</div><div>-                            int error = motionErrorLuma(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);</div><div>+</div><div>+                            if (m_useSADinME)</div><div>+                                error = motionErrorLumaSAD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);</div><div>+                            else</div><div>+                                error = motionErrorLumaSSD(orig, buffer, blockX, blockY, old.x * factor, old.y * factor, blockSize, leastError);</div><div>+</div><div>                             if (error < leastError)</div><div>                             {</div><div>                                 best.set(old.x * factor, old.y * factor);</div><div>@@ -944,7 +1077,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>                     }</div><div>                 }</div><div> </div><div>-                int error = motionErrorLuma(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);</div><div>+                if (m_useSADinME)</div><div>+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);</div><div>+                else</div><div>+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, 0, 0, blockSize, leastError);</div><div>+</div><div>                 if (error < leastError)</div><div>                 {</div><div>                     best.set(0, 0);</div><div>@@ -958,7 +1095,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>             {</div><div>                 for (int x2 = prevBest.x / m_motionVectorFactor - range; x2 <= prevBest.x / m_motionVectorFactor + range; x2++)</div><div>                 {</div><div>-                    int error = motionErrorLuma(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);</div><div>+                    if (m_useSADinME)</div><div>+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);</div><div>+                    else</div><div>+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2 * m_motionVectorFactor, y2 * m_motionVectorFactor, blockSize, leastError);</div><div>+</div><div>                     if (error < leastError)</div><div>                     {</div><div>                         best.set(x2 * m_motionVectorFactor, y2 * m_motionVectorFactor);</div><div>@@ -973,7 +1114,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>             {</div><div>                 for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2 += 4)</div><div>                 {</div><div>-                    int error = motionErrorLuma(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);</div><div>+                    if (m_useSADinME)</div><div>+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);</div><div>+                    else</div><div>+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);</div><div>+</div><div>                     if (error < leastError)</div><div>                     {</div><div>                         best.set(x2, y2);</div><div>@@ -988,7 +1133,11 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>             {</div><div>                 for (int x2 = prevBest.x - doubleRange; x2 <= prevBest.x + doubleRange; x2++)</div><div>                 {</div><div>-                    int error = motionErrorLuma(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);</div><div>+                    if (m_useSADinME)</div><div>+                        error = motionErrorLumaSAD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);</div><div>+                    else</div><div>+                        error = motionErrorLumaSSD(orig, buffer, blockX, blockY, x2, y2, blockSize, leastError);</div><div>+</div><div>                     if (error < leastError)</div><div>                     {</div><div>                         best.set(x2, y2);</div><div>@@ -1002,7 +1151,12 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>             {</div><div>                 int idx = ((blockY - stepSize) / stepSize) * mvStride + (blockX / stepSize);</div><div>                 MV aboveMV = mvs[idx];</div><div>-                int error = motionErrorLuma(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);</div><div>+</div><div>+                if (m_useSADinME)</div><div>+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);</div><div>+                else</div><div>+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, aboveMV.x, aboveMV.y, blockSize, leastError);</div><div>+</div><div>                 if (error < leastError)</div><div>                 {</div><div>                     best.set(aboveMV.x, aboveMV.y);</div><div>@@ -1014,7 +1168,12 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div>             {</div><div>                 int idx = ((blockY / stepSize) * mvStride + (blockX - stepSize) / stepSize);</div><div>                 MV leftMV = mvs[idx];</div><div>-                int error = motionErrorLuma(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);</div><div>+</div><div>+                if (m_useSADinME)</div><div>+                    error = motionErrorLumaSAD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);</div><div>+                else</div><div>+                    error = motionErrorLumaSSD(orig, buffer, blockX, blockY, leftMV.x, leftMV.y, blockSize, leastError);</div><div>+</div><div>                 if (error < leastError)</div><div>                 {</div><div>                     best.set(leftMV.x, leftMV.y);</div><div>diff --git a/source/common/temporalfilter.h b/source/common/temporalfilter.h</div><div>index bdefa2824..f49fae3c5 100644</div><div>--- a/source/common/temporalfilter.h</div><div>+++ b/source/common/temporalfilter.h</div><div>@@ -159,7 +159,8 @@ public:</div><div>     uint8_t m_sliceTypeConfig;</div><div> </div><div>     MotionEstimatorTLD* m_metld;</div><div>-    Yuv        predPUYuv;</div><div>+    Yuv  predPUYuv;</div><div>+    int m_useSADinME;</div><div> </div><div>     void subsampleLuma(PicYuv *input, PicYuv *output, int factor = 2);</div><div> </div><div>@@ -173,7 +174,16 @@ public:</div><div>     void motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, PicYuv *orig, PicYuv *buffer, int blockSize,</div><div>         MV *previous, uint32_t prevMvStride, int factor, int* minError);</div><div> </div><div>-    int motionErrorLuma(PicYuv *orig,</div><div>+    int motionErrorLumaSSD(PicYuv *orig,</div><div>+        PicYuv *buffer,</div><div>+        int x,</div><div>+        int y,</div><div>+        int dx,</div><div>+        int dy,</div><div>+        int bs,</div><div>+        int besterror = 8 * 8 * 1024 * 1024);</div><div>+</div><div>+    int motionErrorLumaSAD(PicYuv *orig,</div><div>         PicYuv *buffer,</div><div>         int x,</div><div>         int y,</div><div>-- </div><div>2.34.1.windows.1</div><div><br></div><div><div dir="ltr" class="gmail_signature"><div dir="ltr"><div><i><font face="georgia, serif">Thanks and Regards,</font></i></div><div><i><font face="georgia, serif"><b>Snehaa.G</b><br>Video Codec Engineer,<br>Media & AI analytics<br><a href="https://multicorewareinc.com/" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4yEumXeQ2mgcFAR2us9INa7z3rCbl8ordut3fbdeIbuPv0n3EA75Or1rHs0neGaI0WM8mFPz1g"></a><br><span></span><span></span><br></font></i></div></div></div></div></div></div>