[x265] [PATCH 10/14] Implement ASM for SSD used for motion estimation

Snehaa Giridharan snehaa at multicorewareinc.com
Wed Oct 19 07:31:20 UTC 2022


>From 0d1748cbf58e83b6357e4ff3c9696687e3c30ddd Mon Sep 17 00:00:00 2001
From: ashok2022 <ashok at multicorewareinc.com>
Date: Thu, 13 Oct 2022 20:22:07 +0530
Subject: [PATCH] Implement ASM for SSD used for motion estimation

---
 source/common/temporalfilter.cpp | 47 +++++++++++++++++++++++++-------
 source/common/temporalfilter.h   | 31 ++++++++++++++++-----
 source/encoder/frameencoder.cpp  |  2 ++
 source/encoder/motion.cpp        | 25 +++++++++++++++++
 source/encoder/motion.h          |  2 +-
 5 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/source/common/temporalfilter.cpp
b/source/common/temporalfilter.cpp
index 1d5a7d076..a937e2a67 100644
--- a/source/common/temporalfilter.cpp
+++ b/source/common/temporalfilter.cpp
@@ -1,6 +1,8 @@
 /*****************************************************************************
 * Copyright (C) 2013-2021 MulticoreWare, Inc
 *
+ * Authors: Ashok Kumar Mishra <ashok at multicorewareinc.com>
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +20,9 @@
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/
-
+#include "common.h"
 #include "temporalfilter.h"
+#include "primitives.h"

 #include "frame.h"
 #include "slice.h"
@@ -160,6 +163,10 @@ void TemporalFilter::init(const x265_param* param)
     m_sourceHeight = param->sourceHeight;
     m_internalCsp = param->internalCsp;
     m_numComponents = (m_internalCsp != X265_CSP_I400) ? MAX_NUM_COMPONENT
: 1;
+
+    m_metld = new MotionEstimatorTLD;
+
+    predPUYuv.create(FENC_STRIDE, X265_CSP_I400);
 }

 int TemporalFilter::createRefPicInfo(MCTFReferencePicInfo* refFrame,
x265_param* param)
@@ -206,21 +213,33 @@ int TemporalFilter::motionErrorLuma(
     {
         dx /= s_motionVectorFactor;
         dy /= s_motionVectorFactor;
+
+        const pixel* bufferRowStart = buffOrigin + (y + dy) * buffStride +
(x + dx);
+#if 0
+        const pixel* origRowStart = origOrigin + y *origStride + x;
+
         for (int y1 = 0; y1 < bs; y1++)
         {
-            const pixel* origRowStart = origOrigin + (y + y1)*origStride +
x;
-            const pixel* bufferRowStart = buffOrigin + (y + y1 +
dy)*buffStride + (x + dx);
-            for (int x1 = 0; x1 < bs; x1 += 2)
+            for (int x1 = 0; x1 < bs; x1++)
             {
                 int diff = origRowStart[x1] - bufferRowStart[x1];
                 error += diff * diff;
-                diff = origRowStart[x1 + 1] - bufferRowStart[x1 + 1];
-                error += diff * diff;
-            }
-            if (error > besterror)
-            {
-                return error;
             }
+
+            origRowStart += origStride;
+            bufferRowStart += buffStride;
+        }
+#else
+        int partEnum = partitionFromSizes(bs, bs);
+        /* copy PU block into cache */
+        primitives.pu[partEnum].copy_pp(predPUYuv.m_buf[0], FENC_STRIDE,
bufferRowStart, buffStride);
+
+        error = primitives.cu[partEnum].sse_pp(m_metld->me.fencPUYuv.m_buf[0],
FENC_STRIDE, predPUYuv.m_buf[0], FENC_STRIDE);
+
+#endif
+        if (error > besterror)
+        {
+            return error;
         }
     }
     else
@@ -761,6 +780,10 @@ void TemporalFilter::motionEstimationLuma(MV *mvs,
uint32_t mvStride, PicYuv *or
     {
         for (int blockX = 0; blockX + blockSize <= origWidth; blockX +=
stepSize)
         {
+            const intptr_t pelOffset = blockY * orig->m_stride + blockX;
+            m_metld->me.setSourcePU(orig->m_picOrg[0], orig->m_stride,
pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
+
+
             MV best(0, 0);
             int leastError = INT_MAX;

@@ -889,6 +912,10 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV
*mvs, uint32_t mvStride, P
     {
         for (int blockX = 0; blockX + blockSize <= origWidth; blockX +=
stepSize)
         {
+
+            const intptr_t pelOffset = blockY * orig->m_stride + blockX;
+            m_metld->me.setSourcePU(orig->m_picOrg[0], orig->m_stride,
pelOffset, blockSize, blockSize, X265_HEX_SEARCH, 1);
+
             MV best(0, 0);
             int leastError = INT_MAX;

diff --git a/source/common/temporalfilter.h b/source/common/temporalfilter.h
index 003630994..801359914 100644
--- a/source/common/temporalfilter.h
+++ b/source/common/temporalfilter.h
@@ -29,6 +29,7 @@
 #include <deque>
 #include "piclist.h"
 #include "yuv.h"
+#include "motion.h"

 using namespace X265_NS;

@@ -94,6 +95,19 @@ struct TemporalFilterRefPicInfo
     int        origOffset;
 };

+struct MotionEstimatorTLD
+{
+    MotionEstimate  me;
+
+    MotionEstimatorTLD()
+    {
+        me.init(X265_CSP_I400);
+        me.setQP(X265_LOOKAHEAD_QP);
+    }
+
+    ~MotionEstimatorTLD() {}
+};
+
 struct MCTFReferencePicInfo
 {
     PicYuv*    picBuffer;
@@ -103,16 +117,16 @@ struct MCTFReferencePicInfo
     MV*        mvs0;
     MV*        mvs1;
     MV*        mvs2;
-    uint32_t mvsStride;
-    uint32_t mvsStride0;
-    uint32_t mvsStride1;
-    uint32_t mvsStride2;
-    int*     error;
-    int*     noise;
+    uint32_t   mvsStride;
+    uint32_t   mvsStride0;
+    uint32_t   mvsStride1;
+    uint32_t   mvsStride2;
+    int*       error;
+    int*       noise;

     int16_t    origOffset;
     bool       isFilteredFrame;
-    PicYuv*       compensatedPic;
+    PicYuv*    compensatedPic;

     int*       isSubsampled;

@@ -154,6 +168,9 @@ public:
     int m_numComponents;
     uint8_t m_sliceTypeConfig;

+    MotionEstimatorTLD* m_metld;
+    Yuv        predPUYuv;
+
     void subsampleLuma(PicYuv *input, PicYuv *output, int factor = 2);

     int createRefPicInfo(MCTFReferencePicInfo* refFrame, x265_param*
param);
diff --git a/source/encoder/frameencoder.cpp
b/source/encoder/frameencoder.cpp
index 0a44eb22f..ec78fc9f2 100644
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -105,6 +105,8 @@ void FrameEncoder::destroy()

     if (m_param->bEnableGopBasedTemporalFilter)
     {
+        delete m_frameEncTF->m_metld;
+
         for (int i = 0; i < (m_frameEncTF->s_range << 1); i++)
             m_frameEncTF->destroyRefPicInfo(&m_mcstfRefList[i]);

diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp
index f10db884e..2bb613ec0 100644
--- a/source/encoder/motion.cpp
+++ b/source/encoder/motion.cpp
@@ -190,6 +190,31 @@ void MotionEstimate::setSourcePU(pixel *fencY,
intptr_t stride, intptr_t offset,
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in
this code path\n");
 }

+/* Called by lookahead, luma only, no use of PicYuv */
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t
offset, int pwidth, int pheight, const int method, const int refine)
+{
+    partEnum = partitionFromSizes(pwidth, pheight);
+    X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
+    sad = primitives.pu[partEnum].sad;
+    ads = primitives.pu[partEnum].ads;
+    satd = primitives.pu[partEnum].satd;
+    sad_x3 = primitives.pu[partEnum].sad_x3;
+    sad_x4 = primitives.pu[partEnum].sad_x4;
+
+
+    blockwidth = pwidth;
+    blockOffset = offset;
+    absPartIdx = ctuAddr = -1;
+
+    /* Search params */
+    searchMethod = method;
+    subpelRefine = refine;
+
+    /* copy PU block into cache */
+    primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY
+ offset, stride);
+    X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in
this code path\n");
+}
+
 /* Called by Search::predInterSearch() or --pme equivalent, chroma
residual might be considered */
 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int
cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const
int refine, bool bChroma)
 {
diff --git a/source/encoder/motion.h b/source/encoder/motion.h
index d306230b4..790bc5fb4 100644
--- a/source/encoder/motion.h
+++ b/source/encoder/motion.h
@@ -77,7 +77,7 @@ public:
     void init(int csp);

     /* Methods called at slice setup */
-
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int
pwidth, int pheight, const int searchMethod, const int subpelRefine);
     void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int
pwidth, int pheight, const int searchMethod, const int searchL0, const int
searchL1, const int subpelRefine);
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx,
int puPartIdx, int pwidth, int pheight, const int searchMethod, const int
subpelRefine, bool bChroma);

-- 
2.34.1.windows.1

*Thanks and Regards,*





*Snehaa.GVideo Codec Engineer,Media & AI analytics
<https://multicorewareinc.com/>*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/608be3b4/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mcstf_patch_10.diff
Type: application/octet-stream
Size: 8790 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/608be3b4/attachment-0001.obj>


More information about the x265-devel mailing list