<div dir="ltr">Thanks, this is certainly an enhancement to x265 lookahead. We would be interested in this - especially if you can also include some efficiency (bitrate vs SSIM) metrics that describe the penalty moving from X265_LOWRES_SCALE of 4 to higher scales.<br>


<div><div class="gmail_extra"><br><br><div class="gmail_quote">On Mon, Jul 21, 2014 at 8:49 PM, Nicolas Morey-Chaisemartin <span dir="ltr"><<a href="mailto:nmorey@kalray.eu" target="_blank">nmorey@kalray.eu</a>></span> wrote:<br>


<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">Hi,<br>

<br>

We recently profiled x265 pre-analysis to estimate what performance we could reach using our accelerator and I was quite disappointed by the performance.<br>When running on a Core-i7 with AVX at roughly 2.7GHz, we barely reached the 30fps mark using ultrafast preset on a 4K video.<br>


</blockquote><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">

After a little bit of browsing I realized that work in LosRew is always done at 1/4th of the final resolution which seems fair but requires a huge amount of work for 4K.<br>

It seemed straight forward enough to change the divider at LowRes initialization but it seems there are a lot of hard coded values that depend both on the LowRes divider and the LowRes CU Size.<br>

<br>

Here's a patch (definitly not applicable like this but just to give an idea of where I'm going) that seems to fix most of the hard-coded value.<br>

It still works with a X265_LOWRES_SCALE of 4 and the perf is definilty improving (29fps => 40fps on a 2048x1024 medium preset on a E5504).<br>

<br>

Would you be interested in a clean version of this? At least the hard-coded CU_SIZE part?<br>

IMHO it would be better to have "dynamic" value for LowRes depending on preset (or equivalent) and the input resolution...<br>

1/4th is fast enough in HD not to be an issue but for RT stream in 4K or more, 1/16 will be compulsory.<br>

<br>

Nicolas<br>

<br>

---<br>

 x265/source/common/common.h          |  1 +<br>

 x265/source/common/lowres.cpp        |  4 ++--<br>

 x265/source/encoder/<u></u>frameencoder.cpp |  7 ++++---<br>

 x265/source/encoder/<u></u>ratecontrol.cpp  | 16 ++++++++--------<br>

 x265/source/encoder/slicetype.<u></u>cpp    |  8 ++++----<br>

 5 files changed, 19 insertions(+), 17 deletions(-)<br>

<br>

diff --git a/x265/source/common/common.h b/x265/source/common/common.h<br>

index 06f60e7..00e73fc 100644<br>

--- a/x265/source/common/common.h<br>

+++ b/x265/source/common/common.h<br>

@@ -156,6 +156,7 @@ typedef int32_t  coeff_t;      // transform coefficient<br>

 // high cost estimates (intra and inter both suffer)<br>

 #define X265_LOWRES_CU_SIZE   8<br>

 #define X265_LOWRES_CU_BITS   3<br>

+#define X265_LOWRES_SCALE     2<br>

  #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(<u></u>type) * (count))<br>

 #define X265_FREE(ptr)              x265_free(ptr)<br>

diff --git a/x265/source/common/lowres.<u></u>cpp b/x265/source/common/lowres.<u></u>cpp<br>

index 5fc2f6b..6138023 100644<br>

--- a/x265/source/common/lowres.<u></u>cpp<br>

+++ b/x265/source/common/lowres.<u></u>cpp<br>

@@ -31,8 +31,8 @@ bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)<br>

 {<br>

     isLowres = true;<br>

     bframes = _bframes;<br>

-    width = orig->getWidth() / 2;<br>

-    lines = orig->getHeight() / 2;<br>

+    width = orig->getWidth() / X265_LOWRES_SCALE;<br>

+    lines = orig->getHeight() / X265_LOWRES_SCALE;<br>

     lumaStride = width + 2 * orig->getLumaMarginX();<br>

     if (lumaStride & 31)<br>

         lumaStride += 32 - (lumaStride & 31);<br>

diff --git a/x265/source/encoder/<u></u>frameencoder.cpp b/x265/source/encoder/<u></u>frameencoder.cpp<br>

index 8c3ee26..7213f60 100644<br>

--- a/x265/source/encoder/<u></u>frameencoder.cpp<br>

+++ b/x265/source/encoder/<u></u>frameencoder.cpp<br>

@@ -1300,9 +1300,10 @@ int FrameEncoder::calcQpForCu(<u></u>uint32_t cuAddr, double baseQp)<br>

      /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */<br>

     double qp_offset = 0;<br>

-    int maxBlockCols = (m_frame->getPicYuvOrg()-><u></u>getWidth() + (16 - 1)) / 16;<br>

-    int maxBlockRows = (m_frame->getPicYuvOrg()-><u></u>getHeight() + (16 - 1)) / 16;<br>

-    int noOfBlocks = g_maxCUSize / 16;<br>

+    int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);<br>

+    int maxBlockCols = (m_frame->getPicYuvOrg()-><u></u>getWidth() + (lowResCu - 1)) / lowResCu;<br>

+    int maxBlockRows = (m_frame->getPicYuvOrg()-><u></u>getHeight() + (lowResCu - 1)) / lowResCu;<br>

+    int noOfBlocks = g_maxCUSize / lowResCu;<br>

     int block_y = (cuAddr / m_frame->getPicSym()-><u></u>getFrameWidthInCU()) * noOfBlocks;<br>

     int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()-><u></u>getFrameWidthInCU();<br>

 diff --git a/x265/source/encoder/<u></u>ratecontrol.cpp b/x265/source/encoder/<u></u>ratecontrol.cpp<br>

index 4358994..5fcc27a 100644<br>

--- a/x265/source/encoder/<u></u>ratecontrol.cpp<br>

+++ b/x265/source/encoder/<u></u>ratecontrol.cpp<br>

@@ -161,8 +161,8 @@ void RateControl::<u></u>calcAdaptiveQuantFrame(Frame *pic)<br>

     if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)<br>

     {<br>

         /* Need to init it anyways for CU tree */<br>

-        int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

-        int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+        int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+        int cuHeight = ((maxRow / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

         int cuCount = cuWidth * cuHeight;<br>

          if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)<br>

@@ -194,9 +194,9 @@ void RateControl::<u></u>calcAdaptiveQuantFrame(Frame *pic)<br>

         if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)<br>

         {<br>

             double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);<br>

-            for (block_y = 0; block_y < maxRow; block_y += 16)<br>

+            for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))<br>

             {<br>

-                for (block_x = 0; block_x < maxCol; block_x += 16)<br>

+                for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))<br>

                 {<br>

                     uint32_t energy = acEnergyCu(pic, block_x, block_y);<br>

                     qp_adj = pow(energy + 1, 0.1);<br>

@@ -216,9 +216,9 @@ void RateControl::<u></u>calcAdaptiveQuantFrame(Frame *pic)<br>

             strength = m_param->rc.aqStrength * 1.0397f;<br>

          block_xy = 0;<br>

-        for (block_y = 0; block_y < maxRow; block_y += 16)<br>

+        for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))<br>

         {<br>

-            for (block_x = 0; block_x < maxCol; block_x += 16)<br>

+            for (block_x = 0; block_x < maxCol; block_x +=  (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))<br>

             {<br>

                 if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)<br>

                 {<br>

@@ -260,8 +260,8 @@ void RateControl::<u></u>calcAdaptiveQuantFrame(Frame *pic)<br>

 RateControl::RateControl(x265_<u></u>param *p)<br>

 {<br>

     m_param = p;<br>

-    int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

-    int lowresCuHeight = ((m_param->sourceHeight / 2)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+    int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+    int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

     m_ncu = lowresCuWidth * lowresCuHeight;<br>

      if (m_param->rc.cuTree)<br>

diff --git a/x265/source/encoder/<u></u>slicetype.cpp b/x265/source/encoder/<u></u>slicetype.cpp<br>

index 34d0b3b..4a2f2cb 100644<br>

--- a/x265/source/encoder/<u></u>slicetype.cpp<br>

+++ b/x265/source/encoder/<u></u>slicetype.cpp<br>

@@ -65,8 +65,8 @@ Lookahead::Lookahead(x265_<u></u>param *param, ThreadPool* pool)<br>

     m_lastNonB = NULL;<br>

     m_bFilling = true;<br>

     m_bFlushed = false;<br>

-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+    m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+    m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

     m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));<br>

     memset(m_histogram, 0, sizeof(m_histogram));<br>

 }<br>

@@ -1201,8 +1201,8 @@ CostEstimate::~CostEstimate()<br>

 void CostEstimate::init(x265_param *_param, Frame *pic)<br>

 {<br>

     m_param = _param;<br>

-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+    m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

+    m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;<br>

      m_rows = new EstimateRow[m_heightInCU];<br>

     for (int i = 0; i < m_heightInCU; i++)<br>

______________________________<u></u>_________________<br>

x265-devel mailing list<br>

<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>

<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/<u></u>listinfo/x265-devel</a><br>

</blockquote></div><br></div></div></div>