<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div> </div><pre><br>At 2015-05-06 08:38:24,"Steve Borho" <steve@borho.org> wrote:

>On 05/05, Min Chen wrote:

>> # HG changeset patch

>> # User Min Chen <chenm003@163.com>

>> # Date 1430862259 25200

>> # Node ID 50ce2c0ddfbb743b45f678ee2e6b796762ad868f

>> # Parent  f32e6464225afa02983af1b1905f50cdccae5244

>> inline mvcost() to reduce address operators

>

>I'm skeptical that this is a good idea. have you measured the difference

>in performance with encoders built with profile-guided optimizations?

</pre><pre>I found this idea from vtune assembly report (preset ultrafast), it show bottleneck in signed address extendsion because offset is signed integer.</pre><pre>In the ICL, when we use keyword 'restrict' can avoid part of these reduce operators.</pre><pre> </pre><pre>after this patch,  I got 10% improve in ME module or call ~2% in total encode.</pre><pre> </pre><pre>>

>>  source/encoder/motion.cpp |   48 ++++++++++++++++++++++++++++++--------------

>>  1 files changed, 33 insertions(+), 15 deletions(-)

>> 

>> diff -r f32e6464225a -r 50ce2c0ddfbb source/encoder/motion.cpp

>> --- a/source/encoder/motion.cpp      Mon May 04 15:15:42 2015 -0500

>> +++ b/source/encoder/motion.cpp      Tue May 05 14:44:19 2015 -0700

>> @@ -234,9 +234,14 @@

>>                 pix_base + (m1x) + (m1y) * stride, \

>>                 pix_base + (m2x) + (m2y) * stride, \

>>                 stride, costs); \

>> -        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \

>> -        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \

>> -        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \

>> +        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \

>> +        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \

>> +        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \

>> +        (costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \

>> +        (costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \

>> +        (costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \

>>      }

>>  

>>  #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \

>> @@ -247,10 +252,10 @@

>>                 fref + (m2x) + (m2y) * stride, \

>>                 fref + (m3x) + (m3y) * stride, \

>>                 stride, costs); \

>> -        costs[0] += mvcost(MV(m0x, m0y) << 2); \

>> -        costs[1] += mvcost(MV(m1x, m1y) << 2); \

>> -        costs[2] += mvcost(MV(m2x, m2y) << 2); \

>> -        costs[3] += mvcost(MV(m3x, m3y) << 2); \

>> +        (costs)[0] += mvcost(MV(m0x, m0y) << 2); \

>> +        (costs)[1] += mvcost(MV(m1x, m1y) << 2); \

>> +        (costs)[2] += mvcost(MV(m2x, m2y) << 2); \

>> +        (costs)[3] += mvcost(MV(m3x, m3y) << 2); \

>>          COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \

>>          COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \

>>          COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \

>> @@ -266,10 +271,16 @@

>>                 pix_base + (m2x) + (m2y) * stride, \

>>                 pix_base + (m3x) + (m3y) * stride, \

>>                 stride, costs); \

>> -        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \

>> -        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \

>> -        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \

>> -        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \

>> +        const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \

>> +        const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \

>> +        X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \

>> +        costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \

>> +        costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \

>> +        costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \

>> +        costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \

>>          COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \

>>          COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \

>>          COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \

>> @@ -285,10 +296,17 @@

>>                 pix_base + (m2x) + (m2y) * stride, \

>>                 pix_base + (m3x) + (m3y) * stride, \

>>                 stride, costs); \

>> -        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \

>> -        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \

>> -        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \

>> -        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \

>> +        /* TODO: use restrict keyword in ICL */ \

>> +        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \

>> +        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \

>> +        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \

>> +        X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \

>> +        (costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \

>> +        (costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \

>> +        (costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \

>> +        (costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \

>>      }

>>  

>>  #define DIA1_ITER(mx, my) \

>> 

>> _______________________________________________

>> x265-devel mailing list

>> x265-devel@videolan.org

>> https://mailman.videolan.org/listinfo/x265-devel

>

>-- 

>Steve Borho

>_______________________________________________

>x265-devel mailing list

>x265-devel@videolan.org

>https://mailman.videolan.org/listinfo/x265-devel

</pre></div>