[x265] [PATCH] inline mvcost() to reduce address operators

Wed May 6 16:28:53 CEST 2015

At 2015-05-06 08:38:24,"Steve Borho" <steve at borho.org> wrote:
>On 05/05, Min Chen wrote:
>> # HG changeset patch
>> # User Min Chen <chenm003 at 163.com>
>> # Date 1430862259 25200
>> # Node ID 50ce2c0ddfbb743b45f678ee2e6b796762ad868f
>> # Parent  f32e6464225afa02983af1b1905f50cdccae5244
>> inline mvcost() to reduce address operators
>
>I'm skeptical that this is a good idea. have you measured the difference
>in performance with encoders built with profile-guided optimizations?

I found this idea from vtune assembly report (preset ultrafast), it show bottleneck in signed address extendsion because offset is signed integer.
In the ICL, when we use keyword 'restrict' can avoid part of these reduce operators.

after this patch,  I got 10% improve in ME module or call ~2% in total encode.

>
>>  source/encoder/motion.cpp |   48 ++++++++++++++++++++++++++++++--------------
>>  1 files changed, 33 insertions(+), 15 deletions(-)
>> 
>> diff -r f32e6464225a -r 50ce2c0ddfbb source/encoder/motion.cpp
>> --- a/source/encoder/motion.cpp	Mon May 04 15:15:42 2015 -0500
>> +++ b/source/encoder/motion.cpp	Tue May 05 14:44:19 2015 -0700
>> @@ -234,9 +234,14 @@
>>                 pix_base + (m1x) + (m1y) * stride, \
>>                 pix_base + (m2x) + (m2y) * stride, \
>>                 stride, costs); \
>> -        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
>> -        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
>> -        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
>> +        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \
>> +        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \
>> +        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \
>> +        (costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \
>> +        (costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \
>> +        (costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \
>>      }
>>  
>>  #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
>> @@ -247,10 +252,10 @@
>>                 fref + (m2x) + (m2y) * stride, \
>>                 fref + (m3x) + (m3y) * stride, \
>>                 stride, costs); \
>> -        costs[0] += mvcost(MV(m0x, m0y) << 2); \
>> -        costs[1] += mvcost(MV(m1x, m1y) << 2); \
>> -        costs[2] += mvcost(MV(m2x, m2y) << 2); \
>> -        costs[3] += mvcost(MV(m3x, m3y) << 2); \
>> +        (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
>> +        (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
>> +        (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
>> +        (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
>>          COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
>>          COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
>>          COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
>> @@ -266,10 +271,16 @@
>>                 pix_base + (m2x) + (m2y) * stride, \
>>                 pix_base + (m3x) + (m3y) * stride, \
>>                 stride, costs); \
>> -        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
>> -        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
>> -        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
>> -        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
>> +        const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \
>> +        const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \
>> +        X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
>> +        costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
>> +        costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
>> +        costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
>> +        costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
>>          COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
>>          COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
>>          COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
>> @@ -285,10 +296,17 @@
>>                 pix_base + (m2x) + (m2y) * stride, \
>>                 pix_base + (m3x) + (m3y) * stride, \
>>                 stride, costs); \
>> -        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
>> -        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
>> -        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
>> -        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
>> +        /* TODO: use restrict keyword in ICL */ \
>> +        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \
>> +        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \
>> +        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
>> +        X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
>> +        (costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
>> +        (costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
>> +        (costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
>> +        (costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
>>      }
>>  
>>  #define DIA1_ITER(mx, my) \
>> 
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>
>-- 
>Steve Borho
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150506/5bb9d212/attachment-0001.html>