[x265] [PATCH] fix bug in satd_4x4 for psyCost_ss

Thu Jan 8 12:08:57 CET 2015

Ok - for HBD enabled, this needs to be 64-bit, so this patch can be pushed.
The asm will use 32/64 bit as appropriate.

On Thu, Jan 8, 2015 at 3:44 PM, Deepthi Nandakumar <
deepthi at multicorewareinc.com> wrote:

> Divya will send a corrected patch for this, the intermediate values can
> stay as 32-bit.
>
> On Thu, Jan 8, 2015 at 2:01 PM, Divya Manivannan <
> divya at multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User Divya Manivannan <divya at multicorewareinc.com>
>> # Date 1420705817 -19800
>> #      Thu Jan 08 14:00:17 2015 +0530
>> # Node ID 188e42417b37cc5ab473f8ba51a351f4fd663082
>> # Parent  6dce2b87f0fe4aa37f9c7d66ec99447919b19c64
>> fix bug in satd_4x4 for psyCost_ss
>>
>> diff -r 6dce2b87f0fe -r 188e42417b37 source/common/pixel.cpp
>> --- a/source/common/pixel.cpp   Thu Jan 08 10:29:09 2015 +0530
>> +++ b/source/common/pixel.cpp   Thu Jan 08 14:00:17 2015 +0530
>> @@ -241,32 +241,35 @@
>>      return (int)(sum >> 1);
>>  }
>>
>> -int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t*
>> pix2, intptr_t stride_pix2)
>> +static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
>>  {
>> -    int64_t tmp[4][2];
>> -    int64_t a0, a1, a2, a3, b0, b1;
>> -    int64_t sum = 0;
>> -
>> -    for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
>> -    {
>> -        a0 = pix1[0] - pix2[0];
>> -        a1 = pix1[1] - pix2[1];
>> -        b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
>> -        a2 = pix1[2] - pix2[2];
>> -        a3 = pix1[3] - pix2[3];
>> -        b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
>> -        tmp[i][0] = b0 + b1;
>> -        tmp[i][1] = b0 - b1;
>> -    }
>> -
>> -    for (int i = 0; i < 2; i++)
>> -    {
>> -        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i],
>> tmp[3][i]);
>> -        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
>> -        sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
>> -    }
>> -
>> -    return (int)(sum >> 1);
>> +    int64_t tmp[4][4];
>> +    int64_t s01, s23, d01, d23;
>> +    int64_t satd = 0;
>> +    int d;
>> +
>> +    for (d = 0; d < 4; d++, pix1 += stride_pix1)
>> +    {
>> +        s01 = pix1[0] + pix1[1];
>> +        s23 = pix1[2] + pix1[3];
>> +        d01 = pix1[0] - pix1[1];
>> +        d23 = pix1[2] - pix1[3];
>> +
>> +        tmp[d][0] = s01 + s23;
>> +        tmp[d][1] = s01 - s23;
>> +        tmp[d][2] = d01 - d23;
>> +        tmp[d][3] = d01 + d23;
>> +    }
>> +
>> +    for (d = 0; d < 4; d++)
>> +    {
>> +        s01 = tmp[0][d] + tmp[1][d];
>> +        s23 = tmp[2][d] + tmp[3][d];
>> +        d01 = tmp[0][d] - tmp[1][d];
>> +        d23 = tmp[2][d] - tmp[3][d];
>> +        satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) +
>> abs(d01 + d23);
>> +    }
>> +    return (int)(satd / 2);
>>  }
>>
>>  // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
>> @@ -832,8 +835,8 @@
>>      else
>>      {
>>          /* 4x4 is too small for sa8d */
>> -        int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) -
>> (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
>> -        int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4,
>> 4>(recon, rstride, zeroBuf, 0) >> 2);
>> +        int sourceEnergy = satd_4x4(source, sstride) - (sad<4,
>> 4>(source, sstride, zeroBuf, 0) >> 2);
>> +        int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon,
>> rstride, zeroBuf, 0) >> 2);
>>          return abs(sourceEnergy - reconEnergy);
>>      }
>>  }
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150108/7137c3f2/attachment.html>