[x265] [PATCH] Vector code for xCalQuantCoefEAdp
Praveen Tiwari
praveen at multicorewareinc.com
Fri Jun 28 06:29:19 CEST 2013
--> This still doesn't look optimized at all, it's just unrolled.
we have unrolled due to __int64 used in HM code so our data loading and
unloading taking much time than the reducing calculations, so we have
unrolled and vectorized few suitable calculations.
--> use uint64_t here. __int64 is a Microsoft data type. Is 64bits really
necessary?
Initial i replaced __int64 with int and it seems __int64 is not required
but i need to run few more tests and then we can vectorized the whole code.
On Fri, Jun 28, 2013 at 12:36 AM, Steve Borho <steve at borho.org> wrote:
>
>
>
> On Thu, Jun 27, 2013 at 7:27 AM, <praveen at multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User praveentiwari
>> # Date 1372336062 -19800
>> # Node ID 2e227fd23fe25e9fe6dfcca2f1dac21474f4a7a0
>> # Parent 321b2fd70a1bd58b2bb1c2351f49766709a15770
>> Vector code for xCalQuantCoefEAdp
>>
>
> This looks ok, but it doesn't match the fixes I made to the C primitive.
> Details below.
>
>
>> diff -r 321b2fd70a1b -r 2e227fd23fe2 source/common/vec/dct.inc
>> --- a/source/common/vec/dct.inc Wed Jun 26 17:42:39 2013 +0530
>> +++ b/source/common/vec/dct.inc Thu Jun 27 17:57:42 2013 +0530
>> @@ -39,7 +39,6 @@
>> extern void fastForwardDst(Short *block, Short *coeff, Int shift);
>>
>> namespace {
>> -
>> /* Used for filter */
>> #define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
>> #define IF_FILTER_PREC 6 ///< Log2 of sum of filter taps
>> @@ -3938,6 +3937,131 @@
>> #undef STROE_LINE
>> }
>> }
>>
>>
> use uint32_t for unsigned int. and xCalQuantCoefEAdp should be quantaq
>
>
>> +unsigned int xCalQuantCoefEAdp(int * coef,
>> + int * quantCoeff,
>> + int * deltaU,
>> + int * qCoef,
>> + int * arlCCoef,
>> + int qBitsC,
>> + int qBits,
>> + int add,
>> + int numCoeff)
>> +{
>> + int addc = 1 << (qBitsC - 1);
>> + int qBits8 = qBits - 8;
>> + unsigned int acSum = 0;
>> + int dstOffset = 0;
>> +
>> + for (int blockpos = 0; blockpos < numCoeff; blockpos++)
>> + {
>> + int level1;
>> + int sign1;
>> + level1 = coef[blockpos];
>> + sign1 = (level1 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel1 = (__int64)abs(level1) * quantCoeff[blockpos];
>>
>
> use uint64_t here. __int64 is a Microsoft data type. Is 64bits really
> necessary?
>
>
>> + arlCCoef[blockpos] = (int)((tmplevel1 + addc) >> qBitsC);
>> + level1 = (int)((tmplevel1 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel1 - (level1 << qBits)) >>
>> qBits8);
>> + blockpos++;
>>
>
> This still doesn't look optimized at all, it's just unrolled.
>
>
>> + int level2;
>> + int sign2;
>> + level2 = coef[blockpos];
>> + sign2 = (level2 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel2 = (__int64)abs(level2) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel2 + addc) >> qBitsC);
>> + level2 = (int)((tmplevel2 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel2 - (level2 << qBits)) >>
>> qBits8);
>> + blockpos++;
>> +
>> + int level3;
>> + int sign3;
>> + level3 = coef[blockpos];
>> + sign3 = (level3 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel3 = (__int64)abs(level3) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel3 + addc) >> qBitsC);
>> + level3 = (int)((tmplevel3 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel3 - (level3 << qBits)) >>
>> qBits8);
>> + blockpos++;
>> +
>> + int level4;
>> + int sign4;
>> + level4 = coef[blockpos];
>> + sign4 = (level4 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel4 = (__int64)abs(level4) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel4 + addc) >> qBitsC);
>> + level4 = (int)((tmplevel4 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel4 - (level4 << qBits)) >>
>> qBits8);
>> + blockpos++;
>> +
>> + Vec4i qLevel1(level1, level2, level3, level4);
>> + Vec4i qSign1(sign1, sign2, sign3, sign4);
>> + acSum += horizontal_add(qLevel1);
>> + qLevel1 = qLevel1 * qSign1;
>> +
>> + int level5;
>> + int sign5;
>> + level5 = coef[blockpos];
>> + sign5 = (level5 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel5 = (__int64)abs(level5) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel5 + addc) >> qBitsC);
>> + level5 = (int)((tmplevel5 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel5 - (level5 << qBits)) >>
>> qBits8);
>> + blockpos++;
>> +
>> + int level6;
>> + int sign6;
>> + level6 = coef[blockpos];
>> + sign6 = (level6 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel6 = (__int64)abs(level6) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel6 + addc) >> qBitsC);
>> + level6 = (int)((tmplevel6 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel6 - (level6 << qBits)) >>
>> qBits8);
>> + blockpos++;
>> +
>> + int level7;
>> + int sign7;
>> + level7 = coef[blockpos];
>> + sign7 = (level7 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel7 = (__int64)abs(level7) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel7 + addc) >> qBitsC);
>> + level7 = (int)((tmplevel7 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel7 - (level7 << qBits)) >>
>> qBits8);
>> + blockpos++;
>> +
>> + int level8;
>> + int sign8;
>> + level8 = coef[blockpos];
>> + sign8 = (level8 < 0 ? -1 : 1);
>> +
>> + __int64 tmplevel8 = (__int64)abs(level8) * quantCoeff[blockpos];
>> + arlCCoef[blockpos] = (int)((tmplevel8 + addc) >> qBitsC);
>> + level8 = (int)((tmplevel8 + add) >> qBits);
>> + deltaU[blockpos] = (int)((tmplevel8 - (level8 << qBits)) >>
>> qBits8);
>> +
>> + Vec4i qLevel2(level5, level6, level7, level8);
>> + Vec4i qSign2(sign5, sign6, sign7, sign8);
>> + acSum += horizontal_add(qLevel2);
>> + qLevel2 = qLevel2 * qSign2;
>> + Vec8s quantCoef = compress_saturated(qLevel1, qLevel2);
>> + Vec4i quantCoef1 = extend_low(quantCoef);
>> + Vec4i quantCoef2 = extend_high(quantCoef);
>> + quantCoef1.store(qCoef + dstOffset);
>> + dstOffset += 4;
>> + quantCoef2.store(qCoef + dstOffset);
>> + dstOffset += 4;
>> + }
>> +
>> + return acSum;
>> +}
>> }
>>
>> #include "utils.h"
>> @@ -3948,6 +4072,7 @@
>> void NAME(Setup_Vec_DCTPrimitives)(EncoderPrimitives &p)
>> {
>> p.deQuant = xDeQuant;
>> + p.calQuantCoefEAdp = xCalQuantCoefEAdp;
>>
>
> this primitive is now just called quantaq
>
>
>>
>> // TODO: in 16bpp mode, the intermediate must be 32-bits
>> #if !HIGH_BIT_DEPTH && INSTRSET > 4
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> http://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Steve Borho
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130628/8e066be0/attachment-0001.html>
More information about the x265-devel
mailing list