[x265] [PATCH] Vector code for xCalQuantCoefEAdp
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Jun 27 14:27:54 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1372336062 -19800
# Node ID 2e227fd23fe25e9fe6dfcca2f1dac21474f4a7a0
# Parent 321b2fd70a1bd58b2bb1c2351f49766709a15770
Vector code for xCalQuantCoefEAdp
diff -r 321b2fd70a1b -r 2e227fd23fe2 source/common/vec/dct.inc
--- a/source/common/vec/dct.inc Wed Jun 26 17:42:39 2013 +0530
+++ b/source/common/vec/dct.inc Thu Jun 27 17:57:42 2013 +0530
@@ -39,7 +39,6 @@
extern void fastForwardDst(Short *block, Short *coeff, Int shift);
namespace {
-
/* Used for filter */
#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
#define IF_FILTER_PREC 6 ///< Log2 of sum of filter taps
@@ -3938,6 +3937,131 @@
#undef STROE_LINE
}
}
+
+unsigned int xCalQuantCoefEAdp(int * coef,
+ int * quantCoeff,
+ int * deltaU,
+ int * qCoef,
+ int * arlCCoef,
+ int qBitsC,
+ int qBits,
+ int add,
+ int numCoeff)
+{
+ int addc = 1 << (qBitsC - 1);
+ int qBits8 = qBits - 8;
+ unsigned int acSum = 0;
+ int dstOffset = 0;
+
+ for (int blockpos = 0; blockpos < numCoeff; blockpos++)
+ {
+ int level1;
+ int sign1;
+ level1 = coef[blockpos];
+ sign1 = (level1 < 0 ? -1 : 1);
+
+ __int64 tmplevel1 = (__int64)abs(level1) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel1 + addc) >> qBitsC);
+ level1 = (int)((tmplevel1 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel1 - (level1 << qBits)) >> qBits8);
+ blockpos++;
+
+ int level2;
+ int sign2;
+ level2 = coef[blockpos];
+ sign2 = (level2 < 0 ? -1 : 1);
+
+ __int64 tmplevel2 = (__int64)abs(level2) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel2 + addc) >> qBitsC);
+ level2 = (int)((tmplevel2 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel2 - (level2 << qBits)) >> qBits8);
+ blockpos++;
+
+ int level3;
+ int sign3;
+ level3 = coef[blockpos];
+ sign3 = (level3 < 0 ? -1 : 1);
+
+ __int64 tmplevel3 = (__int64)abs(level3) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel3 + addc) >> qBitsC);
+ level3 = (int)((tmplevel3 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel3 - (level3 << qBits)) >> qBits8);
+ blockpos++;
+
+ int level4;
+ int sign4;
+ level4 = coef[blockpos];
+ sign4 = (level4 < 0 ? -1 : 1);
+
+ __int64 tmplevel4 = (__int64)abs(level4) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel4 + addc) >> qBitsC);
+ level4 = (int)((tmplevel4 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel4 - (level4 << qBits)) >> qBits8);
+ blockpos++;
+
+ Vec4i qLevel1(level1, level2, level3, level4);
+ Vec4i qSign1(sign1, sign2, sign3, sign4);
+ acSum += horizontal_add(qLevel1);
+ qLevel1 = qLevel1 * qSign1;
+
+ int level5;
+ int sign5;
+ level5 = coef[blockpos];
+ sign5 = (level5 < 0 ? -1 : 1);
+
+ __int64 tmplevel5 = (__int64)abs(level5) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel5 + addc) >> qBitsC);
+ level5 = (int)((tmplevel5 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel5 - (level5 << qBits)) >> qBits8);
+ blockpos++;
+
+ int level6;
+ int sign6;
+ level6 = coef[blockpos];
+ sign6 = (level6 < 0 ? -1 : 1);
+
+ __int64 tmplevel6 = (__int64)abs(level6) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel6 + addc) >> qBitsC);
+ level6 = (int)((tmplevel6 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel6 - (level6 << qBits)) >> qBits8);
+ blockpos++;
+
+ int level7;
+ int sign7;
+ level7 = coef[blockpos];
+ sign7 = (level7 < 0 ? -1 : 1);
+
+ __int64 tmplevel7 = (__int64)abs(level7) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel7 + addc) >> qBitsC);
+ level7 = (int)((tmplevel7 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel7 - (level7 << qBits)) >> qBits8);
+ blockpos++;
+
+ int level8;
+ int sign8;
+ level8 = coef[blockpos];
+ sign8 = (level8 < 0 ? -1 : 1);
+
+ __int64 tmplevel8 = (__int64)abs(level8) * quantCoeff[blockpos];
+ arlCCoef[blockpos] = (int)((tmplevel8 + addc) >> qBitsC);
+ level8 = (int)((tmplevel8 + add) >> qBits);
+ deltaU[blockpos] = (int)((tmplevel8 - (level8 << qBits)) >> qBits8);
+
+ Vec4i qLevel2(level5, level6, level7, level8);
+ Vec4i qSign2(sign5, sign6, sign7, sign8);
+ acSum += horizontal_add(qLevel2);
+ qLevel2 = qLevel2 * qSign2;
+ Vec8s quantCoef = compress_saturated(qLevel1, qLevel2);
+ Vec4i quantCoef1 = extend_low(quantCoef);
+ Vec4i quantCoef2 = extend_high(quantCoef);
+ quantCoef1.store(qCoef + dstOffset);
+ dstOffset += 4;
+ quantCoef2.store(qCoef + dstOffset);
+ dstOffset += 4;
+ }
+
+ return acSum;
+}
}
#include "utils.h"
@@ -3948,6 +4072,7 @@
void NAME(Setup_Vec_DCTPrimitives)(EncoderPrimitives &p)
{
p.deQuant = xDeQuant;
+ p.calQuantCoefEAdp = xCalQuantCoefEAdp;
// TODO: in 16bpp mode, the intermediate must be 32-bits
#if !HIGH_BIT_DEPTH && INSTRSET > 4
More information about the x265-devel
mailing list