[x265] [PATCH 2 of 3] Merge: (common files)check need of signed/unsigned int
kavitha at multicorewareinc.com
kavitha at multicorewareinc.com
Thu Oct 31 14:48:03 CET 2013
# HG changeset patch
# User Kavitha Sampath <kavitha at multicorewareinc.com>
# Date 1383225183 -19800
# Thu Oct 31 18:43:03 2013 +0530
# Node ID 2cdef1dd17b2d66dc5a84f2e40ae3130a3f9e325
# Parent 9bff4295adfc760e9fdebb6c9499e4a3b2cb7fab
# Parent 9a0da4e6d9e363e383eae7243f0c64026a5f6d00
Merge: (common files)check need of signed/unsigned int
diff -r 9bff4295adfc -r 2cdef1dd17b2 .hgtags
--- a/.hgtags Thu Oct 31 15:40:28 2013 +0530
+++ b/.hgtags Thu Oct 31 18:43:03 2013 +0530
@@ -6,3 +6,4 @@
3767fbfa970ff4b2dc2e8647db0274168727147e 0.3
2ba6ec553f218d2b06ad803b87d6ec751fd639f7 0.4
93707bc4fccdaa89a1f2da11db8808ca912a691c 0.4.1
+69acb3cb777f977f5edde908069ac565915dd366 0.5
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -247,6 +247,12 @@
m_totalBits = 0;
m_numPartitions = pic->getNumPartInCU();
+ for (int i = 0; i < 4; i++)
+ {
+ m_avgCost[i] = 0;
+ m_count[i] = 0;
+ }
+
// CHECK_ME: why partStartIdx always negative
int partStartIdx = 0 - (cuAddr) * pic->getNumPartInCU();
@@ -287,7 +293,6 @@
if (numElements > 0)
{
memset(m_skipFlag + firstElement, false, numElements * sizeof(*m_skipFlag));
- memset(m_partSizes + firstElement, SIZE_NONE, numElements * sizeof(*m_partSizes));
memset(m_predModes + firstElement, MODE_NONE, numElements * sizeof(*m_predModes));
memset(m_cuTransquantBypass + firstElement, false, numElements * sizeof(*m_cuTransquantBypass));
memset(m_depth + firstElement, 0, numElements * sizeof(*m_depth));
@@ -297,8 +302,6 @@
memset(m_transformSkip[2] + firstElement, 0, numElements * sizeof(*m_transformSkip[2]));
memset(m_width + firstElement, g_maxCUWidth, numElements * sizeof(*m_width));
memset(m_height + firstElement, g_maxCUHeight, numElements * sizeof(*m_height));
- memset(m_mvpIdx[0] + firstElement, -1, numElements * sizeof(*m_mvpIdx[0]));
- memset(m_mvpIdx[1] + firstElement, -1, numElements * sizeof(*m_mvpIdx[1]));
memset(m_mvpNum[0] + firstElement, -1, numElements * sizeof(*m_mvpNum[0]));
memset(m_mvpNum[1] + firstElement, -1, numElements * sizeof(*m_mvpNum[1]));
memset(m_qp + firstElement, getSlice()->getSliceQp(), numElements * sizeof(*m_qp));
@@ -470,6 +473,12 @@
m_totalBits = 0;
m_numPartitions = cu->getTotalNumPart() >> 2;
+ for (int i = 0; i < 4; i++)
+ {
+ m_avgCost[i] = cu->m_avgCost[i];
+ m_count[i] = cu->m_count[i];
+ }
+
int iSizeInUchar = sizeof(UChar) * m_numPartitions;
int iSizeInBool = sizeof(bool) * m_numPartitions;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.h Thu Oct 31 18:43:03 2013 +0530
@@ -178,6 +178,8 @@
UInt64 m_totalCost; ///< sum of partition RD costs
uint32_t m_totalDistortion; ///< sum of partition distortion
uint32_t m_totalBits; ///< sum of partition signal bits
+ UInt64 m_avgCost[4]; // stores the avg cost of CU's in frame for each depth
+ uint32_t m_count[4];
// -------------------------------------------------------------------------------------------------------------------
// create / destroy / initialize / copy
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComMotionInfo.cpp
--- a/source/Lib/TLibCommon/TComMotionInfo.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComMotionInfo.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -89,12 +89,6 @@
void TComCUMvField::clearMvField()
{
- for (int i = 0; i < m_numPartitions; i++)
- {
- m_mv[i] = 0;
- m_mvd[i] = 0;
- }
-
assert(sizeof(*m_refIdx) == 1);
memset(m_refIdx, NOT_VALID, m_numPartitions * sizeof(*m_refIdx));
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComMotionInfo.h
--- a/source/Lib/TLibCommon/TComMotionInfo.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComMotionInfo.h Thu Oct 31 18:43:03 2013 +0530
@@ -84,7 +84,7 @@
/// class for motion information in one CU
class TComCUMvField
{
-private:
+public:
MV* m_mv;
MV* m_mvd;
@@ -95,8 +95,6 @@
template<typename T>
void setAll(T *p, T const & val, PartSize cuMode, int partAddr, uint32_t depth, int partIdx);
-public:
-
TComCUMvField() : m_mv(NULL), m_mvd(NULL), m_refIdx(NULL), m_numPartitions(0) {}
~TComCUMvField() {}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -499,7 +499,7 @@
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
primitives.ipfilter_ps[FILTER_H_P_S_8](src - (halfFilterSize - 1) * srcStride, srcStride, m_immedVals, tmpStride, width, height + filterSize - 1, g_lumaFilter[xFrac]);
- primitives.ipfilter_sp[FILTER_V_S_P_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, g_lumaFilter[yFrac]);
+ primitives.ipfilter_sp[FILTER_V_S_P_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, yFrac);
}
}
@@ -516,6 +516,9 @@
int xFrac = mv->x & 0x3;
int yFrac = mv->y & 0x3;
+ assert((width % 4) + (height % 4) == 0);
+ assert(dstStride == MAX_CU_SIZE);
+
if ((yFrac | xFrac) == 0)
{
primitives.ipfilter_p2s(ref, refStride, dst, dstStride, width, height);
@@ -590,10 +593,10 @@
int halfFilterSize = (filterSize >> 1);
primitives.ipfilter_ps[FILTER_H_P_S_4](refCb - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
- primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+ primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, yFrac);
primitives.ipfilter_ps[FILTER_H_P_S_4](refCr - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
- primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+ primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, yFrac);
}
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -640,7 +640,7 @@
{
uint32_t posY = blkPos >> log2BlkSize;
uint32_t posX = blkPos - (posY << log2BlkSize);
- UShort ctxSig = getSigCtxInc(patternSigCtx, scanIdx, posX, posY, log2BlkSize, ttype);
+ uint16_t ctxSig = getSigCtxInc(patternSigCtx, scanIdx, posX, posY, log2BlkSize, ttype);
level = xGetCodedLevel(costCoeff[scanPos], costCoeff0[scanPos], costSig[scanPos],
levelDouble, maxAbsLevel, ctxSig, oneCtx, absCtx, goRiceParam,
c1Idx, c2Idx, qbits, scaleFactor, 0);
@@ -1149,10 +1149,10 @@
double& codedCostSig,
int levelDouble,
uint32_t maxAbsLevel,
- UShort ctxNumSig,
- UShort ctxNumOne,
- UShort ctxNumAbs,
- UShort absGoRice,
+ uint16_t ctxNumSig,
+ uint16_t ctxNumOne,
+ uint16_t ctxNumAbs,
+ uint16_t absGoRice,
uint32_t c1Idx,
uint32_t c2Idx,
int qbits,
@@ -1207,9 +1207,9 @@
* \returns cost of given absolute transform level
*/
inline double TComTrQuant::xGetICRateCost(uint32_t absLevel,
- UShort ctxNumOne,
- UShort ctxNumAbs,
- UShort absGoRice,
+ uint16_t ctxNumOne,
+ uint16_t ctxNumAbs,
+ uint16_t absGoRice,
uint32_t c1Idx,
uint32_t c2Idx) const
{
@@ -1263,9 +1263,9 @@
}
inline int TComTrQuant::xGetICRate(uint32_t absLevel,
- UShort ctxNumOne,
- UShort ctxNumAbs,
- UShort absGoRice,
+ uint16_t ctxNumOne,
+ uint16_t ctxNumAbs,
+ uint16_t absGoRice,
uint32_t c1Idx,
uint32_t c2Idx) const
{
@@ -1290,8 +1290,8 @@
symbol = std::min<uint32_t>(symbol, (maxVlc + 1));
}
- UShort prefLen = UShort(symbol >> absGoRice) + 1;
- UShort numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
+ uint16_t prefLen = uint16_t(symbol >> absGoRice) + 1;
+ uint16_t numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
rate += numBins << 15;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.h Thu Oct 31 18:43:03 2013 +0530
@@ -200,18 +200,18 @@
uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, TCoeff* dstCoeff, uint32_t width, uint32_t height, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
inline uint32_t xGetCodedLevel(double& codedCost, double& codedCost0, double& codedCostSig, int levelDouble,
- uint32_t maxAbsLevel, UShort ctxNumSig, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice,
+ uint32_t maxAbsLevel, uint16_t ctxNumSig, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice,
uint32_t c1Idx, uint32_t c2Idx, int qbits, double scale, bool bLast) const;
- inline double xGetICRateCost(uint32_t absLevel, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+ inline double xGetICRateCost(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
- inline int xGetICRate(uint32_t absLevel, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+ inline int xGetICRate(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
inline double xGetRateLast(uint32_t posx, uint32_t posy) const;
- inline double xGetRateSigCoeffGroup(UShort sigCoeffGroup, UShort ctxNumSig) const { return m_lambda * m_estBitsSbac->significantCoeffGroupBits[ctxNumSig][sigCoeffGroup]; }
+ inline double xGetRateSigCoeffGroup(uint16_t sigCoeffGroup, uint16_t ctxNumSig) const { return m_lambda * m_estBitsSbac->significantCoeffGroupBits[ctxNumSig][sigCoeffGroup]; }
- inline double xGetRateSigCoef(UShort sig, UShort ctxNumSig) const { return m_lambda * m_estBitsSbac->significantBits[ctxNumSig][sig]; }
+ inline double xGetRateSigCoef(uint16_t sig, uint16_t ctxNumSig) const { return m_lambda * m_estBitsSbac->significantBits[ctxNumSig][sig]; }
inline double xGetICost(double rage) const { return m_lambda * rage; } ///< Get the cost for a specific rate
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TypeDef.h
--- a/source/Lib/TLibCommon/TypeDef.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TypeDef.h Thu Oct 31 18:43:03 2013 +0530
@@ -52,7 +52,6 @@
// ====================================================================================================================
typedef unsigned char UChar;
-typedef unsigned short UShort;
// ====================================================================================================================
// 64-bit integer type
@@ -71,7 +70,7 @@
// ====================================================================================================================
#if HIGH_BIT_DEPTH
-typedef UShort Pel; // 16-bit pixel type
+typedef uint16_t Pel; // 16-bit pixel type
#define X265_DEPTH x265::g_bitDepth // runtime configurable bit depth
extern uint32_t g_bitDepth;
#else
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/NALwrite.cpp
--- a/source/Lib/TLibEncoder/NALwrite.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -82,8 +82,8 @@
* - 0x00000302
* - 0x00000303
*/
- uint32_t fsize = nalu.m_Bitstream.getByteStreamLength();
- uint8_t* fifo = nalu.m_Bitstream.getFIFO();
+ uint32_t fsize = nalu.m_bitstream.getByteStreamLength();
+ uint8_t* fifo = nalu.m_bitstream.getFIFO();
uint8_t* emulation = (uint8_t*)X265_MALLOC(uint8_t, fsize + EMULATION_SIZE);
uint32_t nalsize = 0;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/NALwrite.h
--- a/source/Lib/TLibEncoder/NALwrite.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.h Thu Oct 31 18:43:03 2013 +0530
@@ -61,17 +61,17 @@
uint32_t temporalID = 0,
uint32_t reserved_zero_6bits = 0)
: NALUnit(nalUnitType, temporalID, reserved_zero_6bits)
- , m_Bitstream()
+ , m_bitstream()
{}
OutputNALUnit& operator =(const NALUnit& src)
{
- m_Bitstream.clear();
+ m_bitstream.clear();
static_cast<NALUnit*>(this)->operator =(src);
return *this;
}
- TComOutputBitstream m_Bitstream;
+ TComOutputBitstream m_bitstream;
};
void write(uint8_t*& out, OutputNALUnit& nalu, uint32_t& packetSize);
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -2115,7 +2115,7 @@
* \param bValid
* \returns void
*/
-void TEncSearch::xMergeEstimation(TComDataCU* cu, int puIdx, uint32_t& interDir, TComMvField* mvField, uint32_t& mergeIndex, uint32_t& outCost, TComMvField* mvFieldNeighbours, UChar* interDirNeighbours, int& numValidMergeCand)
+void TEncSearch::xMergeEstimation(TComDataCU* cu, int puIdx, uint32_t& interDir, TComMvField* mvField, uint32_t& mergeIndex, uint32_t& outCost, uint32_t& outbits, TComMvField* mvFieldNeighbours, UChar* interDirNeighbours, int& numValidMergeCand)
{
uint32_t absPartIdx = 0;
int width = 0;
@@ -2145,10 +2145,10 @@
uint32_t costCand = MAX_UINT;
uint32_t bitsCand = 0;
- PartSize size = cu->getPartitionSize(0);
-
- cu->getCUMvField(REF_PIC_LIST_0)->setAllMvField(mvFieldNeighbours[0 + 2 * mergeCand], size, absPartIdx, 0, puIdx);
- cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(mvFieldNeighbours[1 + 2 * mergeCand], size, absPartIdx, 0, puIdx);
+ cu->getCUMvField(REF_PIC_LIST_0)->m_mv[absPartIdx] = mvFieldNeighbours[0 + 2 * mergeCand].mv;
+ cu->getCUMvField(REF_PIC_LIST_0)->m_refIdx[absPartIdx] = mvFieldNeighbours[0 + 2 * mergeCand].refIdx;
+ cu->getCUMvField(REF_PIC_LIST_1)->m_mv[absPartIdx] = mvFieldNeighbours[1 + 2 * mergeCand].mv;
+ cu->getCUMvField(REF_PIC_LIST_1)->m_refIdx[absPartIdx] = mvFieldNeighbours[1 + 2 * mergeCand].refIdx;
costCand = xGetInterPredictionError(cu, puIdx);
bitsCand = mergeCand + 1;
@@ -2160,6 +2160,7 @@
if (costCand < outCost)
{
outCost = costCand;
+ outbits = bitsCand;
mvField[0] = mvFieldNeighbours[0 + 2 * mergeCand];
mvField[1] = mvFieldNeighbours[1 + 2 * mergeCand];
interDir = interDirNeighbours[mergeCand];
@@ -2226,6 +2227,8 @@
UChar interDirNeighbours[MRG_MAX_NUM_CANDS];
int numValidMergeCand = 0;
+ int totalmebits = 0;
+
for (int partIdx = 0; partIdx < numPart; partIdx++)
{
uint32_t listCost[2] = { MAX_UINT, MAX_UINT };
@@ -2495,7 +2498,8 @@
// find Merge result
uint32_t mrgCost = MAX_UINT;
- xMergeEstimation(cu, partIdx, mrgInterDir, mrgMvField, mrgIndex, mrgCost, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
+ uint32_t mrgBits = 0;
+ xMergeEstimation(cu, partIdx, mrgInterDir, mrgMvField, mrgIndex, mrgCost, mrgBits, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
if (mrgCost < meCost)
{
// set Merge result
@@ -2517,6 +2521,7 @@
#if CU_STAT_LOGFILE
meCost += mrgCost;
#endif
+ totalmebits += mrgBits;
}
else
{
@@ -2530,11 +2535,18 @@
#if CU_STAT_LOGFILE
meCost += meCost;
#endif
+ totalmebits += mebits;
}
}
+ else
+ {
+ totalmebits += mebits;
+ }
motionCompensation(cu, predYuv, REF_PIC_LIST_X, partIdx, bLuma, bChroma);
}
+ cu->m_totalBits = totalmebits;
+
setWpScalingDistParam(cu, -1, REF_PIC_LIST_X);
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.h Thu Oct 31 18:43:03 2013 +0530
@@ -211,7 +211,7 @@
void xGetBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
void xMergeEstimation(TComDataCU* cu, int partIdx, uint32_t& uiInterDir,
- TComMvField* pacMvField, uint32_t& mergeIndex, uint32_t& outCost,
+ TComMvField* pacMvField, uint32_t& mergeIndex, uint32_t& outCost, uint32_t& outbits,
TComMvField* mvFieldNeighbors, UChar* interDirNeighbors, int& numValidMergeCand);
void xRestrictBipredMergeCand(TComDataCU* cu, uint32_t puIdx, TComMvField* mvFieldNeighbours,
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/common.cpp
--- a/source/common/common.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/common.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -186,7 +186,7 @@
/* Rate control options */
param->rc.bitrate = 0;
- param->rc.rateTolerance = 0.1;
+ param->rc.rateTolerance = 1.0;
param->rc.qCompress = 0.6;
param->rc.ipFactor = 1.4f;
param->rc.pbFactor = 1.3f;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/ipfilter.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -37,12 +37,14 @@
namespace {
template<int N>
-void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
+void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
{
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int shift = IF_FILTER_PREC + headRoom;
int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
- int16_t maxVal = (1 << X265_DEPTH) - 1;
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
+ const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+
src -= (N / 2 - 1) * srcStride;
int row, col;
@@ -82,7 +84,7 @@
{
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int offset = (1 << (headRoom - 1));
- int16_t maxVal = (1 << X265_DEPTH) - 1;
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
const int cStride = 1;
src -= (N / 2 - 1) * cStride;
@@ -226,7 +228,7 @@
{
int shift = IF_INTERNAL_PREC - X265_DEPTH;
int16_t offset = IF_INTERNAL_OFFS + (shift ? (1 << (shift - 1)) : 0);
- int16_t maxVal = (1 << X265_DEPTH) - 1;
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
int row, col;
for (row = 0; row < height; row++)
{
@@ -262,12 +264,30 @@
}
}
+void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+{
+ int shift = IF_INTERNAL_PREC - X265_DEPTH;
+ int row, col;
+
+ for (row = 0; row < height; row++)
+ {
+ for (col = 0; col < width; col++)
+ {
+ int16_t val = src[col] << shift;
+ dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
+ }
+
+ src += srcStride;
+ dst += MAX_CU_SIZE;
+ }
+}
+
template<int N>
void filterVertical_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *c)
{
int shift = IF_FILTER_PREC;
int offset = 1 << (shift - 1);
- int16_t maxVal = (1 << X265_DEPTH) - 1;
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
src -= (N / 2 - 1) * srcStride;
int row, col;
@@ -328,7 +348,7 @@
int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
int offset = (1 << (headRoom - 1));
- int16_t maxVal = (1 << X265_DEPTH) - 1;
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
int cStride = 1;
src -= (N / 2 - 1) * cStride;
@@ -368,7 +388,7 @@
int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
int shift = IF_FILTER_PREC;
int offset = 1 << (shift - 1);
- int16_t maxVal = (1 << X265_DEPTH) - 1;
+ uint16_t maxVal = (1 << X265_DEPTH) - 1;
src -= (N / 2 - 1) * srcStride;
int row, col;
@@ -401,6 +421,17 @@
dst += dstStride;
}
}
+typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, short *dst, intptr_t dstStride, int width, int height, const short *coeff);
+typedef void (*ipfilter_sp_t)(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const short *coeff);
+
+template<int N, int width, int height>
+void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+{
+ short m_immedVals[(64 + 8) * (64 + 8)];
+ filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals, width, width, height + 7, g_lumaFilter[idxX]);
+ filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
+}
+
}
namespace x265 {
@@ -412,7 +443,8 @@
#define LUMA(W, H) \
p.luma_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<8, W, H>;\
- p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>
+ p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>; \
+ p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_hv_pp_c<8, W, H>;
void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
{
@@ -457,6 +489,7 @@
p.ipfilter_p2s = filterConvertPelToShort_c;
p.ipfilter_s2p = filterConvertShortToPel_c;
+ p.luma_p2s = filterConvertPelToShort_c;
p.extendRowBorder = extendCURowColBorder;
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/pixel.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -758,6 +758,21 @@
}
}
}
+
+template<int bx, int by>
+void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+{
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ a[x] = b[x];
+ }
+
+ a += stridea;
+ b += strideb;
+ }
+}
} // end anonymous namespace
namespace x265 {
@@ -798,6 +813,37 @@
p.satd[LUMA_64x16] = satd8<64, 16>;
p.satd[LUMA_16x64] = satd8<16, 64>;
+#define CHROMA(W, H) \
+ p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>
+#define LUMA(W, H) \
+ p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>
+
+ LUMA(4, 4);
+ LUMA(8, 8); CHROMA(4, 4);
+ LUMA(4, 8); CHROMA(2, 4);
+ LUMA(8, 4); CHROMA(4, 2);
+ LUMA(16, 16); CHROMA(8, 8);
+ LUMA(16, 8); CHROMA(8, 4);
+ LUMA( 8, 16); CHROMA(4, 8);
+ LUMA(16, 12); CHROMA(8, 6);
+ LUMA(12, 16); CHROMA(6, 8);
+ LUMA(16, 4); CHROMA(8, 2);
+ LUMA( 4, 16); CHROMA(2, 8);
+ LUMA(32, 32); CHROMA(16, 16);
+ LUMA(32, 16); CHROMA(16, 8);
+ LUMA(16, 32); CHROMA(8, 16);
+ LUMA(32, 24); CHROMA(16, 12);
+ LUMA(24, 32); CHROMA(12, 16);
+ LUMA(32, 8); CHROMA(16, 4);
+ LUMA( 8, 32); CHROMA(4, 16);
+ LUMA(64, 64); CHROMA(32, 32);
+ LUMA(64, 32); CHROMA(32, 16);
+ LUMA(32, 64); CHROMA(16, 32);
+ LUMA(64, 48); CHROMA(32, 24);
+ LUMA(48, 64); CHROMA(24, 32);
+ LUMA(64, 16); CHROMA(32, 8);
+ LUMA(16, 64); CHROMA(8, 32);
+
//sse
#if HIGH_BIT_DEPTH
SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, int16_t, int16_t)
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/primitives.h
--- a/source/common/primitives.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/primitives.h Thu Oct 31 18:43:03 2013 +0530
@@ -66,7 +66,7 @@
{ // Square Rectangular Asymmetrical (0.75, 0.25)
LUMA_4x4,
LUMA_8x8, LUMA_8x4, LUMA_4x8,
- LUMA_16x16, LUMA_16x8, LUMA_8x16, LUMA_16x12, LUMA_12x16, LUMA_4x16, LUMA_16x4,
+ LUMA_16x16, LUMA_16x8, LUMA_8x16, LUMA_16x12, LUMA_12x16, LUMA_16x4, LUMA_4x16,
LUMA_32x32, LUMA_32x16, LUMA_16x32, LUMA_32x24, LUMA_24x32, LUMA_32x8, LUMA_8x32,
LUMA_64x64, LUMA_64x32, LUMA_32x64, LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64,
NUM_LUMA_PARTITIONS
@@ -165,7 +165,7 @@
typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
-typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
+typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
typedef void (*ipfilter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height);
typedef void (*ipfilter_s2p_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height);
@@ -209,6 +209,10 @@
typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h);
typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+
+typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -234,6 +238,9 @@
cvt16to16_shl_t cvt16to16_shl;
cvt32to16_shr_t cvt32to16_shr;
+ copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
+ copy_pp_t chroma_copy_pp[NUM_CHROMA_PARTITIONS];
+
ipfilter_pp_t ipfilter_pp[NUM_IPFILTER_P_P];
ipfilter_ps_t ipfilter_ps[NUM_IPFILTER_P_S];
ipfilter_sp_t ipfilter_sp[NUM_IPFILTER_S_P];
@@ -245,6 +252,8 @@
filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
filter_pp_t chroma_vpp[NUM_CHROMA_PARTITIONS];
filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS];
+ filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
+ filter_p2s_t luma_p2s;
intra_dc_t intra_pred_dc;
intra_planar_t intra_pred_planar;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/threadpool.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -369,7 +369,7 @@
static int get_cpu_count()
{
-#if WIN32
+#if _WIN32
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors;
@@ -393,8 +393,8 @@
}
return count;
-#else // if WIN32
+#else // if _WIN32
return 2; // default to 2 threads, everywhere else
-#endif // if WIN32
+#endif // if _WIN32
}
} // end namespace x265
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/vec/ipfilter-sse41.cpp
--- a/source/common/vec/ipfilter-sse41.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/vec/ipfilter-sse41.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -34,6 +34,8 @@
#include <assert.h>
#include <string.h>
+using namespace x265;
+
#if !HIGH_BIT_DEPTH
namespace {
ALIGN_VAR_32(const uint16_t, c_512[16]) =
@@ -42,8 +44,10 @@
};
template<int N>
-void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
+void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
{
+ const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+
src -= (N / 2 - 1) * srcStride;
int offset;
@@ -677,8 +681,9 @@
#include "vectorclass.h"
namespace {
template<int N>
-void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int block_width, int block_height, const int16_t *coeff)
+void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int block_width, int block_height, int coeffIdx)
{
+ const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
int row, col;
src -= (N / 2 - 1) * srcStride;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/vec/pixel-sse41.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -31,1335 +31,8 @@
using namespace x265;
-#if defined(_MSC_VER)
-#pragma warning(disable: 4799) // MMX warning EMMS
-#endif
-
-#if defined(__INTEL_COMPILER) || defined(__GCC__)
-#define HAVE_MMX 1
-#elif defined(_MSC_VER) && defined(X86_64)
-#define HAVE_MMX 0
-#else
-#define HAVE_MMX 1
-#endif
-
namespace {
#if !HIGH_BIT_DEPTH
-#if HAVE_MMX
-template<int ly>
-// ly will always be 32
-int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
- __m64 sum0 = _mm_setzero_si64();
-
- __m64 T00, T01, T02, T03;
- __m64 T10, T11, T12, T13;
- __m64 T20, T21, T22, T23;
-
- for (int i = 0; i < ly; i += 16)
- {
- T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
- T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
- T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
- T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
-
- T10 = (*(__m64*)(fref + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref + (i + 3) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
-
- sum0 = _mm_add_pi16(sum0, T20);
- sum0 = _mm_add_pi16(sum0, T21);
- sum0 = _mm_add_pi16(sum0, T22);
- sum0 = _mm_add_pi16(sum0, T23);
-
- T00 = (*(__m64*)(fenc + (i + 4) * fencstride));
- T01 = (*(__m64*)(fenc + (i + 5) * fencstride));
- T02 = (*(__m64*)(fenc + (i + 6) * fencstride));
- T03 = (*(__m64*)(fenc + (i + 7) * fencstride));
-
- T10 = (*(__m64*)(fref + (i + 4) * frefstride));
- T11 = (*(__m64*)(fref + (i + 5) * frefstride));
- T12 = (*(__m64*)(fref + (i + 6) * frefstride));
- T13 = (*(__m64*)(fref + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
-
- sum0 = _mm_add_pi16(sum0, T20);
- sum0 = _mm_add_pi16(sum0, T21);
- sum0 = _mm_add_pi16(sum0, T22);
- sum0 = _mm_add_pi16(sum0, T23);
-
- T00 = (*(__m64*)(fenc + (i + 8) * fencstride));
- T01 = (*(__m64*)(fenc + (i + 9) * fencstride));
- T02 = (*(__m64*)(fenc + (i + 10) * fencstride));
- T03 = (*(__m64*)(fenc + (i + 11) * fencstride));
-
- T10 = (*(__m64*)(fref + (i + 8) * frefstride));
- T11 = (*(__m64*)(fref + (i + 9) * frefstride));
- T12 = (*(__m64*)(fref + (i + 10) * frefstride));
- T13 = (*(__m64*)(fref + (i + 11) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
-
- sum0 = _mm_add_pi16(sum0, T20);
- sum0 = _mm_add_pi16(sum0, T21);
- sum0 = _mm_add_pi16(sum0, T22);
- sum0 = _mm_add_pi16(sum0, T23);
-
- T00 = (*(__m64*)(fenc + (i + 12) * fencstride));
- T01 = (*(__m64*)(fenc + (i + 13) * fencstride));
- T02 = (*(__m64*)(fenc + (i + 14) * fencstride));
- T03 = (*(__m64*)(fenc + (i + 15) * fencstride));
-
- T10 = (*(__m64*)(fref + (i + 12) * frefstride));
- T11 = (*(__m64*)(fref + (i + 13) * frefstride));
- T12 = (*(__m64*)(fref + (i + 14) * frefstride));
- T13 = (*(__m64*)(fref + (i + 15) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
-
- sum0 = _mm_add_pi16(sum0, T20);
- sum0 = _mm_add_pi16(sum0, T21);
- sum0 = _mm_add_pi16(sum0, T22);
- sum0 = _mm_add_pi16(sum0, T23);
- }
-
- // 8 * 255 -> 11 bits x 8 -> 14 bits
- return _m_to_int(sum0);
-}
-
-#else /* if HAVE_MMX */
-
-template<int ly>
-// ly will always be 32
-int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21;
-
- for (int i = 0; i < ly; i += 8)
- {
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * fencstride));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * fencstride));
- T01 = _mm_unpacklo_epi64(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * fencstride));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * fencstride));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum1 = _mm_add_epi32(sum1, T21);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * fencstride));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * fencstride));
- T01 = _mm_unpacklo_epi64(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 6) * fencstride));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 7) * fencstride));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum1 = _mm_add_epi32(sum1, T21);
- }
-
- // [0 x 0 x]
- sum0 = _mm_add_epi32(sum0, sum1);
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
- return _mm_cvtsi128_si32(sum0);
-}
-
-#endif /* if HAVE_MMX */
-
-template<int ly>
-// will only be instanced with ly == 16
-int sad_12(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)
-{
- assert(ly == 16);
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
-
-#define MASK _mm_set_epi32(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff)
-
-#define PROCESS_12x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
- T00 = _mm_and_si128(T00, MASK); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
- T01 = _mm_and_si128(T01, MASK); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
- T02 = _mm_and_si128(T02, MASK); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
- T03 = _mm_and_si128(T03, MASK); \
- T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
- T10 = _mm_and_si128(T10, MASK); \
- T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
- T11 = _mm_and_si128(T11, MASK); \
- T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
- T12 = _mm_and_si128(T12, MASK); \
- T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
- T13 = _mm_and_si128(T13, MASK); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- sum0 = _mm_add_epi16(sum0, T20); \
- sum0 = _mm_add_epi16(sum0, T21); \
- sum0 = _mm_add_epi16(sum0, T22); \
- sum0 = _mm_add_epi16(sum0, T23)
-
- PROCESS_12x4(0);
- PROCESS_12x4(4);
- PROCESS_12x4(8);
- PROCESS_12x4(12);
-
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
-
- return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-int sad_16(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
-
-#define PROCESS_16x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
- T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- sum0 = _mm_add_epi16(sum0, T20); \
- sum0 = _mm_add_epi16(sum0, T21); \
- sum0 = _mm_add_epi16(sum0, T22); \
- sum0 = _mm_add_epi16(sum0, T23)
-
- PROCESS_16x4(0);
- if (ly >= 8)
- {
- PROCESS_16x4(4);
- }
- if (ly >= 12)
- {
- PROCESS_16x4(8);
- }
- if (ly >= 16)
- {
- PROCESS_16x4(12);
- }
- if (ly > 16)
- {
- for (int i = 16; i < ly; i += 8)
- {
- PROCESS_16x4(i);
- PROCESS_16x4(i + 4);
- }
- }
-
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
-
- return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-// always instanced for 32 rows
-int sad_24(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
-
-#define PROCESS_24x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
- T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- sum0 = _mm_add_epi32(sum0, T20); \
- sum0 = _mm_add_epi32(sum0, T21); \
- sum0 = _mm_add_epi32(sum0, T22); \
- sum0 = _mm_add_epi32(sum0, T23); \
- T00 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 0) * fencstride))); \
- T01 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 1) * fencstride))); \
- T01 = _mm_unpacklo_epi64(T00, T01); \
- T02 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 2) * fencstride))); \
- T03 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 3) * fencstride))); \
- T03 = _mm_unpacklo_epi64(T02, T03); \
- T10 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 0) * frefstride))); \
- T11 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 1) * frefstride))); \
- T11 = _mm_unpacklo_epi64(T10, T11); \
- T12 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 2) * frefstride))); \
- T13 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 3) * frefstride))); \
- T13 = _mm_unpacklo_epi64(T12, T13); \
- T20 = _mm_setzero_si128(); \
- T21 = _mm_setzero_si128(); \
- T20 = _mm_sad_epu8(T01, T11); \
- T21 = _mm_sad_epu8(T03, T13); \
- sum0 = _mm_add_epi32(sum0, T20); \
- sum0 = _mm_add_epi32(sum0, T21);
-
- for (int i = 0; i < ly; i += 8)
- {
- PROCESS_24x4(i);
- PROCESS_24x4(i + 4);
- }
-
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
-
- return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-// ly will be 8, 16, 24, or 32
-int sad_32(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
-
-#define PROCESS_32x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
- T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- sum0 = _mm_add_epi32(sum0, T20); \
- sum0 = _mm_add_epi32(sum0, T21); \
- sum0 = _mm_add_epi32(sum0, T22); \
- sum0 = _mm_add_epi32(sum0, T23); \
- T00 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 0) * fencstride)); \
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 1) * fencstride)); \
- T02 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 2) * fencstride)); \
- T03 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 3) * fencstride)); \
- T10 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- sum0 = _mm_add_epi32(sum0, T20); \
- sum0 = _mm_add_epi32(sum0, T21); \
- sum0 = _mm_add_epi32(sum0, T22); \
- sum0 = _mm_add_epi32(sum0, T23);
-
- for (int i = 0; i < ly; i += 8)
- {
- PROCESS_32x4(i);
- PROCESS_32x4(i + 4);
- }
-
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
-
- return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-int sad_48(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
-
- /* for ly = 64 */
- for (int i = 0; i < ly; i += 8)
- {
- __m128i T00, T01, T02;
- __m128i T10, T11, T12;
- __m128i T20, T21, T22;
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 0) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 0) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 0) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 0) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 0) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 0) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 1) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 1) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 1) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 1) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 2) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 2) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 2) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 2) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 3) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 3) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 3) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 3) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 4) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 4) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 4) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 4) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 5) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 5) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 5) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 5) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 6) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 6) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 6) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 6) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 7) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 7) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 7) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 7) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- }
-
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
- return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-// ly will be 16, 32, 48, or 64
-int sad_64(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
-
- for (int i = 0; i < ly; i += 8)
- {
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 0) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 0) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 0) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 0) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 0) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 0) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 0) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 0) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 1) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 1) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 1) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 1) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 1) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 1) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 2) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 2) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 2) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 2) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 2) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 2) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 3) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 3) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 3) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 3) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 3) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 3) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 4) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 4) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 4) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 4) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 4) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 4) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 5) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 5) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 5) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 5) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 5) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 5) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 6) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 6) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 6) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 6) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 6) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 6) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 7) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 7) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 7) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 7) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 7) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 7) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
- }
-
- sum1 = _mm_shuffle_epi32(sum0, 2);
- sum0 = _mm_add_epi32(sum0, sum1);
- return _mm_cvtsi128_si32(sum0);
-}
-
-#if HAVE_MMX
-void sad_x3_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
- __m128i sum0, sum1, sum2;
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i R00, R01, R02, R03;
- __m128i T20;
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R02);
- sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R03);
- sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- res[0] = _mm_cvtsi128_si32(sum0);
- res[1] = _mm_cvtsi128_si32(sum1);
- res[2] = _mm_cvtsi128_si32(sum2);
-}
-
-#else /* if HAVE_MMX */
-
-void sad_x3_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i sum2 = _mm_setzero_si128();
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i R00, R01, R02, R03;
- __m128i T20;
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (0) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (1) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R02);
- sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R03);
- sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- res[0] = _mm_cvtsi128_si32(sum0);
- res[1] = _mm_cvtsi128_si32(sum1);
- res[2] = _mm_cvtsi128_si32(sum2);
-}
-
-#endif /* if HAVE_MMX */
-
-#if HAVE_MMX
-template<int ly>
-// ly will always be 32
-void sad_x3_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
- __m64 sum0 = _mm_setzero_si64();
- __m64 sum1 = _mm_setzero_si64();
- __m64 sum2 = _mm_setzero_si64();
-
- __m64 T00, T01, T02, T03, T04, T05, T06, T07;
- __m64 T10, T11, T12, T13, T14, T15, T16, T17;
- __m64 T20, T21, T22, T23, T24, T25, T26, T27;
-
- for (int i = 0; i < ly; i += 8)
- {
- T00 = (*(__m64*)(fenc + (i + 0) * FENC_STRIDE));
- T01 = (*(__m64*)(fenc + (i + 1) * FENC_STRIDE));
- T02 = (*(__m64*)(fenc + (i + 2) * FENC_STRIDE));
- T03 = (*(__m64*)(fenc + (i + 3) * FENC_STRIDE));
- T04 = (*(__m64*)(fenc + (i + 4) * FENC_STRIDE));
- T05 = (*(__m64*)(fenc + (i + 5) * FENC_STRIDE));
- T06 = (*(__m64*)(fenc + (i + 6) * FENC_STRIDE));
- T07 = (*(__m64*)(fenc + (i + 7) * FENC_STRIDE));
-
- T10 = (*(__m64*)(fref1 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref1 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref1 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref1 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref1 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref1 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref1 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref1 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum0 = _mm_add_pi16(sum0, T20);
- sum0 = _mm_add_pi16(sum0, T21);
- sum0 = _mm_add_pi16(sum0, T22);
- sum0 = _mm_add_pi16(sum0, T23);
- sum0 = _mm_add_pi16(sum0, T24);
- sum0 = _mm_add_pi16(sum0, T25);
- sum0 = _mm_add_pi16(sum0, T26);
- sum0 = _mm_add_pi16(sum0, T27);
-
- T10 = (*(__m64*)(fref2 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref2 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref2 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref2 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref2 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref2 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref2 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref2 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum1 = _mm_add_pi16(sum1, T20);
- sum1 = _mm_add_pi16(sum1, T21);
- sum1 = _mm_add_pi16(sum1, T22);
- sum1 = _mm_add_pi16(sum1, T23);
- sum1 = _mm_add_pi16(sum1, T24);
- sum1 = _mm_add_pi16(sum1, T25);
- sum1 = _mm_add_pi16(sum1, T26);
- sum1 = _mm_add_pi16(sum1, T27);
-
- T10 = (*(__m64*)(fref3 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref3 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref3 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref3 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref3 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref3 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref3 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref3 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum2 = _mm_add_pi16(sum2, T20);
- sum2 = _mm_add_pi16(sum2, T21);
- sum2 = _mm_add_pi16(sum2, T22);
- sum2 = _mm_add_pi16(sum2, T23);
- sum2 = _mm_add_pi16(sum2, T24);
- sum2 = _mm_add_pi16(sum2, T25);
- sum2 = _mm_add_pi16(sum2, T26);
- sum2 = _mm_add_pi16(sum2, T27);
- }
-
- res[0] = _m_to_int(sum0);
- res[1] = _m_to_int(sum1);
- res[2] = _m_to_int(sum2);
-}
-
-#else /* if HAVE_MMX */
-
-template<int ly>
-// ly will always be 32
-void sad_x3_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21;
- __m128i sum0 = _mm_setzero_si128();
-
- res[0] = res[1] = res[2] = 0;
- for (int i = 0; i < ly; i += 8)
- {
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi64(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- sum0 = _mm_shuffle_epi32(T21, 2);
- sum0 = _mm_add_epi32(sum0, T21);
- res[0] = res[0] + _mm_cvtsi128_si32(sum0);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- sum0 = _mm_shuffle_epi32(T21, 2);
- sum0 = _mm_add_epi32(sum0, T21);
- res[1] = res[1] + _mm_cvtsi128_si32(sum0);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- sum0 = _mm_shuffle_epi32(T21, 2);
- sum0 = _mm_add_epi32(sum0, T21);
- res[2] = res[2] + _mm_cvtsi128_si32(sum0);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi64(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 6) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 7) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- sum0 = _mm_shuffle_epi32(T21, 2);
- sum0 = _mm_add_epi32(sum0, T21);
- res[0] = res[0] + _mm_cvtsi128_si32(sum0);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- sum0 = _mm_shuffle_epi32(T21, 2);
- sum0 = _mm_add_epi32(sum0, T21);
- res[1] = res[1] + _mm_cvtsi128_si32(sum0);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- sum0 = _mm_shuffle_epi32(T21, 2);
- sum0 = _mm_add_epi32(sum0, T21);
- res[2] = res[2] + _mm_cvtsi128_si32(sum0);
- }
-}
-
-#endif /* if HAVE_MMX */
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-
template<int ly>
void sad_x3_12(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
{
@@ -1445,295 +118,6 @@
}
template<int ly>
-void sad_x3_16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-#define PROCESS_16x4x3(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * FENC_STRIDE)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
- T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res0 += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res1 += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res2 += _mm_cvtsi128_si32(sum0); \
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
- __m128i sum0, sum1;
- int res0 = 0, res1 = 0, res2 = 0;
-
- // ly == 4, 12, 32, 64
- PROCESS_16x4x3(0);
- if (ly >= 8)
- {
- PROCESS_16x4x3(4);
- }
- if (ly >= 12)
- {
- PROCESS_16x4x3(8);
- }
- if (ly > 12)
- {
- PROCESS_16x4x3(12);
- for (int i = 16; i < ly; i += 16)
- {
- PROCESS_16x4x3(i);
- PROCESS_16x4x3(i + 4);
- PROCESS_16x4x3(i + 8);
- PROCESS_16x4x3(i + 12);
- }
- }
- res[0] = res0;
- res[1] = res1;
- res[2] = res2;
-}
-
-template<int ly>
-void sad_x3_24(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
- res[0] = res[1] = res[2] = 0;
- __m128i T00, T01, T02, T03, T04, T05;
- __m128i T10, T11, T12, T13, T14, T15;
- __m128i T20, T21, T22, T23;
- __m128i T30, T31;
- __m128i sum0, sum1;
-
-#define PROCESS_24x4x3(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
- T10 = _mm_loadl_epi64((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
- T11 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
- T04 = _mm_unpacklo_epi64(T10, T11); \
- T12 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
- T13 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
- T05 = _mm_unpacklo_epi64(T12, T13); \
- T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[0] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[1] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[2] += _mm_cvtsi128_si32(sum0);
-
- for (int i = 0; i < ly; i += 8)
- {
- PROCESS_24x4x3(i);
- PROCESS_24x4x3(i + 4);
- }
-}
-
-template<int ly>
-// ly will be 8, 16, 24, or 32
-void sad_x3_32(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
- res[0] = res[1] = res[2] = 0;
- __m128i T00, T01, T02, T03, T04, T05, T06, T07;
- __m128i T10, T11, T12, T13, T14, T15, T16, T17;
- __m128i T20, T21, T22, T23, T24, T25, T26, T27;
- __m128i sum0, sum1;
-
-#define PROCESS_32x4x3(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
- T04 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
- T05 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
- T06 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
- T07 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
- T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[0] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[1] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[2] += _mm_cvtsi128_si32(sum0);
-
- for (int i = 0; i < ly; i += 8)
- {
- PROCESS_32x4x3(i);
- PROCESS_32x4x3(i + 4);
- }
-}
-
-template<int ly>
void sad_x3_48(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
{
__m128i sum0 = _mm_setzero_si128();
@@ -2490,770 +874,6 @@
res[2] = _mm_cvtsi128_si32(sum2); /*Extracting sad value for reference frame 3*/
}
-#if HAVE_MMX
-void sad_x4_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
- __m128i sum0, sum1, sum2, sum3;
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i R00, R01, R02, R03, R04;
- __m128i T20;
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R02);
- sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R03);
- sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R04);
- sum3 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T20 = _mm_sad_epu8(R00, R04);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum3 = _mm_add_epi32(sum3, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T20 = _mm_sad_epu8(R00, R04);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum3 = _mm_add_epi32(sum3, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T20 = _mm_sad_epu8(R00, R04);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum3 = _mm_add_epi32(sum3, T20);
-
- res[0] = _mm_cvtsi128_si32(sum0);
- res[1] = _mm_cvtsi128_si32(sum1);
- res[2] = _mm_cvtsi128_si32(sum2);
- res[3] = _mm_cvtsi128_si32(sum3);
-}
-
-#else /* if HAVE_MMX */
-
-void sad_x4_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i sum2 = _mm_setzero_si128();
- __m128i sum3 = _mm_setzero_si128();
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i R00, R01, R02, R03, R04;
- __m128i T20;
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (0) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (1) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (1) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (3) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R02);
- sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R03);
- sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T20 = _mm_sad_epu8(R00, R04);
- sum3 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (5) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (7) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T20 = _mm_sad_epu8(R00, R04);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum3 = _mm_add_epi32(sum3, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (8) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (9) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (10) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (11) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T20 = _mm_sad_epu8(R00, R04);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum3 = _mm_add_epi32(sum3, T20);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi32(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi32(T02, T03);
- R00 = _mm_unpacklo_epi64(T01, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R01 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R02 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R03 = _mm_unpacklo_epi64(T11, T13);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (12) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (13) * frefstride));
- T11 = _mm_unpacklo_epi32(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (14) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (15) * frefstride));
- T13 = _mm_unpacklo_epi32(T12, T13);
- R04 = _mm_unpacklo_epi64(T11, T13);
-
- T20 = _mm_sad_epu8(R00, R01);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum0 = _mm_add_epi32(sum0, T20);
-
- T20 = _mm_sad_epu8(R00, R02);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum1 = _mm_add_epi32(sum1, T20);
-
- T20 = _mm_sad_epu8(R00, R03);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum2 = _mm_add_epi32(sum2, T20);
-
- T20 = _mm_sad_epu8(R00, R04);
- T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
- sum3 = _mm_add_epi32(sum3, T20);
-
- res[0] = _mm_cvtsi128_si32(sum0);
- res[1] = _mm_cvtsi128_si32(sum1);
- res[2] = _mm_cvtsi128_si32(sum2);
- res[3] = _mm_cvtsi128_si32(sum3);
-}
-
-#endif /* if HAVE_MMX */
-
-#if HAVE_MMX
-template<int ly>
-// ly will always be 32
-void sad_x4_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
- __m64 sum0 = _mm_setzero_si64();
- __m64 sum1 = _mm_setzero_si64();
- __m64 sum2 = _mm_setzero_si64();
- __m64 sum3 = _mm_setzero_si64();
-
- __m64 T00, T01, T02, T03, T04, T05, T06, T07;
- __m64 T10, T11, T12, T13, T14, T15, T16, T17;
- __m64 T20, T21, T22, T23, T24, T25, T26, T27;
-
- for (int i = 0; i < ly; i += 8)
- {
- T00 = (*(__m64*)(fenc + (i + 0) * FENC_STRIDE));
- T01 = (*(__m64*)(fenc + (i + 1) * FENC_STRIDE));
- T02 = (*(__m64*)(fenc + (i + 2) * FENC_STRIDE));
- T03 = (*(__m64*)(fenc + (i + 3) * FENC_STRIDE));
- T04 = (*(__m64*)(fenc + (i + 4) * FENC_STRIDE));
- T05 = (*(__m64*)(fenc + (i + 5) * FENC_STRIDE));
- T06 = (*(__m64*)(fenc + (i + 6) * FENC_STRIDE));
- T07 = (*(__m64*)(fenc + (i + 7) * FENC_STRIDE));
-
- T10 = (*(__m64*)(fref1 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref1 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref1 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref1 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref1 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref1 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref1 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref1 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum0 = _mm_add_pi16(sum0, T20);
- sum0 = _mm_add_pi16(sum0, T21);
- sum0 = _mm_add_pi16(sum0, T22);
- sum0 = _mm_add_pi16(sum0, T23);
- sum0 = _mm_add_pi16(sum0, T24);
- sum0 = _mm_add_pi16(sum0, T25);
- sum0 = _mm_add_pi16(sum0, T26);
- sum0 = _mm_add_pi16(sum0, T27);
-
- T10 = (*(__m64*)(fref2 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref2 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref2 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref2 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref2 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref2 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref2 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref2 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum1 = _mm_add_pi16(sum1, T20);
- sum1 = _mm_add_pi16(sum1, T21);
- sum1 = _mm_add_pi16(sum1, T22);
- sum1 = _mm_add_pi16(sum1, T23);
- sum1 = _mm_add_pi16(sum1, T24);
- sum1 = _mm_add_pi16(sum1, T25);
- sum1 = _mm_add_pi16(sum1, T26);
- sum1 = _mm_add_pi16(sum1, T27);
-
- T10 = (*(__m64*)(fref3 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref3 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref3 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref3 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref3 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref3 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref3 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref3 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum2 = _mm_add_pi16(sum2, T20);
- sum2 = _mm_add_pi16(sum2, T21);
- sum2 = _mm_add_pi16(sum2, T22);
- sum2 = _mm_add_pi16(sum2, T23);
- sum2 = _mm_add_pi16(sum2, T24);
- sum2 = _mm_add_pi16(sum2, T25);
- sum2 = _mm_add_pi16(sum2, T26);
- sum2 = _mm_add_pi16(sum2, T27);
-
- T10 = (*(__m64*)(fref4 + (i + 0) * frefstride));
- T11 = (*(__m64*)(fref4 + (i + 1) * frefstride));
- T12 = (*(__m64*)(fref4 + (i + 2) * frefstride));
- T13 = (*(__m64*)(fref4 + (i + 3) * frefstride));
- T14 = (*(__m64*)(fref4 + (i + 4) * frefstride));
- T15 = (*(__m64*)(fref4 + (i + 5) * frefstride));
- T16 = (*(__m64*)(fref4 + (i + 6) * frefstride));
- T17 = (*(__m64*)(fref4 + (i + 7) * frefstride));
-
- T20 = _mm_sad_pu8(T00, T10);
- T21 = _mm_sad_pu8(T01, T11);
- T22 = _mm_sad_pu8(T02, T12);
- T23 = _mm_sad_pu8(T03, T13);
- T24 = _mm_sad_pu8(T04, T14);
- T25 = _mm_sad_pu8(T05, T15);
- T26 = _mm_sad_pu8(T06, T16);
- T27 = _mm_sad_pu8(T07, T17);
-
- sum3 = _mm_add_pi16(sum3, T20);
- sum3 = _mm_add_pi16(sum3, T21);
- sum3 = _mm_add_pi16(sum3, T22);
- sum3 = _mm_add_pi16(sum3, T23);
- sum3 = _mm_add_pi16(sum3, T24);
- sum3 = _mm_add_pi16(sum3, T25);
- sum3 = _mm_add_pi16(sum3, T26);
- sum3 = _mm_add_pi16(sum3, T27);
- }
-
- res[0] = _m_to_int(sum0);
- res[1] = _m_to_int(sum1);
- res[2] = _m_to_int(sum2);
- res[3] = _m_to_int(sum3);
-}
-
-#else /* if HAVE_MMX */
-
-template<int ly>
-// ly will always be 32
-void sad_x4_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
- __m128i sum0 = _mm_setzero_si128();
- __m128i sum1 = _mm_setzero_si128();
- __m128i sum2 = _mm_setzero_si128();
- __m128i sum3 = _mm_setzero_si128();
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21;
-
- for (int i = 0; i < ly; i += 8)
- {
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi64(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum0 = _mm_add_epi32(sum0, T21);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum1 = _mm_add_epi32(sum1, T21);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum2 = _mm_add_epi32(sum2, T21);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 0) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 1) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 2) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 3) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum3 = _mm_add_epi32(sum3, T21);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * FENC_STRIDE));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * FENC_STRIDE));
- T01 = _mm_unpacklo_epi64(T00, T01);
- T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 6) * FENC_STRIDE));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 7) * FENC_STRIDE));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum0 = _mm_add_epi32(sum0, T21);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum1 = _mm_add_epi32(sum1, T21);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum2 = _mm_add_epi32(sum2, T21);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 4) * frefstride));
- T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 5) * frefstride));
- T11 = _mm_unpacklo_epi64(T10, T11);
- T12 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 6) * frefstride));
- T13 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 7) * frefstride));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
- T21 = _mm_add_epi32(T20, T21);
- T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
- sum3 = _mm_add_epi32(sum3, T21);
- }
-
- res[0] = _mm_cvtsi128_si32(sum0);
- res[1] = _mm_cvtsi128_si32(sum1);
- res[2] = _mm_cvtsi128_si32(sum2);
- res[3] = _mm_cvtsi128_si32(sum3);
-}
-
-#endif /* if HAVE_MMX */
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-
template<int ly>
void sad_x4_12(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
{
@@ -3357,360 +977,6 @@
}
template<int ly>
-void sad_x4_16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-#define PROCESS_16x4x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * FENC_STRIDE)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
- T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res0 += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res1 += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res2 += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 0) * frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res3 += _mm_cvtsi128_si32(sum0); \
-
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
- __m128i sum0, sum1;
- int res0 = 0, res1 = 0, res2 = 0, res3 = 0;
-
- // ly == 4, 12, 32, 64
- PROCESS_16x4x4(0);
- if (ly >= 8)
- {
- PROCESS_16x4x4(4);
- }
- if (ly >= 12)
- {
- PROCESS_16x4x4(8);
- }
- if (ly > 12)
- {
- PROCESS_16x4x4(12);
- for (int i = 16; i < ly; i += 16)
- {
- PROCESS_16x4x4(i);
- PROCESS_16x4x4(i + 4);
- PROCESS_16x4x4(i + 8);
- PROCESS_16x4x4(i + 12);
- }
- }
- res[0] = res0;
- res[1] = res1;
- res[2] = res2;
- res[3] = res3;
-}
-
-template<int ly>
-void sad_x4_24(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
- res[0] = res[1] = res[2] = res[3] = 0;
- __m128i T00, T01, T02, T03, T04, T05;
- __m128i T10, T11, T12, T13, T14, T15;
- __m128i T20, T21, T22, T23;
- __m128i T30, T31;
- __m128i sum0, sum1;
-
-#define PROCESS_24x4x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
- T10 = _mm_loadl_epi64((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
- T11 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
- T04 = _mm_unpacklo_epi64(T10, T11); \
- T12 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
- T13 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
- T05 = _mm_unpacklo_epi64(T12, T13); \
- T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[0] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[1] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[2] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref4 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride)); \
- T20 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE)*frefstride + 16)); \
- T21 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE + 1) * frefstride + 16)); \
- T14 = _mm_unpacklo_epi64(T20, T21); \
- T22 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE + 2) * frefstride + 16)); \
- T23 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE + 3) * frefstride + 16)); \
- T15 = _mm_unpacklo_epi64(T22, T23); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T30 = _mm_sad_epu8(T04, T14); \
- T31 = _mm_sad_epu8(T05, T15); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum1 = _mm_add_epi16(T30, T31); \
- sum0 = _mm_add_epi16(sum0, sum1); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[3] += _mm_cvtsi128_si32(sum0)
-
- for (int i = 0; i < ly; i += 8)
- {
- PROCESS_24x4x4(i);
- PROCESS_24x4x4(i + 4);
- }
-}
-
-template<int ly>
-// ly will be 8, 16, 24, or 32
-void sad_x4_32(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
- res[0] = res[1] = res[2] = res[3] = 0;
- __m128i T00, T01, T02, T03, T04, T05, T06, T07;
- __m128i T10, T11, T12, T13, T14, T15, T16, T17;
- __m128i T20, T21, T22, T23, T24, T25, T26, T27;
- __m128i sum0, sum1;
-
-#define PROCESS_32x4x4(BASE) \
- T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
- T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
- T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
- T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
- T04 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
- T05 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
- T06 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
- T07 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
- T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[0] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[1] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[2] += _mm_cvtsi128_si32(sum0); \
- T10 = _mm_loadu_si128((__m128i*)(fref4 + (BASE)*frefstride)); \
- T11 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride)); \
- T12 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride)); \
- T13 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride)); \
- T14 = _mm_loadu_si128((__m128i*)(fref4 + (BASE)*frefstride + 16)); \
- T15 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride + 16)); \
- T16 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride + 16)); \
- T17 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride + 16)); \
- T20 = _mm_sad_epu8(T00, T10); \
- T21 = _mm_sad_epu8(T01, T11); \
- T22 = _mm_sad_epu8(T02, T12); \
- T23 = _mm_sad_epu8(T03, T13); \
- T24 = _mm_sad_epu8(T04, T14); \
- T25 = _mm_sad_epu8(T05, T15); \
- T26 = _mm_sad_epu8(T06, T16); \
- T27 = _mm_sad_epu8(T07, T17); \
- T20 = _mm_add_epi16(T20, T21); \
- T22 = _mm_add_epi16(T22, T23); \
- T24 = _mm_add_epi16(T24, T25); \
- T26 = _mm_add_epi16(T26, T27); \
- sum0 = _mm_add_epi16(T20, T22); \
- sum0 = _mm_add_epi16(sum0, T24); \
- sum0 = _mm_add_epi16(sum0, T26); \
- sum1 = _mm_shuffle_epi32(sum0, 2); \
- sum0 = _mm_add_epi32(sum0, sum1); \
- res[3] += _mm_cvtsi128_si32(sum0)
-
- for (int i = 0; i < ly; i += 8)
- {
- PROCESS_32x4x4(i);
- PROCESS_32x4x4(i + 4);
- }
-}
-
-template<int ly>
void sad_x4_48(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
{
__m128i sum0 = _mm_setzero_si128();
@@ -5601,7 +2867,6 @@
p.sse_ss[LUMA_ ## W ## x ## H] = sse_ss ## W < H >
#else
#define SETUP_PARTITION(W, H) \
- p.sad[LUMA_ ## W ## x ## H] = sad_ ## W<H>; \
p.sad_x3[LUMA_ ## W ## x ## H] = sad_x3_ ## W<H>; \
p.sad_x4[LUMA_ ## W ## x ## H] = sad_x4_ ## W<H>; \
p.sse_sp[LUMA_ ## W ## x ## H] = sse_sp ## W<H>; \
@@ -5616,25 +2881,25 @@
/* 2Nx2N, 2NxN, Nx2N, 4Ax3A, 4AxA, 3Ax4A, Ax4A */
SETUP_PARTITION(64, 64);
SETUP_PARTITION(64, 32);
- SETUP_PARTITION(32, 64);
+ SETUP_NONSAD(32, 64);
SETUP_PARTITION(64, 16);
SETUP_PARTITION(64, 48);
- SETUP_PARTITION(16, 64);
+ SETUP_NONSAD(16, 64);
SETUP_PARTITION(48, 64);
- SETUP_PARTITION(32, 32);
- SETUP_PARTITION(32, 16);
- SETUP_PARTITION(16, 32);
- SETUP_PARTITION(32, 8);
- SETUP_PARTITION(32, 24);
- SETUP_PARTITION(8, 32);
- SETUP_PARTITION(24, 32);
+ SETUP_NONSAD(32, 32);
+ SETUP_NONSAD(32, 16);
+ SETUP_NONSAD(16, 32);
+ SETUP_NONSAD(32, 8);
+ SETUP_NONSAD(32, 24);
+ SETUP_NONSAD(8, 32);
+ SETUP_NONSAD(24, 32);
SETUP_NONSAD(16, 16); // 16x16 SAD covered by assembly
SETUP_NONSAD(16, 8); // 16x8 SAD covered by assembly
SETUP_NONSAD(8, 16); // 8x16 SAD covered by assembly
- SETUP_PARTITION(16, 4);
- SETUP_PARTITION(16, 12);
+ SETUP_NONSAD(16, 4);
+ SETUP_NONSAD(16, 12);
SETUP_NONSAD(4, 16); // 4x16 SAD covered by assembly
#if !defined(__clang__)
SETUP_PARTITION(12, 16);
@@ -5652,8 +2917,6 @@
Setup_Vec_Pixel16Primitives_sse41(p);
#else
// These are the only SSE primitives uncovered by assembly
- p.sad_x3[LUMA_4x16] = sad_x3_4x16;
- p.sad_x4[LUMA_4x16] = sad_x4_4x16;
p.sse_pp[LUMA_12x16] = sse_pp_12x16;
p.sse_pp[LUMA_24x32] = sse_pp_24x32;
p.sse_pp[LUMA_48x64] = sse_pp_48x64;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/vec/vec-primitives.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -43,10 +43,30 @@
*edx = output[3];
}
+#if defined(_MSC_VER)
+#pragma warning(disable: 4100)
+#endif
void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
{
+#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
+
+ // MSVC 2010 SP1 or later, or similar Intel release
uint64_t out = _xgetbv(op);
+#elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax
+
+ uint32_t a, d;
+ __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (ctr) :);
+ *eax = a;
+ *edx = d;
+ return;
+
+#elif defined(_WIN64) // On x64 with older compilers, this is impossible
+
+ uint64_t out = 0;
+
+#endif
+
*eax = (uint32_t)out;
*edx = (uint32_t)(out >> 32);
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -126,7 +126,8 @@
p.pixelavg_pp[LUMA_8x4] = x265_pixel_avg_8x4_ ## cpu;
#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
- p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu
+ p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu;\
+ p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu;
#define CHROMA_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
@@ -221,8 +222,8 @@
{
INIT8_NAME(sse_pp, ssd, _mmx);
INIT8(sad, _mmx2);
- INIT7(sad_x3, _mmx2);
- INIT7(sad_x4, _mmx2);
+ INIT8(sad_x3, _mmx2);
+ INIT8(sad_x4, _mmx2);
INIT8(satd, _mmx2);
HEVC_SATD(mmx2);
p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_mmx2>;
@@ -235,6 +236,27 @@
//p.pixelavg_pp[LUMA_4x4] = x265_pixel_avg_4x4_mmx2;
//PIXEL_AVE(sse2);
+ p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2;
+ p.sad[LUMA_16x4 ] = x265_pixel_sad_16x4_sse2;
+ p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
+ p.sad[LUMA_16x32] = x265_pixel_sad_16x32_sse2;
+ p.sad[LUMA_16x64] = x265_pixel_sad_16x64_sse2;
+
+ p.sad[LUMA_32x8 ] = x265_pixel_sad_32x8_sse2;
+ p.sad[LUMA_32x16] = x265_pixel_sad_32x16_sse2;
+ p.sad[LUMA_32x24] = x265_pixel_sad_32x24_sse2;
+ p.sad[LUMA_32x32] = x265_pixel_sad_32x32_sse2;
+ p.sad[LUMA_32x64] = x265_pixel_sad_32x64_sse2;
+
+ p.sad[LUMA_64x16] = x265_pixel_sad_64x16_sse2;
+ p.sad[LUMA_64x32] = x265_pixel_sad_64x32_sse2;
+ p.sad[LUMA_64x48] = x265_pixel_sad_64x48_sse2;
+ p.sad[LUMA_64x64] = x265_pixel_sad_64x64_sse2;
+
+ p.sad[LUMA_48x64] = x265_pixel_sad_48x64_sse2;
+ p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2;
+ p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
+
ASSGN_SSE(sse2);
INIT2(sad, _sse2);
INIT2(sad_x3, _sse2);
@@ -280,6 +302,22 @@
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3;
p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
+ p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ssse3;
+ p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ssse3;
+ p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3;
+ p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3;
+ p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3;
+ p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3;
+ p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3;
+ p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ssse3;
+ p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ssse3;
+ p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ssse3;
+ p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ssse3;
+ p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ssse3;
+
+ p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
+ p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
+ p.luma_p2s = x265_luma_p2s_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -310,6 +348,18 @@
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx;
p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
+ p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_avx;
+ p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_avx;
+ p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_avx;
+ p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx;
+ p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx;
+ p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx;
+ p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx;
+ p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_avx;
+ p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_avx;
+ p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_avx;
+ p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_avx;
+ p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_avx;
}
if (cpuMask & X265_CPU_XOP)
{
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Oct 31 18:43:03 2013 +0530
@@ -35,7 +35,14 @@
db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
-tab_c_512: times 8 dw 512
+tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+ db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+
+tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
+
+tab_c_512: times 8 dw 512
+tab_c_8192: times 8 dw 8192
+tab_c_526336: times 4 dd 8192*64+2048
tab_ChromaCoeff: db 0, 64, 0, 0
db -2, 58, 10, -2
@@ -51,21 +58,46 @@
db -1, 4, -11, 40, 40, -11, 4, -1
db 0, 1, -5, 17, 58, -10, 4, -1
+tab_LumaCoeffV: times 4 dw 0, 0
+ times 4 dw 0, 64
+ times 4 dw 0, 0
+ times 4 dw 0, 0
+
+ times 4 dw -1, 4
+ times 4 dw -10, 58
+ times 4 dw 17, -5
+ times 4 dw 1, 0
+
+ times 4 dw -1, 4
+ times 4 dw -11, 40
+ times 4 dw 40, -11
+ times 4 dw 4, -1
+
+ times 4 dw 0, 1
+ times 4 dw -5, 17
+ times 4 dw 58, -10
+ times 4 dw 4, -1
+
+tab_c_128: times 16 db 0x80
+tab_c_64_n64: times 8 db 64, -64
+
SECTION .text
%macro FILTER_H4_w2_2 3
- movu %2, [srcq - 1]
+ movh %2, [srcq - 1]
pshufb %2, %2, Tm0
+ movh %1, [srcq + srcstrideq - 1]
+ pshufb %1, %1, Tm0
+ punpcklqdq %2, %1
pmaddubsw %2, coef2
- movu %1, [srcq + srcstrideq - 1]
- pshufb %1, %1, Tm0
- pmaddubsw %1, coef2
- phaddw %2, %1
+ phaddw %2, %2
pmulhrsw %2, %3
packuswb %2, %2
- pextrw [dstq], %2, 0
- pextrw [dstq + dststrideq], %2, 2
+ movd r4, %2
+ mov [dstq], r4w
+ shr r4, 16
+ mov [dstq + dststrideq], r4w
%endmacro
;-----------------------------------------------------------------------------
@@ -137,17 +169,18 @@
RET
%macro FILTER_H4_w4_2 3
- movu %2, [srcq - 1]
+ movh %2, [srcq - 1]
pshufb %2, %2, Tm0
pmaddubsw %2, coef2
- movu %1, [srcq + srcstrideq - 1]
+ movh %1, [srcq + srcstrideq - 1]
pshufb %1, %1, Tm0
pmaddubsw %1, coef2
phaddw %2, %1
pmulhrsw %2, %3
packuswb %2, %2
- movd [dstq], %2
- pextrd [dstq + dststrideq], %2, 1
+ movd [dstq], %2
+ palignr %2, %2, 4
+ movd [dstq + dststrideq], %2
%endmacro
;-----------------------------------------------------------------------------
@@ -523,8 +556,8 @@
pmaddubsw %1, %5
phaddw %4, %1
phaddw %2, %4
+ %if %0 == 8
pmulhrsw %2, %6
- %if %0 == 8
packuswb %2, %2
movh %8, %2
%endif
@@ -623,3 +656,1474 @@
IPFILTER_LUMA 48, 64
IPFILTER_LUMA 64, 16
IPFILTER_LUMA 16, 64
+
+
+;-----------------------------------------------------------------------------
+; Interpolate HV
+;-----------------------------------------------------------------------------
+%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
+ mova %5, [r0 + (%6 + 0) * 16]
+ mova %1, [r0 + (%6 + 1) * 16]
+ mova %2, [r0 + (%6 + 2) * 16]
+ punpcklwd %3, %5, %1
+ punpckhwd %5, %1
+ pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
+ pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
+ punpcklwd %4, %1, %2
+ punpckhwd %1, %2
+ pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
+ pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
+%endmacro ; FILTER_HV8_START
+
+%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
+ mova %8, [r0 + (%9 + 0) * 16]
+ mova %1, [r0 + (%9 + 1) * 16]
+ punpcklwd %7, %2, %8
+ punpckhwd %2, %8
+ pmaddwd %7, [r5 + %10 * 16]
+ pmaddwd %2, [r5 + %10 * 16]
+ paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
+ paddd %5, %2 ; R0 = H[0+1+2+3]
+ punpcklwd %7, %8, %1
+ punpckhwd %8, %1
+ pmaddwd %7, [r5 + %10 * 16]
+ pmaddwd %8, [r5 + %10 * 16]
+ paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
+ paddd %6, %8 ; R1 = H[1+2+3+4]
+%endmacro ; FILTER_HV8_START
+
+; Round and Saturate
+%macro FILTER_HV8_END 4 ; output in [1, 3]
+ paddd %1, [tab_c_526336]
+ paddd %2, [tab_c_526336]
+ paddd %3, [tab_c_526336]
+ paddd %4, [tab_c_526336]
+ psrad %1, 12
+ psrad %2, 12
+ psrad %3, 12
+ psrad %4, 12
+ packssdw %1, %2
+ packssdw %3, %4
+
+ ; TODO: is merge better? I think this way is short dependency link
+ packuswb %1, %1
+ packuswb %3, %3
+%endmacro ; FILTER_HV8_END
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+%define coef m7
+%define stk_buf rsp
+
+ mov r4d, r4m
+ mov r5d, r5m
+
+%ifdef PIC
+ lea r6, [tab_LumaCoeff]
+ movh coef, [r6 + r4 * 8]
+%else
+ movh coef, [tab_LumaCoeff + r4 * 8]
+%endif
+ punpcklqdq coef, coef
+
+ ; move to row -3
+ lea r6, [r1 + r1 * 2]
+ sub r0, r6
+
+ xor r6, r6
+ mov r4, rsp
+
+.loopH:
+ FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
+ psubw m1, [tab_c_8192]
+ mova [r4], m1
+
+ add r0, r1
+ add r4, 16
+ inc r6
+ cmp r6, 8+7
+ jnz .loopH
+
+ ; ready to phase V
+ ; Here all of mN is free
+
+ ; load coeff table
+ shl r5, 6
+ lea r6, [tab_LumaCoeffV]
+ lea r5, [r5 + r6]
+
+ ; load intermedia buffer
+ mov r0, stk_buf
+
+ ; register mapping
+ ; r0 - src
+ ; r5 - coeff
+ ; r6 - loop_i
+
+ ; let's go
+ xor r6, r6
+
+ ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
+.loopV:
+
+ FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
+ FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
+ FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
+ FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
+ FILTER_HV8_END m3, m0, m4, m1
+
+ movq [r2], m3
+ movq [r2 + r3], m4
+
+ lea r0, [r0 + 16 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ inc r6
+ cmp r6, 8/2
+ jnz .loopV
+
+ RET
+
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+
+%if ARCH_X86_64
+cglobal interp_8tap_v_sp, 4, 7+5, 8
+%define tmp_r0 r7
+%define tmp_r2 r8
+%define tmp_r3 r9
+%define tmp_r4d r10d
+%define tmp_6rows r11
+
+%else ; ARCH_X86_64 = 0
+
+cglobal interp_8tap_v_sp, 4, 7, 8, 0-(5*4)
+%define tmp_r0 [(rsp + 0 * 4)]
+%define tmp_r2 [(rsp + 1 * 4)]
+%define tmp_r3 [(rsp + 2 * 4)]
+%define tmp_r4d [(rsp + 3 * 4)]
+%define tmp_6rows [(rsp + 4 * 4)]
+%endif ; ARCH_X86_64
+
+ mov r4d, r4m
+ mov r5d, r5m
+
+ mov tmp_r4d, r4d
+ mov tmp_r2, r2
+
+ ; load coeff table
+ mov r6d, r6m
+ shl r6, 6
+ lea r4, [tab_LumaCoeffV]
+ lea r6, [r4 + r6]
+
+ ; move to -3
+ lea r1, [r1 * 2]
+ lea r4, [r1 + r1 * 2]
+ sub r0, r4
+ lea r4, [r4 * 2]
+ mov tmp_6rows, r4
+
+.loopH:
+
+ ; load width
+ mov r4d, tmp_r4d
+
+ ; save old src
+ mov tmp_r0, r0
+
+.loopW:
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m2, m0, m1
+ pmaddwd m2, [r6 + 0 * 16]
+ punpckhwd m0, m1
+ pmaddwd m0, [r6 + 0 * 16]
+
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m1, m3, m4
+ pmaddwd m1, [r6 + 1 * 16]
+ paddd m2, m1
+ punpckhwd m3, m4
+ pmaddwd m3, [r6 + 1 * 16]
+ paddd m0, m3
+
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ lea r0, [r0 + r1 * 2]
+ punpcklwd m1, m3, m4
+ pmaddwd m1, [r6 + 2 * 16]
+ paddd m2, m1
+ punpckhwd m3, m4
+ pmaddwd m3, [r6 + 2 * 16]
+ paddd m0, m3
+
+ movu m3, [r0]
+ movu m4, [r0 + r1]
+ punpcklwd m1, m3, m4
+ pmaddwd m1, [r6 + 3 * 16]
+ paddd m2, m1
+ punpckhwd m3, m4
+ pmaddwd m3, [r6 + 3 * 16]
+ paddd m0, m3
+
+ paddd m2, [tab_c_526336]
+ paddd m0, [tab_c_526336]
+ psrad m2, 12
+ psrad m0, 12
+ packssdw m2, m0
+ packuswb m2, m2
+
+ ; move to next 8 col
+ sub r0, tmp_6rows
+
+ sub r4, 8
+ jl .width4
+ movq [r2], m2
+ je .nextH
+ lea r0, [r0 + 16]
+ lea r2, [r2 + 8]
+ jmp .loopW
+
+.width4:
+ movd [r2], m2
+ lea r0, [r0 + 4]
+
+.nextH:
+ ; move to next row
+ mov r0, tmp_r0
+ lea r0, [r0 + r1]
+ add tmp_r2, r3
+ mov r2, tmp_r2
+
+ dec r5d
+ jnz .loopH
+
+ RET
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_2x4, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [tab_c_512]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+packuswb m2, m2
+
+pextrw [r2], m2, 0
+pextrw [r2 + r3], m2, 2
+
+lea r5, [r0 + 4 * r1]
+movd m2, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m2
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m2, m3
+punpcklbw m5, m2
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m4, m4
+
+pextrw [r2 + 2 * r3], m4, 0
+lea r6, [r2 + 2 * r3]
+pextrw [r6 + r3], m4, 2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W2_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_2x8, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+packuswb m2, m2
+
+pextrw [r2], m2, 0
+pextrw [r2 + r3], m2, 2
+
+lea r5, [r0 + 4 * r1]
+movd m2, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m2
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m2, m3
+punpcklbw m5, m2
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m4, m4
+
+pextrw [r2 + 2 * r3], m4, 0
+lea r6, [r2 + 2 * r3]
+pextrw [r6 + r3], m4, 2
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W2_H4 2, 8
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_4x2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [tab_c_512]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m5, m6
+punpcklbw m3, m5
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+packuswb m2, m2
+movd [r2], m2
+pextrd [r2 + r3], m2, 1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_4x4, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [tab_c_512]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+packuswb m2, m2
+movd [r2], m2
+pextrd [r2 + r3], m2, 1
+
+lea r5, [r0 + 4 * r1]
+movd m2, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m2
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m2, m3
+punpcklbw m5, m2
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m4, m4
+movd [r2 + 2 * r3], m4
+lea r6, [r2 + 2 * r3]
+pextrd [r6 + r3], m4, 1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W4_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+packuswb m2, m2
+movd [r2], m2
+pextrd [r2 + r3], m2, 1
+
+lea r5, [r0 + 4 * r1]
+movd m2, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m2
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m2, m3
+punpcklbw m5, m2
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m4, m4
+movd [r2 + 2 * r3], m4
+lea r6, [r2 + 2 * r3]
+pextrd [r6 + r3], m4, 1
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W4_H4 4, 8
+FILTER_V4_W4_H4 4, 16
+
+%macro FILTER_V4_W8_H2 0
+punpcklbw m1, m2
+punpcklbw m7, m3, m0
+
+pmaddubsw m1, m6
+pmaddubsw m7, m5
+
+paddw m1, m7
+
+pmulhrsw m1, m4
+packuswb m1, m1
+%endmacro
+
+%macro FILTER_V4_W8_H3 0
+punpcklbw m2, m3
+punpcklbw m7, m0, m1
+
+pmaddubsw m2, m6
+pmaddubsw m7, m5
+
+paddw m2, m7
+
+pmulhrsw m2, m4
+packuswb m2, m2
+%endmacro
+
+%macro FILTER_V4_W8_H4 0
+punpcklbw m3, m0
+punpcklbw m7, m1, m2
+
+pmaddubsw m3, m6
+pmaddubsw m7, m5
+
+paddw m3, m7
+
+pmulhrsw m3, m4
+packuswb m3, m3
+%endmacro
+
+%macro FILTER_V4_W8_H5 0
+punpcklbw m0, m1
+punpcklbw m7, m2, m3
+
+pmaddubsw m0, m6
+pmaddubsw m7, m5
+
+paddw m0, m7
+
+pmulhrsw m0, m4
+packuswb m0, m0
+%endmacro
+
+%macro FILTER_V4_W8_8x2 2
+FILTER_V4_W8 %1, %2
+movq m0, [r0 + 4 * r1]
+
+FILTER_V4_W8_H2
+
+movh [r2 + r3], m1
+%endmacro
+
+%macro FILTER_V4_W8_8x4 2
+FILTER_V4_W8_8x2 %1, %2
+;8x3
+lea r6, [r0 + 4 * r1]
+movq m1, [r6 + r1]
+
+FILTER_V4_W8_H3
+
+movh [r2 + 2 * r3], m2
+
+;8x4
+movq m2, [r6 + 2 * r1]
+
+FILTER_V4_W8_H4
+
+lea r5, [r2 + 2 * r3]
+movh [r5 + r3], m3
+%endmacro
+
+%macro FILTER_V4_W8_8x6 2
+FILTER_V4_W8_8x4 %1, %2
+;8x5
+lea r6, [r6 + 2 * r1]
+movq m3, [r6 + r1]
+
+FILTER_V4_W8_H5
+
+movh [r2 + 4 * r3], m0
+
+;8x6
+movq m0, [r0 + 8 * r1]
+
+FILTER_V4_W8_H2
+
+lea r5, [r2 + 4 * r3]
+movh [r5 + r3], m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W8 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov r4d, r4m
+
+sub r0, r1
+movq m0, [r0]
+movq m1, [r0 + r1]
+movq m2, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movq m3, [r5 + r1]
+
+punpcklbw m0, m1
+punpcklbw m4, m2, m3
+
+%ifdef PIC
+lea r6, [tab_ChromaCoeff]
+movd m5, [r6 + r4 * 4]
+%else
+movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m6, m5, [tab_Vm]
+pmaddubsw m0, m6
+
+pshufb m5, [tab_Vm + 16]
+pmaddubsw m4, m5
+
+paddw m0, m4
+
+mova m4, [tab_c_512]
+
+pmulhrsw m0, m4
+packuswb m0, m0
+movh [r2], m0
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x2 8, 2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x4 8, 4
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x6 8, 6
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W8_H8_H16_H32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r6, [tab_ChromaCoeff]
+movd m5, [r6 + r4 * 4]
+%else
+movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m6, m5, [tab_Vm]
+pshufb m5, [tab_Vm + 16]
+mova m4, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movq m0, [r0]
+movq m1, [r0 + r1]
+movq m2, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movq m3, [r5 + r1]
+
+punpcklbw m0, m1
+punpcklbw m7, m2, m3
+
+pmaddubsw m0, m6
+pmaddubsw m7, m5
+
+paddw m0, m7
+
+pmulhrsw m0, m4
+packuswb m0, m0
+movh [r2], m0
+
+movq m0, [r0 + 4 * r1]
+
+punpcklbw m1, m2
+punpcklbw m7, m3, m0
+
+pmaddubsw m1, m6
+pmaddubsw m7, m5
+
+paddw m1, m7
+
+pmulhrsw m1, m4
+packuswb m1, m1
+movh [r2 + r3], m1
+
+lea r6, [r0 + 4 * r1]
+movq m1, [r6 + r1]
+
+punpcklbw m2, m3
+punpcklbw m7, m0, m1
+
+pmaddubsw m2, m6
+pmaddubsw m7, m5
+
+paddw m2, m7
+
+pmulhrsw m2, m4
+packuswb m2, m2
+movh [r2 + 2 * r3], m2
+
+movq m2, [r6 + 2 * r1]
+
+punpcklbw m3, m0
+punpcklbw m1, m2
+
+pmaddubsw m3, m6
+pmaddubsw m1, m5
+
+paddw m3, m1
+
+pmulhrsw m3, m4
+packuswb m3, m3
+
+lea r5, [r2 + 2 * r3]
+movh [r5 + r3], m3
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W8_H8_H16_H32 8, 8
+FILTER_V4_W8_H8_H16_H32 8, 16
+FILTER_V4_W8_H8_H16_H32 8, 32
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W6_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_6x8, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r6, [tab_ChromaCoeff]
+movd m5, [r6 + r4 * 4]
+%else
+movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m6, m5, [tab_Vm]
+pshufb m5, [tab_Vm + 16]
+mova m4, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movq m0, [r0]
+movq m1, [r0 + r1]
+movq m2, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movq m3, [r5 + r1]
+
+punpcklbw m0, m1
+punpcklbw m7, m2, m3
+
+pmaddubsw m0, m6
+pmaddubsw m7, m5
+
+paddw m0, m7
+
+pmulhrsw m0, m4
+packuswb m0, m0
+movd [r2], m0
+pextrw [r2 + 4], m0, 2
+
+movq m0, [r0 + 4 * r1]
+
+punpcklbw m1, m2
+punpcklbw m7, m3, m0
+
+pmaddubsw m1, m6
+pmaddubsw m7, m5
+
+paddw m1, m7
+
+pmulhrsw m1, m4
+packuswb m1, m1
+movd [r2 + r3], m1
+pextrw [r2 + r3 + 4], m1, 2
+
+lea r6, [r0 + 4 * r1]
+movq m1, [r6 + r1]
+
+punpcklbw m2, m3
+punpcklbw m7, m0, m1
+
+pmaddubsw m2, m6
+pmaddubsw m7, m5
+
+paddw m2, m7
+
+pmulhrsw m2, m4
+packuswb m2, m2
+movd [r2 + 2 * r3], m2
+pextrw [r2 + 2 * r3 + 4], m2, 2
+
+movq m2, [r6 + 2 * r1]
+
+punpcklbw m3, m0
+punpcklbw m1, m2
+
+pmaddubsw m3, m6
+pmaddubsw m1, m5
+
+paddw m3, m1
+
+pmulhrsw m3, m4
+packuswb m3, m3
+
+lea r5, [r2 + 2 * r3]
+movd [r5 + r3], m3
+pextrw [r5 + r3 + 4], m3, 2
+
+lea r0, [r0 + 4 * r1]
+lea r2, [r2 + 4 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W6_H4 6, 8
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W12_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_12x16, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mova m7, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+movu m5, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movu m3, [r5 + r1]
+
+punpcklbw m6, m5, m3,
+punpckhbw m5, m3,
+
+pmaddubsw m6, m0
+pmaddubsw m5, m0
+
+paddw m4, m6
+paddw m2, m5
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movh [r2], m4
+pextrd [r2 + 8], m4, 2
+
+movu m2, [r0 + r1]
+movu m3, [r0 + 2 * r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r5, [r0 + 2 * r1]
+movu m5, [r5 + r1]
+movu m3, [r5 + 2 * r1]
+
+punpcklbw m6, m5, m3,
+punpckhbw m5, m3,
+
+pmaddubsw m6, m0
+pmaddubsw m5, m0
+
+paddw m4, m6
+paddw m2, m5
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movh [r2 + r3], m4
+pextrd [r2 + r3 + 8], m4, 2
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+sub r4, 2
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W12_H2 12, 16
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mov r4d, %2
+
+.loop
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m5, m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m5, m1
+
+movu m2, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movu m3, [r5 + r1]
+
+punpcklbw m6, m2, m3,
+punpckhbw m7, m2, m3,
+
+pmaddubsw m6, m0
+pmaddubsw m7, m0
+
+paddw m4, m6;
+paddw m5, m7;
+
+mova m6, [tab_c_512]
+
+pmulhrsw m4, m6
+pmulhrsw m5, m6
+
+packuswb m4, m5
+
+movu [r2], m4
+
+movu m2, [r0 + r1]
+movu m3, [r0 + 2 * r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m5, m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m5, m1
+
+lea r5, [r0 + 2 * r1]
+movu m2, [r5 + r1]
+movu m3, [r5 + 2 * r1]
+
+punpcklbw m6, m2, m3,
+punpckhbw m7, m2, m3,
+
+pmaddubsw m6, m0
+pmaddubsw m7, m0
+
+paddw m4, m6
+paddw m5, m7
+
+mova m6, [tab_c_512]
+
+pmulhrsw m4, m6
+pmulhrsw m5, m6
+
+packuswb m4, m5
+
+movu [r2 + r3], m4
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+sub r4, 2
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W16_H2 16, 4
+FILTER_V4_W16_H2 16, 8
+FILTER_V4_W16_H2 16, 12
+FILTER_V4_W16_H2 16, 16
+FILTER_V4_W16_H2 16, 32
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W24 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_24x32, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mova m7, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+movu m5, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movu m3, [r5 + r1]
+
+punpcklbw m6, m5, m3,
+punpckhbw m5, m3
+
+pmaddubsw m6, m0
+pmaddubsw m5, m0
+
+paddw m4, m6
+paddw m2, m5
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+movq m2, [r0 + 16]
+movq m3, [r0 + r1 + 16]
+movq m4, [r0 + 2 * r1 + 16]
+movq m5, [r5 + r1 + 16]
+
+punpcklbw m2, m3
+punpcklbw m4, m5
+
+pmaddubsw m2, m1
+pmaddubsw m4, m0
+
+paddw m2, m4
+
+pmulhrsw m2, m7
+packuswb m2, m2
+movh [r2 + 16], m2
+
+movu m2, [r0 + r1]
+movu m3, [r0 + 2 * r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r5, [r0 + 2 * r1]
+movu m5, [r5 + r1]
+movu m3, [r5 + 2 * r1]
+
+punpcklbw m6, m5, m3,
+punpckhbw m5, m3
+
+pmaddubsw m6, m0
+pmaddubsw m5, m0
+
+paddw m4, m6
+paddw m2, m5
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2 + r3], m4
+
+movq m2, [r0 + r1 + 16]
+movq m3, [r0 + 2 * r1 + 16]
+movq m4, [r5 + r1 + 16]
+movq m5, [r5 + 2 * r1 + 16]
+
+punpcklbw m2, m3
+punpcklbw m4, m5
+
+pmaddubsw m2, m1
+pmaddubsw m4, m0
+
+paddw m2, m4
+
+pmulhrsw m2, m7
+packuswb m2, m2
+movh [r2 + r3 + 16], m2
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+sub r4, 2
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W24 24, 32
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mova m7, [tab_c_512]
+
+mov r4d, %2
+
+.loop
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3,
+punpckhbw m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+movu m3, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movu m5, [r5 + r1]
+
+punpcklbw m6, m3, m5
+punpckhbw m3, m5,
+
+pmaddubsw m6, m0
+pmaddubsw m3, m0
+
+paddw m4, m6
+paddw m2, m3
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+movu m2, [r0 + 16]
+movu m3, [r0 + r1 + 16]
+
+punpcklbw m4, m2, m3,
+punpckhbw m2, m3,
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+movu m3, [r0 + 2 * r1 + 16]
+movu m5, [r5 + r1 + 16]
+
+punpcklbw m6, m3, m5
+punpckhbw m3, m5,
+
+pmaddubsw m6, m0
+pmaddubsw m3, m0
+
+paddw m4, m6
+paddw m2, m3
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2 + 16], m4
+
+lea r0, [r0 + r1]
+lea r2, [r2 + r3]
+
+dec r4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W32 32, 8
+FILTER_V4_W32 32, 16
+FILTER_V4_W32 32, 24
+FILTER_V4_W32 32, 32
+
+
+;-----------------------------------------------------------------------------
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal luma_p2s, 3, 7, 8
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+
+ ; load constant
+ mova m6, [tab_c_128]
+ mova m7, [tab_c_64_n64]
+
+ ;shr r4d, 2
+ lea r2, [r2 - 16]
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5]
+
+ movh m0, [r6]
+ punpcklbw m0, m6
+ pmaddubsw m0, m7
+
+ movh m1, [r6 + r1]
+ punpcklbw m1, m6
+ pmaddubsw m1, m7
+
+ movh m2, [r6 + r1 * 2]
+ punpcklbw m2, m6
+ pmaddubsw m2, m7
+
+ lea r6, [r6 + r1 * 2]
+ movh m3, [r6 + r1]
+ punpcklbw m3, m6
+ pmaddubsw m3, m7
+
+ add r5, 8
+ cmp r5, r3
+ jg .width4
+ movu [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+ movu [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+ movu [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+ movu [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+ lea r5, [r5 + 8]
+ je .nextH
+ jmp .loopW
+
+.width4:
+ movh [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+ movh [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+ movh [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+ movh [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 2 * 4
+
+ sub r4, 4
+ jnz .loopH
+
+ RET
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/ipfilter8.h Thu Oct 31 18:43:03 2013 +0530
@@ -88,6 +88,10 @@
CHROMA_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
+void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
+void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+
#undef SETUP_CHROMA_FUNC_DEF
#undef SETUP_LUMA_FUNC_DEF
#undef CHROMA_FILTERS
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/pixel.h Thu Oct 31 18:43:03 2013 +0530
@@ -42,6 +42,19 @@
ret x265_pixel_ ## name ## _4x16_ ## suffix args; \
ret x265_pixel_ ## name ## _4x8_ ## suffix args; \
ret x265_pixel_ ## name ## _4x4_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x8_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x16_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x24_ ## suffix args; \
+ ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x32_ ## suffix args; \
+ ret x265_pixel_ ## name ## _32x64_ ## suffix args; \
+ ret x265_pixel_ ## name ## _64x16_ ## suffix args; \
+ ret x265_pixel_ ## name ## _64x32_ ## suffix args; \
+ ret x265_pixel_ ## name ## _64x48_ ## suffix args; \
+ ret x265_pixel_ ## name ## _64x64_ ## suffix args; \
+ ret x265_pixel_ ## name ## _48x64_ ## suffix args; \
+ ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
+ ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
#define DECL_X1(name, suffix) \
DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/sad-a.asm Thu Oct 31 18:43:03 2013 +0530
@@ -31,8 +31,9 @@
SECTION_RODATA 32
+MSK: db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
-hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
+hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
@@ -119,6 +120,263 @@
RET
%endmacro
+%macro PROCESS_SAD_12x4 0
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+%endmacro
+
+%macro PROCESS_SAD_16x4 0
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + r1]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + r1]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+%endmacro
+
+%macro PROCESS_SAD_24x4 0
+ movu m1, [r2]
+ movq m2, [r2 + 16]
+ lea r2, [r2 + r3]
+ movu m3, [r2]
+ movq m4, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m3, [r0 + r1]
+ paddd m0, m1
+ paddd m0, m3
+ movq m1, [r0 + 16]
+ lea r0, [r0 + r1]
+ movq m3, [r0 + 16]
+ punpcklqdq m2, m4
+ punpcklqdq m1, m3
+ psadbw m2, m1
+ paddd m0, m2
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movq m2, [r2 + 16]
+ lea r2, [r2 + r3]
+ movu m3, [r2]
+ movq m4, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m3, [r0 + r1]
+ paddd m0, m1
+ paddd m0, m3
+ movq m1, [r0 + 16]
+ lea r0, [r0 + r1]
+ movq m3, [r0 + 16]
+ punpcklqdq m2, m4
+ punpcklqdq m1, m3
+ psadbw m2, m1
+ paddd m0, m2
+%endmacro
+
+%macro PROCESS_SAD_32x4 0
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+%endmacro
+
+%macro PROCESS_SAD_48x4 0
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ paddd m1, m2
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ paddd m1, m2
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ paddd m1, m2
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ paddd m1, m2
+ paddd m0, m1
+ paddd m0, m3
+%endmacro
+
+%macro PROCESS_SAD_8x4 0
+ movq m1, [r2]
+ movq m2, [r2 + r3]
+ lea r2, [r2 + 2 * r3]
+ movq m3, [r0]
+ movq m4, [r0 + r1]
+ lea r0, [r0 + 2 * r1]
+ punpcklqdq m1, m2
+ punpcklqdq m3, m4
+ psadbw m1, m3
+ paddd m0, m1
+ movq m1, [r2]
+ movq m2, [r2 + r3]
+ lea r2, [r2 + 2 * r3]
+ movq m3, [r0]
+ movq m4, [r0 + r1]
+ lea r0, [r0 + 2 * r1]
+ punpcklqdq m1, m2
+ punpcklqdq m3, m4
+ psadbw m1, m3
+ paddd m0, m1
+%endmacro
+
+%macro PROCESS_SAD_64x4 0
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ movu m4, [r2 + 48]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ psadbw m4, [r0 + 48]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ movu m4, [r2 + 48]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ psadbw m4, [r0 + 48]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ movu m4, [r2 + 48]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ psadbw m4, [r0 + 48]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ movu m4, [r2 + 48]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ psadbw m3, [r0 + 32]
+ psadbw m4, [r0 + 48]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+%endmacro
+
%macro SAD_W16 0
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
@@ -223,6 +481,376 @@
paddw m0, m1
paddw m0, m3
SAD_END_SSE2
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x12, 4,4,3
+ pxor m0, m0
+
+ PROCESS_SAD_16x4
+ PROCESS_SAD_16x4
+ PROCESS_SAD_16x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x32, 4,5,3
+ pxor m0, m0
+ mov r4d, 4
+.loop
+ PROCESS_SAD_16x4
+ PROCESS_SAD_16x4
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x64, 4,5,3
+ pxor m0, m0
+ mov r4d, 8
+.loop
+ PROCESS_SAD_16x4
+ PROCESS_SAD_16x4
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x4, 4,4,3
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ psadbw m0, [r0]
+ psadbw m1, [r0 + r1]
+ paddd m0, m1
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ movu m1, [r2]
+ movu m2, [r2 + r3]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + r1]
+ paddd m1, m2
+ paddd m0, m1
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x8, 4,4,3
+ pxor m0, m0
+
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x24, 4,5,3
+ pxor m0, m0
+ mov r4d, 3
+.loop
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x32, 4,5,3
+ pxor m0, m0
+ mov r4d, 4
+.loop
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x16, 4,4,3
+ pxor m0, m0
+
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x64, 4,5,3
+ pxor m0, m0
+ mov r4d, 8
+.loop
+ PROCESS_SAD_32x4
+ PROCESS_SAD_32x4
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_8x32, 4,4,3
+ pxor m0, m0
+ mov r4d, 4
+.loop
+ PROCESS_SAD_8x4
+ PROCESS_SAD_8x4
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x16, 4,4,5
+ pxor m0, m0
+
+ PROCESS_SAD_64x4
+
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_64x4
+
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_64x4
+
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_64x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x32, 4,5,5
+ pxor m0, m0
+ mov r4, 32
+
+.loop
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ sub r4, 8
+ cmp r4, 8
+
+jnz .loop
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_64x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x48, 4,5,5
+ pxor m0, m0
+ mov r4, 48
+
+.loop
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ sub r4, 8
+ cmp r4, 8
+
+jnz .loop
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_64x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x64, 4,5,5
+ pxor m0, m0
+ mov r4, 64
+
+.loop
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ sub r4, 8
+ cmp r4, 8
+
+jnz .loop
+ PROCESS_SAD_64x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_64x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_48x64, 4,5,5
+ pxor m0, m0
+ mov r4, 64
+
+.loop
+ PROCESS_SAD_48x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ PROCESS_SAD_48x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ sub r4, 8
+ cmp r4, 8
+
+jnz .loop
+ PROCESS_SAD_48x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_48x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_24x32, 4,5,4
+ pxor m0, m0
+ mov r4, 32
+
+.loop
+ PROCESS_SAD_24x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_24x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ sub r4, 8
+ cmp r4, 8
+jnz .loop
+ PROCESS_SAD_24x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_24x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_12x16, 4,4,4
+ mova m4, [MSK]
+ pxor m0, m0
+
+ PROCESS_SAD_12x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_12x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_12x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_12x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
%endmacro
INIT_XMM sse2
@@ -972,6 +1600,486 @@
RET
%endmacro
+%macro SAD_X3_24x4 0
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ pshufd m6, m6, 84
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+%endmacro
+
+%macro SAD_X4_24x4 0
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ pshufd m7, m7, 84
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+%endmacro
+
+%macro SAD_X3_32x4 0
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+ mova m3, [r0]
+ mova m4, [r0 + 16]
+ movu m5, [r1]
+ movu m6, [r1 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m0, m5
+ movu m5, [r2]
+ movu m6, [r2 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m1, m5
+ movu m5, [r3]
+ movu m6, [r3 + 16]
+ psadbw m5, m3
+ psadbw m6, m4
+ paddd m5, m6
+ paddd m2, m5
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r4]
+ lea r2, [r2 + r4]
+ lea r3, [r3 + r4]
+%endmacro
+
+%macro SAD_X4_32x4 0
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+ mova m4, [r0]
+ mova m5, [r0 + 16]
+ movu m6, [r1]
+ movu m7, [r1 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m0, m6
+ movu m6, [r2]
+ movu m7, [r2 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m1, m6
+ movu m6, [r3]
+ movu m7, [r3 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m2, m6
+ movu m6, [r4]
+ movu m7, [r4 + 16]
+ psadbw m6, m4
+ psadbw m7, m5
+ paddd m6, m7
+ paddd m3, m6
+ lea r0, [r0 + FENC_STRIDE]
+ lea r1, [r1 + r5]
+ lea r2, [r2 + r5]
+ lea r3, [r3 + r5]
+ lea r4, [r4 + r5]
+%endmacro
+
;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, intptr_t i_stride, int scores[3] )
@@ -991,6 +2099,7 @@
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
+SAD_X 3, 4, 16
SAD_X 3, 4, 8
SAD_X 3, 4, 4
SAD_X 4, 16, 16
@@ -998,6 +2107,7 @@
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
+SAD_X 4, 4, 16
SAD_X 4, 4, 8
SAD_X 4, 4, 4
@@ -1513,6 +2623,206 @@
%endif
%endmacro
+%macro SAD_X3_W24 0
+cglobal pixel_sad_x3_24x32, 5, 7, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ mov r6, 32
+
+.loop
+ SAD_X3_24x4
+ SAD_X3_24x4
+ SAD_X3_24x4
+ SAD_X3_24x4
+
+ sub r6, 16
+ cmp r6, 0
+jnz .loop
+ SAD_X3_END_SSE2 1
+%endmacro
+
+%macro SAD_X4_W24 0
+%if ARCH_X86_64 == 1
+cglobal pixel_sad_x4_24x32, 6, 8, 8
+%define count r7
+%else
+cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4
+%define count dword [rsp]
+%endif
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ mov count, 32
+
+.loop
+ SAD_X4_24x4
+ SAD_X4_24x4
+ SAD_X4_24x4
+ SAD_X4_24x4
+
+ sub count, 16
+ jnz .loop
+ SAD_X4_END_SSE2 1
+
+%endmacro
+
+%macro SAD_X3_W32 0
+cglobal pixel_sad_x3_32x8, 5, 6, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x16, 5, 6, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x24, 5, 6, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x32, 5, 7, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ mov r6, 32
+
+.loop
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+
+ sub r6, 16
+ cmp r6, 0
+jnz .loop
+ SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x64, 5, 7, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ mov r6, 64
+
+.loop1
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+ SAD_X3_32x4
+
+ sub r6, 16
+ cmp r6, 0
+jnz .loop1
+ SAD_X3_END_SSE2 1
+%endmacro
+
+%macro SAD_X4_W32 0
+cglobal pixel_sad_x4_32x8, 6, 7, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_END_SSE2 1
+
+cglobal pixel_sad_x4_32x16, 6, 7, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_END_SSE2 1
+
+cglobal pixel_sad_x4_32x24, 6, 7, 8
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_END_SSE2 1
+
+%if ARCH_X86_64 == 1
+cglobal pixel_sad_x4_32x32, 6, 8, 8
+%define count r7
+%else
+cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4
+%define count dword [rsp]
+%endif
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ mov count, 32
+
+.loop
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+
+ sub count, 16
+ jnz .loop
+ SAD_X4_END_SSE2 1
+
+%if ARCH_X86_64 == 1
+cglobal pixel_sad_x4_32x64, 6, 8, 8
+%define count r7
+%else
+cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4
+%define count dword [rsp]
+%endif
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ mov count, 64
+
+.loop
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+ SAD_X4_32x4
+
+ sub count, 16
+ jnz .loop
+ SAD_X4_END_SSE2 1
+
+%endmacro
+
+
INIT_XMM sse2
SAD_X_SSE2 3, 16, 16, 7
SAD_X_SSE2 3, 16, 8, 7
@@ -1544,6 +2854,8 @@
%endmacro
INIT_XMM ssse3
+SAD_X3_W32
+SAD_X3_W24
SAD_X_SSE2 3, 16, 64, 7
SAD_X_SSE2 3, 16, 32, 7
SAD_X_SSE2 3, 16, 16, 7
@@ -1551,6 +2863,8 @@
SAD_X_SSE2 3, 16, 8, 7
SAD_X_SSE2 3, 8, 32, 7
SAD_X_SSE2 3, 8, 16, 7
+SAD_X4_W24
+SAD_X4_W32
SAD_X_SSE2 4, 16, 64, 7
SAD_X_SSE2 4, 16, 32, 7
SAD_X_SSE2 4, 16, 16, 7
@@ -1562,12 +2876,16 @@
SAD_X_SSSE3 4, 8, 4
INIT_XMM avx
+SAD_X3_W32
+SAD_X3_W24
SAD_X_SSE2 3, 16, 64, 7
SAD_X_SSE2 3, 16, 32, 6
SAD_X_SSE2 3, 16, 16, 6
SAD_X_SSE2 3, 16, 12, 6
SAD_X_SSE2 3, 16, 8, 6
SAD_X_SSE2 3, 16, 4, 6
+SAD_X4_W24
+SAD_X4_W32
SAD_X_SSE2 4, 16, 64, 7
SAD_X_SSE2 4, 16, 32, 7
SAD_X_SSE2 4, 16, 16, 7
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/compress.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -26,6 +26,7 @@
/* Lambda Partition Select adjusts the threshold value for Early Exit in No-RDO flow */
#define LAMBDA_PARTITION_SELECT 0.9
+#define EARLY_EXIT 1
using namespace x265;
@@ -222,10 +223,12 @@
m_tmpResiYuv[depth]->clear();
//do motion compensation only for Luma since luma cost alone is calculated
+ outTempCU->m_totalBits = 0;
m_search->predInterSearch(outTempCU, outPredYuv, bUseMRG, true, false);
int part = partitionFromSizes(outTempCU->getWidth(0), outTempCU->getHeight(0));
- outTempCU->m_totalCost = primitives.sse_pp[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
- outPredYuv->getLumaAddr(), outPredYuv->getStride());
+ uint32_t distortion = primitives.sse_pp[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
+ outPredYuv->getLumaAddr(), outPredYuv->getStride());
+ outTempCU->m_totalCost = m_rdCost->calcRdCost(distortion, outTempCU->m_totalBits);
}
void TEncCu::xComputeCostMerge2Nx2N(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool* earlyDetectionSkip, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
@@ -245,6 +248,7 @@
outTempCU->setCUTransquantBypassSubParts(m_cfg->getCUTransquantBypassFlagValue(), 0, depth);
outTempCU->getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
+ int bestMergeCand = 0;
for (int mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
{
// set MC parameters, interprets depth relative to LCU level
@@ -268,6 +272,7 @@
if (outTempCU->m_totalCost < outBestCU->m_totalCost)
{
+ bestMergeCand = mergeCand;
TComDataCU* tmp = outTempCU;
outTempCU = outBestCU;
outBestCU = tmp;
@@ -286,7 +291,44 @@
{
m_search->motionCompensation(outBestCU, bestPredYuv, REF_PIC_LIST_X, partIdx, false, true);
}
- m_search->encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], yuvReconBest, false);
+
+ TComDataCU* tmp;
+ TComYuv *yuv;
+
+ outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
+ outTempCU->setCUTransquantBypassSubParts(m_cfg->getCUTransquantBypassFlagValue(), 0, depth);
+ outTempCU->setPartSizeSubParts(SIZE_2Nx2N, 0, depth);
+ outTempCU->setMergeFlagSubParts(true, 0, 0, depth);
+ outTempCU->setMergeIndexSubParts(bestMergeCand, 0, 0, depth);
+ outTempCU->setInterDirSubParts(interDirNeighbours[bestMergeCand], 0, 0, depth);
+ outTempCU->getCUMvField(REF_PIC_LIST_0)->setAllMvField(mvFieldNeighbours[0 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0);
+ outTempCU->getCUMvField(REF_PIC_LIST_1)->setAllMvField(mvFieldNeighbours[1 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0);
+
+ //No-residue mode
+ m_search->encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true);
+
+ tmp = outTempCU;
+ outTempCU = outBestCU;
+ outBestCU = tmp;
+
+ yuv = yuvReconBest;
+ yuvReconBest = m_tmpRecoYuv[depth];
+ m_tmpRecoYuv[depth] = yuv;
+
+ //Encode with residue
+ m_search->encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false);
+
+ if (outTempCU->m_totalCost < outBestCU->m_totalCost) //Choose best from no-residue mode and residue mode
+ {
+ tmp = outTempCU;
+ outTempCU = outBestCU;
+ outBestCU = tmp;
+
+ yuv = yuvReconBest;
+ yuvReconBest = m_tmpRecoYuv[depth];
+ m_tmpRecoYuv[depth] = yuv;
+ }
+
if (m_cfg->param.bEnableEarlySkip)
{
if (outBestCU->getQtRootCbf(0) == 0)
@@ -531,7 +573,78 @@
// further split
if (bSubBranch && bTrySplitDQP && depth < g_maxCUDepth - g_addCUDepth)
{
+#if EARLY_EXIT // turn ON this to enable early exit
+ // early exit when the RD cost of best mode at depth n is less than the avgerage of RD cost of the
+ // CU's(above, aboveleft, aboveright, left, colocated) at depth "n" of previosuly coded CU's
+ if (outBestCU != 0)
+ {
+ UInt64 costCU = 0, costCUAbove = 0, costCUAboveLeft = 0, costCUAboveRight = 0, costCULeft = 0, costCUColocated0 = 0, costCUColocated1 = 0, totalCost = 0, avgCost= 0;
+ UInt64 countCU = 0, countCUAbove = 0, countCUAboveLeft = 0, countCUAboveRight = 0, countCULeft = 0, countCUColocated0 = 0, countCUColocated1 = 0;
+ UInt64 totalCount = 0;
+ TComDataCU* above = outTempCU->getCUAbove();
+ TComDataCU* aboveLeft = outTempCU->getCUAboveLeft();
+ TComDataCU* aboveRight = outTempCU->getCUAboveRight();
+ TComDataCU* left = outTempCU->getCULeft();
+ TComDataCU* colocated0 = outTempCU->getCUColocated(REF_PIC_LIST_0);
+ TComDataCU* colocated1 = outTempCU->getCUColocated(REF_PIC_LIST_1);
+
+ costCU = outTempCU->m_avgCost[depth] * outTempCU->m_count[depth];
+ countCU = outTempCU->m_count[depth];
+ if (above)
+ {
+ costCUAbove = above->m_avgCost[depth] * above->m_count[depth];
+ countCUAbove = above->m_count[depth];
+ }
+ if (aboveLeft)
+ {
+ costCUAboveLeft = aboveLeft->m_avgCost[depth] * aboveLeft->m_count[depth];
+ countCUAboveLeft = aboveLeft->m_count[depth];
+ }
+ if (aboveRight)
+ {
+ costCUAboveRight = aboveRight->m_avgCost[depth] * aboveRight->m_count[depth];
+ countCUAboveRight = aboveRight->m_count[depth];
+ }
+ if (left)
+ {
+ costCULeft = left->m_avgCost[depth] * left->m_count[depth];
+ countCULeft = left->m_count[depth];
+ }
+ if (colocated0)
+ {
+ costCUColocated0 = colocated0->m_avgCost[depth] * colocated0->m_count[depth];
+ countCUColocated0 = colocated0->m_count[depth];
+ }
+ if (colocated1)
+ {
+ costCUColocated1 = colocated1->m_avgCost[depth] * colocated1->m_count[depth];
+ countCUColocated1 = colocated1->m_count[depth];
+ }
+
+ totalCost = costCU + costCUAbove + costCUAboveLeft + costCUAboveRight + costCULeft + costCUColocated0 + costCUColocated1;
+ totalCount = countCU + countCUAbove + countCUAboveLeft + countCUAboveRight + countCULeft + countCUColocated0 + countCUColocated1;
+ if (totalCount != 0)
+ avgCost = totalCost / totalCount;
+
+ float lambda = 1.0f;
+
+ if (outBestCU->m_totalCost < lambda * avgCost && avgCost != 0 && depth != 0)
+ {
+ m_entropyCoder->resetBits();
+ m_entropyCoder->encodeSplitFlag(outBestCU, 0, depth, true);
+ outBestCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits
+ outBestCU->m_totalCost = m_rdCost->calcRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits);
+ /* Copy Best data to Picture for next partition prediction. */
+ outBestCU->copyToPic((UChar)depth);
+
+ /* Copy Yuv data to picture Yuv */
+ xCopyYuv2Pic(outBestCU->getPic(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU(), depth, depth, outBestCU, lpelx, tpely);
+ return;
+ }
+ }
+#endif
#if 0 // turn ON this to enable early exit
+ //early exit when RD cost of best mode is less than the cumulative RD cost of 4 subpartition
UInt64 nxnCost = 0;
if (outBestCU != 0 && depth > 0)
{
@@ -612,7 +725,22 @@
m_rdSbacCoders[nextDepth][CI_CURR_BEST]->load(m_rdSbacCoders[nextDepth][CI_NEXT_BEST]);
}
xCompressInterCU(subBestPartCU, subTempPartCU, outTempCU, nextDepth, nextDepth_partIndex);
-
+#if EARLY_EXIT
+ for (int k = 0; k < 4; k++)
+ {
+ outTempCU->m_avgCost[k] = subTempPartCU->m_avgCost[k];
+ outTempCU->m_count[k] = subTempPartCU->m_count[k];
+ }
+ if (subBestPartCU->getPredictionMode(0) != MODE_INTRA)
+ {
+ UInt64 tempavgCost = subBestPartCU->m_totalCost;
+ UInt64 temp = outTempCU->m_avgCost[depth + 1] * outTempCU->m_count[depth + 1];
+ outTempCU->m_count[depth + 1] += 1;
+ outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_count[depth + 1] += 1;
+ outTempCU->m_avgCost[depth + 1] = (temp + tempavgCost) / outTempCU->m_count[depth + 1];
+ outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_avgCost[depth + 1] = outTempCU->m_avgCost[depth + 1];
+ }
+#endif
/* Adding costs from best SUbCUs */
outTempCU->copyPartFrom(subBestPartCU, nextDepth_partIndex, nextDepth, true); // Keep best part data to current temporary data.
xCopyYuv2Tmp(subBestPartCU->getTotalNumPart() * nextDepth_partIndex, nextDepth);
@@ -708,6 +836,16 @@
* Copy recon data from Temp structure to Best structure */
if (outBestCU)
{
+ if (depth == 0)
+ {
+ UInt64 tempavgCost = outBestCU->m_totalCost;
+ UInt64 temp = outTempCU->m_avgCost[depth] * outTempCU->m_count[depth];
+ outTempCU->m_count[depth] += 1;
+ outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_count[depth] += 1;
+
+ outTempCU->m_avgCost[depth] = (temp + tempavgCost) / outTempCU->m_count[depth];
+ outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_avgCost[depth] = outTempCU->m_avgCost[depth];
+ }
if (outTempCU->m_totalCost < outBestCU->m_totalCost)
{
outBestCU = outTempCU;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/encoder.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -494,52 +494,18 @@
const char* digestStr = NULL;
if (param.decodedPictureHashSEI)
{
- SEIDecodedPictureHash sei_recon_picture_digest;
if (param.decodedPictureHashSEI == 1)
{
- /* calculate MD5sum for entire reconstructed picture */
- sei_recon_picture_digest.method = SEIDecodedPictureHash::MD5;
- for (int i = 0; i < 3; i++)
- {
- MD5Final(&(pic->m_state[i]), sei_recon_picture_digest.digest[i]);
- }
- digestStr = digestToString(sei_recon_picture_digest.digest, 16);
+ digestStr = digestToString(m_frameEncoder->m_seiReconPictureDigest.digest, 16);
}
else if (param.decodedPictureHashSEI == 2)
{
- sei_recon_picture_digest.method = SEIDecodedPictureHash::CRC;
- for (int i = 0; i < 3; i++)
- {
- crcFinish((pic->m_crc[i]), sei_recon_picture_digest.digest[i]);
- }
- digestStr = digestToString(sei_recon_picture_digest.digest, 2);
+ digestStr = digestToString(m_frameEncoder->m_seiReconPictureDigest.digest, 2);
}
else if (param.decodedPictureHashSEI == 3)
{
- sei_recon_picture_digest.method = SEIDecodedPictureHash::CHECKSUM;
- for (int i = 0; i < 3; i++)
- {
- checksumFinish(pic->m_checksum[i], sei_recon_picture_digest.digest[i]);
- }
- digestStr = digestToString(sei_recon_picture_digest.digest, 4);
+ digestStr = digestToString(m_frameEncoder->m_seiReconPictureDigest.digest, 4);
}
-
- /* write the SEI messages */
- OutputNALUnit onalu(NAL_UNIT_SUFFIX_SEI, 0);
- m_frameEncoder->m_seiWriter.writeSEImessage(onalu.m_Bitstream, sei_recon_picture_digest, pic->getSlice()->getSPS());
- writeRBSPTrailingBits(onalu.m_Bitstream);
-
- int count = 0;
- while (nalunits[count] != NULL)
- {
- count++;
- }
-
- nalunits[count] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
- if (nalunits[count])
- nalunits[count]->init(onalu);
- else
- digestStr = NULL;
}
/* calculate the size of the access unit, excluding:
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/frameencoder.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -188,25 +188,25 @@
/* headers for start of bitstream */
OutputNALUnit nalu(NAL_UNIT_VPS);
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
entropyCoder->encodeVPS(m_cfg->getVPS());
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ writeRBSPTrailingBits(nalu.m_bitstream);
CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
nalunits[count]->init(nalu);
count++;
nalu = NALUnit(NAL_UNIT_SPS);
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
entropyCoder->encodeSPS(&m_sps);
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ writeRBSPTrailingBits(nalu.m_bitstream);
CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
nalunits[count]->init(nalu);
count++;
nalu = NALUnit(NAL_UNIT_PPS);
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
entropyCoder->encodePPS(&m_pps);
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ writeRBSPTrailingBits(nalu.m_bitstream);
CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
nalunits[count]->init(nalu);
count++;
@@ -220,9 +220,9 @@
sei.numSpsIdsMinus1 = 0;
sei.activeSeqParamSetId = m_sps.getSPSId();
- entropyCoder->setBitstream(&nalu.m_Bitstream);
- m_seiWriter.writeSEImessage(nalu.m_Bitstream, sei, &m_sps);
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
+ m_seiWriter.writeSEImessage(nalu.m_bitstream, sei, &m_sps);
+ writeRBSPTrailingBits(nalu.m_bitstream);
CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
nalunits[count]->init(nalu);
count++;
@@ -237,9 +237,9 @@
sei.anticlockwiseRotation = m_cfg->getDisplayOrientationSEIAngle();
nalu = NALUnit(NAL_UNIT_PREFIX_SEI);
- entropyCoder->setBitstream(&nalu.m_Bitstream);
- m_seiWriter.writeSEImessage(nalu.m_Bitstream, sei, &m_sps);
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
+ m_seiWriter.writeSEImessage(nalu.m_bitstream, sei, &m_sps);
+ writeRBSPTrailingBits(nalu.m_bitstream);
CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
nalunits[count]->init(nalu);
}
@@ -499,8 +499,8 @@
SEIGradualDecodingRefreshInfo seiGradualDecodingRefreshInfo;
seiGradualDecodingRefreshInfo.m_gdrForegroundFlag = true; // Indicating all "foreground"
- m_seiWriter.writeSEImessage(nalu.m_Bitstream, seiGradualDecodingRefreshInfo, slice->getSPS());
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ m_seiWriter.writeSEImessage(nalu.m_bitstream, seiGradualDecodingRefreshInfo, slice->getSPS());
+ writeRBSPTrailingBits(nalu.m_bitstream);
m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
if (m_nalList[m_nalCount])
{
@@ -516,8 +516,8 @@
sei_recovery_point.m_exactMatchingFlag = (slice->getPOC() == 0) ? (true) : (false);
sei_recovery_point.m_brokenLinkFlag = false;
- m_seiWriter.writeSEImessage(nalu.m_Bitstream, sei_recovery_point, slice->getSPS());
- writeRBSPTrailingBits(nalu.m_Bitstream);
+ m_seiWriter.writeSEImessage(nalu.m_bitstream, sei_recovery_point, slice->getSPS());
+ writeRBSPTrailingBits(nalu.m_bitstream);
m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
if (m_nalList[m_nalCount])
{
@@ -565,7 +565,7 @@
/* start slice NALunit */
bool sliceSegment = !slice->isNextSlice();
OutputNALUnit nalu(slice->getNalUnitType(), 0);
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
entropyCoder->encodeSliceHeader(slice);
// is it needed?
@@ -601,7 +601,7 @@
}
else
{
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
}
// for now, override the TILES_DECODER setting in order to write substreams.
@@ -616,7 +616,7 @@
{
// Construct the final bitstream by flushing and concatenating substreams.
- // The final bitstream is either nalu.m_Bitstream or pcBitstreamRedirect;
+ // The final bitstream is either nalu.m_bitstream or pcBitstreamRedirect;
uint32_t* substreamSizes = slice->getSubstreamSizes();
for (int i = 0; i < numSubstreams; i++)
{
@@ -638,7 +638,7 @@
// Complete the slice header info.
entropyCoder->setEntropyCoder(&m_sbacCoder, slice);
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
entropyCoder->encodeTilesWPPEntryPoint(slice);
// Substreams...
@@ -654,14 +654,14 @@
// current NALU is the last NALU of slice and a NALU was buffered, then (a)
// Write current NALU (b) Update an write buffered NALU at appropriate
// location in NALU list.
- nalu.m_Bitstream.writeByteAlignment(); // Slice header byte-alignment
+ nalu.m_bitstream.writeByteAlignment(); // Slice header byte-alignment
// Perform bitstream concatenation
if (bitstreamRedirect->getNumberOfWrittenBits() > 0)
{
- nalu.m_Bitstream.addSubstream(bitstreamRedirect);
+ nalu.m_bitstream.addSubstream(bitstreamRedirect);
}
- entropyCoder->setBitstream(&nalu.m_Bitstream);
+ entropyCoder->setBitstream(&nalu.m_bitstream);
bitstreamRedirect->clear();
m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
if (m_nalList[m_nalCount])
@@ -670,6 +670,45 @@
m_nalCount++;
}
+ /* write decoded picture hash SEI messages */
+ if (m_cfg->param.decodedPictureHashSEI)
+ {
+ if (m_cfg->param.decodedPictureHashSEI == 1)
+ {
+ m_seiReconPictureDigest.method = SEIDecodedPictureHash::MD5;
+ for (int i = 0; i < 3; i++)
+ {
+ MD5Final(&(m_pic->m_state[i]), m_seiReconPictureDigest.digest[i]);
+ }
+ }
+ else if (m_cfg->param.decodedPictureHashSEI == 2)
+ {
+ m_seiReconPictureDigest.method = SEIDecodedPictureHash::CRC;
+ for (int i = 0; i < 3; i++)
+ {
+ crcFinish((m_pic->m_crc[i]), m_seiReconPictureDigest.digest[i]);
+ }
+ }
+ else if (m_cfg->param.decodedPictureHashSEI == 3)
+ {
+ m_seiReconPictureDigest.method = SEIDecodedPictureHash::CHECKSUM;
+ for (int i = 0; i < 3; i++)
+ {
+ checksumFinish(m_pic->m_checksum[i], m_seiReconPictureDigest.digest[i]);
+ }
+ }
+ OutputNALUnit onalu(NAL_UNIT_SUFFIX_SEI, 0);
+ m_seiWriter.writeSEImessage(onalu.m_bitstream, m_seiReconPictureDigest, slice->getSPS());
+ writeRBSPTrailingBits(onalu.m_bitstream);
+
+ m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
+ if (m_nalList[m_nalCount])
+ {
+ m_nalList[m_nalCount]->init(onalu);
+ m_nalCount++;
+ }
+ }
+
if (m_sps.getUseSAO())
{
m_frameFilter.end();
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/frameencoder.h Thu Oct 31 18:43:03 2013 +0530
@@ -161,6 +161,7 @@
TComSPS m_sps;
TComPPS m_pps;
RateControlEntry m_rce;
+ SEIDecodedPictureHash m_seiReconPictureDigest;
protected:
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/framefilter.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -277,7 +277,6 @@
int cuAddr = lineStartCUAddr;
if (m_cfg->param.bEnablePsnr)
{
- TComPicYuv* recon = m_pic->getPicYuvRec();
TComPicYuv* orig = m_pic->getPicYuvOrg();
intptr_t stride = recon->getStride();
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/motion.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -116,7 +116,7 @@
X265_FREE(immedVal2);
}
-void MotionEstimate::setSourcePU(int offset, uint32_t width, uint32_t height)
+void MotionEstimate::setSourcePU(int offset, int width, int height)
{
/* copy PU block into cache */
primitives.blockcpy_pp(width, height, fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
@@ -300,7 +300,7 @@
MV & outQMv)
{
ALIGN_VAR_16(int, costs[16]);
- intptr_t stride = ref->lumaStride;
+ size_t stride = ref->lumaStride;
pixel *fref = ref->fpelPlane + blockOffset;
setMVP(qmvp);
@@ -561,7 +561,7 @@
omv = bmv;
const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
- int16_t i = 1;
+ uint16_t i = 1;
do
{
if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
@@ -854,9 +854,19 @@
}
else
{
- subpelInterpolate(ref, qmv0, dir);
- cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
- cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+ if (dir == 1)
+ {
+ subpelInterpolate(ref, qmv0, 1);
+ cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
+ cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+ }
+ else
+ {
+ subpelInterpolate(ref, qmv0, 0);
+ cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE) + mvcost0;
+ subpelInterpolate(ref, qmv1, 0);
+ cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE) + mvcost1;
+ }
}
COPY2_IF_LT(bcost, cost0, bdir, i + 0);
COPY2_IF_LT(bcost, cost1, bdir, i + 1);
@@ -899,7 +909,7 @@
{
ALIGN_VAR_16(int, costs[16]);
pixel *fref = ref->fpelPlane + blockOffset;
- intptr_t stride = ref->lumaStride;
+ size_t stride = ref->lumaStride;
MV omv = bmv;
int saved = bcost;
@@ -1179,9 +1189,9 @@
int yFrac = qmv.y & 0x3;
assert(yFrac | xFrac);
- uint32_t realWidth = blockwidth + (dir == 2);
- uint32_t realHeight = blockheight + (dir == 1);
- intptr_t realStride = FENC_STRIDE + (dir == 2);
+ assert(dir != 2);
+ assert((blockwidth % 4) == 0);
+ int realHeight = blockheight + (dir == 1);
pixel *fref = ref->unweightedFPelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
int local_shift = ref->shift + shiftNum;
@@ -1190,39 +1200,39 @@
{
if (yFrac == 0)
{
- primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
- primitives.weightpUni(immedVal, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+ primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[xFrac]);
+ primitives.weightpUni(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
}
else if (xFrac == 0)
{
- primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
- primitives.weightpUni(immedVal, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+ primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+ primitives.weightpUni(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
}
else
{
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
- primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
- primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, immedVal2, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
- primitives.weightpUni(immedVal2, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+ primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+ primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+ primitives.weightpUni(immedVal2, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
}
}
else
{
if (yFrac == 0)
{
- primitives.ipfilter_pp[FILTER_H_P_P_8](fref, ref->lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
+ primitives.ipfilter_pp[FILTER_H_P_P_8](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[xFrac]);
}
else if (xFrac == 0)
{
- primitives.ipfilter_pp[FILTER_V_P_P_8](fref, ref->lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+ primitives.ipfilter_pp[FILTER_V_P_P_8](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
}
else
{
int filterSize = NTAPS_LUMA;
int halfFilterSize = (filterSize >> 1);
- primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
- primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+ primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+ primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, blockwidth, realHeight, yFrac);
}
}
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/ratecontrol.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -124,7 +124,7 @@
this->cfg = _cfg;
bitrate = cfg->param.rc.bitrate * 1000;
frameDuration = 1.0 / cfg->param.frameRate;
- ncu = (int)((cfg->param.sourceHeight * cfg->param.sourceWidth) / pow((int)cfg->param.maxCUSize, 2.0));
+ ncu = (int)((cfg->param.sourceHeight * cfg->param.sourceWidth) / pow((int)16, 2.0));
lastNonBPictType = -1;
baseQp = cfg->param.rc.qp;
qp = baseQp;
@@ -142,7 +142,7 @@
accumPNorm = .01;
accumPQp = (ABR_INIT_QP_MIN)*accumPNorm;
/* estimated ratio that produces a reasonable QP for the first I-frame */
- cplxrSum = .01 * pow(7.0e5, cfg->param.rc.qCompress) * pow(2 * ncu, 0.5);
+ cplxrSum = .01 * pow(7.0e5, cfg->param.rc.qCompress) * pow(ncu, 0.5);
wantedBitsWindow = bitrate * frameDuration;
lastNonBPictType = I_SLICE;
}
@@ -253,7 +253,7 @@
}
else
{
- double abrBuffer = 1.5 * cfg->param.rc.rateTolerance * bitrate;
+ double abrBuffer = 2 * cfg->param.rc.rateTolerance * bitrate;
/* 1pass ABR */
@@ -299,46 +299,30 @@
q = qp2qScale(accumPQp / accumPNorm);
q /= fabs(cfg->param.rc.ipFactor);
}
- if (cfg->param.rc.rateControlMode != X265_RC_CRF)
- {
- double lqmin = 0, lqmax = 0;
+ else if (framesDone>0)
+ {
+ if (cfg->param.rc.rateControlMode != X265_RC_CRF)
+ {
+ double lqmin = 0, lqmax = 0;
+ if (totalBits == 0)
+ {
+ lqmin = qp2qScale(ABR_INIT_QP_MIN) / lstep;
+ lqmax = qp2qScale(ABR_INIT_QP_MAX) * lstep;
+ }
+ else
+ {
+ lqmin = lastQScaleFor[sliceType] / lstep;
+ lqmax = lastQScaleFor[sliceType] * lstep;
+ }
- /* Clip the qp of 1st 'N' frames running parallely to ensure it doesnt detoriate
- * the quality */
- if (totalBits == 0)
- {
- lqmin = qp2qScale(ABR_INIT_QP_MIN) / lstep;
- lqmax = qp2qScale(ABR_INIT_QP_MAX) * lstep;
- }
+ if (overflow > 1.1 && framesDone > 3)
+ lqmax *= lstep;
+ else if (overflow <0.9)
+ lqmin /= lstep;
- /* Asymmetric clipping, because symmetric would prevent
- * overflow control in areas of rapidly oscillating complexity */
- else
- {
- lqmin = lastQScaleFor[sliceType] / lstep;
- lqmax = lastQScaleFor[sliceType] * lstep;
- }
-
- /* Rate control needs to be more aggressive based on actual costs obtained for
- * previous encoded frame */
- int rfAdapt = 1;
- if (overflow > 1.1 && framesDone > 3)
- {
- /* Control propagation of excessive overflow / underfow */
- if (overflow > 1.5)
- rfAdapt = 2;
- lqmax *= pow(lstep, rfAdapt);
- lqmin /= pow(lstep, rfAdapt / cfg->param.frameNumThreads);
- }
- else if (overflow < 0.9)
- {
- if (overflow < 0.6)
- rfAdapt = 2;
- lqmin /= pow(lstep, rfAdapt);
- lqmax /= pow(lstep, rfAdapt / cfg->param.frameNumThreads);
- }
- q = Clip3(lqmin, lqmax, q);
- }
+ q = Clip3(lqmin, lqmax, q);
+ }
+ }
double lmin1 = lmin[sliceType];
double lmax1 = lmax[sliceType];
@@ -378,7 +362,7 @@
if (rce->sliceType != B_SLICE)
/* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low
* to improve short term compensation for next frame. */
- cplxrSum += 1.5 * bits * qp2qScale(rce->qpaRc) / rce->qRceq;
+ cplxrSum += bits * qp2qScale(rce->qpaRc) / rce->qRceq;
else
{
/* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/input/y4m.cpp
--- a/source/input/y4m.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/input/y4m.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -28,7 +28,7 @@
#include <string.h>
#include <iostream>
-#if WIN32
+#if _WIN32
#include "io.h"
#include "fcntl.h"
#if defined(_MSC_VER)
@@ -53,7 +53,7 @@
if (!strcmp(filename, "-"))
{
ifs = &cin;
-#if WIN32
+#if _WIN32
setmode(fileno(stdin), O_BINARY);
#endif
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/input/yuv.cpp
--- a/source/input/yuv.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/input/yuv.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -28,7 +28,7 @@
#include <string.h>
#include <iostream>
-#if WIN32
+#if _WIN32
#include "io.h"
#include "fcntl.h"
#if defined(_MSC_VER)
@@ -55,7 +55,7 @@
if (!strcmp(filename, "-"))
{
ifs = &cin;
-#if WIN32
+#if _WIN32
setmode(fileno(stdin), O_BINARY);
#endif
}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/intrapredharness.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -284,7 +284,7 @@
void IntraPredHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
int width = 64;
- int16_t srcStride = 96;
+ uint16_t srcStride = 96;
if (opt.intra_pred_dc)
{
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/ipfilterharness.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -160,12 +160,13 @@
bool IPFilterHarness::check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt)
{
- int rand_height = rand() % 100; // Randomly generated Height
- int rand_width = rand() % 100; // Randomly generated Width
- int16_t rand_val, rand_srcStride, rand_dstStride;
+ int rand_val, rand_srcStride, rand_dstStride;
- for (int i = 0; i <= 100; i++)
+ for (int i = 0; i <= 1000; i++)
{
+ int rand_height = rand() % 100; // Randomly generated Height
+ int rand_width = rand() % 100; // Randomly generated Width
+
memset(IPF_vec_output_p, 0, ipf_t_size); // Initialize output buffer to zero
memset(IPF_C_output_p, 0, ipf_t_size); // Initialize output buffer to zero
@@ -173,19 +174,29 @@
rand_srcStride = rand() % 100; // Randomly generated srcStride
rand_dstStride = rand() % 100; // Randomly generated dstStride
+ rand_width &= ~3;
+ if (rand_width < 4)
+ rand_width = 4;
+
+ if (rand_height <= 0)
+ rand_height = 1;
+
+ if (rand_dstStride < rand_width)
+ rand_dstStride = rand_width;
+
+ ref(short_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_C_output_p,
+ rand_dstStride,
+ rand_width,
+ rand_height, rand_val
+ );
opt(short_buff + 3 * rand_srcStride,
rand_srcStride,
IPF_vec_output_p,
rand_dstStride,
rand_width,
- rand_height, g_lumaFilter[rand_val]
- );
- ref(short_buff + 3 * rand_srcStride,
- rand_srcStride,
- IPF_C_output_p,
- rand_dstStride,
- rand_width,
- rand_height, g_lumaFilter[rand_val]
+ rand_height, rand_val
);
if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
@@ -229,6 +240,48 @@
return true;
}
+bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+ int16_t rand_srcStride;
+
+ for (int i = 0; i <= 1000; i++)
+ {
+ int16_t rand_height = (int16_t)rand() % 100; // Randomly generated Height
+ int16_t rand_width = (int16_t)rand() % 100; // Randomly generated Width
+
+ memset(IPF_vec_output_s, 0, ipf_t_size); // Initialize output buffer to zero
+ memset(IPF_C_output_s, 0, ipf_t_size); // Initialize output buffer to zero
+
+ rand_srcStride = rand_width + rand() % 100; // Randomly generated srcStride
+ if (rand_srcStride < rand_width)
+ rand_srcStride = rand_width;
+
+ rand_width %= 4;
+ if (rand_width < 4)
+ rand_width = 4;
+
+ rand_height %= 4;
+ if (rand_height < 4)
+ rand_height = 4;
+
+ ref(pixel_buff,
+ rand_srcStride,
+ IPF_C_output_s,
+ rand_width,
+ rand_height);
+ opt(pixel_buff,
+ rand_srcStride,
+ IPF_vec_output_s,
+ rand_width,
+ rand_height);
+
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, ipf_t_size))
+ return false;
+ }
+
+ return true;
+}
+
bool IPFilterHarness::check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt)
{
int16_t rand_height = (int16_t)rand() % 100; // Randomly generated Height
@@ -325,6 +378,40 @@
return true;
}
+bool IPFilterHarness::check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt)
+{
+ int rand_srcStride, rand_dstStride, rand_coeffIdxX, rand_coeffIdxY;
+
+ for (int i = 0; i <= 1000; i++)
+ {
+ rand_coeffIdxX = rand() % 3; // Random coeffIdex in the filter
+ rand_coeffIdxY = rand() % 3; // Random coeffIdex in the filter
+
+ rand_srcStride = rand() % 100; // Randomly generated srcStride
+ rand_dstStride = rand() % 100; // Randomly generated dstStride
+
+ ref(pixel_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_C_output_p,
+ rand_dstStride,
+ rand_coeffIdxX,
+ rand_coeffIdxY
+ );
+ opt(pixel_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_vec_output_p,
+ rand_dstStride,
+ rand_coeffIdxX,
+ rand_coeffIdxY
+ );
+
+ if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
+ return false;
+ }
+
+ return true;
+}
+
bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
for (int value = 0; value < NUM_IPFILTER_P_P; value++)
@@ -372,6 +459,15 @@
}
}
+ if (opt.luma_p2s)
+ {
+ if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s))
+ {
+ printf("ipfilter_p2s failed\n");
+ return false;
+ }
+ }
+
if (opt.ipfilter_s2p)
{
if (!check_IPFilter_primitive(ref.ipfilter_s2p, opt.ipfilter_s2p))
@@ -421,6 +517,18 @@
}
}
+ for (int value = 0; value < NUM_LUMA_PARTITIONS; value++)
+ {
+ if (opt.luma_hvpp[value])
+ {
+ if (!check_IPFilterLumaHV_primitive(ref.luma_hvpp[value], opt.luma_hvpp[value]))
+ {
+ printf("luma_hvpp[%s]", lumaPartStr[value]);
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -460,7 +568,7 @@
printf("ipfilter_sp %d\t", 8 / (value + 1));
REPORT_SPEEDUP(opt.ipfilter_sp[value], ref.ipfilter_sp[value],
short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
- IPF_vec_output_p, dstStride, width, height, g_lumaFilter[val]);
+ IPF_vec_output_p, dstStride, width, height, val);
}
}
@@ -486,6 +594,7 @@
REPORT_SPEEDUP(opt.luma_hpp[value], ref.luma_hpp[value],
pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
+
if (opt.luma_vpp[value])
{
printf("luma_vpp[%s]\t", lumaPartStr[value]);
@@ -493,6 +602,13 @@
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
+
+ if (opt.luma_hvpp[value])
+ {
+ printf("luma_hv [%s]\t", lumaPartStr[value]);
+ REPORT_SPEEDUP(opt.luma_hvpp[value], ref.luma_hvpp[value],
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1, 3);
+ }
}
for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/ipfilterharness.h Thu Oct 31 18:43:03 2013 +0530
@@ -45,9 +45,11 @@
bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt);
bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
+ bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt);
bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
+ bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
public:
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/pixelharness.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -528,6 +528,31 @@
return true;
}
+bool PixelHarness::check_block_copy_pp(copy_pp_t ref, copy_pp_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ // we don't know the partition size so we are checking the entire output buffer so
+ // we must initialize the buffers
+ memset(ref_dest, 0, sizeof(ref_dest));
+ memset(opt_dest, 0, sizeof(opt_dest));
+
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ opt(opt_dest, STRIDE, pbuf2 + j, STRIDE);
+ ref(ref_dest, STRIDE, pbuf2 + j, STRIDE);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.satd[part])
@@ -611,6 +636,24 @@
}
}
+ if (opt.luma_copy_pp[part])
+ {
+ if (!check_block_copy_pp(ref.luma_copy_pp[part], opt.luma_copy_pp[part]))
+ {
+ printf("luma_copy_pp[%s] failed\n", lumaPartStr[part]);
+ return false;
+ }
+ }
+
+ if (opt.chroma_copy_pp[part])
+ {
+ if (!check_block_copy_pp(ref.chroma_copy_pp[part], opt.chroma_copy_pp[part]))
+ {
+ printf("chroma_copy_pp[%s] failed\n", chromaPartStr[part]);
+ return false;
+ }
+ }
+
return true;
}
@@ -769,6 +812,7 @@
return false;
}
}
+
return true;
}
@@ -830,6 +874,18 @@
printf("sse_ss[%s]", lumaPartStr[part]);
REPORT_SPEEDUP(opt.sse_ss[part], ref.sse_ss[part], (int16_t*)pbuf1, STRIDE, (int16_t*)fref, STRIDE);
}
+
+ if (opt.luma_copy_pp[part])
+ {
+ printf("luma_copy_pp[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.luma_copy_pp[part], ref.luma_copy_pp[part], pbuf1, 64, pbuf2, 128);
+ }
+
+ if (opt.chroma_copy_pp[part])
+ {
+ printf("chroma_copy_pp[%s]", chromaPartStr[part]);
+ REPORT_SPEEDUP(opt.chroma_copy_pp[part], ref.chroma_copy_pp[part], pbuf1, 64, pbuf2, 128);
+ }
}
void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/pixelharness.h
--- a/source/test/pixelharness.h Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/pixelharness.h Thu Oct 31 18:43:03 2013 +0530
@@ -57,6 +57,7 @@
bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
+ bool check_block_copy_pp(copy_pp_t ref, copy_pp_t opt);
public:
PixelHarness();
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/testpool.cpp
--- a/source/test/testpool.cpp Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/testpool.cpp Thu Oct 31 18:43:03 2013 +0530
@@ -30,6 +30,7 @@
#include <time.h>
#include <assert.h>
#include <string.h>
+#include <stdio.h>
#include <sstream>
#include <iostream>
More information about the x265-devel
mailing list