[x265] [PATCH 2 of 3] Merge: (common files)check need of signed/unsigned int

kavitha at multicorewareinc.com kavitha at multicorewareinc.com
Thu Oct 31 14:48:03 CET 2013


# HG changeset patch
# User Kavitha Sampath <kavitha at multicorewareinc.com>
# Date 1383225183 -19800
#      Thu Oct 31 18:43:03 2013 +0530
# Node ID 2cdef1dd17b2d66dc5a84f2e40ae3130a3f9e325
# Parent  9bff4295adfc760e9fdebb6c9499e4a3b2cb7fab
# Parent  9a0da4e6d9e363e383eae7243f0c64026a5f6d00
Merge: (common files)check need of signed/unsigned int

diff -r 9bff4295adfc -r 2cdef1dd17b2 .hgtags
--- a/.hgtags	Thu Oct 31 15:40:28 2013 +0530
+++ b/.hgtags	Thu Oct 31 18:43:03 2013 +0530
@@ -6,3 +6,4 @@
 3767fbfa970ff4b2dc2e8647db0274168727147e 0.3
 2ba6ec553f218d2b06ad803b87d6ec751fd639f7 0.4
 93707bc4fccdaa89a1f2da11db8808ca912a691c 0.4.1
+69acb3cb777f977f5edde908069ac565915dd366 0.5
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -247,6 +247,12 @@
     m_totalBits        = 0;
     m_numPartitions    = pic->getNumPartInCU();
 
+    for (int i = 0; i < 4; i++)
+    {
+        m_avgCost[i] = 0;
+        m_count[i] = 0;
+    }
+
     // CHECK_ME: why partStartIdx always negative
     int partStartIdx = 0 - (cuAddr) * pic->getNumPartInCU();
 
@@ -287,7 +293,6 @@
     if (numElements > 0)
     {
         memset(m_skipFlag         + firstElement, false,                    numElements * sizeof(*m_skipFlag));
-        memset(m_partSizes        + firstElement, SIZE_NONE,                numElements * sizeof(*m_partSizes));
         memset(m_predModes        + firstElement, MODE_NONE,                numElements * sizeof(*m_predModes));
         memset(m_cuTransquantBypass + firstElement, false,                  numElements * sizeof(*m_cuTransquantBypass));
         memset(m_depth            + firstElement, 0,                        numElements * sizeof(*m_depth));
@@ -297,8 +302,6 @@
         memset(m_transformSkip[2] + firstElement, 0,                        numElements * sizeof(*m_transformSkip[2]));
         memset(m_width            + firstElement, g_maxCUWidth,             numElements * sizeof(*m_width));
         memset(m_height           + firstElement, g_maxCUHeight,            numElements * sizeof(*m_height));
-        memset(m_mvpIdx[0]        + firstElement, -1,                       numElements * sizeof(*m_mvpIdx[0]));
-        memset(m_mvpIdx[1]        + firstElement, -1,                       numElements * sizeof(*m_mvpIdx[1]));
         memset(m_mvpNum[0]        + firstElement, -1,                       numElements * sizeof(*m_mvpNum[0]));
         memset(m_mvpNum[1]        + firstElement, -1,                       numElements * sizeof(*m_mvpNum[1]));
         memset(m_qp               + firstElement, getSlice()->getSliceQp(), numElements * sizeof(*m_qp));
@@ -470,6 +473,12 @@
     m_totalBits        = 0;
     m_numPartitions    = cu->getTotalNumPart() >> 2;
 
+    for (int i = 0; i < 4; i++)
+    {
+        m_avgCost[i] = cu->m_avgCost[i];
+        m_count[i] = cu->m_count[i];
+    }
+
     int iSizeInUchar = sizeof(UChar) * m_numPartitions;
     int iSizeInBool  = sizeof(bool) * m_numPartitions;
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.h	Thu Oct 31 18:43:03 2013 +0530
@@ -178,6 +178,8 @@
     UInt64        m_totalCost;       ///< sum of partition RD costs
     uint32_t      m_totalDistortion; ///< sum of partition distortion
     uint32_t      m_totalBits;       ///< sum of partition signal bits
+    UInt64        m_avgCost[4];      // stores the avg cost of CU's in frame for each depth
+    uint32_t      m_count[4];
 
     // -------------------------------------------------------------------------------------------------------------------
     // create / destroy / initialize / copy
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComMotionInfo.cpp
--- a/source/Lib/TLibCommon/TComMotionInfo.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComMotionInfo.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -89,12 +89,6 @@
 
 void TComCUMvField::clearMvField()
 {
-    for (int i = 0; i < m_numPartitions; i++)
-    {
-        m_mv[i] = 0;
-        m_mvd[i] = 0;
-    }
-
     assert(sizeof(*m_refIdx) == 1);
     memset(m_refIdx, NOT_VALID, m_numPartitions * sizeof(*m_refIdx));
 }
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComMotionInfo.h
--- a/source/Lib/TLibCommon/TComMotionInfo.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComMotionInfo.h	Thu Oct 31 18:43:03 2013 +0530
@@ -84,7 +84,7 @@
 /// class for motion information in one CU
 class TComCUMvField
 {
-private:
+public:
 
     MV* m_mv;
     MV* m_mvd;
@@ -95,8 +95,6 @@
     template<typename T>
     void setAll(T *p, T const & val, PartSize cuMode, int partAddr, uint32_t depth, int partIdx);
 
-public:
-
     TComCUMvField() : m_mv(NULL), m_mvd(NULL), m_refIdx(NULL), m_numPartitions(0) {}
 
     ~TComCUMvField() {}
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -499,7 +499,7 @@
         int filterSize = NTAPS_LUMA;
         int halfFilterSize = (filterSize >> 1);
         primitives.ipfilter_ps[FILTER_H_P_S_8](src - (halfFilterSize - 1) * srcStride,  srcStride, m_immedVals, tmpStride, width, height + filterSize - 1, g_lumaFilter[xFrac]);
-        primitives.ipfilter_sp[FILTER_V_S_P_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, g_lumaFilter[yFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, yFrac);
     }
 }
 
@@ -516,6 +516,9 @@
     int xFrac = mv->x & 0x3;
     int yFrac = mv->y & 0x3;
 
+    assert((width % 4) + (height % 4) == 0);
+    assert(dstStride == MAX_CU_SIZE);
+
     if ((yFrac | xFrac) == 0)
     {
         primitives.ipfilter_p2s(ref, refStride, dst, dstStride, width, height);
@@ -590,10 +593,10 @@
         int halfFilterSize = (filterSize >> 1);
 
         primitives.ipfilter_ps[FILTER_H_P_S_4](refCb - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
-        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, yFrac);
 
         primitives.ipfilter_ps[FILTER_H_P_S_4](refCr - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
-        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, yFrac);
     }
 }
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -640,7 +640,7 @@
                 {
                     uint32_t   posY   = blkPos >> log2BlkSize;
                     uint32_t   posX   = blkPos - (posY << log2BlkSize);
-                    UShort ctxSig = getSigCtxInc(patternSigCtx, scanIdx, posX, posY, log2BlkSize, ttype);
+                    uint16_t ctxSig = getSigCtxInc(patternSigCtx, scanIdx, posX, posY, log2BlkSize, ttype);
                     level         = xGetCodedLevel(costCoeff[scanPos], costCoeff0[scanPos], costSig[scanPos],
                                                    levelDouble, maxAbsLevel, ctxSig, oneCtx, absCtx, goRiceParam,
                                                    c1Idx, c2Idx, qbits, scaleFactor, 0);
@@ -1149,10 +1149,10 @@
                                         double& codedCostSig,
                                         int     levelDouble,
                                         uint32_t    maxAbsLevel,
-                                        UShort  ctxNumSig,
-                                        UShort  ctxNumOne,
-                                        UShort  ctxNumAbs,
-                                        UShort  absGoRice,
+                                        uint16_t  ctxNumSig,
+                                        uint16_t  ctxNumOne,
+                                        uint16_t  ctxNumAbs,
+                                        uint16_t  absGoRice,
                                         uint32_t    c1Idx,
                                         uint32_t    c2Idx,
                                         int     qbits,
@@ -1207,9 +1207,9 @@
  * \returns cost of given absolute transform level
  */
 inline double TComTrQuant::xGetICRateCost(uint32_t   absLevel,
-                                          UShort ctxNumOne,
-                                          UShort ctxNumAbs,
-                                          UShort absGoRice,
+                                          uint16_t ctxNumOne,
+                                          uint16_t ctxNumAbs,
+                                          uint16_t absGoRice,
                                           uint32_t   c1Idx,
                                           uint32_t   c2Idx) const
 {
@@ -1263,9 +1263,9 @@
 }
 
 inline int TComTrQuant::xGetICRate(uint32_t   absLevel,
-                                   UShort ctxNumOne,
-                                   UShort ctxNumAbs,
-                                   UShort absGoRice,
+                                   uint16_t ctxNumOne,
+                                   uint16_t ctxNumAbs,
+                                   uint16_t absGoRice,
                                    uint32_t   c1Idx,
                                    uint32_t   c2Idx) const
 {
@@ -1290,8 +1290,8 @@
             symbol = std::min<uint32_t>(symbol, (maxVlc + 1));
         }
 
-        UShort prefLen = UShort(symbol >> absGoRice) + 1;
-        UShort numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
+        uint16_t prefLen = uint16_t(symbol >> absGoRice) + 1;
+        uint16_t numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
 
         rate += numBins << 15;
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.h	Thu Oct 31 18:43:03 2013 +0530
@@ -200,18 +200,18 @@
     uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, TCoeff* dstCoeff, uint32_t width, uint32_t height, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
 
     inline uint32_t xGetCodedLevel(double& codedCost, double& codedCost0, double& codedCostSig, int levelDouble,
-                               uint32_t maxAbsLevel, UShort ctxNumSig, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice,
+                               uint32_t maxAbsLevel, uint16_t ctxNumSig, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice,
                                uint32_t c1Idx, uint32_t c2Idx, int qbits, double scale, bool bLast) const;
 
-    inline double xGetICRateCost(uint32_t absLevel, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+    inline double xGetICRateCost(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
 
-    inline int    xGetICRate(uint32_t absLevel, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+    inline int    xGetICRate(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
 
     inline double xGetRateLast(uint32_t posx, uint32_t posy) const;
 
-    inline double xGetRateSigCoeffGroup(UShort sigCoeffGroup, UShort ctxNumSig) const { return m_lambda * m_estBitsSbac->significantCoeffGroupBits[ctxNumSig][sigCoeffGroup]; }
+    inline double xGetRateSigCoeffGroup(uint16_t sigCoeffGroup, uint16_t ctxNumSig) const { return m_lambda * m_estBitsSbac->significantCoeffGroupBits[ctxNumSig][sigCoeffGroup]; }
 
-    inline double xGetRateSigCoef(UShort sig, UShort ctxNumSig) const { return m_lambda * m_estBitsSbac->significantBits[ctxNumSig][sig]; }
+    inline double xGetRateSigCoef(uint16_t sig, uint16_t ctxNumSig) const { return m_lambda * m_estBitsSbac->significantBits[ctxNumSig][sig]; }
 
     inline double xGetICost(double rage) const { return m_lambda * rage; } ///< Get the cost for a specific rate
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibCommon/TypeDef.h
--- a/source/Lib/TLibCommon/TypeDef.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibCommon/TypeDef.h	Thu Oct 31 18:43:03 2013 +0530
@@ -52,7 +52,6 @@
 // ====================================================================================================================
 
 typedef unsigned char  UChar;
-typedef unsigned short UShort;
 
 // ====================================================================================================================
 // 64-bit integer type
@@ -71,7 +70,7 @@
 // ====================================================================================================================
 
 #if HIGH_BIT_DEPTH
-typedef UShort Pel;            // 16-bit pixel type
+typedef uint16_t Pel;            // 16-bit pixel type
 #define X265_DEPTH x265::g_bitDepth  // runtime configurable bit depth
 extern uint32_t g_bitDepth;
 #else
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/NALwrite.cpp
--- a/source/Lib/TLibEncoder/NALwrite.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -82,8 +82,8 @@
      *  - 0x00000302
      *  - 0x00000303
      */
-    uint32_t fsize = nalu.m_Bitstream.getByteStreamLength();
-    uint8_t* fifo = nalu.m_Bitstream.getFIFO();
+    uint32_t fsize = nalu.m_bitstream.getByteStreamLength();
+    uint8_t* fifo = nalu.m_bitstream.getFIFO();
     uint8_t* emulation = (uint8_t*)X265_MALLOC(uint8_t, fsize + EMULATION_SIZE);
     uint32_t nalsize = 0;
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/NALwrite.h
--- a/source/Lib/TLibEncoder/NALwrite.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.h	Thu Oct 31 18:43:03 2013 +0530
@@ -61,17 +61,17 @@
                   uint32_t        temporalID = 0,
                   uint32_t        reserved_zero_6bits = 0)
         : NALUnit(nalUnitType, temporalID, reserved_zero_6bits)
-        , m_Bitstream()
+        , m_bitstream()
     {}
 
     OutputNALUnit& operator =(const NALUnit& src)
     {
-        m_Bitstream.clear();
+        m_bitstream.clear();
         static_cast<NALUnit*>(this)->operator =(src);
         return *this;
     }
 
-    TComOutputBitstream m_Bitstream;
+    TComOutputBitstream m_bitstream;
 };
 
 void write(uint8_t*& out, OutputNALUnit& nalu, uint32_t& packetSize);
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -2115,7 +2115,7 @@
  * \param bValid
  * \returns void
  */
-void TEncSearch::xMergeEstimation(TComDataCU* cu, int puIdx, uint32_t& interDir, TComMvField* mvField, uint32_t& mergeIndex, uint32_t& outCost, TComMvField* mvFieldNeighbours, UChar* interDirNeighbours, int& numValidMergeCand)
+void TEncSearch::xMergeEstimation(TComDataCU* cu, int puIdx, uint32_t& interDir, TComMvField* mvField, uint32_t& mergeIndex, uint32_t& outCost, uint32_t& outbits, TComMvField* mvFieldNeighbours, UChar* interDirNeighbours, int& numValidMergeCand)
 {
     uint32_t absPartIdx = 0;
     int width = 0;
@@ -2145,10 +2145,10 @@
         uint32_t costCand = MAX_UINT;
         uint32_t bitsCand = 0;
 
-        PartSize size = cu->getPartitionSize(0);
-
-        cu->getCUMvField(REF_PIC_LIST_0)->setAllMvField(mvFieldNeighbours[0 + 2 * mergeCand], size, absPartIdx, 0, puIdx);
-        cu->getCUMvField(REF_PIC_LIST_1)->setAllMvField(mvFieldNeighbours[1 + 2 * mergeCand], size, absPartIdx, 0, puIdx);
+        cu->getCUMvField(REF_PIC_LIST_0)->m_mv[absPartIdx] = mvFieldNeighbours[0 + 2 * mergeCand].mv;
+        cu->getCUMvField(REF_PIC_LIST_0)->m_refIdx[absPartIdx] = mvFieldNeighbours[0 + 2 * mergeCand].refIdx;
+        cu->getCUMvField(REF_PIC_LIST_1)->m_mv[absPartIdx] = mvFieldNeighbours[1 + 2 * mergeCand].mv;
+        cu->getCUMvField(REF_PIC_LIST_1)->m_refIdx[absPartIdx] = mvFieldNeighbours[1 + 2 * mergeCand].refIdx;
 
         costCand = xGetInterPredictionError(cu, puIdx);
         bitsCand = mergeCand + 1;
@@ -2160,6 +2160,7 @@
         if (costCand < outCost)
         {
             outCost = costCand;
+            outbits = bitsCand;
             mvField[0] = mvFieldNeighbours[0 + 2 * mergeCand];
             mvField[1] = mvFieldNeighbours[1 + 2 * mergeCand];
             interDir = interDirNeighbours[mergeCand];
@@ -2226,6 +2227,8 @@
     UChar interDirNeighbours[MRG_MAX_NUM_CANDS];
     int numValidMergeCand = 0;
 
+    int totalmebits = 0;
+
     for (int partIdx = 0; partIdx < numPart; partIdx++)
     {
         uint32_t listCost[2] = { MAX_UINT, MAX_UINT };
@@ -2495,7 +2498,8 @@
 
             // find Merge result
             uint32_t mrgCost = MAX_UINT;
-            xMergeEstimation(cu, partIdx, mrgInterDir, mrgMvField, mrgIndex, mrgCost, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
+            uint32_t mrgBits = 0;
+            xMergeEstimation(cu, partIdx, mrgInterDir, mrgMvField, mrgIndex, mrgCost, mrgBits, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
             if (mrgCost < meCost)
             {
                 // set Merge result
@@ -2517,6 +2521,7 @@
 #if CU_STAT_LOGFILE
                 meCost += mrgCost;
 #endif
+                totalmebits += mrgBits;
             }
             else
             {
@@ -2530,11 +2535,18 @@
 #if CU_STAT_LOGFILE
                 meCost += meCost;
 #endif
+                totalmebits += mebits;
             }
         }
+        else
+        {
+            totalmebits += mebits;
+        }
         motionCompensation(cu, predYuv, REF_PIC_LIST_X, partIdx, bLuma, bChroma);
     }
 
+    cu->m_totalBits = totalmebits;
+
     setWpScalingDistParam(cu, -1, REF_PIC_LIST_X);
 }
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Thu Oct 31 18:43:03 2013 +0530
@@ -211,7 +211,7 @@
     void xGetBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
 
     void xMergeEstimation(TComDataCU* cu, int partIdx, uint32_t& uiInterDir,
-                          TComMvField* pacMvField, uint32_t& mergeIndex, uint32_t& outCost,
+                          TComMvField* pacMvField, uint32_t& mergeIndex, uint32_t& outCost, uint32_t& outbits,
                           TComMvField* mvFieldNeighbors, UChar* interDirNeighbors, int& numValidMergeCand);
 
     void xRestrictBipredMergeCand(TComDataCU* cu, uint32_t puIdx, TComMvField* mvFieldNeighbours,
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/common.cpp
--- a/source/common/common.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/common.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -186,7 +186,7 @@
 
     /* Rate control options */
     param->rc.bitrate = 0;
-    param->rc.rateTolerance = 0.1;
+    param->rc.rateTolerance = 1.0;
     param->rc.qCompress = 0.6;
     param->rc.ipFactor = 1.4f;
     param->rc.pbFactor = 1.3f;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/ipfilter.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -37,12 +37,14 @@
 
 namespace {
 template<int N>
-void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
+void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
 {
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int shift = IF_FILTER_PREC + headRoom;
     int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
-    int16_t maxVal = (1 << X265_DEPTH) - 1;
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
+    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+
     src -= (N / 2 - 1) * srcStride;
 
     int row, col;
@@ -82,7 +84,7 @@
 {
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int offset =  (1 << (headRoom - 1));
-    int16_t maxVal = (1 << X265_DEPTH) - 1;
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
     const int cStride = 1;
     src -= (N / 2 - 1) * cStride;
 
@@ -226,7 +228,7 @@
 {
     int shift = IF_INTERNAL_PREC - X265_DEPTH;
     int16_t offset = IF_INTERNAL_OFFS + (shift ? (1 << (shift - 1)) : 0);
-    int16_t maxVal = (1 << X265_DEPTH) - 1;
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
     int row, col;
     for (row = 0; row < height; row++)
     {
@@ -262,12 +264,30 @@
     }
 }
 
+void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+{
+    int shift = IF_INTERNAL_PREC - X265_DEPTH;
+    int row, col;
+
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            int16_t val = src[col] << shift;
+            dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
+        }
+
+        src += srcStride;
+        dst += MAX_CU_SIZE;
+    }
+}
+
 template<int N>
 void filterVertical_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *c)
 {
     int shift = IF_FILTER_PREC;
     int offset = 1 << (shift - 1);
-    int16_t maxVal = (1 << X265_DEPTH) - 1;
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
     src -= (N / 2 - 1) * srcStride;
 
     int row, col;
@@ -328,7 +348,7 @@
     int16_t const * coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int offset =  (1 << (headRoom - 1));
-    int16_t maxVal = (1 << X265_DEPTH) - 1;
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
     int cStride = 1;
     src -= (N / 2 - 1) * cStride;
 
@@ -368,7 +388,7 @@
     int16_t const * c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
     int shift = IF_FILTER_PREC;
     int offset = 1 << (shift - 1);
-    int16_t maxVal = (1 << X265_DEPTH) - 1;
+    uint16_t maxVal = (1 << X265_DEPTH) - 1;
     src -= (N / 2 - 1) * srcStride;
 
     int row, col;
@@ -401,6 +421,17 @@
         dst += dstStride;
     }
 }
+typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, short *dst, intptr_t dstStride, int width, int height, const short *coeff);
+typedef void (*ipfilter_sp_t)(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const short *coeff);
+
+template<int N, int width, int height>
+void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+{
+    short m_immedVals[(64 + 8) * (64 + 8)];
+    filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals, width, width, height + 7, g_lumaFilter[idxX]);
+    filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
+}
+
 }
 
 namespace x265 {
@@ -412,7 +443,8 @@
 
 #define LUMA(W, H) \
     p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>;\
-    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>
+    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>; \
+    p.luma_hvpp[LUMA_ ## W ## x ## H]    = interp_hv_pp_c<8, W, H>;
 
 void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
 {
@@ -457,6 +489,7 @@
 
     p.ipfilter_p2s = filterConvertPelToShort_c;
     p.ipfilter_s2p = filterConvertShortToPel_c;
+    p.luma_p2s = filterConvertPelToShort_c;
 
     p.extendRowBorder = extendCURowColBorder;
 }
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/pixel.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -758,6 +758,21 @@
         }
     }
 }
+
+template<int bx, int by>
+void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+{
+    for (int y = 0; y < by; y++)
+    {
+        for (int x = 0; x < bx; x++)
+        {
+            a[x] = b[x];
+        }
+
+        a += stridea;
+        b += strideb;
+    }
+}
 }  // end anonymous namespace
 
 namespace x265 {
@@ -798,6 +813,37 @@
     p.satd[LUMA_64x16] = satd8<64, 16>;
     p.satd[LUMA_16x64] = satd8<16, 64>;
 
+#define CHROMA(W, H) \
+    p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>
+#define LUMA(W, H) \
+    p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>
+
+    LUMA(4, 4);
+    LUMA(8, 8);   CHROMA(4, 4);
+    LUMA(4, 8);   CHROMA(2, 4);
+    LUMA(8, 4);   CHROMA(4, 2);
+    LUMA(16, 16); CHROMA(8, 8);
+    LUMA(16,  8); CHROMA(8, 4);
+    LUMA( 8, 16); CHROMA(4, 8);
+    LUMA(16, 12); CHROMA(8, 6);
+    LUMA(12, 16); CHROMA(6, 8);
+    LUMA(16,  4); CHROMA(8, 2);
+    LUMA( 4, 16); CHROMA(2, 8);
+    LUMA(32, 32); CHROMA(16, 16);
+    LUMA(32, 16); CHROMA(16, 8);
+    LUMA(16, 32); CHROMA(8, 16);
+    LUMA(32, 24); CHROMA(16, 12);
+    LUMA(24, 32); CHROMA(12, 16);
+    LUMA(32,  8); CHROMA(16, 4);
+    LUMA( 8, 32); CHROMA(4, 16);
+    LUMA(64, 64); CHROMA(32, 32);
+    LUMA(64, 32); CHROMA(32, 16);
+    LUMA(32, 64); CHROMA(16, 32);
+    LUMA(64, 48); CHROMA(32, 24);
+    LUMA(48, 64); CHROMA(24, 32);
+    LUMA(64, 16); CHROMA(32, 8);
+    LUMA(16, 64); CHROMA(8, 32);
+
     //sse
 #if HIGH_BIT_DEPTH
     SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, int16_t, int16_t)
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/primitives.h
--- a/source/common/primitives.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/primitives.h	Thu Oct 31 18:43:03 2013 +0530
@@ -66,7 +66,7 @@
 { // Square     Rectangular             Asymmetrical (0.75, 0.25)
     LUMA_4x4,
     LUMA_8x8,   LUMA_8x4,   LUMA_4x8,
-    LUMA_16x16, LUMA_16x8,  LUMA_8x16,  LUMA_16x12, LUMA_12x16, LUMA_4x16,  LUMA_16x4,
+    LUMA_16x16, LUMA_16x8,  LUMA_8x16,  LUMA_16x12, LUMA_12x16, LUMA_16x4,  LUMA_4x16,
     LUMA_32x32, LUMA_32x16, LUMA_16x32, LUMA_32x24, LUMA_24x32, LUMA_32x8,  LUMA_8x32,
     LUMA_64x64, LUMA_64x32, LUMA_32x64, LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64,
     NUM_LUMA_PARTITIONS
@@ -165,7 +165,7 @@
 typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
 typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
 typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
-typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
+typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
 typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
 typedef void (*ipfilter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height);
 typedef void (*ipfilter_s2p_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height);
@@ -209,6 +209,10 @@
 typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src,  intptr_t srcStride, int w, int h);
 
 typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
+typedef void (*filter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+
+typedef void (*copy_pp_t)(pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -234,6 +238,9 @@
     cvt16to16_shl_t cvt16to16_shl;
     cvt32to16_shr_t cvt32to16_shr;
 
+    copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
+    copy_pp_t       chroma_copy_pp[NUM_CHROMA_PARTITIONS];
+
     ipfilter_pp_t   ipfilter_pp[NUM_IPFILTER_P_P];
     ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];
     ipfilter_sp_t   ipfilter_sp[NUM_IPFILTER_S_P];
@@ -245,6 +252,8 @@
     filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];
     filter_pp_t     chroma_vpp[NUM_CHROMA_PARTITIONS];
     filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
+    filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
+    filter_p2s_t    luma_p2s;
 
     intra_dc_t      intra_pred_dc;
     intra_planar_t  intra_pred_planar;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/threadpool.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -369,7 +369,7 @@
 
 static int get_cpu_count()
 {
-#if WIN32
+#if _WIN32
     SYSTEM_INFO sysinfo;
     GetSystemInfo(&sysinfo);
     return sysinfo.dwNumberOfProcessors;
@@ -393,8 +393,8 @@
     }
 
     return count;
-#else // if WIN32
+#else // if _WIN32
     return 2; // default to 2 threads, everywhere else
-#endif // if WIN32
+#endif // if _WIN32
 }
 } // end namespace x265
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/vec/ipfilter-sse41.cpp
--- a/source/common/vec/ipfilter-sse41.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/vec/ipfilter-sse41.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -34,6 +34,8 @@
 #include <assert.h>
 #include <string.h>
 
+using namespace x265;
+
 #if !HIGH_BIT_DEPTH
 namespace {
 ALIGN_VAR_32(const uint16_t, c_512[16]) =
@@ -42,8 +44,10 @@
 };
 
 template<int N>
-void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
+void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
 {
+    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+
     src -= (N / 2 - 1) * srcStride;
 
     int offset;
@@ -677,8 +681,9 @@
 #include "vectorclass.h"
 namespace {
 template<int N>
-void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int block_width, int block_height, const int16_t *coeff)
+void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int block_width, int block_height, int coeffIdx)
 {
+    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
     int row, col;
 
     src -= (N / 2 - 1) * srcStride;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/vec/pixel-sse41.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -31,1335 +31,8 @@
 
 using namespace x265;
 
-#if defined(_MSC_VER)
-#pragma warning(disable: 4799) // MMX warning EMMS
-#endif
-
-#if defined(__INTEL_COMPILER) || defined(__GCC__)
-#define HAVE_MMX 1
-#elif defined(_MSC_VER) && defined(X86_64)
-#define HAVE_MMX 0
-#else
-#define HAVE_MMX 1
-#endif
-
 namespace {
 #if !HIGH_BIT_DEPTH
-#if HAVE_MMX
-template<int ly>
-// ly will always be 32
-int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    __m64 sum0 = _mm_setzero_si64();
-
-    __m64 T00, T01, T02, T03;
-    __m64 T10, T11, T12, T13;
-    __m64 T20, T21, T22, T23;
-
-    for (int i = 0; i < ly; i += 16)
-    {
-        T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
-        T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
-        T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
-        T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
-
-        T10 = (*(__m64*)(fref + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref + (i + 3) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-
-        sum0 = _mm_add_pi16(sum0, T20);
-        sum0 = _mm_add_pi16(sum0, T21);
-        sum0 = _mm_add_pi16(sum0, T22);
-        sum0 = _mm_add_pi16(sum0, T23);
-
-        T00 = (*(__m64*)(fenc + (i + 4) * fencstride));
-        T01 = (*(__m64*)(fenc + (i + 5) * fencstride));
-        T02 = (*(__m64*)(fenc + (i + 6) * fencstride));
-        T03 = (*(__m64*)(fenc + (i + 7) * fencstride));
-
-        T10 = (*(__m64*)(fref + (i + 4) * frefstride));
-        T11 = (*(__m64*)(fref + (i + 5) * frefstride));
-        T12 = (*(__m64*)(fref + (i + 6) * frefstride));
-        T13 = (*(__m64*)(fref + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-
-        sum0 = _mm_add_pi16(sum0, T20);
-        sum0 = _mm_add_pi16(sum0, T21);
-        sum0 = _mm_add_pi16(sum0, T22);
-        sum0 = _mm_add_pi16(sum0, T23);
-
-        T00 = (*(__m64*)(fenc + (i + 8) * fencstride));
-        T01 = (*(__m64*)(fenc + (i + 9) * fencstride));
-        T02 = (*(__m64*)(fenc + (i + 10) * fencstride));
-        T03 = (*(__m64*)(fenc + (i + 11) * fencstride));
-
-        T10 = (*(__m64*)(fref + (i + 8) * frefstride));
-        T11 = (*(__m64*)(fref + (i + 9) * frefstride));
-        T12 = (*(__m64*)(fref + (i + 10) * frefstride));
-        T13 = (*(__m64*)(fref + (i + 11) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-
-        sum0 = _mm_add_pi16(sum0, T20);
-        sum0 = _mm_add_pi16(sum0, T21);
-        sum0 = _mm_add_pi16(sum0, T22);
-        sum0 = _mm_add_pi16(sum0, T23);
-
-        T00 = (*(__m64*)(fenc + (i + 12) * fencstride));
-        T01 = (*(__m64*)(fenc + (i + 13) * fencstride));
-        T02 = (*(__m64*)(fenc + (i + 14) * fencstride));
-        T03 = (*(__m64*)(fenc + (i + 15) * fencstride));
-
-        T10 = (*(__m64*)(fref + (i + 12) * frefstride));
-        T11 = (*(__m64*)(fref + (i + 13) * frefstride));
-        T12 = (*(__m64*)(fref + (i + 14) * frefstride));
-        T13 = (*(__m64*)(fref + (i + 15) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-
-        sum0 = _mm_add_pi16(sum0, T20);
-        sum0 = _mm_add_pi16(sum0, T21);
-        sum0 = _mm_add_pi16(sum0, T22);
-        sum0 = _mm_add_pi16(sum0, T23);
-    }
-
-    // 8 * 255 -> 11 bits x 8 -> 14 bits
-    return _m_to_int(sum0);
-}
-
-#else /* if HAVE_MMX */
-
-template<int ly>
-// ly will always be 32
-int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21;
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * fencstride));
-        T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * fencstride));
-        T01 = _mm_unpacklo_epi64(T00, T01);
-        T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * fencstride));
-        T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * fencstride));
-        T03 = _mm_unpacklo_epi64(T02, T03);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum1 = _mm_add_epi32(sum1, T21);
-
-        T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * fencstride));
-        T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * fencstride));
-        T01 = _mm_unpacklo_epi64(T00, T01);
-        T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 6) * fencstride));
-        T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 7) * fencstride));
-        T03 = _mm_unpacklo_epi64(T02, T03);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum1 = _mm_add_epi32(sum1, T21);
-    }
-
-    // [0 x 0 x]
-    sum0 = _mm_add_epi32(sum0, sum1);
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-    return _mm_cvtsi128_si32(sum0);
-}
-
-#endif /* if HAVE_MMX */
-
-template<int ly>
-// will only be instanced with ly == 16
-int sad_12(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)
-{
-    assert(ly == 16);
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21, T22, T23;
-
-#define MASK _mm_set_epi32(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff)
-
-#define PROCESS_12x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
-    T00 = _mm_and_si128(T00, MASK); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
-    T01 = _mm_and_si128(T01, MASK); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
-    T02 = _mm_and_si128(T02, MASK); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
-    T03 = _mm_and_si128(T03, MASK); \
-    T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
-    T10 = _mm_and_si128(T10, MASK); \
-    T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
-    T11 = _mm_and_si128(T11, MASK); \
-    T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
-    T12 = _mm_and_si128(T12, MASK); \
-    T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
-    T13 = _mm_and_si128(T13, MASK); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    sum0 = _mm_add_epi16(sum0, T20); \
-    sum0 = _mm_add_epi16(sum0, T21); \
-    sum0 = _mm_add_epi16(sum0, T22); \
-    sum0 = _mm_add_epi16(sum0, T23)
-
-    PROCESS_12x4(0);
-    PROCESS_12x4(4);
-    PROCESS_12x4(8);
-    PROCESS_12x4(12);
-
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-
-    return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-int sad_16(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21, T22, T23;
-
-#define PROCESS_16x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    sum0 = _mm_add_epi16(sum0, T20); \
-    sum0 = _mm_add_epi16(sum0, T21); \
-    sum0 = _mm_add_epi16(sum0, T22); \
-    sum0 = _mm_add_epi16(sum0, T23)
-
-    PROCESS_16x4(0);
-    if (ly >= 8)
-    {
-        PROCESS_16x4(4);
-    }
-    if (ly >= 12)
-    {
-        PROCESS_16x4(8);
-    }
-    if (ly >= 16)
-    {
-        PROCESS_16x4(12);
-    }
-    if (ly > 16)
-    {
-        for (int i = 16; i < ly; i += 8)
-        {
-            PROCESS_16x4(i);
-            PROCESS_16x4(i + 4);
-        }
-    }
-
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-
-    return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-// always instanced for 32 rows
-int sad_24(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21, T22, T23;
-
-#define PROCESS_24x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    sum0 = _mm_add_epi32(sum0, T20); \
-    sum0 = _mm_add_epi32(sum0, T21); \
-    sum0 = _mm_add_epi32(sum0, T22); \
-    sum0 = _mm_add_epi32(sum0, T23); \
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 0) * fencstride))); \
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 1) * fencstride))); \
-    T01 = _mm_unpacklo_epi64(T00, T01); \
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 2) * fencstride))); \
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 3) * fencstride))); \
-    T03 = _mm_unpacklo_epi64(T02, T03); \
-    T10 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 0) * frefstride))); \
-    T11 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 1) * frefstride))); \
-    T11 = _mm_unpacklo_epi64(T10, T11); \
-    T12 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 2) * frefstride))); \
-    T13 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 3) * frefstride))); \
-    T13 = _mm_unpacklo_epi64(T12, T13); \
-    T20 = _mm_setzero_si128(); \
-    T21 = _mm_setzero_si128(); \
-    T20 = _mm_sad_epu8(T01, T11); \
-    T21 = _mm_sad_epu8(T03, T13); \
-    sum0 = _mm_add_epi32(sum0, T20); \
-    sum0 = _mm_add_epi32(sum0, T21);
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        PROCESS_24x4(i);
-        PROCESS_24x4(i + 4);
-    }
-
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-
-    return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-// ly will be 8, 16, 24, or 32
-int sad_32(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21, T22, T23;
-
-#define PROCESS_32x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    sum0 = _mm_add_epi32(sum0, T20); \
-    sum0 = _mm_add_epi32(sum0, T21); \
-    sum0 = _mm_add_epi32(sum0, T22); \
-    sum0 = _mm_add_epi32(sum0, T23); \
-    T00 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 0) * fencstride)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 1) * fencstride)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 2) * fencstride)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + 16 + (BASE + 3) * fencstride)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref + 16 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    sum0 = _mm_add_epi32(sum0, T20); \
-    sum0 = _mm_add_epi32(sum0, T21); \
-    sum0 = _mm_add_epi32(sum0, T22); \
-    sum0 = _mm_add_epi32(sum0, T23);
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        PROCESS_32x4(i);
-        PROCESS_32x4(i + 4);
-    }
-
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-
-    return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-int sad_48(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-
-    /* for ly = 64 */
-    for (int i = 0; i < ly; i += 8)
-    {
-        __m128i T00, T01, T02;
-        __m128i T10, T11, T12;
-        __m128i T20, T21, T22;
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 0) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 0) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 0) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 0) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 0) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 0) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 1) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 1) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 1) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 1) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 2) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 2) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 2) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 2) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 3) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 3) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 3) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 3) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 4) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 4) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 4) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 4) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 5) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 5) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 5) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 5) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 6) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 6) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 6) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 6) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 7) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 7) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 7) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-    }
-
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-    return _mm_cvtsi128_si32(sum0);
-}
-
-template<int ly>
-// ly will be 16, 32, 48, or 64
-int sad_64(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        __m128i T00, T01, T02, T03;
-        __m128i T10, T11, T12, T13;
-        __m128i T20, T21, T22, T23;
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 0) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 0) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 0) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 0) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 0) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 0) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 0) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 0) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 1) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 1) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 1) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 1) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 1) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 1) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 2) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 2) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 2) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 2) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 2) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 2) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 3) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 3) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 3) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 3) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 3) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 3) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 4) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 4) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 4) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 4) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 4) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 4) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 5) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 5) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 5) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 5) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 5) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 5) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 6) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 6) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 6) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 6) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 6) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 6) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-
-        T00 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));
-        T01 = _mm_load_si128((__m128i*)(fenc + 16 + (i + 7) * fencstride));
-        T02 = _mm_load_si128((__m128i*)(fenc + 32 + (i + 7) * fencstride));
-        T03 = _mm_load_si128((__m128i*)(fenc + 48 + (i + 7) * fencstride));
-
-        T10 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));
-        T11 = _mm_loadu_si128((__m128i*)(fref + 16 + (i + 7) * frefstride));
-        T12 = _mm_loadu_si128((__m128i*)(fref + 32 + (i + 7) * frefstride));
-        T13 = _mm_loadu_si128((__m128i*)(fref + 48 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_epu8(T00, T10);
-        T21 = _mm_sad_epu8(T01, T11);
-        T22 = _mm_sad_epu8(T02, T12);
-        T23 = _mm_sad_epu8(T03, T13);
-
-        sum0 = _mm_add_epi32(sum0, T20);
-        sum0 = _mm_add_epi32(sum0, T21);
-        sum0 = _mm_add_epi32(sum0, T22);
-        sum0 = _mm_add_epi32(sum0, T23);
-    }
-
-    sum1 = _mm_shuffle_epi32(sum0, 2);
-    sum0 = _mm_add_epi32(sum0, sum1);
-    return _mm_cvtsi128_si32(sum0);
-}
-
-#if HAVE_MMX
-void sad_x3_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-    __m128i sum0, sum1, sum2;
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i R00, R01, R02, R03;
-    __m128i T20;
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R02);
-    sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R03);
-    sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    res[0] = _mm_cvtsi128_si32(sum0);
-    res[1] = _mm_cvtsi128_si32(sum1);
-    res[2] = _mm_cvtsi128_si32(sum2);
-}
-
-#else /* if HAVE_MMX */
-
-void sad_x3_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i sum2 = _mm_setzero_si128();
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i R00, R01, R02, R03;
-    __m128i T20;
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (0) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (1) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R02);
-    sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R03);
-    sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    res[0] = _mm_cvtsi128_si32(sum0);
-    res[1] = _mm_cvtsi128_si32(sum1);
-    res[2] = _mm_cvtsi128_si32(sum2);
-}
-
-#endif /* if HAVE_MMX */
-
-#if HAVE_MMX
-template<int ly>
-// ly will always be 32
-void sad_x3_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-    __m64 sum0 = _mm_setzero_si64();
-    __m64 sum1 = _mm_setzero_si64();
-    __m64 sum2 = _mm_setzero_si64();
-
-    __m64 T00, T01, T02, T03, T04, T05, T06, T07;
-    __m64 T10, T11, T12, T13, T14, T15, T16, T17;
-    __m64 T20, T21, T22, T23, T24, T25, T26, T27;
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        T00 = (*(__m64*)(fenc + (i + 0) * FENC_STRIDE));
-        T01 = (*(__m64*)(fenc + (i + 1) * FENC_STRIDE));
-        T02 = (*(__m64*)(fenc + (i + 2) * FENC_STRIDE));
-        T03 = (*(__m64*)(fenc + (i + 3) * FENC_STRIDE));
-        T04 = (*(__m64*)(fenc + (i + 4) * FENC_STRIDE));
-        T05 = (*(__m64*)(fenc + (i + 5) * FENC_STRIDE));
-        T06 = (*(__m64*)(fenc + (i + 6) * FENC_STRIDE));
-        T07 = (*(__m64*)(fenc + (i + 7) * FENC_STRIDE));
-
-        T10 = (*(__m64*)(fref1 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref1 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref1 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref1 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref1 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref1 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref1 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref1 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum0 = _mm_add_pi16(sum0, T20);
-        sum0 = _mm_add_pi16(sum0, T21);
-        sum0 = _mm_add_pi16(sum0, T22);
-        sum0 = _mm_add_pi16(sum0, T23);
-        sum0 = _mm_add_pi16(sum0, T24);
-        sum0 = _mm_add_pi16(sum0, T25);
-        sum0 = _mm_add_pi16(sum0, T26);
-        sum0 = _mm_add_pi16(sum0, T27);
-
-        T10 = (*(__m64*)(fref2 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref2 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref2 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref2 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref2 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref2 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref2 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref2 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum1 = _mm_add_pi16(sum1, T20);
-        sum1 = _mm_add_pi16(sum1, T21);
-        sum1 = _mm_add_pi16(sum1, T22);
-        sum1 = _mm_add_pi16(sum1, T23);
-        sum1 = _mm_add_pi16(sum1, T24);
-        sum1 = _mm_add_pi16(sum1, T25);
-        sum1 = _mm_add_pi16(sum1, T26);
-        sum1 = _mm_add_pi16(sum1, T27);
-
-        T10 = (*(__m64*)(fref3 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref3 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref3 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref3 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref3 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref3 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref3 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref3 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum2 = _mm_add_pi16(sum2, T20);
-        sum2 = _mm_add_pi16(sum2, T21);
-        sum2 = _mm_add_pi16(sum2, T22);
-        sum2 = _mm_add_pi16(sum2, T23);
-        sum2 = _mm_add_pi16(sum2, T24);
-        sum2 = _mm_add_pi16(sum2, T25);
-        sum2 = _mm_add_pi16(sum2, T26);
-        sum2 = _mm_add_pi16(sum2, T27);
-    }
-
-    res[0] = _m_to_int(sum0);
-    res[1] = _m_to_int(sum1);
-    res[2] = _m_to_int(sum2);
-}
-
-#else /* if HAVE_MMX */
-
-template<int ly>
-// ly will always be 32
-void sad_x3_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21;
-    __m128i sum0 = _mm_setzero_si128();
-
-    res[0] = res[1] = res[2] = 0;
-    for (int i = 0; i < ly; i += 8)
-    {
-        T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * FENC_STRIDE));
-        T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * FENC_STRIDE));
-        T01 = _mm_unpacklo_epi64(T00, T01);
-        T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * FENC_STRIDE));
-        T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * FENC_STRIDE));
-        T03 = _mm_unpacklo_epi64(T02, T03);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = res[0] + _mm_cvtsi128_si32(sum0);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = res[1] + _mm_cvtsi128_si32(sum0);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = res[2] + _mm_cvtsi128_si32(sum0);
-
-        T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * FENC_STRIDE));
-        T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * FENC_STRIDE));
-        T01 = _mm_unpacklo_epi64(T00, T01);
-        T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 6) * FENC_STRIDE));
-        T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 7) * FENC_STRIDE));
-        T03 = _mm_unpacklo_epi64(T02, T03);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = res[0] + _mm_cvtsi128_si32(sum0);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = res[1] + _mm_cvtsi128_si32(sum0);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = res[2] + _mm_cvtsi128_si32(sum0);
-    }
-}
-
-#endif /* if HAVE_MMX */
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-
 template<int ly>
 void sad_x3_12(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
 {
@@ -1445,295 +118,6 @@
 }
 
 template<int ly>
-void sad_x3_16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-#define PROCESS_16x4x3(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * FENC_STRIDE)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res0 += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res1 += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res2 += _mm_cvtsi128_si32(sum0); \
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21, T22, T23;
-    __m128i sum0, sum1;
-    int res0 = 0, res1 = 0, res2 = 0;
-
-    // ly == 4, 12, 32, 64
-    PROCESS_16x4x3(0);
-    if (ly >= 8)
-    {
-        PROCESS_16x4x3(4);
-    }
-    if (ly >= 12)
-    {
-        PROCESS_16x4x3(8);
-    }
-    if (ly > 12)
-    {
-        PROCESS_16x4x3(12);
-        for (int i = 16; i < ly; i += 16)
-        {
-            PROCESS_16x4x3(i);
-            PROCESS_16x4x3(i + 4);
-            PROCESS_16x4x3(i + 8);
-            PROCESS_16x4x3(i + 12);
-        }
-    }
-    res[0] = res0;
-    res[1] = res1;
-    res[2] = res2;
-}
-
-template<int ly>
-void sad_x3_24(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-    res[0] = res[1] = res[2] = 0;
-    __m128i T00, T01, T02, T03, T04, T05;
-    __m128i T10, T11, T12, T13, T14, T15;
-    __m128i T20, T21, T22, T23;
-    __m128i T30, T31;
-    __m128i sum0, sum1;
-
-#define PROCESS_24x4x3(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
-    T10 = _mm_loadl_epi64((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
-    T11 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
-    T04 = _mm_unpacklo_epi64(T10, T11); \
-    T12 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
-    T13 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
-    T05 = _mm_unpacklo_epi64(T12, T13); \
-    T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[0] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[1] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[2] += _mm_cvtsi128_si32(sum0);
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        PROCESS_24x4x3(i);
-        PROCESS_24x4x3(i + 4);
-    }
-}
-
-template<int ly>
-// ly will be 8, 16, 24, or 32
-void sad_x3_32(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
-{
-    res[0] = res[1] = res[2] = 0;
-    __m128i T00, T01, T02, T03, T04, T05, T06, T07;
-    __m128i T10, T11, T12, T13, T14, T15, T16, T17;
-    __m128i T20, T21, T22, T23, T24, T25, T26, T27;
-    __m128i sum0, sum1;
-
-#define PROCESS_32x4x3(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
-    T04 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
-    T05 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
-    T06 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
-    T07 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[0] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[1] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[2] += _mm_cvtsi128_si32(sum0);
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        PROCESS_32x4x3(i);
-        PROCESS_32x4x3(i + 4);
-    }
-}
-
-template<int ly>
 void sad_x3_48(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res)
 {
     __m128i sum0 = _mm_setzero_si128();
@@ -2490,770 +874,6 @@
     res[2] = _mm_cvtsi128_si32(sum2);       /*Extracting sad value for reference frame 3*/
 }
 
-#if HAVE_MMX
-void sad_x4_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-    __m128i sum0, sum1, sum2, sum3;
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i R00, R01, R02, R03, R04;
-    __m128i T20;
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R02);
-    sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R03);
-    sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R04);
-    sum3 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T20 = _mm_sad_epu8(R00, R04);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum3 = _mm_add_epi32(sum3, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T20 = _mm_sad_epu8(R00, R04);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum3 = _mm_add_epi32(sum3, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T20 = _mm_sad_epu8(R00, R04);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum3 = _mm_add_epi32(sum3, T20);
-
-    res[0] = _mm_cvtsi128_si32(sum0);
-    res[1] = _mm_cvtsi128_si32(sum1);
-    res[2] = _mm_cvtsi128_si32(sum2);
-    res[3] = _mm_cvtsi128_si32(sum3);
-}
-
-#else /* if HAVE_MMX */
-
-void sad_x4_4x16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i sum2 = _mm_setzero_si128();
-    __m128i sum3 = _mm_setzero_si128();
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i R00, R01, R02, R03, R04;
-    __m128i T20;
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (0) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (1) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (2) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (3) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (0) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (1) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (2) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (3) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    sum0 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R02);
-    sum1 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R03);
-    sum2 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T20 = _mm_sad_epu8(R00, R04);
-    sum3 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (6) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (7) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (4) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (5) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (6) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (7) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T20 = _mm_sad_epu8(R00, R04);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum3 = _mm_add_epi32(sum3, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (10) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (11) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (8) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (9) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (10) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (11) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T20 = _mm_sad_epu8(R00, R04);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum3 = _mm_add_epi32(sum3, T20);
-
-    T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
-    T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
-    T01 = _mm_unpacklo_epi32(T00, T01);
-    T02 = _mm_loadl_epi64((__m128i*)(fenc + (14) * FENC_STRIDE));
-    T03 = _mm_loadl_epi64((__m128i*)(fenc + (15) * FENC_STRIDE));
-    T03 = _mm_unpacklo_epi32(T02, T03);
-    R00 = _mm_unpacklo_epi64(T01, T03);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref1 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref1 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref1 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref1 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R01 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref2 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref2 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R02 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref3 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref3 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R03 = _mm_unpacklo_epi64(T11, T13);
-
-    T10 = _mm_loadl_epi64((__m128i*)(fref4 + (12) * frefstride));
-    T11 = _mm_loadl_epi64((__m128i*)(fref4 + (13) * frefstride));
-    T11 = _mm_unpacklo_epi32(T10, T11);
-    T12 = _mm_loadl_epi64((__m128i*)(fref4 + (14) * frefstride));
-    T13 = _mm_loadl_epi64((__m128i*)(fref4 + (15) * frefstride));
-    T13 = _mm_unpacklo_epi32(T12, T13);
-    R04 = _mm_unpacklo_epi64(T11, T13);
-
-    T20 = _mm_sad_epu8(R00, R01);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum0 = _mm_add_epi32(sum0, T20);
-
-    T20 = _mm_sad_epu8(R00, R02);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum1 = _mm_add_epi32(sum1, T20);
-
-    T20 = _mm_sad_epu8(R00, R03);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum2 = _mm_add_epi32(sum2, T20);
-
-    T20 = _mm_sad_epu8(R00, R04);
-    T20 = _mm_add_epi32(T20, _mm_shuffle_epi32(T20, 2));
-    sum3 = _mm_add_epi32(sum3, T20);
-
-    res[0] = _mm_cvtsi128_si32(sum0);
-    res[1] = _mm_cvtsi128_si32(sum1);
-    res[2] = _mm_cvtsi128_si32(sum2);
-    res[3] = _mm_cvtsi128_si32(sum3);
-}
-
-#endif /* if HAVE_MMX */
-
-#if HAVE_MMX
-template<int ly>
-// ly will always be 32
-void sad_x4_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-    __m64 sum0 = _mm_setzero_si64();
-    __m64 sum1 = _mm_setzero_si64();
-    __m64 sum2 = _mm_setzero_si64();
-    __m64 sum3 = _mm_setzero_si64();
-
-    __m64 T00, T01, T02, T03, T04, T05, T06, T07;
-    __m64 T10, T11, T12, T13, T14, T15, T16, T17;
-    __m64 T20, T21, T22, T23, T24, T25, T26, T27;
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        T00 = (*(__m64*)(fenc + (i + 0) * FENC_STRIDE));
-        T01 = (*(__m64*)(fenc + (i + 1) * FENC_STRIDE));
-        T02 = (*(__m64*)(fenc + (i + 2) * FENC_STRIDE));
-        T03 = (*(__m64*)(fenc + (i + 3) * FENC_STRIDE));
-        T04 = (*(__m64*)(fenc + (i + 4) * FENC_STRIDE));
-        T05 = (*(__m64*)(fenc + (i + 5) * FENC_STRIDE));
-        T06 = (*(__m64*)(fenc + (i + 6) * FENC_STRIDE));
-        T07 = (*(__m64*)(fenc + (i + 7) * FENC_STRIDE));
-
-        T10 = (*(__m64*)(fref1 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref1 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref1 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref1 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref1 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref1 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref1 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref1 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum0 = _mm_add_pi16(sum0, T20);
-        sum0 = _mm_add_pi16(sum0, T21);
-        sum0 = _mm_add_pi16(sum0, T22);
-        sum0 = _mm_add_pi16(sum0, T23);
-        sum0 = _mm_add_pi16(sum0, T24);
-        sum0 = _mm_add_pi16(sum0, T25);
-        sum0 = _mm_add_pi16(sum0, T26);
-        sum0 = _mm_add_pi16(sum0, T27);
-
-        T10 = (*(__m64*)(fref2 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref2 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref2 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref2 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref2 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref2 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref2 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref2 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum1 = _mm_add_pi16(sum1, T20);
-        sum1 = _mm_add_pi16(sum1, T21);
-        sum1 = _mm_add_pi16(sum1, T22);
-        sum1 = _mm_add_pi16(sum1, T23);
-        sum1 = _mm_add_pi16(sum1, T24);
-        sum1 = _mm_add_pi16(sum1, T25);
-        sum1 = _mm_add_pi16(sum1, T26);
-        sum1 = _mm_add_pi16(sum1, T27);
-
-        T10 = (*(__m64*)(fref3 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref3 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref3 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref3 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref3 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref3 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref3 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref3 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum2 = _mm_add_pi16(sum2, T20);
-        sum2 = _mm_add_pi16(sum2, T21);
-        sum2 = _mm_add_pi16(sum2, T22);
-        sum2 = _mm_add_pi16(sum2, T23);
-        sum2 = _mm_add_pi16(sum2, T24);
-        sum2 = _mm_add_pi16(sum2, T25);
-        sum2 = _mm_add_pi16(sum2, T26);
-        sum2 = _mm_add_pi16(sum2, T27);
-
-        T10 = (*(__m64*)(fref4 + (i + 0) * frefstride));
-        T11 = (*(__m64*)(fref4 + (i + 1) * frefstride));
-        T12 = (*(__m64*)(fref4 + (i + 2) * frefstride));
-        T13 = (*(__m64*)(fref4 + (i + 3) * frefstride));
-        T14 = (*(__m64*)(fref4 + (i + 4) * frefstride));
-        T15 = (*(__m64*)(fref4 + (i + 5) * frefstride));
-        T16 = (*(__m64*)(fref4 + (i + 6) * frefstride));
-        T17 = (*(__m64*)(fref4 + (i + 7) * frefstride));
-
-        T20 = _mm_sad_pu8(T00, T10);
-        T21 = _mm_sad_pu8(T01, T11);
-        T22 = _mm_sad_pu8(T02, T12);
-        T23 = _mm_sad_pu8(T03, T13);
-        T24 = _mm_sad_pu8(T04, T14);
-        T25 = _mm_sad_pu8(T05, T15);
-        T26 = _mm_sad_pu8(T06, T16);
-        T27 = _mm_sad_pu8(T07, T17);
-
-        sum3 = _mm_add_pi16(sum3, T20);
-        sum3 = _mm_add_pi16(sum3, T21);
-        sum3 = _mm_add_pi16(sum3, T22);
-        sum3 = _mm_add_pi16(sum3, T23);
-        sum3 = _mm_add_pi16(sum3, T24);
-        sum3 = _mm_add_pi16(sum3, T25);
-        sum3 = _mm_add_pi16(sum3, T26);
-        sum3 = _mm_add_pi16(sum3, T27);
-    }
-
-    res[0] = _m_to_int(sum0);
-    res[1] = _m_to_int(sum1);
-    res[2] = _m_to_int(sum2);
-    res[3] = _m_to_int(sum3);
-}
-
-#else /* if HAVE_MMX */
-
-template<int ly>
-// ly will always be 32
-void sad_x4_8(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-    __m128i sum0 = _mm_setzero_si128();
-    __m128i sum1 = _mm_setzero_si128();
-    __m128i sum2 = _mm_setzero_si128();
-    __m128i sum3 = _mm_setzero_si128();
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21;
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * FENC_STRIDE));
-        T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * FENC_STRIDE));
-        T01 = _mm_unpacklo_epi64(T00, T01);
-        T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * FENC_STRIDE));
-        T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * FENC_STRIDE));
-        T03 = _mm_unpacklo_epi64(T02, T03);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum0 = _mm_add_epi32(sum0, T21);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum1 = _mm_add_epi32(sum1, T21);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum2 = _mm_add_epi32(sum2, T21);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 0) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 1) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 2) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 3) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum3 = _mm_add_epi32(sum3, T21);
-
-        T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * FENC_STRIDE));
-        T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * FENC_STRIDE));
-        T01 = _mm_unpacklo_epi64(T00, T01);
-        T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 6) * FENC_STRIDE));
-        T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 7) * FENC_STRIDE));
-        T03 = _mm_unpacklo_epi64(T02, T03);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref1 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum0 = _mm_add_epi32(sum0, T21);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum1 = _mm_add_epi32(sum1, T21);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum2 = _mm_add_epi32(sum2, T21);
-
-        T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 4) * frefstride));
-        T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 5) * frefstride));
-        T11 = _mm_unpacklo_epi64(T10, T11);
-        T12 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 6) * frefstride));
-        T13 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 7) * frefstride));
-        T13 = _mm_unpacklo_epi64(T12, T13);
-
-        T20 = _mm_sad_epu8(T01, T11);
-        T21 = _mm_sad_epu8(T03, T13);
-        T21 = _mm_add_epi32(T20, T21);
-        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
-        sum3 = _mm_add_epi32(sum3, T21);
-    }
-
-    res[0] = _mm_cvtsi128_si32(sum0);
-    res[1] = _mm_cvtsi128_si32(sum1);
-    res[2] = _mm_cvtsi128_si32(sum2);
-    res[3] = _mm_cvtsi128_si32(sum3);
-}
-
-#endif /* if HAVE_MMX */
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-
 template<int ly>
 void sad_x4_12(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
 {
@@ -3357,360 +977,6 @@
 }
 
 template<int ly>
-void sad_x4_16(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-#define PROCESS_16x4x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * FENC_STRIDE)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res0 += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res1 += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res2 += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 0) * frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res3 += _mm_cvtsi128_si32(sum0); \
-
-    __m128i T00, T01, T02, T03;
-    __m128i T10, T11, T12, T13;
-    __m128i T20, T21, T22, T23;
-    __m128i sum0, sum1;
-    int res0 = 0, res1 = 0, res2 = 0, res3 = 0;
-
-    // ly == 4, 12, 32, 64
-    PROCESS_16x4x4(0);
-    if (ly >= 8)
-    {
-        PROCESS_16x4x4(4);
-    }
-    if (ly >= 12)
-    {
-        PROCESS_16x4x4(8);
-    }
-    if (ly > 12)
-    {
-        PROCESS_16x4x4(12);
-        for (int i = 16; i < ly; i += 16)
-        {
-            PROCESS_16x4x4(i);
-            PROCESS_16x4x4(i + 4);
-            PROCESS_16x4x4(i + 8);
-            PROCESS_16x4x4(i + 12);
-        }
-    }
-    res[0] = res0;
-    res[1] = res1;
-    res[2] = res2;
-    res[3] = res3;
-}
-
-template<int ly>
-void sad_x4_24(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-    res[0] = res[1] = res[2] = res[3] = 0;
-    __m128i T00, T01, T02, T03, T04, T05;
-    __m128i T10, T11, T12, T13, T14, T15;
-    __m128i T20, T21, T22, T23;
-    __m128i T30, T31;
-    __m128i sum0, sum1;
-
-#define PROCESS_24x4x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
-    T10 = _mm_loadl_epi64((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
-    T11 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
-    T04 = _mm_unpacklo_epi64(T10, T11); \
-    T12 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
-    T13 = _mm_loadl_epi64((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
-    T05 = _mm_unpacklo_epi64(T12, T13); \
-    T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[0] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[1] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[2] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref4 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride)); \
-    T20 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE)*frefstride + 16)); \
-    T21 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE + 1) * frefstride + 16)); \
-    T14 = _mm_unpacklo_epi64(T20, T21); \
-    T22 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE + 2) * frefstride + 16)); \
-    T23 = _mm_loadl_epi64((__m128i*)(fref4 + (BASE + 3) * frefstride + 16)); \
-    T15 = _mm_unpacklo_epi64(T22, T23); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T30 = _mm_sad_epu8(T04, T14); \
-    T31 = _mm_sad_epu8(T05, T15); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum1 = _mm_add_epi16(T30, T31); \
-    sum0 = _mm_add_epi16(sum0, sum1); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[3] += _mm_cvtsi128_si32(sum0)
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        PROCESS_24x4x4(i);
-        PROCESS_24x4x4(i + 4);
-    }
-}
-
-template<int ly>
-// ly will be 8, 16, 24, or 32
-void sad_x4_32(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
-{
-    res[0] = res[1] = res[2] = res[3] = 0;
-    __m128i T00, T01, T02, T03, T04, T05, T06, T07;
-    __m128i T10, T11, T12, T13, T14, T15, T16, T17;
-    __m128i T20, T21, T22, T23, T24, T25, T26, T27;
-    __m128i sum0, sum1;
-
-#define PROCESS_32x4x4(BASE) \
-    T00 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE)); \
-    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE)); \
-    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE)); \
-    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE)); \
-    T04 = _mm_load_si128((__m128i*)(fenc + (BASE)*FENC_STRIDE + 16)); \
-    T05 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * FENC_STRIDE + 16)); \
-    T06 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * FENC_STRIDE + 16)); \
-    T07 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * FENC_STRIDE + 16)); \
-    T10 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref1 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref1 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[0] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref2 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref2 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[1] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref3 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref3 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[2] += _mm_cvtsi128_si32(sum0); \
-    T10 = _mm_loadu_si128((__m128i*)(fref4 + (BASE)*frefstride)); \
-    T11 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride)); \
-    T12 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride)); \
-    T13 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride)); \
-    T14 = _mm_loadu_si128((__m128i*)(fref4 + (BASE)*frefstride + 16)); \
-    T15 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 1) * frefstride + 16)); \
-    T16 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 2) * frefstride + 16)); \
-    T17 = _mm_loadu_si128((__m128i*)(fref4 + (BASE + 3) * frefstride + 16)); \
-    T20 = _mm_sad_epu8(T00, T10); \
-    T21 = _mm_sad_epu8(T01, T11); \
-    T22 = _mm_sad_epu8(T02, T12); \
-    T23 = _mm_sad_epu8(T03, T13); \
-    T24 = _mm_sad_epu8(T04, T14); \
-    T25 = _mm_sad_epu8(T05, T15); \
-    T26 = _mm_sad_epu8(T06, T16); \
-    T27 = _mm_sad_epu8(T07, T17); \
-    T20 = _mm_add_epi16(T20, T21); \
-    T22 = _mm_add_epi16(T22, T23); \
-    T24 = _mm_add_epi16(T24, T25); \
-    T26 = _mm_add_epi16(T26, T27); \
-    sum0 = _mm_add_epi16(T20, T22); \
-    sum0 = _mm_add_epi16(sum0, T24); \
-    sum0 = _mm_add_epi16(sum0, T26); \
-    sum1 = _mm_shuffle_epi32(sum0, 2); \
-    sum0 = _mm_add_epi32(sum0, sum1); \
-    res[3] += _mm_cvtsi128_si32(sum0)
-
-    for (int i = 0; i < ly; i += 8)
-    {
-        PROCESS_32x4x4(i);
-        PROCESS_32x4x4(i + 4);
-    }
-}
-
-template<int ly>
 void sad_x4_48(pixel *fenc, pixel *fref1, pixel *fref2, pixel *fref3, pixel *fref4, intptr_t frefstride, int32_t *res)
 {
     __m128i sum0 = _mm_setzero_si128();
@@ -5601,7 +2867,6 @@
     p.sse_ss[LUMA_ ## W ## x ## H] = sse_ss ## W < H >
 #else
 #define SETUP_PARTITION(W, H) \
-    p.sad[LUMA_ ## W ## x ## H] = sad_ ## W<H>; \
     p.sad_x3[LUMA_ ## W ## x ## H] = sad_x3_ ## W<H>; \
     p.sad_x4[LUMA_ ## W ## x ## H] = sad_x4_ ## W<H>; \
     p.sse_sp[LUMA_ ## W ## x ## H] = sse_sp ## W<H>; \
@@ -5616,25 +2881,25 @@
     /* 2Nx2N, 2NxN, Nx2N, 4Ax3A, 4AxA, 3Ax4A, Ax4A */
     SETUP_PARTITION(64, 64);
     SETUP_PARTITION(64, 32);
-    SETUP_PARTITION(32, 64);
+    SETUP_NONSAD(32, 64);
     SETUP_PARTITION(64, 16);
     SETUP_PARTITION(64, 48);
-    SETUP_PARTITION(16, 64);
+    SETUP_NONSAD(16, 64);
     SETUP_PARTITION(48, 64);
 
-    SETUP_PARTITION(32, 32);
-    SETUP_PARTITION(32, 16);
-    SETUP_PARTITION(16, 32);
-    SETUP_PARTITION(32, 8);
-    SETUP_PARTITION(32, 24);
-    SETUP_PARTITION(8, 32);
-    SETUP_PARTITION(24, 32);
+    SETUP_NONSAD(32, 32);
+    SETUP_NONSAD(32, 16);
+    SETUP_NONSAD(16, 32);
+    SETUP_NONSAD(32, 8);
+    SETUP_NONSAD(32, 24);
+    SETUP_NONSAD(8, 32);
+    SETUP_NONSAD(24, 32);
 
     SETUP_NONSAD(16, 16); // 16x16 SAD covered by assembly
     SETUP_NONSAD(16, 8);  // 16x8 SAD covered by assembly
     SETUP_NONSAD(8, 16);  // 8x16 SAD covered by assembly
-    SETUP_PARTITION(16, 4);
-    SETUP_PARTITION(16, 12);
+    SETUP_NONSAD(16, 4);
+    SETUP_NONSAD(16, 12);
     SETUP_NONSAD(4, 16); // 4x16 SAD covered by assembly
 #if !defined(__clang__)
     SETUP_PARTITION(12, 16);
@@ -5652,8 +2917,6 @@
     Setup_Vec_Pixel16Primitives_sse41(p);
 #else
     // These are the only SSE primitives uncovered by assembly
-    p.sad_x3[LUMA_4x16] = sad_x3_4x16;
-    p.sad_x4[LUMA_4x16] = sad_x4_4x16;
     p.sse_pp[LUMA_12x16] = sse_pp_12x16;
     p.sse_pp[LUMA_24x32] = sse_pp_24x32;
     p.sse_pp[LUMA_48x64] = sse_pp_48x64;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/vec/vec-primitives.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -43,10 +43,30 @@
     *edx = output[3];
 }
 
+#if defined(_MSC_VER)
+#pragma warning(disable: 4100)
+#endif
 void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
 {
+#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
+
+    // MSVC 2010 SP1 or later, or similar Intel release
     uint64_t out = _xgetbv(op);
 
+#elif defined(__GNUC__)    // use inline assembly, Gnu/AT&T syntax
+
+    uint32_t a, d;
+    __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (ctr) :);
+    *eax = a;
+    *edx = d;
+    return;
+
+#elif defined(_WIN64)      // On x64 with older compilers, this is impossible
+
+    uint64_t out = 0;
+
+#endif
+
     *eax = (uint32_t)out;
     *edx = (uint32_t)(out >> 32);
 }
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -126,7 +126,8 @@
     p.pixelavg_pp[LUMA_8x4]   = x265_pixel_avg_8x4_ ## cpu;
 
 #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
-    p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu
+    p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu;\
+    p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu;
 
 #define CHROMA_FILTERS(cpu) \
     SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
@@ -221,8 +222,8 @@
     {
         INIT8_NAME(sse_pp, ssd, _mmx);
         INIT8(sad, _mmx2);
-        INIT7(sad_x3, _mmx2);
-        INIT7(sad_x4, _mmx2);
+        INIT8(sad_x3, _mmx2);
+        INIT8(sad_x4, _mmx2);
         INIT8(satd, _mmx2);
         HEVC_SATD(mmx2);
         p.satd[LUMA_12x16] = cmp<12, 16, 4, 16, x265_pixel_satd_4x16_mmx2>;
@@ -235,6 +236,27 @@
         //p.pixelavg_pp[LUMA_4x4]  = x265_pixel_avg_4x4_mmx2;
         //PIXEL_AVE(sse2);
 
+        p.sad[LUMA_8x32]   = x265_pixel_sad_8x32_sse2;
+        p.sad[LUMA_16x4 ]  = x265_pixel_sad_16x4_sse2;
+        p.sad[LUMA_16x12]  = x265_pixel_sad_16x12_sse2;
+        p.sad[LUMA_16x32]  = x265_pixel_sad_16x32_sse2;
+        p.sad[LUMA_16x64]  = x265_pixel_sad_16x64_sse2;
+
+        p.sad[LUMA_32x8 ]  = x265_pixel_sad_32x8_sse2;
+        p.sad[LUMA_32x16]  = x265_pixel_sad_32x16_sse2;
+        p.sad[LUMA_32x24]  = x265_pixel_sad_32x24_sse2;
+        p.sad[LUMA_32x32]  = x265_pixel_sad_32x32_sse2;
+        p.sad[LUMA_32x64]  = x265_pixel_sad_32x64_sse2;
+
+        p.sad[LUMA_64x16]  = x265_pixel_sad_64x16_sse2;
+        p.sad[LUMA_64x32]  = x265_pixel_sad_64x32_sse2;
+        p.sad[LUMA_64x48]  = x265_pixel_sad_64x48_sse2;
+        p.sad[LUMA_64x64]  = x265_pixel_sad_64x64_sse2;
+
+        p.sad[LUMA_48x64]  = x265_pixel_sad_48x64_sse2;
+        p.sad[LUMA_24x32]  = x265_pixel_sad_24x32_sse2;
+        p.sad[LUMA_12x16]  = x265_pixel_sad_12x16_sse2;
+
         ASSGN_SSE(sse2);
         INIT2(sad, _sse2);
         INIT2(sad_x3, _sse2);
@@ -280,6 +302,22 @@
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
         p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_ssse3;
         p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_ssse3;
+        p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_ssse3;
+        p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_ssse3;
+        p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_ssse3;
+        p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_ssse3;
+        p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_ssse3;
+        p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_ssse3;
+        p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_ssse3;
+        p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_ssse3;
+        p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_ssse3;
+        p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_ssse3;
+        p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_ssse3;
+        p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_ssse3;
+
+        p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
+        p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
+        p.luma_p2s = x265_luma_p2s_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -310,6 +348,18 @@
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx;
         p.sad_x3[LUMA_16x64] = x265_pixel_sad_x3_16x64_avx;
         p.sad_x4[LUMA_16x64] = x265_pixel_sad_x4_16x64_avx;
+        p.sad_x3[LUMA_24x32] = x265_pixel_sad_x3_24x32_avx;
+        p.sad_x4[LUMA_24x32] = x265_pixel_sad_x4_24x32_avx;
+        p.sad_x3[LUMA_32x8] = x265_pixel_sad_x3_32x8_avx;
+        p.sad_x3[LUMA_32x16] = x265_pixel_sad_x3_32x16_avx;
+        p.sad_x3[LUMA_32x24] = x265_pixel_sad_x3_32x24_avx;
+        p.sad_x3[LUMA_32x32] = x265_pixel_sad_x3_32x32_avx;
+        p.sad_x3[LUMA_32x64] = x265_pixel_sad_x3_32x64_avx;
+        p.sad_x4[LUMA_32x8] = x265_pixel_sad_x4_32x8_avx;
+        p.sad_x4[LUMA_32x16] = x265_pixel_sad_x4_32x16_avx;
+        p.sad_x4[LUMA_32x24] = x265_pixel_sad_x4_32x24_avx;
+        p.sad_x4[LUMA_32x32] = x265_pixel_sad_x4_32x32_avx;
+        p.sad_x4[LUMA_32x64] = x265_pixel_sad_x4_32x64_avx;
     }
     if (cpuMask & X265_CPU_XOP)
     {
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Oct 31 18:43:03 2013 +0530
@@ -35,7 +35,14 @@
            db 4, 5, 6, 7, 8,  9,  10, 11, 5, 6, 7, 8,  9,  10, 11, 12
            db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
 
-tab_c_512:  times 8 dw 512
+tab_Vm:    db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+           db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+
+tab_Cm:    db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
+
+tab_c_512:      times 8 dw 512
+tab_c_8192:     times 8 dw 8192
+tab_c_526336:   times 4 dd 8192*64+2048
 
 tab_ChromaCoeff: db  0, 64,  0,  0
                  db -2, 58, 10, -2
@@ -51,21 +58,46 @@
                  db  -1, 4, -11, 40,  40, -11, 4, -1
                  db   0, 1, -5,  17,  58, -10, 4, -1
 
+tab_LumaCoeffV: times 4 dw 0, 0
+                times 4 dw 0, 64
+                times 4 dw 0, 0
+                times 4 dw 0, 0
+
+                times 4 dw -1, 4
+                times 4 dw -10, 58
+                times 4 dw 17, -5
+                times 4 dw 1, 0
+
+                times 4 dw -1, 4
+                times 4 dw -11, 40
+                times 4 dw 40, -11
+                times 4 dw 4, -1
+
+                times 4 dw 0, 1
+                times 4 dw -5, 17
+                times 4 dw 58, -10
+                times 4 dw 4, -1
+
+tab_c_128:      times 16 db 0x80
+tab_c_64_n64:   times 8 db 64, -64
+
 
 SECTION .text
 
 %macro FILTER_H4_w2_2 3
-    movu        %2, [srcq - 1]
+    movh        %2, [srcq - 1]
     pshufb      %2, %2, Tm0
+    movh        %1, [srcq + srcstrideq - 1]
+    pshufb      %1, %1, Tm0
+    punpcklqdq  %2, %1
     pmaddubsw   %2, coef2
-    movu        %1, [srcq + srcstrideq - 1]
-    pshufb      %1, %1, Tm0
-    pmaddubsw   %1, coef2
-    phaddw      %2, %1
+    phaddw      %2, %2
     pmulhrsw    %2, %3
     packuswb    %2, %2
-    pextrw      [dstq], %2, 0
-    pextrw      [dstq + dststrideq], %2, 2
+    movd        r4, %2
+    mov         [dstq], r4w
+    shr         r4, 16
+    mov         [dstq + dststrideq], r4w
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -137,17 +169,18 @@
 RET
 
 %macro FILTER_H4_w4_2 3
-    movu        %2, [srcq - 1]
+    movh        %2, [srcq - 1]
     pshufb      %2, %2, Tm0
     pmaddubsw   %2, coef2
-    movu        %1, [srcq + srcstrideq - 1]
+    movh        %1, [srcq + srcstrideq - 1]
     pshufb      %1, %1, Tm0
     pmaddubsw   %1, coef2
     phaddw      %2, %1
     pmulhrsw    %2, %3
     packuswb    %2, %2
-    movd        [dstq],      %2
-    pextrd      [dstq + dststrideq], %2,  1
+    movd        [dstq], %2
+    palignr     %2, %2, 4
+    movd        [dstq + dststrideq], %2
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -523,8 +556,8 @@
     pmaddubsw   %1, %5
     phaddw      %4, %1
     phaddw      %2, %4
+  %if %0 == 8
     pmulhrsw    %2, %6
-  %if %0 == 8
     packuswb    %2, %2
     movh        %8, %2
   %endif
@@ -623,3 +656,1474 @@
     IPFILTER_LUMA 48, 64
     IPFILTER_LUMA 64, 16
     IPFILTER_LUMA 16, 64
+
+
+;-----------------------------------------------------------------------------
+; Interpolate HV
+;-----------------------------------------------------------------------------
+%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
+    mova        %5, [r0 +  (%6 + 0) * 16]
+    mova        %1, [r0 +  (%6 + 1) * 16]
+    mova        %2, [r0 +  (%6 + 2) * 16]
+    punpcklwd   %3, %5, %1
+    punpckhwd   %5, %1
+    pmaddwd     %3, [r5 + (%7) * 16]   ; R3 = L[0+1] -- Row 0
+    pmaddwd     %5, [r5 + (%7) * 16]   ; R0 = H[0+1]
+    punpcklwd   %4, %1, %2
+    punpckhwd   %1, %2
+    pmaddwd     %4, [r5 + (%7) * 16]   ; R4 = L[1+2] -- Row 1
+    pmaddwd     %1, [r5 + (%7) * 16]   ; R1 = H[1+2]
+%endmacro ; FILTER_HV8_START
+
+%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
+    mova        %8, [r0 +  (%9 + 0) * 16]
+    mova        %1, [r0 +  (%9 + 1) * 16]
+    punpcklwd   %7, %2, %8
+    punpckhwd   %2, %8
+    pmaddwd     %7, [r5 + %10 * 16]
+    pmaddwd     %2, [r5 + %10 * 16]
+    paddd       %3, %7              ; R3 = L[0+1+2+3] -- Row 0
+    paddd       %5, %2              ; R0 = H[0+1+2+3]
+    punpcklwd   %7, %8, %1
+    punpckhwd   %8, %1
+    pmaddwd     %7, [r5 + %10 * 16]
+    pmaddwd     %8, [r5 + %10 * 16]
+    paddd       %4, %7              ; R4 = L[1+2+3+4] -- Row 1
+    paddd       %6, %8              ; R1 = H[1+2+3+4]
+%endmacro ; FILTER_HV8_START
+
+; Round and Saturate
+%macro FILTER_HV8_END 4 ; output in [1, 3]
+    paddd       %1, [tab_c_526336]
+    paddd       %2, [tab_c_526336]
+    paddd       %3, [tab_c_526336]
+    paddd       %4, [tab_c_526336]
+    psrad       %1, 12
+    psrad       %2, 12
+    psrad       %3, 12
+    psrad       %4, 12
+    packssdw    %1, %2
+    packssdw    %3, %4
+
+    ; TODO: is merge better? I think this way is short dependency link
+    packuswb    %1, %1
+    packuswb    %3, %3
+%endmacro ; FILTER_HV8_END
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+%define coef        m7
+%define stk_buf     rsp
+
+    mov         r4d,        r4m
+    mov         r5d,        r5m
+
+%ifdef PIC
+    lea         r6,         [tab_LumaCoeff]
+    movh        coef,       [r6 + r4 * 8]
+%else
+    movh        coef,       [tab_LumaCoeff + r4 * 8]
+%endif
+    punpcklqdq  coef,       coef
+
+    ; move to row -3
+    lea         r6,         [r1 + r1 * 2]
+    sub         r0,         r6
+
+    xor         r6,         r6
+    mov         r4,         rsp
+
+.loopH:
+    FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
+    psubw       m1,         [tab_c_8192]
+    mova        [r4],       m1
+
+    add         r0,         r1
+    add         r4,         16
+    inc         r6
+    cmp         r6,         8+7
+    jnz         .loopH
+
+    ; ready to phase V
+    ; Here all of mN is free
+
+    ; load coeff table
+    shl         r5,         6
+    lea         r6,         [tab_LumaCoeffV]
+    lea         r5,         [r5 + r6]
+
+    ; load intermedia buffer
+    mov         r0,         stk_buf
+
+    ; register mapping
+    ; r0 - src
+    ; r5 - coeff
+    ; r6 - loop_i
+
+    ; let's go
+    xor         r6,         r6
+
+    ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
+.loopV:
+
+    FILTER_HV8_START    m1, m2, m3, m4, m0,             0, 0
+    FILTER_HV8_MID      m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
+    FILTER_HV8_MID      m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
+    FILTER_HV8_MID      m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
+    FILTER_HV8_END      m3, m0, m4, m1
+
+    movq        [r2],       m3
+    movq        [r2 + r3],  m4
+
+    lea         r0,         [r0 + 16 * 2]
+    lea         r2,         [r2 + r3 * 2]
+
+    inc         r6
+    cmp         r6,         8/2
+    jnz         .loopV
+
+    RET
+
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+
+%if ARCH_X86_64
+cglobal interp_8tap_v_sp, 4, 7+5, 8
+%define tmp_r0      r7
+%define tmp_r2      r8
+%define tmp_r3      r9
+%define tmp_r4d     r10d
+%define tmp_6rows   r11
+
+%else ; ARCH_X86_64 = 0
+
+cglobal interp_8tap_v_sp, 4, 7, 8, 0-(5*4)
+%define tmp_r0      [(rsp + 0 * 4)]
+%define tmp_r2      [(rsp + 1 * 4)]
+%define tmp_r3      [(rsp + 2 * 4)]
+%define tmp_r4d     [(rsp + 3 * 4)]
+%define tmp_6rows   [(rsp + 4 * 4)]
+%endif ; ARCH_X86_64
+
+    mov         r4d,        r4m
+    mov         r5d,        r5m
+
+    mov         tmp_r4d, r4d
+    mov         tmp_r2, r2
+
+    ; load coeff table
+    mov         r6d,        r6m
+    shl         r6,         6
+    lea         r4,         [tab_LumaCoeffV]
+    lea         r6,         [r4 + r6]
+
+    ; move to -3
+    lea         r1, [r1 * 2]
+    lea         r4, [r1 + r1 * 2]
+    sub         r0, r4
+    lea         r4, [r4 * 2]
+    mov         tmp_6rows, r4
+
+.loopH:
+
+    ; load width
+    mov         r4d, tmp_r4d
+
+    ; save old src
+    mov         tmp_r0, r0
+
+.loopW:
+
+    movu        m0, [r0]
+    movu        m1, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m2, m0, m1
+    pmaddwd     m2, [r6 + 0 * 16]
+    punpckhwd   m0, m1
+    pmaddwd     m0, [r6 + 0 * 16]
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 1 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 1 * 16]
+    paddd       m0, m3
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    lea         r0, [r0 + r1 * 2]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 2 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 2 * 16]
+    paddd       m0, m3
+
+    movu        m3, [r0]
+    movu        m4, [r0 + r1]
+    punpcklwd   m1, m3, m4
+    pmaddwd     m1, [r6 + 3 * 16]
+    paddd       m2, m1
+    punpckhwd   m3, m4
+    pmaddwd     m3, [r6 + 3 * 16]
+    paddd       m0, m3
+
+    paddd       m2, [tab_c_526336]
+    paddd       m0, [tab_c_526336]
+    psrad       m2, 12
+    psrad       m0, 12
+    packssdw    m2, m0
+    packuswb    m2, m2
+
+    ; move to next 8 col
+    sub         r0, tmp_6rows
+
+    sub         r4, 8
+    jl          .width4
+    movq        [r2], m2
+    je          .nextH
+    lea         r0, [r0 + 16]
+    lea         r2, [r2 + 8]
+    jmp         .loopW
+
+.width4:
+    movd        [r2], m2
+    lea         r0, [r0 + 4]
+
+.nextH:
+    ; move to next row
+    mov         r0, tmp_r0
+    lea         r0, [r0 + r1]
+    add         tmp_r2, r3
+    mov         r2, tmp_r2
+
+    dec         r5d
+    jnz         .loopH
+
+    RET
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_2x4, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m0,        [tab_Cm]
+
+mova        m1,        [tab_c_512]
+
+movd        m2,        [r0]
+movd        m3,        [r0 + r1]
+movd        m4,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movd        m5,        [r5 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m6,        m4,        m5
+punpcklbw   m2,        m6
+
+pmaddubsw   m2,        m0
+
+movd        m6,        [r0 + 4 * r1]
+
+punpcklbw   m3,        m4
+punpcklbw   m7,        m5,        m6
+punpcklbw   m3,        m7
+
+pmaddubsw   m3,        m0
+
+phaddw      m2,        m3
+
+pmulhrsw    m2,        m1
+packuswb    m2,        m2
+
+pextrw      [r2],      m2,  0
+pextrw      [r2 + r3], m2,  2
+
+lea         r5,        [r0 + 4 * r1]
+movd        m2,        [r5 + r1]
+
+punpcklbw   m4,        m5
+punpcklbw   m3,        m6,        m2
+punpcklbw   m4,        m3
+
+pmaddubsw   m4,        m0
+
+movd        m3,        [r5 + 2 * r1]
+
+punpcklbw   m5,        m6
+punpcklbw   m2,        m3
+punpcklbw   m5,        m2
+
+pmaddubsw   m5,        m0
+
+phaddw      m4,        m5
+
+pmulhrsw    m4,        m1
+packuswb    m4,        m4
+
+pextrw      [r2 + 2 * r3],    m4,    0
+lea         r6,               [r2 + 2 * r3]
+pextrw      [r6 + r3],        m4,    2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W2_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_2x8, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m0,        [tab_Cm]
+
+mova        m1,        [tab_c_512]
+
+mov         r4d,       %2
+
+.loop
+movd        m2,        [r0]
+movd        m3,        [r0 + r1]
+movd        m4,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movd        m5,        [r5 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m6,        m4,        m5
+punpcklbw   m2,        m6
+
+pmaddubsw   m2,        m0
+
+movd        m6,        [r0 + 4 * r1]
+
+punpcklbw   m3,        m4
+punpcklbw   m7,        m5,        m6
+punpcklbw   m3,        m7
+
+pmaddubsw   m3,        m0
+
+phaddw      m2,        m3
+
+pmulhrsw    m2,        m1
+packuswb    m2,        m2
+
+pextrw      [r2],      m2,  0
+pextrw      [r2 + r3], m2,  2
+
+lea         r5,        [r0 + 4 * r1]
+movd        m2,        [r5 + r1]
+
+punpcklbw   m4,        m5
+punpcklbw   m3,        m6,        m2
+punpcklbw   m4,        m3
+
+pmaddubsw   m4,        m0
+
+movd        m3,        [r5 + 2 * r1]
+
+punpcklbw   m5,        m6
+punpcklbw   m2,        m3
+punpcklbw   m5,        m2
+
+pmaddubsw   m5,        m0
+
+phaddw      m4,        m5
+
+pmulhrsw    m4,        m1
+packuswb    m4,        m4
+
+pextrw      [r2 + 2 * r3],    m4,    0
+lea         r6,               [r2 + 2 * r3]
+pextrw      [r6 + r3],        m4,    2
+
+lea         r0,        [r0 + 4 * r1]
+lea         r2,        [r2 + 4 * r3]
+
+sub         r4,        4
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W2_H4 2, 8
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_4x2, 4, 6, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m0,        [tab_Cm]
+
+mova        m1,        [tab_c_512]
+
+movd        m2,        [r0]
+movd        m3,        [r0 + r1]
+movd        m4,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movd        m5,        [r5 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m6,        m4,        m5
+punpcklbw   m2,        m6
+
+pmaddubsw   m2,        m0
+
+movd        m6,        [r0 + 4 * r1]
+
+punpcklbw   m3,        m4
+punpcklbw   m5,        m6
+punpcklbw   m3,        m5
+
+pmaddubsw   m3,        m0
+
+phaddw      m2,        m3
+
+pmulhrsw    m2,        m1
+packuswb    m2,        m2
+movd        [r2],      m2
+pextrd      [r2 + r3], m2,  1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_4x4, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m0,        [tab_Cm]
+
+mova        m1,        [tab_c_512]
+
+movd        m2,        [r0]
+movd        m3,        [r0 + r1]
+movd        m4,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movd        m5,        [r5 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m6,        m4,        m5
+punpcklbw   m2,        m6
+
+pmaddubsw   m2,        m0
+
+movd        m6,        [r0 + 4 * r1]
+
+punpcklbw   m3,        m4
+punpcklbw   m7,        m5,        m6
+punpcklbw   m3,        m7
+
+pmaddubsw   m3,        m0
+
+phaddw      m2,        m3
+
+pmulhrsw    m2,        m1
+packuswb    m2,        m2
+movd        [r2],      m2
+pextrd      [r2 + r3], m2,  1
+
+lea         r5,        [r0 + 4 * r1]
+movd        m2,        [r5 + r1]
+
+punpcklbw   m4,        m5
+punpcklbw   m3,        m6,        m2
+punpcklbw   m4,        m3
+
+pmaddubsw   m4,        m0
+
+movd        m3,        [r5 + 2 * r1]
+
+punpcklbw   m5,        m6
+punpcklbw   m2,        m3
+punpcklbw   m5,        m2
+
+pmaddubsw   m5,        m0
+
+phaddw      m4,        m5
+
+pmulhrsw    m4,        m1
+packuswb    m4,        m4
+movd        [r2 + 2 * r3],      m4
+lea         r6,        [r2 + 2 * r3]
+pextrd      [r6 + r3], m4,  1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W4_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m0,        [tab_Cm]
+
+mova        m1,        [tab_c_512]
+
+mov         r4d,       %2
+
+.loop
+movd        m2,        [r0]
+movd        m3,        [r0 + r1]
+movd        m4,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movd        m5,        [r5 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m6,        m4,        m5
+punpcklbw   m2,        m6
+
+pmaddubsw   m2,        m0
+
+movd        m6,        [r0 + 4 * r1]
+
+punpcklbw   m3,        m4
+punpcklbw   m7,        m5,        m6
+punpcklbw   m3,        m7
+
+pmaddubsw   m3,        m0
+
+phaddw      m2,        m3
+
+pmulhrsw    m2,        m1
+packuswb    m2,        m2
+movd        [r2],      m2
+pextrd      [r2 + r3], m2,  1
+
+lea         r5,        [r0 + 4 * r1]
+movd        m2,        [r5 + r1]
+
+punpcklbw   m4,        m5
+punpcklbw   m3,        m6,        m2
+punpcklbw   m4,        m3
+
+pmaddubsw   m4,        m0
+
+movd        m3,        [r5 + 2 * r1]
+
+punpcklbw   m5,        m6
+punpcklbw   m2,        m3
+punpcklbw   m5,        m2
+
+pmaddubsw   m5,        m0
+
+phaddw      m4,        m5
+
+pmulhrsw    m4,        m1
+packuswb    m4,        m4
+movd        [r2 + 2 * r3],      m4
+lea         r6,        [r2 + 2 * r3]
+pextrd      [r6 + r3], m4,  1
+
+lea         r0,        [r0 + 4 * r1]
+lea         r2,        [r2 + 4 * r3]
+
+sub         r4,        4
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W4_H4 4,  8
+FILTER_V4_W4_H4 4, 16
+
+%macro FILTER_V4_W8_H2 0
+punpcklbw   m1,        m2
+punpcklbw   m7,        m3,        m0
+
+pmaddubsw   m1,        m6
+pmaddubsw   m7,        m5
+
+paddw       m1,        m7
+
+pmulhrsw    m1,        m4
+packuswb    m1,        m1
+%endmacro
+
+%macro FILTER_V4_W8_H3 0
+punpcklbw   m2,        m3
+punpcklbw   m7,        m0,        m1
+
+pmaddubsw   m2,        m6
+pmaddubsw   m7,        m5
+
+paddw       m2,        m7
+
+pmulhrsw    m2,        m4
+packuswb    m2,        m2
+%endmacro
+
+%macro FILTER_V4_W8_H4 0
+punpcklbw   m3,        m0
+punpcklbw   m7,        m1,        m2
+
+pmaddubsw   m3,        m6
+pmaddubsw   m7,        m5
+
+paddw       m3,        m7
+
+pmulhrsw    m3,        m4
+packuswb    m3,        m3
+%endmacro
+
+%macro FILTER_V4_W8_H5 0
+punpcklbw   m0,        m1
+punpcklbw   m7,        m2,        m3
+
+pmaddubsw   m0,        m6
+pmaddubsw   m7,        m5
+
+paddw       m0,        m7
+
+pmulhrsw    m0,        m4
+packuswb    m0,        m0
+%endmacro
+
+%macro FILTER_V4_W8_8x2 2
+FILTER_V4_W8 %1, %2
+movq        m0,        [r0 + 4 * r1]
+
+FILTER_V4_W8_H2
+
+movh        [r2 + r3], m1
+%endmacro
+
+%macro FILTER_V4_W8_8x4 2
+FILTER_V4_W8_8x2 %1, %2
+;8x3
+lea         r6,        [r0 + 4 * r1]
+movq        m1,        [r6 + r1]
+
+FILTER_V4_W8_H3
+
+movh        [r2 + 2 * r3], m2
+
+;8x4
+movq        m2,        [r6 + 2 * r1]
+
+FILTER_V4_W8_H4
+
+lea         r5,        [r2 + 2 * r3]
+movh        [r5 + r3], m3
+%endmacro
+
+%macro FILTER_V4_W8_8x6 2
+FILTER_V4_W8_8x4 %1, %2
+;8x5
+lea         r6,        [r6 + 2 * r1]
+movq        m3,        [r6 + r1]
+
+FILTER_V4_W8_H5
+
+movh        [r2 + 4 * r3], m0
+
+;8x6
+movq        m0,        [r0 + 8 * r1]
+
+FILTER_V4_W8_H2
+
+lea         r5,        [r2 + 4 * r3]
+movh        [r5 + r3], m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W8 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov         r4d,       r4m
+
+sub         r0,        r1
+movq        m0,        [r0]
+movq        m1,        [r0 + r1]
+movq        m2,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movq        m3,        [r5 + r1]
+
+punpcklbw   m0,        m1
+punpcklbw   m4,        m2,          m3
+
+%ifdef PIC
+lea         r6,        [tab_ChromaCoeff]
+movd        m5,        [r6 + r4 * 4]
+%else
+movd        m5,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m6,        m5,       [tab_Vm]
+pmaddubsw   m0,        m6
+
+pshufb      m5,        [tab_Vm + 16]
+pmaddubsw   m4,        m5
+
+paddw       m0,        m4
+
+mova        m4,        [tab_c_512]
+
+pmulhrsw    m0,        m4
+packuswb    m0,        m0
+movh        [r2],      m0
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x2 8, 2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x4 8, 4
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x6 8, 6
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W8_H8_H16_H32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r6,        [tab_ChromaCoeff]
+movd        m5,        [r6 + r4 * 4]
+%else
+movd        m5,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m6,        m5,       [tab_Vm]
+pshufb      m5,        [tab_Vm + 16]
+mova        m4,        [tab_c_512]
+
+mov         r4d,       %2
+
+.loop
+movq        m0,        [r0]
+movq        m1,        [r0 + r1]
+movq        m2,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movq        m3,        [r5 + r1]
+
+punpcklbw   m0,        m1
+punpcklbw   m7,        m2,        m3
+
+pmaddubsw   m0,        m6
+pmaddubsw   m7,        m5
+
+paddw       m0,        m7
+
+pmulhrsw    m0,        m4
+packuswb    m0,        m0
+movh        [r2],      m0
+
+movq        m0,        [r0 + 4 * r1]
+
+punpcklbw   m1,        m2
+punpcklbw   m7,        m3,        m0
+
+pmaddubsw   m1,        m6
+pmaddubsw   m7,        m5
+
+paddw       m1,        m7
+
+pmulhrsw    m1,        m4
+packuswb    m1,        m1
+movh        [r2 + r3], m1
+
+lea         r6,        [r0 + 4 * r1]
+movq        m1,        [r6 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m7,        m0,        m1
+
+pmaddubsw   m2,        m6
+pmaddubsw   m7,        m5
+
+paddw       m2,        m7
+
+pmulhrsw    m2,        m4
+packuswb    m2,        m2
+movh        [r2 + 2 * r3], m2
+
+movq        m2,        [r6 + 2 * r1]
+
+punpcklbw   m3,        m0
+punpcklbw   m1,        m2
+
+pmaddubsw   m3,        m6
+pmaddubsw   m1,        m5
+
+paddw       m3,        m1
+
+pmulhrsw    m3,        m4
+packuswb    m3,        m3
+
+lea         r5,        [r2 + 2 * r3]
+movh        [r5 + r3], m3
+
+lea         r0,        [r0 + 4 * r1]
+lea         r2,        [r2 + 4 * r3]
+
+sub         r4,         4
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W8_H8_H16_H32 8,  8
+FILTER_V4_W8_H8_H16_H32 8, 16
+FILTER_V4_W8_H8_H16_H32 8, 32
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W6_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_6x8, 4, 7, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r6,        [tab_ChromaCoeff]
+movd        m5,        [r6 + r4 * 4]
+%else
+movd        m5,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m6,        m5,       [tab_Vm]
+pshufb      m5,        [tab_Vm + 16]
+mova        m4,        [tab_c_512]
+
+mov         r4d,       %2
+
+.loop
+movq        m0,        [r0]
+movq        m1,        [r0 + r1]
+movq        m2,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movq        m3,        [r5 + r1]
+
+punpcklbw   m0,        m1
+punpcklbw   m7,        m2,        m3
+
+pmaddubsw   m0,        m6
+pmaddubsw   m7,        m5
+
+paddw       m0,        m7
+
+pmulhrsw    m0,        m4
+packuswb    m0,        m0
+movd        [r2],      m0
+pextrw      [r2 + 4],  m0,    2
+
+movq        m0,        [r0 + 4 * r1]
+
+punpcklbw   m1,        m2
+punpcklbw   m7,        m3,        m0
+
+pmaddubsw   m1,        m6
+pmaddubsw   m7,        m5
+
+paddw       m1,        m7
+
+pmulhrsw    m1,        m4
+packuswb    m1,        m1
+movd        [r2 + r3],      m1
+pextrw      [r2 + r3 + 4],  m1,    2
+
+lea         r6,        [r0 + 4 * r1]
+movq        m1,        [r6 + r1]
+
+punpcklbw   m2,        m3
+punpcklbw   m7,        m0,        m1
+
+pmaddubsw   m2,        m6
+pmaddubsw   m7,        m5
+
+paddw       m2,        m7
+
+pmulhrsw    m2,        m4
+packuswb    m2,        m2
+movd        [r2 + 2 * r3],     m2
+pextrw      [r2 + 2 * r3 + 4], m2,    2
+
+movq        m2,        [r6 + 2 * r1]
+
+punpcklbw   m3,        m0
+punpcklbw   m1,        m2
+
+pmaddubsw   m3,        m6
+pmaddubsw   m1,        m5
+
+paddw       m3,        m1
+
+pmulhrsw    m3,        m4
+packuswb    m3,        m3
+
+lea         r5,               [r2 + 2 * r3]
+movd        [r5 + r3],        m3
+pextrw      [r5 + r3 + 4],    m3,    2
+
+lea         r0,        [r0 + 4 * r1]
+lea         r2,        [r2 + 4 * r3]
+
+sub         r4,         4
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W6_H4 6, 8
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W12_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_12x16, 4, 6, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m1,        m0,       [tab_Vm]
+pshufb      m0,        [tab_Vm + 16]
+
+mova        m7,        [tab_c_512]
+
+mov          r4d,       %2
+
+.loop
+movu        m2,        [r0]
+movu        m3,        [r0 + r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+movu        m5,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movu        m3,        [r5 + r1]
+
+punpcklbw   m6,        m5,        m3,
+punpckhbw   m5,        m3,
+
+pmaddubsw   m6,        m0
+pmaddubsw   m5,        m0
+
+paddw       m4,        m6
+paddw       m2,        m5
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movh         [r2],     m4
+pextrd       [r2 + 8], m4,  2
+
+movu        m2,        [r0 + r1]
+movu        m3,        [r0 + 2 * r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+lea         r5,        [r0 + 2 * r1]
+movu        m5,        [r5 + r1]
+movu        m3,        [r5 + 2 * r1]
+
+punpcklbw   m6,        m5,        m3,
+punpckhbw   m5,        m3,
+
+pmaddubsw   m6,        m0
+pmaddubsw   m5,        m0
+
+paddw       m4,        m6
+paddw       m2,        m5
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movh        [r2 + r3],      m4
+pextrd      [r2 + r3 + 8],  m4,  2
+
+lea         r0,        [r0 + 2 * r1]
+lea         r2,        [r2 + 2 * r3]
+
+sub         r4,        2
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W12_H2 12, 16
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m1,        m0,       [tab_Vm]
+pshufb      m0,        [tab_Vm + 16]
+
+mov         r4d,       %2
+
+.loop
+movu        m2,        [r0]
+movu        m3,        [r0 + r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m5,        m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m5,        m1
+
+movu        m2,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movu        m3,        [r5 + r1]
+
+punpcklbw   m6,        m2,        m3,
+punpckhbw   m7,        m2,        m3,
+
+pmaddubsw   m6,        m0
+pmaddubsw   m7,        m0
+
+paddw       m4,        m6;
+paddw       m5,        m7;
+
+mova        m6,        [tab_c_512]
+
+pmulhrsw    m4,        m6
+pmulhrsw    m5,        m6
+
+packuswb    m4,        m5
+
+movu        [r2],      m4
+
+movu        m2,        [r0 + r1]
+movu        m3,        [r0 + 2 * r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m5,        m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m5,        m1
+
+lea         r5,        [r0 + 2 * r1]
+movu        m2,        [r5 + r1]
+movu        m3,        [r5 + 2 * r1]
+
+punpcklbw   m6,        m2,        m3,
+punpckhbw   m7,        m2,        m3,
+
+pmaddubsw   m6,        m0
+pmaddubsw   m7,        m0
+
+paddw       m4,        m6
+paddw       m5,        m7
+
+mova        m6,        [tab_c_512]
+
+pmulhrsw    m4,        m6
+pmulhrsw    m5,        m6
+
+packuswb    m4,        m5
+
+movu        [r2 + r3],      m4
+
+lea         r0,        [r0 + 2 * r1]
+lea         r2,        [r2 + 2 * r3]
+
+sub         r4,        2
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W16_H2 16,  4
+FILTER_V4_W16_H2 16,  8
+FILTER_V4_W16_H2 16, 12
+FILTER_V4_W16_H2 16, 16
+FILTER_V4_W16_H2 16, 32
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W24 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_24x32, 4, 6, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m1,        m0,       [tab_Vm]
+pshufb      m0,        [tab_Vm + 16]
+
+mova        m7,        [tab_c_512]
+
+mov         r4d,       %2
+
+.loop
+movu        m2,        [r0]
+movu        m3,        [r0 + r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+movu        m5,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movu        m3,        [r5 + r1]
+
+punpcklbw   m6,        m5,        m3,
+punpckhbw   m5,        m3
+
+pmaddubsw   m6,        m0
+pmaddubsw   m5,        m0
+
+paddw       m4,        m6
+paddw       m2,        m5
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movu        [r2],      m4
+
+movq        m2,        [r0 + 16]
+movq        m3,        [r0 + r1 + 16]
+movq        m4,        [r0 + 2 * r1 + 16]
+movq        m5,        [r5 + r1 + 16]
+
+punpcklbw   m2,        m3
+punpcklbw   m4,        m5
+
+pmaddubsw   m2,        m1
+pmaddubsw   m4,        m0
+
+paddw       m2,        m4
+
+pmulhrsw    m2,        m7
+packuswb    m2,        m2
+movh        [r2 + 16], m2
+
+movu        m2,        [r0 + r1]
+movu        m3,        [r0 + 2 * r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+lea         r5,        [r0 + 2 * r1]
+movu        m5,        [r5 +  r1]
+movu        m3,        [r5 + 2 * r1]
+
+punpcklbw   m6,        m5,        m3,
+punpckhbw   m5,        m3
+
+pmaddubsw   m6,        m0
+pmaddubsw   m5,        m0
+
+paddw       m4,        m6
+paddw       m2,        m5
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movu        [r2 + r3],      m4
+
+movq        m2,        [r0 + r1 + 16]
+movq        m3,        [r0 + 2 * r1 + 16]
+movq        m4,        [r5 + r1 + 16]
+movq        m5,        [r5 + 2 * r1 + 16]
+
+punpcklbw   m2,        m3
+punpcklbw   m4,        m5
+
+pmaddubsw   m2,        m1
+pmaddubsw   m4,        m0
+
+paddw       m2,        m4
+
+pmulhrsw    m2,        m7
+packuswb    m2,        m2
+movh        [r2 + r3 + 16], m2
+
+lea         r0,        [r0 + 2 * r1]
+lea         r2,        [r2 + 2 * r3]
+
+sub         r4,        2
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W24 24, 32
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov         r4d,       r4m
+sub         r0,        r1
+
+%ifdef PIC
+lea         r5,        [tab_ChromaCoeff]
+movd        m0,        [r5 + r4 * 4]
+%else
+movd        m0,        [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb      m1,        m0,       [tab_Vm]
+pshufb      m0,        [tab_Vm + 16]
+
+mova        m7,        [tab_c_512]
+
+mov         r4d,       %2
+
+.loop
+movu        m2,        [r0]
+movu        m3,        [r0 + r1]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+movu        m3,        [r0 + 2 * r1]
+lea         r5,        [r0 + 2 * r1]
+movu        m5,        [r5 + r1]
+
+punpcklbw   m6,        m3,        m5
+punpckhbw   m3,        m5,
+
+pmaddubsw   m6,        m0
+pmaddubsw   m3,        m0
+
+paddw       m4,        m6
+paddw       m2,        m3
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movu        [r2],      m4
+
+movu        m2,        [r0 + 16]
+movu        m3,        [r0 + r1 + 16]
+
+punpcklbw   m4,        m2,        m3,
+punpckhbw   m2,        m3,
+
+pmaddubsw   m4,        m1
+pmaddubsw   m2,        m1
+
+movu        m3,        [r0 + 2 * r1 + 16]
+movu        m5,        [r5 + r1 + 16]
+
+punpcklbw   m6,        m3,        m5
+punpckhbw   m3,        m5,
+
+pmaddubsw   m6,        m0
+pmaddubsw   m3,        m0
+
+paddw       m4,        m6
+paddw       m2,        m3
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movu        [r2 + 16], m4
+
+lea         r0,        [r0 + r1]
+lea         r2,        [r2 + r3]
+
+dec         r4
+jnz        .loop
+RET
+%endmacro
+
+FILTER_V4_W32 32,  8
+FILTER_V4_W32 32, 16
+FILTER_V4_W32 32, 24
+FILTER_V4_W32 32, 32
+
+
+;-----------------------------------------------------------------------------
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal luma_p2s, 3, 7, 8
+
+    ; load width and height
+    mov         r3d, r3m
+    mov         r4d, r4m
+
+    ; load constant
+    mova        m6, [tab_c_128]
+    mova        m7, [tab_c_64_n64]
+
+    ;shr         r4d, 2
+    lea         r2, [r2 - 16]
+.loopH:
+
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m6
+    pmaddubsw   m0, m7
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m6
+    pmaddubsw   m1, m7
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m6
+    pmaddubsw   m2, m7
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m6
+    pmaddubsw   m3, m7
+
+    add         r5, 8
+    cmp         r5, r3
+    jg          .width4
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+    lea         r5, [r5 + 8]
+    je          .nextH
+    jmp         .loopW
+
+.width4:
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 0], m0
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 2], m1
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 4], m2
+    movh        [r2 + r5 * 2 + FENC_STRIDE * 6], m3
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 2 * 4
+
+    sub         r4, 4
+    jnz         .loopH
+
+    RET
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/ipfilter8.h	Thu Oct 31 18:43:03 2013 +0530
@@ -88,6 +88,10 @@
 CHROMA_FILTERS(_sse4);
 LUMA_FILTERS(_sse4);
 
+void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
+void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
+void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_LUMA_FUNC_DEF
 #undef CHROMA_FILTERS
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/pixel.h	Thu Oct 31 18:43:03 2013 +0530
@@ -42,6 +42,19 @@
     ret x265_pixel_ ## name ## _4x16_ ## suffix args; \
     ret x265_pixel_ ## name ## _4x8_ ## suffix args; \
     ret x265_pixel_ ## name ## _4x4_ ## suffix args; \
+    ret x265_pixel_ ## name ## _32x8_ ## suffix args; \
+    ret x265_pixel_ ## name ## _32x16_ ## suffix args; \
+    ret x265_pixel_ ## name ## _32x24_ ## suffix args; \
+    ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
+    ret x265_pixel_ ## name ## _32x32_ ## suffix args; \
+    ret x265_pixel_ ## name ## _32x64_ ## suffix args; \
+    ret x265_pixel_ ## name ## _64x16_ ## suffix args; \
+    ret x265_pixel_ ## name ## _64x32_ ## suffix args; \
+    ret x265_pixel_ ## name ## _64x48_ ## suffix args; \
+    ret x265_pixel_ ## name ## _64x64_ ## suffix args; \
+    ret x265_pixel_ ## name ## _48x64_ ## suffix args; \
+    ret x265_pixel_ ## name ## _24x32_ ## suffix args; \
+    ret x265_pixel_ ## name ## _12x16_ ## suffix args; \
 
 #define DECL_X1(name, suffix) \
     DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/common/x86/sad-a.asm	Thu Oct 31 18:43:03 2013 +0530
@@ -31,8 +31,9 @@
 
 SECTION_RODATA 32
 
+MSK:                  db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
 pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
-hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
+hpred_shuf:           db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
 
 SECTION .text
 
@@ -119,6 +120,263 @@
     RET
 %endmacro
 
+%macro PROCESS_SAD_12x4 0
+    movu    m1,  [r2]
+    movu    m2,  [r0]
+    pand    m1,  m4
+    pand    m2,  m4
+    psadbw  m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+    movu    m1,  [r2]
+    movu    m2,  [r0]
+    pand    m1,  m4
+    pand    m2,  m4
+    psadbw  m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+    movu    m1,  [r2]
+    movu    m2,  [r0]
+    pand    m1,  m4
+    pand    m2,  m4
+    psadbw  m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+    movu    m1,  [r2]
+    movu    m2,  [r0]
+    pand    m1,  m4
+    pand    m2,  m4
+    psadbw  m1,  m2
+    paddd   m0,  m1
+%endmacro
+
+%macro PROCESS_SAD_16x4 0
+    movu    m1,  [r2]
+    movu    m2,  [r2 + r3]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + r1]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + 2 * r3]
+    lea     r0,  [r0 + 2 * r1]
+    movu    m1,  [r2]
+    movu    m2,  [r2 + r3]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + r1]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + 2 * r3]
+    lea     r0,  [r0 + 2 * r1]
+%endmacro
+
+%macro PROCESS_SAD_24x4 0
+    movu        m1,  [r2]
+    movq        m2,  [r2 + 16]
+    lea         r2,  [r2 + r3]
+    movu        m3,  [r2]
+    movq        m4,  [r2 + 16]
+    psadbw      m1,  [r0]
+    psadbw      m3,  [r0 + r1]
+    paddd       m0,  m1
+    paddd       m0,  m3
+    movq        m1,  [r0 + 16]
+    lea         r0,  [r0 + r1]
+    movq        m3,  [r0 + 16]
+    punpcklqdq  m2,  m4
+    punpcklqdq  m1,  m3
+    psadbw      m2, m1
+    paddd       m0, m2
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+
+    movu        m1,  [r2]
+    movq        m2,  [r2 + 16]
+    lea         r2,  [r2 + r3]
+    movu        m3,  [r2]
+    movq        m4,  [r2 + 16]
+    psadbw      m1,  [r0]
+    psadbw      m3,  [r0 + r1]
+    paddd       m0,  m1
+    paddd       m0,  m3
+    movq        m1,  [r0 + 16]
+    lea         r0,  [r0 + r1]
+    movq        m3,  [r0 + 16]
+    punpcklqdq  m2,  m4
+    punpcklqdq  m1,  m3
+    psadbw      m2, m1
+    paddd       m0, m2
+%endmacro
+
+%macro PROCESS_SAD_32x4 0
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+%endmacro
+
+%macro PROCESS_SAD_48x4 0
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    paddd   m0,  m3
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    paddd   m0,  m3
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    paddd   m0,  m3
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    paddd   m1,  m2
+    paddd   m0,  m1
+    paddd   m0,  m3
+%endmacro
+
+%macro PROCESS_SAD_8x4 0
+    movq        m1, [r2]
+    movq        m2, [r2 + r3]
+    lea         r2, [r2 + 2 * r3]
+    movq        m3, [r0]
+    movq        m4, [r0 + r1]
+    lea         r0, [r0 + 2 * r1]
+    punpcklqdq  m1, m2
+    punpcklqdq  m3, m4
+    psadbw      m1, m3
+    paddd       m0, m1
+    movq        m1, [r2]
+    movq        m2, [r2 + r3]
+    lea         r2, [r2 + 2 * r3]
+    movq        m3, [r0]
+    movq        m4, [r0 + r1]
+    lea         r0, [r0 + 2 * r1]
+    punpcklqdq  m1, m2
+    punpcklqdq  m3, m4
+    psadbw      m1, m3
+    paddd       m0, m1
+%endmacro
+
+%macro PROCESS_SAD_64x4 0
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    movu    m4,  [r2 + 48]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    psadbw  m4,  [r0 + 48]
+    paddd   m1,  m2
+    paddd   m3,  m4
+    paddd   m0,  m1
+    paddd   m0,  m3
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    movu    m4,  [r2 + 48]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    psadbw  m4,  [r0 + 48]
+    paddd   m1,  m2
+    paddd   m3,  m4
+    paddd   m0,  m1
+    paddd   m0,  m3
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    movu    m4,  [r2 + 48]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    psadbw  m4,  [r0 + 48]
+    paddd   m1,  m2
+    paddd   m3,  m4
+    paddd   m0,  m1
+    paddd   m0,  m3
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    movu    m1,  [r2]
+    movu    m2,  [r2 + 16]
+    movu    m3,  [r2 + 32]
+    movu    m4,  [r2 + 48]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + 16]
+    psadbw  m3,  [r0 + 32]
+    psadbw  m4,  [r0 + 48]
+    paddd   m1,  m2
+    paddd   m3,  m4
+    paddd   m0,  m1
+    paddd   m0,  m3
+
+%endmacro
+
 %macro SAD_W16 0
 ;-----------------------------------------------------------------------------
 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
@@ -223,6 +481,376 @@
     paddw   m0, m1
     paddw   m0, m3
     SAD_END_SSE2
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x12, 4,4,3
+    pxor m0, m0
+
+    PROCESS_SAD_16x4
+    PROCESS_SAD_16x4
+    PROCESS_SAD_16x4
+
+    movhlps m1, m0
+    paddd   m0, m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x32, 4,5,3
+    pxor m0,  m0
+    mov  r4d, 4
+.loop
+    PROCESS_SAD_16x4
+    PROCESS_SAD_16x4
+    dec  r4d
+    jnz .loop
+
+    movhlps m1, m0
+    paddd   m0, m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x64, 4,5,3
+    pxor m0,  m0
+    mov  r4d, 8
+.loop
+    PROCESS_SAD_16x4
+    PROCESS_SAD_16x4
+    dec  r4d
+    jnz .loop
+
+    movhlps m1, m0
+    paddd   m0, m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_16x4, 4,4,3
+
+    movu    m0,  [r2]
+    movu    m1,  [r2 + r3]
+    psadbw  m0,  [r0]
+    psadbw  m1,  [r0 + r1]
+    paddd   m0,  m1
+    lea     r2,  [r2 + 2 * r3]
+    lea     r0,  [r0 + 2 * r1]
+    movu    m1,  [r2]
+    movu    m2,  [r2 + r3]
+    psadbw  m1,  [r0]
+    psadbw  m2,  [r0 + r1]
+    paddd   m1,  m2
+    paddd   m0,  m1
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x8, 4,4,3
+    pxor  m0,  m0
+
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x24, 4,5,3
+    pxor  m0,  m0
+    mov   r4d, 3
+.loop
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+    dec r4d
+    jnz .loop
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x32, 4,5,3
+    pxor  m0,  m0
+    mov   r4d, 4
+.loop
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+    dec r4d
+    jnz .loop
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x16, 4,4,3
+    pxor  m0,  m0
+
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_32x64, 4,5,3
+    pxor  m0,  m0
+    mov   r4d, 8
+.loop
+    PROCESS_SAD_32x4
+    PROCESS_SAD_32x4
+    dec  r4d
+    jnz .loop
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_8x32, 4,4,3
+    pxor  m0,  m0
+    mov   r4d, 4
+.loop
+    PROCESS_SAD_8x4
+    PROCESS_SAD_8x4
+    dec  r4d
+    jnz .loop
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x16, 4,4,5
+    pxor  m0,  m0
+
+    PROCESS_SAD_64x4
+
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_64x4
+
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_64x4
+
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_64x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x32, 4,5,5
+    pxor  m0,  m0
+    mov   r4,  32
+
+.loop
+    PROCESS_SAD_64x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_64x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    sub   r4,  8
+    cmp   r4,  8
+
+jnz .loop
+    PROCESS_SAD_64x4
+    lea   r2,  [r2 + r3]
+    lea   r0,  [r0 + r1]
+    PROCESS_SAD_64x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x48, 4,5,5
+    pxor  m0,  m0
+    mov   r4,  48
+
+.loop
+    PROCESS_SAD_64x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_64x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    sub   r4,  8
+    cmp   r4,  8
+
+jnz .loop
+    PROCESS_SAD_64x4
+    lea   r2,  [r2 + r3]
+    lea   r0,  [r0 + r1]
+    PROCESS_SAD_64x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_64x64, 4,5,5
+    pxor  m0,  m0
+    mov   r4,  64
+
+.loop
+    PROCESS_SAD_64x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_64x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    sub   r4,  8
+    cmp   r4,  8
+
+jnz .loop
+    PROCESS_SAD_64x4
+    lea   r2,  [r2 + r3]
+    lea   r0,  [r0 + r1]
+    PROCESS_SAD_64x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_48x64, 4,5,5
+    pxor  m0,  m0
+    mov   r4,  64
+
+.loop
+    PROCESS_SAD_48x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    PROCESS_SAD_48x4
+    lea     r2,  [r2 + r3]
+    lea     r0,  [r0 + r1]
+
+    sub   r4,  8
+    cmp   r4,  8
+
+jnz .loop
+    PROCESS_SAD_48x4
+    lea   r2,  [r2 + r3]
+    lea   r0,  [r0 + r1]
+    PROCESS_SAD_48x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_24x32, 4,5,4
+    pxor  m0,  m0
+    mov   r4,  32
+
+.loop
+    PROCESS_SAD_24x4
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+    PROCESS_SAD_24x4
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+    sub   r4,  8
+    cmp   r4,  8
+jnz .loop
+    PROCESS_SAD_24x4
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+    PROCESS_SAD_24x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_12x16, 4,4,4
+    mova  m4,  [MSK]
+    pxor  m0,  m0
+
+    PROCESS_SAD_12x4
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+    PROCESS_SAD_12x4
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+    PROCESS_SAD_12x4
+    lea         r2,  [r2 + r3]
+    lea         r0,  [r0 + r1]
+    PROCESS_SAD_12x4
+
+    movhlps m1,  m0
+    paddd   m0,  m1
+    movd    eax, m0
+    RET
+
 %endmacro
 
 INIT_XMM sse2
@@ -972,6 +1600,486 @@
     RET
 %endmacro
 
+%macro SAD_X3_24x4 0
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    pshufd  m6,  m6, 84
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+%endmacro
+
+%macro SAD_X4_24x4 0
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    pshufd  m7,  m7, 84
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+%endmacro
+
+%macro SAD_X3_32x4 0
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+    mova    m3,  [r0]
+    mova    m4,  [r0 + 16]
+    movu    m5,  [r1]
+    movu    m6,  [r1 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m0,  m5
+    movu    m5,  [r2]
+    movu    m6,  [r2 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m1,  m5
+    movu    m5,  [r3]
+    movu    m6,  [r3 + 16]
+    psadbw  m5,  m3
+    psadbw  m6,  m4
+    paddd   m5,  m6
+    paddd   m2,  m5
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r4]
+    lea     r2,  [r2 + r4]
+    lea     r3,  [r3 + r4]
+%endmacro
+
+%macro SAD_X4_32x4 0
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+    mova    m4,  [r0]
+    mova    m5,  [r0 + 16]
+    movu    m6,  [r1]
+    movu    m7,  [r1 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m0,  m6
+    movu    m6,  [r2]
+    movu    m7,  [r2 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m1,  m6
+    movu    m6,  [r3]
+    movu    m7,  [r3 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m2,  m6
+    movu    m6,  [r4]
+    movu    m7,  [r4 + 16]
+    psadbw  m6,  m4
+    psadbw  m7,  m5
+    paddd   m6,  m7
+    paddd   m3,  m6
+    lea     r0,  [r0 + FENC_STRIDE]
+    lea     r1,  [r1 + r5]
+    lea     r2,  [r2 + r5]
+    lea     r3,  [r3 + r5]
+    lea     r4,  [r4 + r5]
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
 ;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
@@ -991,6 +2099,7 @@
 SAD_X 3,  8, 16
 SAD_X 3,  8,  8
 SAD_X 3,  8,  4
+SAD_X 3,  4, 16
 SAD_X 3,  4,  8
 SAD_X 3,  4,  4
 SAD_X 4, 16, 16
@@ -998,6 +2107,7 @@
 SAD_X 4,  8, 16
 SAD_X 4,  8,  8
 SAD_X 4,  8,  4
+SAD_X 4,  4, 16
 SAD_X 4,  4,  8
 SAD_X 4,  4,  4
 
@@ -1513,6 +2623,206 @@
 %endif
 %endmacro
 
+%macro SAD_X3_W24 0
+cglobal pixel_sad_x3_24x32, 5, 7, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    mov   r6, 32
+
+.loop
+    SAD_X3_24x4
+    SAD_X3_24x4
+    SAD_X3_24x4
+    SAD_X3_24x4
+
+    sub r6,  16
+    cmp r6,  0
+jnz .loop
+    SAD_X3_END_SSE2 1
+%endmacro
+
+%macro SAD_X4_W24 0
+%if ARCH_X86_64 == 1
+cglobal pixel_sad_x4_24x32, 6, 8, 8
+%define count r7
+%else
+cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4
+%define count dword [rsp]
+%endif
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    pxor  m3, m3
+    mov   count, 32
+
+.loop
+    SAD_X4_24x4
+    SAD_X4_24x4
+    SAD_X4_24x4
+    SAD_X4_24x4
+
+    sub count,  16
+    jnz .loop
+    SAD_X4_END_SSE2 1
+
+%endmacro
+
+%macro SAD_X3_W32 0
+cglobal pixel_sad_x3_32x8, 5, 6, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x16, 5, 6, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x24, 5, 6, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x32, 5, 7, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    mov   r6, 32
+
+.loop
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+
+    sub r6,  16
+    cmp r6,  0
+jnz .loop
+    SAD_X3_END_SSE2 1
+
+cglobal pixel_sad_x3_32x64, 5, 7, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    mov   r6, 64
+
+.loop1
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+    SAD_X3_32x4
+
+    sub r6,  16
+    cmp r6,  0
+jnz .loop1
+    SAD_X3_END_SSE2 1
+%endmacro
+
+%macro SAD_X4_W32 0
+cglobal pixel_sad_x4_32x8, 6, 7, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    pxor  m3, m3
+
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_END_SSE2 1
+
+cglobal pixel_sad_x4_32x16, 6, 7, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    pxor  m3, m3
+
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_END_SSE2 1
+
+cglobal pixel_sad_x4_32x24, 6, 7, 8
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    pxor  m3, m3
+
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_END_SSE2 1
+
+%if ARCH_X86_64 == 1
+cglobal pixel_sad_x4_32x32, 6, 8, 8
+%define count r7
+%else
+cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4
+%define count dword [rsp]
+%endif
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    pxor  m3, m3
+    mov   count, 32
+
+.loop
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+
+    sub count,  16
+    jnz .loop
+    SAD_X4_END_SSE2 1
+
+%if ARCH_X86_64 == 1
+cglobal pixel_sad_x4_32x64, 6, 8, 8
+%define count r7
+%else
+cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4
+%define count dword [rsp]
+%endif
+    pxor  m0, m0
+    pxor  m1, m1
+    pxor  m2, m2
+    pxor  m3, m3
+    mov   count, 64
+
+.loop
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+    SAD_X4_32x4
+
+    sub count,  16
+    jnz .loop
+    SAD_X4_END_SSE2 1
+
+%endmacro
+
+
 INIT_XMM sse2
 SAD_X_SSE2 3, 16, 16, 7
 SAD_X_SSE2 3, 16,  8, 7
@@ -1544,6 +2854,8 @@
 %endmacro
 
 INIT_XMM ssse3
+SAD_X3_W32
+SAD_X3_W24
 SAD_X_SSE2  3, 16, 64, 7
 SAD_X_SSE2  3, 16, 32, 7
 SAD_X_SSE2  3, 16, 16, 7
@@ -1551,6 +2863,8 @@
 SAD_X_SSE2  3, 16,  8, 7
 SAD_X_SSE2  3,  8, 32, 7
 SAD_X_SSE2  3,  8, 16, 7
+SAD_X4_W24
+SAD_X4_W32
 SAD_X_SSE2  4, 16, 64, 7
 SAD_X_SSE2  4, 16, 32, 7
 SAD_X_SSE2  4, 16, 16, 7
@@ -1562,12 +2876,16 @@
 SAD_X_SSSE3 4,  8,  4
 
 INIT_XMM avx
+SAD_X3_W32
+SAD_X3_W24
 SAD_X_SSE2 3, 16, 64, 7
 SAD_X_SSE2 3, 16, 32, 6
 SAD_X_SSE2 3, 16, 16, 6
 SAD_X_SSE2 3, 16, 12, 6
 SAD_X_SSE2 3, 16,  8, 6
 SAD_X_SSE2 3, 16,  4, 6
+SAD_X4_W24
+SAD_X4_W32
 SAD_X_SSE2 4, 16, 64, 7
 SAD_X_SSE2 4, 16, 32, 7
 SAD_X_SSE2 4, 16, 16, 7
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/compress.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -26,6 +26,7 @@
 
 /* Lambda Partition Select adjusts the threshold value for Early Exit in No-RDO flow */
 #define LAMBDA_PARTITION_SELECT     0.9
+#define EARLY_EXIT                  1
 
 using namespace x265;
 
@@ -222,10 +223,12 @@
     m_tmpResiYuv[depth]->clear();
 
     //do motion compensation only for Luma since luma cost alone is calculated
+    outTempCU->m_totalBits = 0;
     m_search->predInterSearch(outTempCU, outPredYuv, bUseMRG, true, false);
     int part = partitionFromSizes(outTempCU->getWidth(0), outTempCU->getHeight(0));
-    outTempCU->m_totalCost = primitives.sse_pp[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
-                                                     outPredYuv->getLumaAddr(), outPredYuv->getStride());
+    uint32_t distortion = primitives.sse_pp[part](m_origYuv[depth]->getLumaAddr(), m_origYuv[depth]->getStride(),
+                                                  outPredYuv->getLumaAddr(), outPredYuv->getStride());
+    outTempCU->m_totalCost = m_rdCost->calcRdCost(distortion, outTempCU->m_totalBits);
 }
 
 void TEncCu::xComputeCostMerge2Nx2N(TComDataCU*& outBestCU, TComDataCU*& outTempCU, bool* earlyDetectionSkip, TComYuv*& bestPredYuv, TComYuv*& yuvReconBest)
@@ -245,6 +248,7 @@
     outTempCU->setCUTransquantBypassSubParts(m_cfg->getCUTransquantBypassFlagValue(), 0, depth);
     outTempCU->getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
 
+    int bestMergeCand = 0;
     for (int mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
     {
         // set MC parameters, interprets depth relative to LCU level
@@ -268,6 +272,7 @@
 
         if (outTempCU->m_totalCost < outBestCU->m_totalCost)
         {
+            bestMergeCand = mergeCand;
             TComDataCU* tmp = outTempCU;
             outTempCU = outBestCU;
             outBestCU = tmp;
@@ -286,7 +291,44 @@
     {
         m_search->motionCompensation(outBestCU, bestPredYuv, REF_PIC_LIST_X, partIdx, false, true);
     }
-    m_search->encodeResAndCalcRdInterCU(outBestCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], yuvReconBest, false);
+
+    TComDataCU* tmp;
+    TComYuv *yuv;
+
+    outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
+    outTempCU->setCUTransquantBypassSubParts(m_cfg->getCUTransquantBypassFlagValue(), 0, depth);
+    outTempCU->setPartSizeSubParts(SIZE_2Nx2N, 0, depth);
+    outTempCU->setMergeFlagSubParts(true, 0, 0, depth);
+    outTempCU->setMergeIndexSubParts(bestMergeCand, 0, 0, depth);
+    outTempCU->setInterDirSubParts(interDirNeighbours[bestMergeCand], 0, 0, depth);
+    outTempCU->getCUMvField(REF_PIC_LIST_0)->setAllMvField(mvFieldNeighbours[0 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0);
+    outTempCU->getCUMvField(REF_PIC_LIST_1)->setAllMvField(mvFieldNeighbours[1 + 2 * bestMergeCand], SIZE_2Nx2N, 0, 0);
+
+    //No-residue mode
+    m_search->encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], true);
+
+    tmp = outTempCU;
+    outTempCU = outBestCU;
+    outBestCU = tmp;
+
+    yuv = yuvReconBest;
+    yuvReconBest = m_tmpRecoYuv[depth];
+    m_tmpRecoYuv[depth] = yuv;
+
+    //Encode with residue
+    m_search->encodeResAndCalcRdInterCU(outTempCU, m_origYuv[depth], bestPredYuv, m_tmpResiYuv[depth], m_bestResiYuv[depth], m_tmpRecoYuv[depth], false);
+
+    if (outTempCU->m_totalCost < outBestCU->m_totalCost)    //Choose best from no-residue mode and residue mode
+    {
+        tmp = outTempCU;
+        outTempCU = outBestCU;
+        outBestCU = tmp;
+
+        yuv = yuvReconBest;
+        yuvReconBest = m_tmpRecoYuv[depth];
+        m_tmpRecoYuv[depth] = yuv;
+    }
+
     if (m_cfg->param.bEnableEarlySkip)
     {
         if (outBestCU->getQtRootCbf(0) == 0)
@@ -531,7 +573,78 @@
     // further split
     if (bSubBranch && bTrySplitDQP && depth < g_maxCUDepth - g_addCUDepth)
     {
+#if EARLY_EXIT // turn ON this to enable early exit
+        // early exit when the RD cost of best mode at depth n is less than the avgerage of RD cost of the
+        // CU's(above, aboveleft, aboveright, left, colocated) at depth "n" of previosuly coded CU's
+        if (outBestCU != 0)
+        {
+            UInt64 costCU = 0, costCUAbove = 0, costCUAboveLeft = 0, costCUAboveRight = 0, costCULeft = 0, costCUColocated0 = 0, costCUColocated1 = 0, totalCost = 0, avgCost= 0;
+            UInt64 countCU = 0, countCUAbove = 0, countCUAboveLeft = 0, countCUAboveRight = 0, countCULeft = 0, countCUColocated0 = 0, countCUColocated1 = 0;
+            UInt64 totalCount = 0;
+            TComDataCU* above = outTempCU->getCUAbove();
+            TComDataCU* aboveLeft = outTempCU->getCUAboveLeft();
+            TComDataCU* aboveRight = outTempCU->getCUAboveRight();
+            TComDataCU* left = outTempCU->getCULeft();
+            TComDataCU* colocated0 = outTempCU->getCUColocated(REF_PIC_LIST_0);
+            TComDataCU* colocated1 = outTempCU->getCUColocated(REF_PIC_LIST_1);
+
+            costCU = outTempCU->m_avgCost[depth] * outTempCU->m_count[depth];
+            countCU = outTempCU->m_count[depth];
+            if (above)
+            {
+                costCUAbove = above->m_avgCost[depth] * above->m_count[depth];
+                countCUAbove = above->m_count[depth];
+            }
+            if (aboveLeft)
+            {
+                costCUAboveLeft = aboveLeft->m_avgCost[depth] * aboveLeft->m_count[depth];
+                countCUAboveLeft = aboveLeft->m_count[depth];
+            }
+            if (aboveRight)
+            {
+                costCUAboveRight = aboveRight->m_avgCost[depth] * aboveRight->m_count[depth];
+                countCUAboveRight = aboveRight->m_count[depth];
+            }
+            if (left)
+            {
+                costCULeft = left->m_avgCost[depth] * left->m_count[depth];
+                countCULeft = left->m_count[depth];
+            }
+            if (colocated0)
+            {
+                costCUColocated0 = colocated0->m_avgCost[depth] * colocated0->m_count[depth];
+                countCUColocated0 = colocated0->m_count[depth];
+            }
+            if (colocated1)
+            {
+                costCUColocated1 = colocated1->m_avgCost[depth] * colocated1->m_count[depth];
+                countCUColocated1 = colocated1->m_count[depth];
+            }
+
+            totalCost = costCU + costCUAbove + costCUAboveLeft + costCUAboveRight + costCULeft + costCUColocated0 + costCUColocated1;
+            totalCount = countCU + countCUAbove + countCUAboveLeft + countCUAboveRight + countCULeft + countCUColocated0 + countCUColocated1;
+            if (totalCount != 0)
+                avgCost = totalCost / totalCount;
+
+            float lambda = 1.0f;
+
+            if (outBestCU->m_totalCost < lambda * avgCost && avgCost != 0 && depth != 0)
+            {
+                m_entropyCoder->resetBits();
+                m_entropyCoder->encodeSplitFlag(outBestCU, 0, depth, true);
+                outBestCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits();        // split bits
+                outBestCU->m_totalCost  = m_rdCost->calcRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits);
+                /* Copy Best data to Picture for next partition prediction. */
+                outBestCU->copyToPic((UChar)depth);
+
+                /* Copy Yuv data to picture Yuv */
+                xCopyYuv2Pic(outBestCU->getPic(), outBestCU->getAddr(), outBestCU->getZorderIdxInCU(), depth, depth, outBestCU, lpelx, tpely);
+                return;
+            }
+        }
+#endif
 #if 0 // turn ON this to enable early exit
+        //early exit when RD cost of best mode is less than the cumulative RD cost of 4 subpartition
         UInt64 nxnCost = 0;
         if (outBestCU != 0 && depth > 0)
         {
@@ -612,7 +725,22 @@
                     m_rdSbacCoders[nextDepth][CI_CURR_BEST]->load(m_rdSbacCoders[nextDepth][CI_NEXT_BEST]);
                 }
                 xCompressInterCU(subBestPartCU, subTempPartCU, outTempCU, nextDepth, nextDepth_partIndex);
-
+#if EARLY_EXIT
+                for (int k = 0; k < 4; k++)
+                {
+                    outTempCU->m_avgCost[k] = subTempPartCU->m_avgCost[k];
+                    outTempCU->m_count[k] = subTempPartCU->m_count[k];
+                }
+                if (subBestPartCU->getPredictionMode(0) != MODE_INTRA)
+                {
+                    UInt64 tempavgCost = subBestPartCU->m_totalCost;
+                    UInt64 temp = outTempCU->m_avgCost[depth + 1] * outTempCU->m_count[depth + 1];
+                    outTempCU->m_count[depth + 1] += 1;
+                    outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_count[depth + 1] += 1;
+                    outTempCU->m_avgCost[depth + 1] = (temp + tempavgCost) / outTempCU->m_count[depth + 1];
+                    outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_avgCost[depth + 1] = outTempCU->m_avgCost[depth + 1];
+                }
+#endif
                 /* Adding costs from best SUbCUs */
                 outTempCU->copyPartFrom(subBestPartCU, nextDepth_partIndex, nextDepth, true); // Keep best part data to current temporary data.
                 xCopyYuv2Tmp(subBestPartCU->getTotalNumPart() * nextDepth_partIndex, nextDepth);
@@ -708,6 +836,16 @@
          * Copy recon data from Temp structure to Best structure */
         if (outBestCU)
         {
+            if (depth == 0)
+            {
+                UInt64 tempavgCost = outBestCU->m_totalCost;
+                UInt64 temp = outTempCU->m_avgCost[depth] * outTempCU->m_count[depth];
+                outTempCU->m_count[depth] += 1;
+                outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_count[depth] += 1;
+
+                outTempCU->m_avgCost[depth] = (temp + tempavgCost) / outTempCU->m_count[depth];
+                outTempCU->getPic()->getPicSym()->getCU(outTempCU->getAddr())->m_avgCost[depth] = outTempCU->m_avgCost[depth];
+            }
             if (outTempCU->m_totalCost < outBestCU->m_totalCost)
             {
                 outBestCU = outTempCU;
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/encoder.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -494,52 +494,18 @@
     const char* digestStr = NULL;
     if (param.decodedPictureHashSEI)
     {
-        SEIDecodedPictureHash sei_recon_picture_digest;
         if (param.decodedPictureHashSEI == 1)
         {
-            /* calculate MD5sum for entire reconstructed picture */
-            sei_recon_picture_digest.method = SEIDecodedPictureHash::MD5;
-            for (int i = 0; i < 3; i++)
-            {
-                MD5Final(&(pic->m_state[i]), sei_recon_picture_digest.digest[i]);
-            }
-            digestStr = digestToString(sei_recon_picture_digest.digest, 16);
+            digestStr = digestToString(m_frameEncoder->m_seiReconPictureDigest.digest, 16);
         }
         else if (param.decodedPictureHashSEI == 2)
         {
-            sei_recon_picture_digest.method = SEIDecodedPictureHash::CRC;
-            for (int i = 0; i < 3; i++)
-            {
-                crcFinish((pic->m_crc[i]), sei_recon_picture_digest.digest[i]);
-            }
-            digestStr = digestToString(sei_recon_picture_digest.digest, 2);
+            digestStr = digestToString(m_frameEncoder->m_seiReconPictureDigest.digest, 2);
         }
         else if (param.decodedPictureHashSEI == 3)
         {
-            sei_recon_picture_digest.method = SEIDecodedPictureHash::CHECKSUM;
-            for (int i = 0; i < 3; i++)
-            {
-                checksumFinish(pic->m_checksum[i], sei_recon_picture_digest.digest[i]);
-            }
-            digestStr = digestToString(sei_recon_picture_digest.digest, 4);
+            digestStr = digestToString(m_frameEncoder->m_seiReconPictureDigest.digest, 4);
         }
-
-        /* write the SEI messages */
-        OutputNALUnit onalu(NAL_UNIT_SUFFIX_SEI, 0);
-        m_frameEncoder->m_seiWriter.writeSEImessage(onalu.m_Bitstream, sei_recon_picture_digest, pic->getSlice()->getSPS());
-        writeRBSPTrailingBits(onalu.m_Bitstream);
-
-        int count = 0;
-        while (nalunits[count] != NULL)
-        {
-            count++;
-        }
-
-        nalunits[count] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
-        if (nalunits[count])
-            nalunits[count]->init(onalu);
-        else
-            digestStr = NULL;
     }
 
     /* calculate the size of the access unit, excluding:
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/frameencoder.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -188,25 +188,25 @@
 
     /* headers for start of bitstream */
     OutputNALUnit nalu(NAL_UNIT_VPS);
-    entropyCoder->setBitstream(&nalu.m_Bitstream);
+    entropyCoder->setBitstream(&nalu.m_bitstream);
     entropyCoder->encodeVPS(m_cfg->getVPS());
-    writeRBSPTrailingBits(nalu.m_Bitstream);
+    writeRBSPTrailingBits(nalu.m_bitstream);
     CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
     nalunits[count]->init(nalu);
     count++;
 
     nalu = NALUnit(NAL_UNIT_SPS);
-    entropyCoder->setBitstream(&nalu.m_Bitstream);
+    entropyCoder->setBitstream(&nalu.m_bitstream);
     entropyCoder->encodeSPS(&m_sps);
-    writeRBSPTrailingBits(nalu.m_Bitstream);
+    writeRBSPTrailingBits(nalu.m_bitstream);
     CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
     nalunits[count]->init(nalu);
     count++;
 
     nalu = NALUnit(NAL_UNIT_PPS);
-    entropyCoder->setBitstream(&nalu.m_Bitstream);
+    entropyCoder->setBitstream(&nalu.m_bitstream);
     entropyCoder->encodePPS(&m_pps);
-    writeRBSPTrailingBits(nalu.m_Bitstream);
+    writeRBSPTrailingBits(nalu.m_bitstream);
     CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
     nalunits[count]->init(nalu);
     count++;
@@ -220,9 +220,9 @@
         sei.numSpsIdsMinus1 = 0;
         sei.activeSeqParamSetId = m_sps.getSPSId();
 
-        entropyCoder->setBitstream(&nalu.m_Bitstream);
-        m_seiWriter.writeSEImessage(nalu.m_Bitstream, sei, &m_sps);
-        writeRBSPTrailingBits(nalu.m_Bitstream);
+        entropyCoder->setBitstream(&nalu.m_bitstream);
+        m_seiWriter.writeSEImessage(nalu.m_bitstream, sei, &m_sps);
+        writeRBSPTrailingBits(nalu.m_bitstream);
         CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
         nalunits[count]->init(nalu);
         count++;
@@ -237,9 +237,9 @@
         sei.anticlockwiseRotation = m_cfg->getDisplayOrientationSEIAngle();
 
         nalu = NALUnit(NAL_UNIT_PREFIX_SEI);
-        entropyCoder->setBitstream(&nalu.m_Bitstream);
-        m_seiWriter.writeSEImessage(nalu.m_Bitstream, sei, &m_sps);
-        writeRBSPTrailingBits(nalu.m_Bitstream);
+        entropyCoder->setBitstream(&nalu.m_bitstream);
+        m_seiWriter.writeSEImessage(nalu.m_bitstream, sei, &m_sps);
+        writeRBSPTrailingBits(nalu.m_bitstream);
         CHECKED_MALLOC(nalunits[count], NALUnitEBSP, 1);
         nalunits[count]->init(nalu);
     }
@@ -499,8 +499,8 @@
             SEIGradualDecodingRefreshInfo seiGradualDecodingRefreshInfo;
             seiGradualDecodingRefreshInfo.m_gdrForegroundFlag = true; // Indicating all "foreground"
 
-            m_seiWriter.writeSEImessage(nalu.m_Bitstream, seiGradualDecodingRefreshInfo, slice->getSPS());
-            writeRBSPTrailingBits(nalu.m_Bitstream);
+            m_seiWriter.writeSEImessage(nalu.m_bitstream, seiGradualDecodingRefreshInfo, slice->getSPS());
+            writeRBSPTrailingBits(nalu.m_bitstream);
             m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
             if (m_nalList[m_nalCount])
             {
@@ -516,8 +516,8 @@
         sei_recovery_point.m_exactMatchingFlag = (slice->getPOC() == 0) ? (true) : (false);
         sei_recovery_point.m_brokenLinkFlag    = false;
 
-        m_seiWriter.writeSEImessage(nalu.m_Bitstream, sei_recovery_point, slice->getSPS());
-        writeRBSPTrailingBits(nalu.m_Bitstream);
+        m_seiWriter.writeSEImessage(nalu.m_bitstream, sei_recovery_point, slice->getSPS());
+        writeRBSPTrailingBits(nalu.m_bitstream);
         m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
         if (m_nalList[m_nalCount])
         {
@@ -565,7 +565,7 @@
     /* start slice NALunit */
     bool sliceSegment = !slice->isNextSlice();
     OutputNALUnit nalu(slice->getNalUnitType(), 0);
-    entropyCoder->setBitstream(&nalu.m_Bitstream);
+    entropyCoder->setBitstream(&nalu.m_bitstream);
     entropyCoder->encodeSliceHeader(slice);
 
     // is it needed?
@@ -601,7 +601,7 @@
         }
         else
         {
-            entropyCoder->setBitstream(&nalu.m_Bitstream);
+            entropyCoder->setBitstream(&nalu.m_bitstream);
         }
 
         // for now, override the TILES_DECODER setting in order to write substreams.
@@ -616,7 +616,7 @@
 
     {
         // Construct the final bitstream by flushing and concatenating substreams.
-        // The final bitstream is either nalu.m_Bitstream or pcBitstreamRedirect;
+        // The final bitstream is either nalu.m_bitstream or pcBitstreamRedirect;
         uint32_t* substreamSizes = slice->getSubstreamSizes();
         for (int i = 0; i < numSubstreams; i++)
         {
@@ -638,7 +638,7 @@
 
         // Complete the slice header info.
         entropyCoder->setEntropyCoder(&m_sbacCoder, slice);
-        entropyCoder->setBitstream(&nalu.m_Bitstream);
+        entropyCoder->setBitstream(&nalu.m_bitstream);
         entropyCoder->encodeTilesWPPEntryPoint(slice);
 
         // Substreams...
@@ -654,14 +654,14 @@
     // current NALU is the last NALU of slice and a NALU was buffered, then (a)
     // Write current NALU (b) Update an write buffered NALU at appropriate
     // location in NALU list.
-    nalu.m_Bitstream.writeByteAlignment(); // Slice header byte-alignment
+    nalu.m_bitstream.writeByteAlignment(); // Slice header byte-alignment
 
     // Perform bitstream concatenation
     if (bitstreamRedirect->getNumberOfWrittenBits() > 0)
     {
-        nalu.m_Bitstream.addSubstream(bitstreamRedirect);
+        nalu.m_bitstream.addSubstream(bitstreamRedirect);
     }
-    entropyCoder->setBitstream(&nalu.m_Bitstream);
+    entropyCoder->setBitstream(&nalu.m_bitstream);
     bitstreamRedirect->clear();
     m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
     if (m_nalList[m_nalCount])
@@ -670,6 +670,45 @@
         m_nalCount++;
     }
 
+    /* write decoded picture hash SEI messages */
+    if (m_cfg->param.decodedPictureHashSEI)
+    {
+        if (m_cfg->param.decodedPictureHashSEI == 1)
+        {
+            m_seiReconPictureDigest.method = SEIDecodedPictureHash::MD5;
+            for (int i = 0; i < 3; i++)
+            {
+                MD5Final(&(m_pic->m_state[i]), m_seiReconPictureDigest.digest[i]);
+            }
+        }
+        else if (m_cfg->param.decodedPictureHashSEI == 2)
+        {
+            m_seiReconPictureDigest.method = SEIDecodedPictureHash::CRC;
+            for (int i = 0; i < 3; i++)
+            {
+                crcFinish((m_pic->m_crc[i]), m_seiReconPictureDigest.digest[i]);
+            }
+        }
+        else if (m_cfg->param.decodedPictureHashSEI == 3)
+        {
+            m_seiReconPictureDigest.method = SEIDecodedPictureHash::CHECKSUM;
+            for (int i = 0; i < 3; i++)
+            {
+                checksumFinish(m_pic->m_checksum[i], m_seiReconPictureDigest.digest[i]);
+            }
+        }
+        OutputNALUnit onalu(NAL_UNIT_SUFFIX_SEI, 0);
+        m_seiWriter.writeSEImessage(onalu.m_bitstream, m_seiReconPictureDigest, slice->getSPS());
+        writeRBSPTrailingBits(onalu.m_bitstream);
+
+        m_nalList[m_nalCount] = (NALUnitEBSP*)X265_MALLOC(NALUnitEBSP, 1);
+        if (m_nalList[m_nalCount])
+        {
+            m_nalList[m_nalCount]->init(onalu);
+            m_nalCount++;
+        }
+    }
+
     if (m_sps.getUseSAO())
     {
         m_frameFilter.end();
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/frameencoder.h	Thu Oct 31 18:43:03 2013 +0530
@@ -161,6 +161,7 @@
     TComSPS                  m_sps;
     TComPPS                  m_pps;
     RateControlEntry         m_rce;
+    SEIDecodedPictureHash    m_seiReconPictureDigest;
 
 protected:
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/framefilter.cpp
--- a/source/encoder/framefilter.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/framefilter.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -277,7 +277,6 @@
     int cuAddr = lineStartCUAddr;
     if (m_cfg->param.bEnablePsnr)
     {
-        TComPicYuv* recon = m_pic->getPicYuvRec();
         TComPicYuv* orig  = m_pic->getPicYuvOrg();
 
         intptr_t stride = recon->getStride();
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/motion.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -116,7 +116,7 @@
     X265_FREE(immedVal2);
 }
 
-void MotionEstimate::setSourcePU(int offset, uint32_t width, uint32_t height)
+void MotionEstimate::setSourcePU(int offset, int width, int height)
 {
     /* copy PU block into cache */
     primitives.blockcpy_pp(width, height, fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
@@ -300,7 +300,7 @@
                                    MV &             outQMv)
 {
     ALIGN_VAR_16(int, costs[16]);
-    intptr_t stride = ref->lumaStride;
+    size_t stride = ref->lumaStride;
     pixel *fref = ref->fpelPlane + blockOffset;
 
     setMVP(qmvp);
@@ -561,7 +561,7 @@
         omv = bmv;
         const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
         const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
-        int16_t i = 1;
+        uint16_t i = 1;
         do
         {
             if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
@@ -854,9 +854,19 @@
                 }
                 else
                 {
-                    subpelInterpolate(ref, qmv0, dir);
-                    cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
-                    cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+                    if (dir == 1)
+                    {
+                        subpelInterpolate(ref, qmv0, 1);
+                        cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE + (dir == 2)) + mvcost0;
+                        cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf + (dir == 2) + (dir == 1 ? FENC_STRIDE : 0), FENC_STRIDE + (dir == 2)) + mvcost1;
+                    }
+                    else
+                    {
+                        subpelInterpolate(ref, qmv0, 0);
+                        cost0 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE) + mvcost0;
+                        subpelInterpolate(ref, qmv1, 0);
+                        cost1 = hpelcomp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE) + mvcost1;
+                    }
                 }
                 COPY2_IF_LT(bcost, cost0, bdir, i + 0);
                 COPY2_IF_LT(bcost, cost1, bdir, i + 1);
@@ -899,7 +909,7 @@
 {
     ALIGN_VAR_16(int, costs[16]);
     pixel *fref = ref->fpelPlane + blockOffset;
-    intptr_t stride = ref->lumaStride;
+    size_t stride = ref->lumaStride;
 
     MV omv = bmv;
     int saved = bcost;
@@ -1179,9 +1189,9 @@
     int yFrac = qmv.y & 0x3;
 
     assert(yFrac | xFrac);
-    uint32_t realWidth = blockwidth + (dir == 2);
-    uint32_t realHeight = blockheight + (dir == 1);
-    intptr_t realStride = FENC_STRIDE + (dir == 2);
+    assert(dir != 2);
+    assert((blockwidth % 4) == 0);
+    int realHeight = blockheight + (dir == 1);
     pixel *fref = ref->unweightedFPelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
     int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
     int local_shift = ref->shift + shiftNum;
@@ -1190,39 +1200,39 @@
     {
         if (yFrac == 0)
         {
-            primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
-            primitives.weightpUni(immedVal, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[xFrac]);
+            primitives.weightpUni(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
         }
         else if (xFrac == 0)
         {
-            primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
-            primitives.weightpUni(immedVal, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+            primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.weightpUni(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
         }
         else
         {
             int filterSize = NTAPS_LUMA;
             int halfFilterSize = (filterSize >> 1);
-            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
-            primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, immedVal2, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
-            primitives.weightpUni(immedVal2, subpelbuf, realStride, realStride, realWidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+            primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.weightpUni(immedVal2, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift, ref->offset);
         }
     }
     else
     {
         if (yFrac == 0)
         {
-            primitives.ipfilter_pp[FILTER_H_P_P_8](fref, ref->lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[xFrac]);
+            primitives.ipfilter_pp[FILTER_H_P_P_8](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[xFrac]);
         }
         else if (xFrac == 0)
         {
-            primitives.ipfilter_pp[FILTER_V_P_P_8](fref, ref->lumaStride, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.ipfilter_pp[FILTER_V_P_P_8](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, blockwidth, realHeight, g_lumaFilter[yFrac]);
         }
         else
         {
             int filterSize = NTAPS_LUMA;
             int halfFilterSize = (filterSize >> 1);
-            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, realWidth, realWidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
-            primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * realWidth, realWidth, subpelbuf, realStride, realWidth, realHeight, g_lumaFilter[yFrac]);
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, realHeight + filterSize - 1, g_lumaFilter[xFrac]);
+            primitives.ipfilter_sp[FILTER_V_S_P_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, blockwidth, realHeight, yFrac);
         }
     }
 }
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/encoder/ratecontrol.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -124,7 +124,7 @@
     this->cfg = _cfg;
     bitrate = cfg->param.rc.bitrate * 1000;
     frameDuration = 1.0 / cfg->param.frameRate;
-    ncu = (int)((cfg->param.sourceHeight * cfg->param.sourceWidth) / pow((int)cfg->param.maxCUSize, 2.0));
+    ncu = (int)((cfg->param.sourceHeight * cfg->param.sourceWidth) / pow((int)16, 2.0));
     lastNonBPictType = -1;
     baseQp = cfg->param.rc.qp;
     qp = baseQp;
@@ -142,7 +142,7 @@
         accumPNorm = .01;
         accumPQp = (ABR_INIT_QP_MIN)*accumPNorm;
         /* estimated ratio that produces a reasonable QP for the first I-frame */
-        cplxrSum = .01 * pow(7.0e5, cfg->param.rc.qCompress) * pow(2 * ncu, 0.5);
+        cplxrSum = .01 * pow(7.0e5, cfg->param.rc.qCompress) * pow(ncu, 0.5);
         wantedBitsWindow = bitrate * frameDuration;
         lastNonBPictType = I_SLICE;
     }
@@ -253,7 +253,7 @@
     }
     else
     {
-        double abrBuffer = 1.5 * cfg->param.rc.rateTolerance * bitrate;
+		double abrBuffer = 2 * cfg->param.rc.rateTolerance * bitrate;
 
         /* 1pass ABR */
 
@@ -299,46 +299,30 @@
             q = qp2qScale(accumPQp / accumPNorm);
             q /= fabs(cfg->param.rc.ipFactor);
         }
-        if (cfg->param.rc.rateControlMode != X265_RC_CRF)
-        {
-            double lqmin = 0, lqmax = 0;
+		else if (framesDone>0)
+		{
+			 if (cfg->param.rc.rateControlMode != X265_RC_CRF)
+			 {
+				double lqmin = 0, lqmax = 0;
+				if (totalBits == 0)
+				{
+					lqmin = qp2qScale(ABR_INIT_QP_MIN) / lstep;
+					lqmax = qp2qScale(ABR_INIT_QP_MAX) * lstep;
+				}
+				else
+				{
+					lqmin = lastQScaleFor[sliceType] / lstep;
+					lqmax = lastQScaleFor[sliceType] * lstep;
+				}
 
-            /* Clip the qp of 1st 'N' frames running parallely to ensure it doesnt detoriate
-             * the quality */
-            if (totalBits == 0)
-            {
-                lqmin = qp2qScale(ABR_INIT_QP_MIN) / lstep;
-                lqmax = qp2qScale(ABR_INIT_QP_MAX) * lstep;
-            }
+				if (overflow > 1.1 && framesDone > 3)
+					lqmax *= lstep;
+				else if (overflow <0.9)
+					lqmin /= lstep;
 
-            /* Asymmetric clipping, because symmetric would prevent
-             * overflow control in areas of rapidly oscillating complexity */
-            else
-            {
-                lqmin = lastQScaleFor[sliceType] / lstep;
-                lqmax = lastQScaleFor[sliceType] * lstep;
-            }
-
-            /* Rate control needs to be more aggressive based on actual costs obtained for
-             * previous encoded frame */
-            int rfAdapt = 1;
-            if (overflow > 1.1 && framesDone > 3)
-            {
-                /* Control propagation of excessive overflow / underfow */
-                if (overflow > 1.5)
-                    rfAdapt = 2;
-                lqmax *= pow(lstep, rfAdapt);
-                lqmin /= pow(lstep, rfAdapt / cfg->param.frameNumThreads);
-            }
-            else if (overflow < 0.9)
-            {
-                if (overflow < 0.6)
-                    rfAdapt = 2;
-                lqmin /= pow(lstep, rfAdapt);
-                lqmax /= pow(lstep, rfAdapt / cfg->param.frameNumThreads);
-            }
-            q = Clip3(lqmin, lqmax, q);
-        }
+				q = Clip3(lqmin, lqmax, q);
+			 }
+		}
 
         double lmin1 = lmin[sliceType];
         double lmax1 = lmax[sliceType];
@@ -378,7 +362,7 @@
         if (rce->sliceType != B_SLICE)
             /* The factor 1.5 is to tune up the actual bits, otherwise the cplxrSum is scaled too low
              * to improve short term compensation for next frame. */
-            cplxrSum += 1.5 * bits * qp2qScale(rce->qpaRc) / rce->qRceq;
+            cplxrSum += bits * qp2qScale(rce->qpaRc) / rce->qRceq;
         else
         {
             /* Depends on the fact that B-frame's QP is an offset from the following P-frame's.
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/input/y4m.cpp
--- a/source/input/y4m.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/input/y4m.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -28,7 +28,7 @@
 #include <string.h>
 #include <iostream>
 
-#if WIN32
+#if _WIN32
 #include "io.h"
 #include "fcntl.h"
 #if defined(_MSC_VER)
@@ -53,7 +53,7 @@
     if (!strcmp(filename, "-"))
     {
         ifs = &cin;
-#if WIN32
+#if _WIN32
         setmode(fileno(stdin), O_BINARY);
 #endif
     }
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/input/yuv.cpp
--- a/source/input/yuv.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/input/yuv.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -28,7 +28,7 @@
 #include <string.h>
 #include <iostream>
 
-#if WIN32
+#if _WIN32
 #include "io.h"
 #include "fcntl.h"
 #if defined(_MSC_VER)
@@ -55,7 +55,7 @@
     if (!strcmp(filename, "-"))
     {
         ifs = &cin;
-#if WIN32
+#if _WIN32
         setmode(fileno(stdin), O_BINARY);
 #endif
     }
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/intrapredharness.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -284,7 +284,7 @@
 void IntraPredHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     int width = 64;
-    int16_t srcStride = 96;
+    uint16_t srcStride = 96;
 
     if (opt.intra_pred_dc)
     {
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/ipfilterharness.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -160,12 +160,13 @@
 
 bool IPFilterHarness::check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt)
 {
-    int rand_height = rand() % 100;                 // Randomly generated Height
-    int rand_width = rand() % 100;                  // Randomly generated Width
-    int16_t rand_val, rand_srcStride, rand_dstStride;
+    int rand_val, rand_srcStride, rand_dstStride;
 
-    for (int i = 0; i <= 100; i++)
+    for (int i = 0; i <= 1000; i++)
     {
+        int rand_height = rand() % 100;                 // Randomly generated Height
+        int rand_width = rand() % 100;                  // Randomly generated Width
+
         memset(IPF_vec_output_p, 0, ipf_t_size);      // Initialize output buffer to zero
         memset(IPF_C_output_p, 0, ipf_t_size);        // Initialize output buffer to zero
 
@@ -173,19 +174,29 @@
         rand_srcStride = rand() % 100;              // Randomly generated srcStride
         rand_dstStride = rand() % 100;              // Randomly generated dstStride
 
+        rand_width &= ~3;
+        if (rand_width < 4)
+            rand_width = 4;
+
+        if (rand_height <= 0)
+            rand_height = 1;
+
+        if (rand_dstStride < rand_width)
+            rand_dstStride = rand_width;
+
+        ref(short_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            IPF_C_output_p,
+            rand_dstStride,
+            rand_width,
+            rand_height, rand_val
+            );
         opt(short_buff + 3 * rand_srcStride,
             rand_srcStride,
             IPF_vec_output_p,
             rand_dstStride,
             rand_width,
-            rand_height, g_lumaFilter[rand_val]
-            );
-        ref(short_buff + 3 * rand_srcStride,
-            rand_srcStride,
-            IPF_C_output_p,
-            rand_dstStride,
-            rand_width,
-            rand_height, g_lumaFilter[rand_val]
+            rand_height, rand_val
             );
 
         if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
@@ -229,6 +240,48 @@
     return true;
 }
 
+bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt)
+{
+    int16_t rand_srcStride;
+
+    for (int i = 0; i <= 1000; i++)
+    {
+        int16_t rand_height = (int16_t)rand() % 100;                 // Randomly generated Height
+        int16_t rand_width = (int16_t)rand() % 100;                  // Randomly generated Width
+
+        memset(IPF_vec_output_s, 0, ipf_t_size);      // Initialize output buffer to zero
+        memset(IPF_C_output_s, 0, ipf_t_size);        // Initialize output buffer to zero
+
+        rand_srcStride = rand_width + rand() % 100;              // Randomly generated srcStride
+        if (rand_srcStride < rand_width)
+            rand_srcStride = rand_width;
+
+        rand_width %= 4;
+        if (rand_width < 4)
+            rand_width = 4;
+
+        rand_height %= 4;
+        if (rand_height < 4)
+            rand_height = 4;
+
+        ref(pixel_buff,
+            rand_srcStride,
+            IPF_C_output_s,
+            rand_width,
+            rand_height);
+        opt(pixel_buff,
+            rand_srcStride,
+            IPF_vec_output_s,
+            rand_width,
+            rand_height);
+
+        if (memcmp(IPF_vec_output_s, IPF_C_output_s, ipf_t_size))
+            return false;
+    }
+
+    return true;
+}
+
 bool IPFilterHarness::check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt)
 {
     int16_t rand_height = (int16_t)rand() % 100;                 // Randomly generated Height
@@ -325,6 +378,40 @@
     return true;
 }
 
+bool IPFilterHarness::check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt)
+{
+    int rand_srcStride, rand_dstStride, rand_coeffIdxX, rand_coeffIdxY;
+
+    for (int i = 0; i <= 1000; i++)
+    {
+        rand_coeffIdxX = rand() % 3;                // Random coeffIdex in the filter
+        rand_coeffIdxY = rand() % 3;                // Random coeffIdex in the filter
+
+        rand_srcStride = rand() % 100;             // Randomly generated srcStride
+        rand_dstStride = rand() % 100;             // Randomly generated dstStride
+
+        ref(pixel_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            IPF_C_output_p,
+            rand_dstStride,
+            rand_coeffIdxX,
+            rand_coeffIdxY
+        );
+        opt(pixel_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            IPF_vec_output_p,
+            rand_dstStride,
+            rand_coeffIdxX,
+            rand_coeffIdxY
+        );
+
+        if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
+            return false;
+    }
+
+    return true;
+}
+
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (int value = 0; value < NUM_IPFILTER_P_P; value++)
@@ -372,6 +459,15 @@
         }
     }
 
+    if (opt.luma_p2s)
+    {
+        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s))
+        {
+            printf("ipfilter_p2s failed\n");
+            return false;
+        }
+    }
+
     if (opt.ipfilter_s2p)
     {
         if (!check_IPFilter_primitive(ref.ipfilter_s2p, opt.ipfilter_s2p))
@@ -421,6 +517,18 @@
         }
     }
 
+    for (int value = 0; value < NUM_LUMA_PARTITIONS; value++)
+    {
+        if (opt.luma_hvpp[value])
+        {
+            if (!check_IPFilterLumaHV_primitive(ref.luma_hvpp[value], opt.luma_hvpp[value]))
+            {
+                printf("luma_hvpp[%s]", lumaPartStr[value]);
+                return false;
+            }
+        }
+    }
+
     return true;
 }
 
@@ -460,7 +568,7 @@
             printf("ipfilter_sp %d\t", 8 / (value + 1));
             REPORT_SPEEDUP(opt.ipfilter_sp[value], ref.ipfilter_sp[value],
                            short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
-                           IPF_vec_output_p, dstStride, width, height, g_lumaFilter[val]);
+                           IPF_vec_output_p, dstStride, width, height, val);
         }
     }
 
@@ -486,6 +594,7 @@
             REPORT_SPEEDUP(opt.luma_hpp[value], ref.luma_hpp[value],
                            pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
         }
+
         if (opt.luma_vpp[value])
         {
             printf("luma_vpp[%s]\t", lumaPartStr[value]);
@@ -493,6 +602,13 @@
                            pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
                            IPF_vec_output_p, dstStride, 1);
         }
+
+        if (opt.luma_hvpp[value])
+        {
+            printf("luma_hv [%s]\t", lumaPartStr[value]);
+            REPORT_SPEEDUP(opt.luma_hvpp[value], ref.luma_hvpp[value],
+                           pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1, 3);
+        }
     }
 
     for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/ipfilterharness.h	Thu Oct 31 18:43:03 2013 +0530
@@ -45,9 +45,11 @@
     bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt);
     bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
     bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
+    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt);
     bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
     bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
     bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
+    bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
 
 public:
 
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/pixelharness.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -528,6 +528,31 @@
     return true;
 }
 
+bool PixelHarness::check_block_copy_pp(copy_pp_t ref, copy_pp_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    // we don't know the partition size so we are checking the entire output buffer so
+    // we must initialize the buffers
+    memset(ref_dest, 0, sizeof(ref_dest));
+    memset(opt_dest, 0, sizeof(opt_dest));
+
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        opt(opt_dest, STRIDE, pbuf2 + j, STRIDE);
+        ref(ref_dest, STRIDE, pbuf2 + j, STRIDE);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.satd[part])
@@ -611,6 +636,24 @@
         }
     }
 
+    if (opt.luma_copy_pp[part])
+    {
+        if (!check_block_copy_pp(ref.luma_copy_pp[part], opt.luma_copy_pp[part]))
+        {
+            printf("luma_copy_pp[%s] failed\n", lumaPartStr[part]);
+            return false;
+        }
+    }
+
+    if (opt.chroma_copy_pp[part])
+    {
+        if (!check_block_copy_pp(ref.chroma_copy_pp[part], opt.chroma_copy_pp[part]))
+        {
+            printf("chroma_copy_pp[%s] failed\n", chromaPartStr[part]);
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -769,6 +812,7 @@
             return false;
         }
     }
+
     return true;
 }
 
@@ -830,6 +874,18 @@
         printf("sse_ss[%s]", lumaPartStr[part]);
         REPORT_SPEEDUP(opt.sse_ss[part], ref.sse_ss[part], (int16_t*)pbuf1, STRIDE, (int16_t*)fref, STRIDE);
     }
+
+    if (opt.luma_copy_pp[part])
+    {
+        printf("luma_copy_pp[%s]", lumaPartStr[part]);
+        REPORT_SPEEDUP(opt.luma_copy_pp[part], ref.luma_copy_pp[part], pbuf1, 64, pbuf2, 128);
+    }
+
+    if (opt.chroma_copy_pp[part])
+    {
+        printf("chroma_copy_pp[%s]", chromaPartStr[part]);
+        REPORT_SPEEDUP(opt.chroma_copy_pp[part], ref.chroma_copy_pp[part], pbuf1, 64, pbuf2, 128);
+    }
 }
 
 void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/pixelharness.h	Thu Oct 31 18:43:03 2013 +0530
@@ -57,6 +57,7 @@
     bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
     bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
 
+    bool check_block_copy_pp(copy_pp_t ref, copy_pp_t opt);
 public:
 
     PixelHarness();
diff -r 9bff4295adfc -r 2cdef1dd17b2 source/test/testpool.cpp
--- a/source/test/testpool.cpp	Thu Oct 31 15:40:28 2013 +0530
+++ b/source/test/testpool.cpp	Thu Oct 31 18:43:03 2013 +0530
@@ -30,6 +30,7 @@
 #include <time.h>
 #include <assert.h>
 #include <string.h>
+#include <stdio.h>
 #include <sstream>
 #include <iostream>
 


More information about the x265-devel mailing list