[x265] [PATCH] predict: introduce PredictionUnit structure to maintain current PU geometry

Tue Feb 24 18:43:01 CET 2015

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1424799770 21600
#      Tue Feb 24 11:42:50 2015 -0600
# Node ID 9e2a83c99a2a53fc102a04bdbd327ae0d2967087
# Parent  7252c10278a1e1aeddbe489180aad35196262919
predict: introduce PredictionUnit structure to maintain current PU geometry

The whole idea of prepMotionCompensation was to cache some data that could be
shared between motionCompensation() calls but since the motion vectors and refs
were included in that data, it had to be re-done each time any way.  In the
mean-time, much code was taking advantage of the PU data members but in a
fragile way, data hazards were too common.

The PredictionUnit struct is intended to be stack allocated and the PU geometry
is valid so long as the structure is within scope, making the data safe.

diff -r 7252c10278a1 -r 9e2a83c99a2a source/common/predict.cpp

--- a/source/common/predict.cpp	Mon Feb 23 23:11:49 2015 -0600
+++ b/source/common/predict.cpp	Tue Feb 24 11:42:50 2015 -0600
@@ -34,6 +34,19 @@
 #pragma warning(disable: 4127) // conditional expression is constant
 #endif
 
+PredictionUnit::PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx)
+{
+    /* address of CTU */
+    slice = cu.m_slice;
+    ctuAddr = cu.m_cuAddr;
+
+    /* address of CU */
+    cuAbsPartIdx = cuGeom.absPartIdx;
+
+    /* address of PU */
+    cu.getPartIndexAndSize(puIdx, puAbsPartIdx, width, height);
+}
+
 namespace
 {
 inline pixel weightBidir(int w0, int16_t P0, int w1, int16_t P1, int round, int shift, int offset)
@@ -112,37 +125,25 @@
     primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0);
 }
 
-void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
+
+void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
 {
-    m_predSlice = cu.m_slice;
-    cu.getPartIndexAndSize(partIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
-    m_ctuAddr = cu.m_cuAddr;
-    m_cuAbsPartIdx = cuGeom.absPartIdx;
-}
+    int refIdx0 = cu.m_refIdx[0][pu.puAbsPartIdx];
+    int refIdx1 = cu.m_refIdx[1][pu.puAbsPartIdx];
 
-void Predict::prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
-{
-    initMotionCompensation(cu, cuGeom, partIdx);
-
-    m_refIdx0      = cu.m_refIdx[0][m_puAbsPartIdx];
-    m_clippedMv[0] = cu.m_mv[0][m_puAbsPartIdx];
-    m_refIdx1      = cu.m_refIdx[1][m_puAbsPartIdx];
-    m_clippedMv[1] = cu.m_mv[1][m_puAbsPartIdx];
-    cu.clipMv(m_clippedMv[0]);
-    cu.clipMv(m_clippedMv[1]);
-}
-
-void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
-{
-    if (m_predSlice->isInterP())
+    if (cu.m_slice->isInterP())
     {
         /* P Slice */
         WeightValues wv0[3];
-        X265_CHECK(m_refIdx0 >= 0, "invalid P refidx\n");
-        X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "P refidx out of range\n");
-        const WeightParam *wp0 = m_predSlice->m_weightPredTable[0][m_refIdx0];
 
-        if (m_predSlice->m_pps->bUseWeightPred && wp0->bPresentFlag)
+        X265_CHECK(refIdx0 >= 0, "invalid P refidx\n");
+        X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "P refidx out of range\n");
+        const WeightParam *wp0 = cu.m_slice->m_weightPredTable[0][refIdx0];
+
+        MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
+        cu.clipMv(mv0);
+
+        if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
         {
             for (int plane = 0; plane < 3; plane++)
             {
@@ -155,18 +156,18 @@
             ShortYuv& shortYuv = m_predShortYuv[0];
 
             if (bLuma)
-                predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
             if (bChroma)
-                predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
 
-            addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
+            addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
         }
         else
         {
             if (bLuma)
-                predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
             if (bChroma)
-                predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
         }
     }
     else
@@ -176,10 +177,10 @@
         WeightValues wv0[3], wv1[3];
         const WeightParam *pwp0, *pwp1;
 
-        if (m_predSlice->m_pps->bUseWeightedBiPred)
+        if (cu.m_slice->m_pps->bUseWeightedBiPred)
         {
-            pwp0 = m_refIdx0 >= 0 ? m_predSlice->m_weightPredTable[0][m_refIdx0] : NULL;
-            pwp1 = m_refIdx1 >= 0 ? m_predSlice->m_weightPredTable[1][m_refIdx1] : NULL;
+            pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
+            pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
             {
@@ -200,7 +201,7 @@
             else
             {
                 /* uniprediction weighting, always outputs to wv0 */
-                const WeightParam* pwp = (m_refIdx0 >= 0) ? pwp0 : pwp1;
+                const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
                 for (int plane = 0; plane < 3; plane++)
                 {
                     wv0[plane].w = pwp[plane].inputWeight;
@@ -213,89 +214,100 @@
         else
             pwp0 = pwp1 = NULL;
 
-        if (m_refIdx0 >= 0 && m_refIdx1 >= 0)
+        if (refIdx0 >= 0 && refIdx1 >= 0)
         {
+            MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
+            MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
+            cu.clipMv(mv0);
+            cu.clipMv(mv1);
+
             /* Biprediction */
-            X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "bidir refidx0 out of range\n");
-            X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "bidir refidx1 out of range\n");
+            X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "bidir refidx0 out of range\n");
+            X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "bidir refidx1 out of range\n");
 
             if (bLuma)
             {
-                predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
-                predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
-                predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
             }
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
-                addWeightBi(predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
+                addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
             else
-                predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], m_puAbsPartIdx, m_puWidth, m_puHeight, bLuma, bChroma);
+                predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);
         }
-        else if (m_refIdx0 >= 0)
+        else if (refIdx0 >= 0)
         {
+            MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
+            cu.clipMv(mv0);
+
             /* uniprediction to L0 */
-            X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "unidir refidx0 out of range\n");
+            X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "unidir refidx0 out of range\n");
 
             if (pwp0 && pwp0->bPresentFlag)
             {
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
                 if (bChroma)
-                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
 
-                addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
+                addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
                 if (bChroma)
-                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
             }
         }
         else
         {
+            MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
+            cu.clipMv(mv1);
+
             /* uniprediction to L1 */
-            X265_CHECK(m_refIdx1 >= 0, "refidx1 was not positive\n");
-            X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "unidir refidx1 out of range\n");
+            X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");
+            X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "unidir refidx1 out of range\n");
 
             if (pwp1 && pwp1->bPresentFlag)
             {
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
                 if (bChroma)
-                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
 
-                addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
+                addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
                 if (bChroma)
-                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
             }
         }
     }
 }
 
-void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
 {
-    pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
+    pixel* dst = dstYuv.getLumaAddr(pu.puAbsPartIdx);
     intptr_t dstStride = dstYuv.m_size;
 
     intptr_t srcStride = refPic.m_stride;
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
-    const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+    int partEnum = partitionFromSizes(pu.width, pu.height);
+    const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
 
     int xFrac = mv.x & 0x3;
     int yFrac = mv.y & 0x3;
@@ -310,32 +322,32 @@
         primitives.pu[partEnum].luma_hvpp(src, srcStride, dst, dstStride, xFrac, yFrac);
 }
 
-void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
 {
-    int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
+    int16_t* dst = dstSYuv.getLumaAddr(pu.puAbsPartIdx);
     int dstStride = dstSYuv.m_size;
 
     intptr_t srcStride = refPic.m_stride;
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
-    const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+    const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
 
     int xFrac = mv.x & 0x3;
     int yFrac = mv.y & 0x3;
 
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
+    int partEnum = partitionFromSizes(pu.width, pu.height);
 
-    X265_CHECK((m_puWidth % 4) + (m_puHeight % 4) == 0, "width or height not divisible by 4\n");
+    X265_CHECK((pu.width % 4) + (pu.height % 4) == 0, "width or height not divisible by 4\n");
     X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n");
 
     if (!(yFrac | xFrac))
-        primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight);
+        primitives.luma_p2s(src, srcStride, dst, pu.width, pu.height);
     else if (!yFrac)
         primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
     else if (!xFrac)
         primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
     else
     {
-        int tmpStride = m_puWidth;
+        int tmpStride = pu.width;
         int filterSize = NTAPS_LUMA;
         int halfFilterSize = (filterSize >> 1);
         primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
@@ -343,7 +355,7 @@
     }
 }
 
-void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
 {
     intptr_t dstStride = dstYuv.m_csize;
     intptr_t refStride = refPic.m_strideC;
@@ -353,16 +365,16 @@
 
     intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
 
-    const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
-    const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
+    const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
 
-    pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx);
-    pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx);
+    pixel* dstCb = dstYuv.getCbAddr(pu.puAbsPartIdx);
+    pixel* dstCr = dstYuv.getCrAddr(pu.puAbsPartIdx);
 
     int xFrac = mv.x & ((1 << shiftHor) - 1);
     int yFrac = mv.y & ((1 << shiftVer) - 1);
 
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
+    int partEnum = partitionFromSizes(pu.width, pu.height);
     
     if (!(yFrac | xFrac))
     {
@@ -381,7 +393,7 @@
     }
     else
     {
-        int extStride = m_puWidth >> m_hChromaShift;
+        int extStride = pu.width >> m_hChromaShift;
         int filterSize = NTAPS_CHROMA;
         int halfFilterSize = (filterSize >> 1);
 
@@ -393,7 +405,7 @@
     }
 }
 
-void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
 {
     intptr_t refStride = refPic.m_strideC;
     intptr_t dstStride = dstSYuv.m_csize;
@@ -403,19 +415,19 @@
 
     intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
 
-    const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
-    const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
+    const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
 
-    int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx);
-    int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx);
+    int16_t* dstCb = dstSYuv.getCbAddr(pu.puAbsPartIdx);
+    int16_t* dstCr = dstSYuv.getCrAddr(pu.puAbsPartIdx);
 
     int xFrac = mv.x & ((1 << shiftHor) - 1);
     int yFrac = mv.y & ((1 << shiftVer) - 1);
 
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
+    int partEnum = partitionFromSizes(pu.width, pu.height);
     
-    uint32_t cxWidth  = m_puWidth >> m_hChromaShift;
-    uint32_t cxHeight = m_puHeight >> m_vChromaShift;
+    uint32_t cxWidth  = pu.width >> m_hChromaShift;
+    uint32_t cxHeight = pu.height >> m_vChromaShift;
 
     X265_CHECK(((cxWidth | cxHeight) % 2) == 0, "chroma block size expected to be multiple of 2\n");
 
@@ -447,7 +459,7 @@
 }
 
 /* weighted averaging for bi-pred */
-void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const
+void Predict::addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const
 {
     int x, y;
 
@@ -456,9 +468,9 @@
 
     if (bLuma)
     {
-        pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
-        const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
-        const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
+        pixel* dstY = predYuv.getLumaAddr(pu.puAbsPartIdx);
+        const int16_t* srcY0 = srcYuv0.getLumaAddr(pu.puAbsPartIdx);
+        const int16_t* srcY1 = srcYuv1.getLumaAddr(pu.puAbsPartIdx);
 
         // Luma
         w0      = wp0[0].w;
@@ -473,9 +485,9 @@
         dststride = predYuv.m_size;
 
         // TODO: can we use weight_sp here?
-        for (y = m_puHeight - 1; y >= 0; y--)
+        for (y = pu.height - 1; y >= 0; y--)
         {
-            for (x = m_puWidth - 1; x >= 0; )
+            for (x = pu.width - 1; x >= 0; )
             {
                 // note: luma min width is 4
                 dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset);
@@ -496,12 +508,12 @@
 
     if (bChroma)
     {
-        pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
-        pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-        const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
-        const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
-        const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
-        const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
+        pixel* dstU = predYuv.getCbAddr(pu.puAbsPartIdx);
+        pixel* dstV = predYuv.getCrAddr(pu.puAbsPartIdx);
+        const int16_t* srcU0 = srcYuv0.getCbAddr(pu.puAbsPartIdx);
+        const int16_t* srcV0 = srcYuv0.getCrAddr(pu.puAbsPartIdx);
+        const int16_t* srcU1 = srcYuv1.getCbAddr(pu.puAbsPartIdx);
+        const int16_t* srcV1 = srcYuv1.getCrAddr(pu.puAbsPartIdx);
 
         // Chroma U
         w0      = wp0[1].w;
@@ -515,8 +527,8 @@
         src1Stride = srcYuv1.m_csize;
         dststride  = predYuv.m_csize;
 
-        uint32_t cwidth = m_puWidth >> srcYuv0.m_hChromaShift;
-        uint32_t cheight = m_puHeight >> srcYuv0.m_vChromaShift;
+        uint32_t cwidth = pu.width >> srcYuv0.m_hChromaShift;
+        uint32_t cheight = pu.height >> srcYuv0.m_vChromaShift;
 
         // TODO: can we use weight_sp here?
         for (y = cheight - 1; y >= 0; y--)
@@ -561,15 +573,15 @@
 }
 
 /* weighted averaging for uni-pred */
-void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
+void Predict::addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
 {
     int w0, offset, shiftNum, shift, round;
     uint32_t srcStride, dstStride;
 
     if (bLuma)
     {
-        pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
-        const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
+        pixel* dstY = predYuv.getLumaAddr(pu.puAbsPartIdx);
+        const int16_t* srcY0 = srcYuv.getLumaAddr(pu.puAbsPartIdx);
 
         // Luma
         w0      = wp[0].w;
@@ -580,15 +592,15 @@
         srcStride = srcYuv.m_size;
         dstStride = predYuv.m_size;
 
-        primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
+        primitives.weight_sp(srcY0, dstY, srcStride, dstStride, pu.width, pu.height, w0, round, shift, offset);
     }
 
     if (bChroma)
     {
-        pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
-        pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-        const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
-        const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
+        pixel* dstU = predYuv.getCbAddr(pu.puAbsPartIdx);
+        pixel* dstV = predYuv.getCrAddr(pu.puAbsPartIdx);
+        const int16_t* srcU0 = srcYuv.getCbAddr(pu.puAbsPartIdx);
+        const int16_t* srcV0 = srcYuv.getCrAddr(pu.puAbsPartIdx);
 
         // Chroma U
         w0      = wp[1].w;
@@ -600,8 +612,8 @@
         srcStride = srcYuv.m_csize;
         dstStride = predYuv.m_csize;
 
-        uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift;
-        uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift;
+        uint32_t cwidth = pu.width >> srcYuv.m_hChromaShift;
+        uint32_t cheight = pu.height >> srcYuv.m_vChromaShift;
 
         primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
 
diff -r 7252c10278a1 -r 9e2a83c99a2a source/common/predict.h
--- a/source/common/predict.h	Mon Feb 23 23:11:49 2015 -0600
+++ b/source/common/predict.h	Tue Feb 24 11:42:50 2015 -0600
@@ -36,6 +36,18 @@
 class Slice;
 struct CUGeom;
 
+struct PredictionUnit
+{
+    const Slice* slice;
+    uint32_t     ctuAddr;      // raster index of current CTU within its picture
+    uint32_t     cuAbsPartIdx; // z-order index of current CU within its CTU
+    uint32_t     puAbsPartIdx; // z-order index of current PU with its CU
+    int          width;
+    int          height;
+
+    PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx);
+};
+
 class Predict
 {
 public:
@@ -65,38 +77,34 @@
 
     // Unfiltered/filtered neighbours of the current partition.
     pixel     intraNeighbourBuf[2][258];
+
     /* Slice information */
-    const Slice* m_predSlice;
     int       m_csp;
     int       m_hChromaShift;
     int       m_vChromaShift;
 
-    /* cached CU information for prediction */
-    uint32_t  m_ctuAddr;      // raster index of current CTU within its picture
-    uint32_t  m_cuAbsPartIdx; // z-order index of current CU within its CTU
-    uint32_t  m_puAbsPartIdx; // z-order index of current PU with its CU
-    int       m_puWidth;
-    int       m_puHeight;
-    int       m_refIdx0;
-    int       m_refIdx1;
-
-    /* TODO: Need to investigate clipping while writing into the TComDataCU fields itself */
-    MV        m_clippedMv[2];
-
     Predict();
     ~Predict();
 
     bool allocBuffers(int csp);
 
     // motion compensation functions
-    void predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
-    void predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
 
-    void predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
-    void predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
 
-    void addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const;
-    void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
+    void addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const;
+    void addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
+
+    void motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma);
+
+    /* Angular Intra */
+    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
+    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
+    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
 
     /* Intra prediction helper functions */
     static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
@@ -111,19 +119,6 @@
     static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
     template<bool cip>
     static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);
-
-public:
-
-    /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */
-    void initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx);
-    void prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx);
-    void motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma);
-
-    /* Angular Intra */
-    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
-    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
-    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
-    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
 };
 }
 
diff -r 7252c10278a1 -r 9e2a83c99a2a source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Mon Feb 23 23:11:49 2015 -0600
+++ b/source/encoder/analysis.cpp	Tue Feb 24 11:42:50 2015 -0600
@@ -574,8 +574,8 @@
                 {
                     for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                        motionCompensation(bestInter->predYuv, false, true);
+                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
                     }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
@@ -610,8 +610,8 @@
                     /* finally code the best mode selected from SA8D costs */
                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
-                        motionCompensation(md.bestMode->predYuv, false, true);
+                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
                     }
                     encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
                 }
@@ -828,8 +828,8 @@
                 {
                     for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                        motionCompensation(bestInter->predYuv, false, true);
+                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
                     }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
@@ -883,8 +883,8 @@
                 {
                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
-                        motionCompensation(md.bestMode->predYuv, false, true);
+                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
                     }
                     if (m_param->rdLevel == 2)
                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
@@ -1220,6 +1220,7 @@
     MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
     uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
     uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
+    PredictionUnit pu(merge.cu, cuGeom, 0);
 
     bestPred->sa8dCost = MAX_INT64;
     int bestSadCand = -1;
@@ -1239,8 +1240,7 @@
         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
         tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
 
-        prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d);
 
         tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
         tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
@@ -1264,10 +1264,7 @@
 
     /* calculate the motion compensation for chroma for the best mode selected */
     if (!m_bChromaSa8d) /* Chroma MC was done above */
-    {
-        prepMotionCompensation(bestPred->cu, cuGeom, 0);
-        motionCompensation(bestPred->predYuv, false, true);
-    }
+        motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
 
     if (m_param->rdLevel)
     {
@@ -1322,6 +1319,7 @@
     MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
     uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
     uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
+    PredictionUnit pu(merge.cu, cuGeom, 0);
 
     bool foundCbf0Merge = false;
     bool triedPZero = false, triedBZero = false;
@@ -1338,8 +1336,7 @@
         tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
         tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
 
-        prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, true);
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
 
         encodeResAndCalcRdSkipCU(*tempPred);
 
@@ -1379,8 +1376,7 @@
             tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
             tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
 
-            prepMotionCompensation(tempPred->cu, cuGeom, 0);
-            motionCompensation(tempPred->predYuv, true, true);
+            motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
 
             uint8_t hasCbf = true;
             bool swapped = false;
@@ -1572,8 +1568,8 @@
     cu.setPUMv(1, bestME[1].mv, 0, 0);
     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
 
-    prepMotionCompensation(cu, cuGeom, 0);
-    motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+    PredictionUnit pu(cu, cuGeom, 0);
+    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d);
 
     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
     if (m_bChromaSa8d)
@@ -1612,8 +1608,7 @@
             cu.m_mv[0][0] = mvzero;
             cu.m_mv[1][0] = mvzero;
 
-            prepMotionCompensation(cu, cuGeom, 0);
-            motionCompensation(tmpPredYuv, true, true);
+            motionCompensation(cu, pu, tmpPredYuv, true, true);
 
             zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
@@ -1621,8 +1616,8 @@
         }
         else
         {
-            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx);
-            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx);
+            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
+            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
             intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
 
             primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
@@ -1657,10 +1652,7 @@
                 /* real MC was already performed */
                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
             else
-            {
-                prepMotionCompensation(cu, cuGeom, 0);
-                motionCompensation(bidir2Nx2N.predYuv, true, true);
-            }
+                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true);
         }
         else if (m_bChromaSa8d)
         {
diff -r 7252c10278a1 -r 9e2a83c99a2a source/encoder/search.cpp
--- a/source/encoder/search.cpp	Mon Feb 23 23:11:49 2015 -0600
+++ b/source/encoder/search.cpp	Tue Feb 24 11:42:50 2015 -0600
@@ -1827,6 +1827,7 @@
     }
 
     Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
+    PredictionUnit pu(cu, cuGeom, puIdx); /* could be passed in */
 
     uint32_t outCost = MAX_UINT;
     for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
@@ -1842,8 +1843,7 @@
         cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
         cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
 
-        prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(tempYuv, true, m_me.bChromaSATD);
+        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
 
         uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
         if (m_me.bChromaSATD)
@@ -1899,17 +1899,17 @@
         slave.setQP(*m_slice, m_rdCost.m_qp);
         slave.m_slice = m_slice;
         slave.m_frame = m_frame;
-        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.mode.cu.m_cuAddr, pme.cuGeom.absPartIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
-        slave.prepMotionCompensation(pme.mode.cu, pme.cuGeom, pme.puIdx);
+
+        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
     }
 
     /* Perform ME, repeat until no more work is available */
     do
     {
         if (meId < m_slice->m_numRefIdx[0])
-            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.puIdx, 0, meId);
+            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.pu, pme.puIdx, 0, meId);
         else
-            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
+            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.pu, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
 
         meId = -1;
         pme.m_lock.acquire();
@@ -1920,16 +1920,14 @@
     while (meId >= 0);
 }
 
-/* this function assumes the caller has configured its MotionEstimation engine with the
- * correct source plane and source PU, and has called prepMotionCompensation() to set
- * m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, const PredictionUnit& pu,
+                                    int part, int list, int ref)
 {
     uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
     bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
 
     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
-    int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
+    int numMvc = interMode.cu.fillMvpCand(part, pu.puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
 
     int mvpIdx = 0;
     int merange = m_param->searchRange;
@@ -1949,8 +1947,8 @@
             interMode.cu.clipMv(mvCand);
 
             Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
-            predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
-            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
 
             if (bestCost > cost)
             {
@@ -2013,35 +2011,32 @@
     for (int puIdx = 0; puIdx < numPart; puIdx++)
     {
         MotionData* bestME = interMode.bestME[puIdx];
-
-        /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
-        initMotionCompensation(cu, cuGeom, puIdx);
-
-        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.absPartIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+        PredictionUnit pu(cu, cuGeom, puIdx);
+
+        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
 
         uint32_t mrgCost = MAX_UINT;
 
         /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
         if (cu.m_partSize[0] != SIZE_2Nx2N)
         {
-            merge.absPartIdx = m_puAbsPartIdx;
-            merge.width      = m_puWidth;
-            merge.height     = m_puHeight;
+            merge.absPartIdx = pu.puAbsPartIdx;
+            merge.width      = pu.width;
+            merge.height     = pu.height;
             mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
 
             if (bMergeOnly && mrgCost != MAX_UINT)
             {
-                cu.m_mergeFlag[m_puAbsPartIdx] = true;
-                cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
-                cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
-                cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
-                cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
-                cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
-                cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
+                cu.m_mergeFlag[pu.puAbsPartIdx] = true;
+                cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
+                cu.setPUInterDir(merge.interDir, pu.puAbsPartIdx, puIdx);
+                cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
+                cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
+                cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
+                cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
                 totalmebits += merge.bits;
 
-                prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(*predYuv, true, bChromaSA8D);
+                motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
                 continue;
             }
         }
@@ -2061,7 +2056,7 @@
                 uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
                 bits += getTUBits(ref, numRefIdx[l]);
 
-                int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+                int numMvc = cu.fillMvpCand(puIdx, pu.puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
 
                 // Pick the best possible MVP from AMVP candidates based on least residual
                 int mvpIdx = 0;
@@ -2079,8 +2074,8 @@
                             continue;
 
                         cu.clipMv(mvCand);
-                        predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
-                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+                        predInterLumaPixel(pu, tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
 
                         if (bestCost > cost)
                         {
@@ -2116,7 +2111,7 @@
         }
         else if (bTryDistributed)
         {
-            PME pme(*this, interMode, cuGeom, puIdx);
+            PME pme(*this, interMode, cuGeom, pu, puIdx);
             pme.m_jobTotal = numME;
             pme.m_jobAcquired = 1; /* reserve L0-0 */
 
@@ -2124,7 +2119,7 @@
             {
                 processPME(pme, *this);
 
-                singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0); /* L0-0 */
+                singleMotionEstimation(*this, interMode, cuGeom, pu, puIdx, 0, 0); /* L0-0 */
 
                 bDoUnidir = false;
 
@@ -2144,7 +2139,7 @@
                     uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
                     bits += getTUBits(ref, numRefIdx[l]);
 
-                    int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+                    int numMvc = cu.fillMvpCand(puIdx, pu.puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
 
                     // Pick the best possible MVP from AMVP candidates based on least residual
                     int mvpIdx = 0;
@@ -2162,8 +2157,8 @@
                                 continue;
 
                             cu.clipMv(mvCand);
-                            predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
-                            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+                            predInterLumaPixel(pu, tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
 
                             if (bestCost > cost)
                             {
@@ -2204,7 +2199,7 @@
         int bidirBits = 0;
 
         if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
-            cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N &&     /* 2Nx2N biprediction is handled elsewhere */
+            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
             bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
         {
             bidir[0] = bestME[0];
@@ -2214,16 +2209,14 @@
 
             if (m_me.bChromaSATD)
             {
-                cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
-                cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
-                cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
-                cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
-
-                prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(tmpPredYuv, true, true);
-
-                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
-                           m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
+                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
+                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
+                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
+                motionCompensation(cu, pu, tmpPredYuv, true, true);
+
+                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
+                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
             }
             else
             {
@@ -2232,11 +2225,11 @@
                 Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
 
                 /* Generate reference subpels */
-                predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
-                predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
-
-                primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
-                                                                                              bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
+                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
+
+                primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
+                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
                 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
             }
 
@@ -2263,21 +2256,19 @@
                 /* coincident blocks of the two reference pictures */
                 if (m_me.bChromaSATD)
                 {
-                    cu.m_mv[0][m_puAbsPartIdx] = mvzero;
-                    cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
-                    cu.m_mv[1][m_puAbsPartIdx] = mvzero;
-                    cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
-
-                    prepMotionCompensation(cu, cuGeom, puIdx);
-                    motionCompensation(tmpPredYuv, true, true);
-
-                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
-                               m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
+                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
+                    motionCompensation(cu, pu, tmpPredYuv, true, true);
+
+                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
+                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
                 }
                 else
                 {
-                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + m_puAbsPartIdx);
-                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + m_puAbsPartIdx);
+                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
+                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
                     intptr_t refStride = slice->m_mref[0][0].lumaStride;
 
                     primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
@@ -2315,13 +2306,13 @@
         /* select best option and store into CU */
         if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
         {
-            cu.m_mergeFlag[m_puAbsPartIdx] = true;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
-            cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
+            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
+            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
+            cu.setPUInterDir(merge.interDir, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
 
             totalmebits += merge.bits;
         }
@@ -2329,17 +2320,17 @@
         {
             lastMode = 2;
 
-            cu.m_mergeFlag[m_puAbsPartIdx] = false;
-            cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
-
-            cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
-            cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
+            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
+            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
+            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
+
+            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
+            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
 
             totalmebits += bidirBits;
         }
@@ -2347,15 +2338,15 @@
         {
             lastMode = 0;
 
-            cu.m_mergeFlag[m_puAbsPartIdx] = false;
-            cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
-
-            cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
+            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
+            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
+
+            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
 
             totalmebits += bestME[0].bits;
         }
@@ -2363,21 +2354,20 @@
         {
             lastMode = 1;
 
-            cu.m_mergeFlag[m_puAbsPartIdx] = false;
-            cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
-            cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
-
-            cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
+            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
+            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
+
+            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
 
             totalmebits += bestME[1].bits;
         }
 
-        prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(*predYuv, true, bChromaSA8D);
+        motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
     }
 
     interMode.sa8dBits += totalmebits;
diff -r 7252c10278a1 -r 9e2a83c99a2a source/encoder/search.h
--- a/source/encoder/search.h	Mon Feb 23 23:11:49 2015 -0600
+++ b/source/encoder/search.h	Tue Feb 24 11:42:50 2015 -0600
@@ -288,9 +288,10 @@
         Search&       master;
         Mode&         mode;
         const CUGeom& cuGeom;
+        const PredictionUnit& pu;
         int           puIdx;
 
-        PME(Search& s, Mode& m, const CUGeom& g, int p) : master(s), mode(m), cuGeom(g), puIdx(p) {}
+        PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
 
         void processTasks(int workerThreadId);
 
@@ -300,7 +301,7 @@
     };
 
     void     processPME(PME& pme, Search& slave);
-    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
+    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, const PredictionUnit& pu, int part, int list, int ref);
 
 protected: