[x265] [PATCH 3 of 3] slicetype: refactor lookahead to use bonded task groups

Thu Feb 19 20:40:44 CET 2015

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1424360526 21600
#      Thu Feb 19 09:42:06 2015 -0600
# Node ID 8faf10074f9fae32f90ec3d214f9048c96eefd27
# Parent  bc969aedef71552dfd6a914d99543f3ac4eb16e3
slicetype: refactor lookahead to use bonded task groups

The lowres downscale, adaptive quant calculations, and lowres intra analysis
are all moved into a per-frame task which is performed by worker threads when
a worker pool is available to the encoder.  Similary slicetypeDecide() is also
performed by worker threads if available.

Individual frame cost estimations no longer use wave-front scheduling. Instead
the frames are divided into slices which are distributed to workers via a
bonded task group. This change reduces the accuracy of P frame decisions, and
will likely require further tuning. It improves overall work efficiency but it
lowers compression efficiency

When --b-adapt 2 is used on large core systems, the initial batch of frame cost
estimates are performed by the entire thread pool at once, again using a bonded
task group.

diff -r bc969aedef71 -r 8faf10074f9f doc/reST/threading.rst

--- a/doc/reST/threading.rst	Thu Feb 19 09:39:48 2015 -0600
+++ b/doc/reST/threading.rst	Thu Feb 19 09:42:06 2015 -0600
@@ -222,13 +222,12 @@
 
 The lookahead module of x265 (the lowres pre-encode which determines
 scene cuts and slice types) uses the thread pool to distribute the
-lowres cost analysis to worker threads. It follows the same wave-front
-pattern as the main encoder except it works in reverse-scan order.
+lowres cost analysis to worker threads. It will use bonded task groups
+to perform batches of frame cost estimates.
 
-The function slicetypeDecide() itself may also be performed by a worker
-thread if your system has enough CPU cores to make this a beneficial
-trade-off, else it runs within the context of the thread which calls the
-x265_encoder_encode().
+The function slicetypeDecide() itself is also be performed by a worker
+thread if your encoder has a thread pool, else it runs within the
+context of the thread which calls the x265_encoder_encode().
 
 SAO
 ===
diff -r bc969aedef71 -r 8faf10074f9f source/common/lowres.cpp
--- a/source/common/lowres.cpp	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/common/lowres.cpp	Thu Feb 19 09:42:06 2015 -0600
@@ -56,12 +56,14 @@
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
     /* allocate lowres buffers */
-    for (int i = 0; i < 4; i++)
-    {
-        CHECKED_MALLOC(buffer[i], pixel, planesize);
-        /* initialize the whole buffer to prevent valgrind warnings on right edge */
-        memset(buffer[i], 0, sizeof(pixel) * planesize);
-    }
+    CHECKED_MALLOC(buffer[0], pixel, 4 * planesize);
+
+    /* initialize the whole buffer to prevent valgrind warnings on right edge */
+    memset(buffer[0], 0, 4 * sizeof(pixel) * planesize);
+
+    buffer[1] = buffer[0] + planesize;
+    buffer[2] = buffer[1] + planesize;
+    buffer[3] = buffer[2] + planesize;
 
     lowresPlane[0] = buffer[0] + padoffset;
     lowresPlane[1] = buffer[1] + padoffset;
@@ -96,9 +98,7 @@
 
 void Lowres::destroy()
 {
-    for (int i = 0; i < 4; i++)
-        X265_FREE(buffer[i]);
-
+    X265_FREE(buffer[0]);
     X265_FREE(intraCost);
     X265_FREE(intraMode);
 
@@ -126,13 +126,11 @@
 }
 
 // (re) initialize lowres state
-void Lowres::init(PicYuv *origPic, int poc, int type)
+void Lowres::init(PicYuv *origPic, int poc)
 {
-    bIntraCalculated = false;
     bLastMiniGopBFrame = false;
     bScenecut = true;  // could be a scene-cut, until ruled out by flash detection
     bKeyframe = false; // Not a keyframe unless identified by lookahead
-    sliceType = type;
     frameNum = poc;
     leadingBframes = 0;
     indB = 0;
@@ -158,8 +156,8 @@
 
     /* downscale and generate 4 hpel planes for lookahead */
     primitives.frameInitLowres(origPic->m_picOrg[0],
-                                      lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
-                                      origPic->m_stride, lumaStride, width, lines);
+                               lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
+                               origPic->m_stride, lumaStride, width, lines);
 
     /* extend hpel planes for motion search */
     extendPicBorder(lowresPlane[0], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
diff -r bc969aedef71 -r 8faf10074f9f source/common/lowres.h
--- a/source/common/lowres.h	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/common/lowres.h	Thu Feb 19 09:42:06 2015 -0600
@@ -114,7 +114,6 @@
     int    lines;            // height of lowres frame in pixel lines
     int    leadingBframes;   // number of leading B frames for P or I
 
-    bool   bIntraCalculated;
     bool   bScenecut;        // Set to false if the frame cannot possibly be part of a real scenecut.
     bool   bKeyframe;
     bool   bLastMiniGopBFrame;
@@ -151,7 +150,7 @@
 
     bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
     void destroy();
-    void init(PicYuv *origPic, int poc, int sliceType);
+    void init(PicYuv *origPic, int poc);
 };
 }
 
diff -r bc969aedef71 -r 8faf10074f9f source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/encoder/ratecontrol.cpp	Thu Feb 19 09:42:06 2015 -0600
@@ -145,30 +145,6 @@
 }
 
 }  // end anonymous namespace
-/* Compute variance to derive AC energy of each block */
-static inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int i)
-{
-    uint32_t sum = (uint32_t)sum_ssd;
-    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
-
-    curFrame->m_lowres.wp_sum[i] += sum;
-    curFrame->m_lowres.wp_ssd[i] += ssd;
-    return ssd - ((uint64_t)sum * sum >> shift);
-}
-
-/* Find the energy of each block in Y/Cb/Cr plane */
-static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int bChroma, int colorFormat)
-{
-    if ((colorFormat != X265_CSP_I444) && bChroma)
-    {
-        ALIGN_VAR_8(pixel, pix[8 * 8]);
-        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, bChroma);
-    }
-    else
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, bChroma);
-}
-
 /* Returns the zone for the current frame */
 x265_zone* RateControl::getZone()
 {
@@ -181,134 +157,6 @@
     return NULL;
 }
 
-/* Find the total AC energy of each block in all planes */
-uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y)
-{
-    intptr_t stride = curFrame->m_fencPic->m_stride;
-    intptr_t cStride = curFrame->m_fencPic->m_strideC;
-    intptr_t blockOffsetLuma = block_x + (block_y * stride);
-    int colorFormat = m_param->internalCsp;
-    int hShift = CHROMA_H_SHIFT(colorFormat);
-    int vShift = CHROMA_V_SHIFT(colorFormat);
-    intptr_t blockOffsetChroma = (block_x >> hShift) + ((block_y >> vShift) * cStride);
-
-    uint32_t var;
-
-    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
-    x265_emms();
-    return var;
-}
-
-void RateControl::calcAdaptiveQuantFrame(Frame *curFrame)
-{
-    /* Actual adaptive quantization */
-    int maxCol = curFrame->m_fencPic->m_picWidth;
-    int maxRow = curFrame->m_fencPic->m_picHeight;
-
-    for (int y = 0; y < 3; y++)
-    {
-        curFrame->m_lowres.wp_ssd[y] = 0;
-        curFrame->m_lowres.wp_sum[y] = 0;
-    }
-
-    /* Calculate Qp offset for each 16x16 block in the frame */
-    int block_xy = 0;
-    int block_x = 0, block_y = 0;
-    double strength = 0.f;
-    if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
-    {
-        /* Need to init it anyways for CU tree */
-        int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-        int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-        int cuCount = cuWidth * cuHeight;
-
-        if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
-        {
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
-        }
-
-        /* Need variance data for weighted prediction */
-        if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred)
-        {
-            for (block_y = 0; block_y < maxRow; block_y += 16)
-                for (block_x = 0; block_x < maxCol; block_x += 16)
-                    acEnergyCu(curFrame, block_x, block_y);
-        }
-    }
-    else
-    {
-        block_xy = 0;
-        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
-        if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
-        {
-            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
-            for (block_y = 0; block_y < maxRow; block_y += 16)
-            {
-                for (block_x = 0; block_x < maxCol; block_x += 16)
-                {
-                    uint32_t energy = acEnergyCu(curFrame, block_x, block_y);
-                    qp_adj = pow(energy + 1, 0.1);
-                    curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
-                    avg_adj += qp_adj;
-                    avg_adj_pow2 += qp_adj * qp_adj;
-                    block_xy++;
-                }
-            }
-
-            avg_adj /= m_ncu;
-            avg_adj_pow2 /= m_ncu;
-            strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
-        }
-        else
-            strength = m_param->rc.aqStrength * 1.0397f;
-
-        block_xy = 0;
-        for (block_y = 0; block_y < maxRow; block_y += 16)
-        {
-            for (block_x = 0; block_x < maxCol; block_x += 16)
-            {
-                if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
-                {
-                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[block_xy];
-                    qp_adj = strength * (qp_adj - avg_adj);
-                }
-                else
-                {
-                    uint32_t energy = acEnergyCu(curFrame, block_x, block_y);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
-                }
-                curFrame->m_lowres.qpAqOffset[block_xy] = qp_adj;
-                curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
-                curFrame->m_lowres.invQscaleFactor[block_xy] = x265_exp2fix8(qp_adj);
-                block_xy++;
-            }
-        }
-    }
-
-    if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred)
-    {
-        int hShift = CHROMA_H_SHIFT(m_param->internalCsp);
-        int vShift = CHROMA_V_SHIFT(m_param->internalCsp);
-        maxCol = ((maxCol + 8) >> 4) << 4;
-        maxRow = ((maxRow + 8) >> 4) << 4;
-        int width[3]  = { maxCol, maxCol >> hShift, maxCol >> hShift };
-        int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
-
-        for (int i = 0; i < 3; i++)
-        {
-            uint64_t sum, ssd;
-            sum = curFrame->m_lowres.wp_sum[i];
-            ssd = curFrame->m_lowres.wp_ssd[i];
-            curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
-        }
-    }
-}
 
 RateControl::RateControl(x265_param *p)
 {
@@ -1351,6 +1199,8 @@
 
     if (m_rce2Pass[frame->m_poc].keptAsRef)
     {
+        /* TODO: We don't need pre-lookahead to measure AQ offsets, but there is currently
+         * no way to signal this */
         uint8_t type;
         if (m_cuTreeStats.qpBufPos < 0)
         {
@@ -1379,8 +1229,6 @@
         }
         m_cuTreeStats.qpBufPos--;
     }
-    else
-        calcAdaptiveQuantFrame(frame);
     return true;
 
 fail:
diff -r bc969aedef71 -r 8faf10074f9f source/encoder/ratecontrol.h
--- a/source/encoder/ratecontrol.h	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/encoder/ratecontrol.h	Thu Feb 19 09:42:06 2015 -0600
@@ -226,7 +226,6 @@
 
     // to be called for each curFrame to process RateControl and set QP
     int rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
-    void calcAdaptiveQuantFrame(Frame *curFrame);
     void rateControlUpdateStats(RateControlEntry* rce);
     int rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats);
     int rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
@@ -253,7 +252,6 @@
     double getQScale(RateControlEntry *rce, double rateFactor);
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
     void accumPQpUpdate();
-    uint32_t acEnergyCu(Frame* pic, uint32_t block_x, uint32_t block_y);
 
     void updateVbv(int64_t bits, RateControlEntry* rce);
     void updatePredictor(Predictor *p, double q, double var, double bits);
diff -r bc969aedef71 -r 8faf10074f9f source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/encoder/slicetype.cpp	Thu Feb 19 09:42:06 2015 -0600
@@ -34,11 +34,11 @@
 #include "motion.h"
 #include "ratecontrol.h"
 
-#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
-
 using namespace x265;
 
-static inline int16_t median(int16_t a, int16_t b, int16_t c)
+namespace {
+
+inline int16_t median(int16_t a, int16_t b, int16_t c)
 {
     int16_t t = (a - b) & ((a - b) >> 31);
 
@@ -49,61 +49,505 @@
     return b;
 }
 
-static inline void median_mv(MV &dst, MV a, MV b, MV c)
+inline void median_mv(MV &dst, MV a, MV b, MV c)
 {
     dst.x = median(a.x, b.x, c.x);
     dst.y = median(a.y, b.y, c.y);
 }
 
+/* Compute variance to derive AC energy of each block */
+inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
+{
+    uint32_t sum = (uint32_t)sum_ssd;
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
+
+    curFrame->m_lowres.wp_sum[plane] += sum;
+    curFrame->m_lowres.wp_ssd[plane] += ssd;
+    return ssd - ((uint64_t)sum * sum >> shift);
+}
+
+/* Find the energy of each block in Y/Cb/Cr plane */
+inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat)
+{
+    if ((colorFormat != X265_CSP_I444) && plane)
+    {
+        ALIGN_VAR_8(pixel, pix[8 * 8]);
+        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+    }
+    else
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+}
+
+} // end anonymous namespace
+
+/* Find the total AC energy of each block in all planes */
+uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp)
+{
+    intptr_t stride = curFrame->m_fencPic->m_stride;
+    intptr_t cStride = curFrame->m_fencPic->m_strideC;
+    intptr_t blockOffsetLuma = blockX + (blockY * stride);
+    int hShift = CHROMA_H_SHIFT(csp);
+    int vShift = CHROMA_V_SHIFT(csp);
+    intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);
+
+    uint32_t var;
+
+    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    x265_emms();
+    return var;
+}
+
+void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param)
+{
+    /* Actual adaptive quantization */
+    int maxCol = curFrame->m_fencPic->m_picWidth;
+    int maxRow = curFrame->m_fencPic->m_picHeight;
+
+    for (int y = 0; y < 3; y++)
+    {
+        curFrame->m_lowres.wp_ssd[y] = 0;
+        curFrame->m_lowres.wp_sum[y] = 0;
+    }
+
+    /* Calculate Qp offset for each 16x16 block in the frame */
+    int blockXY = 0;
+    int blockX = 0, blockY = 0;
+    double strength = 0.f;
+    if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
+    {
+        /* Need to init it anyways for CU tree */
+        int cuCount = widthInCU * heightInCU;
+
+        if (param->rc.aqMode && param->rc.aqStrength == 0)
+        {
+            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
+            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
+            for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+        }
+
+        /* Need variance data for weighted prediction */
+        if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
+        {
+            for (blockY = 0; blockY < maxRow; blockY += 16)
+                for (blockX = 0; blockX < maxCol; blockX += 16)
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+        }
+    }
+    else
+    {
+        blockXY = 0;
+        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
+        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+        {
+            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
+            for (blockY = 0; blockY < maxRow; blockY += 16)
+            {
+                for (blockX = 0; blockX < maxCol; blockX += 16)
+                {
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    qp_adj = pow(energy + 1, 0.1);
+                    curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                    avg_adj += qp_adj;
+                    avg_adj_pow2 += qp_adj * qp_adj;
+                    blockXY++;
+                }
+            }
+
+            avg_adj /= ncu;
+            avg_adj_pow2 /= ncu;
+            strength = param->rc.aqStrength * avg_adj / bit_depth_correction;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
+        }
+        else
+            strength = param->rc.aqStrength * 1.0397f;
+
+        blockXY = 0;
+        for (blockY = 0; blockY < maxRow; blockY += 16)
+        {
+            for (blockX = 0; blockX < maxCol; blockX += 16)
+            {
+                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+                {
+                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                    qp_adj = strength * (qp_adj - avg_adj);
+                }
+                else
+                {
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
+                }
+                curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
+                curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
+                blockXY++;
+            }
+        }
+    }
+
+    if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
+    {
+        int hShift = CHROMA_H_SHIFT(param->internalCsp);
+        int vShift = CHROMA_V_SHIFT(param->internalCsp);
+        maxCol = ((maxCol + 8) >> 4) << 4;
+        maxRow = ((maxRow + 8) >> 4) << 4;
+        int width[3]  = { maxCol, maxCol >> hShift, maxCol >> hShift };
+        int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
+
+        for (int i = 0; i < 3; i++)
+        {
+            uint64_t sum, ssd;
+            sum = curFrame->m_lowres.wp_sum[i];
+            ssd = curFrame->m_lowres.wp_ssd[i];
+            curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
+        }
+    }
+}
+
+void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
+{
+    ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+    ALIGN_VAR_32(pixel, fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+    pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
+
+    const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
+    const int intraPenalty = 5 * lookAheadLambda;
+    const int lowresPenalty = 4; /* fixed CU cost overhead */
+
+    const int cuSize  = X265_LOWRES_CU_SIZE;
+    const int cuSize2 = cuSize << 1;
+    const int sizeIdx = X265_LOWRES_CU_BITS - 2;
+
+    pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
+    pixelcmp_t satd = primitives.pu[sizeIdx].satd;
+
+    for (int cuY = 0; cuY < heightInCU; cuY++)
+    {
+        fenc.rowSatds[0][0][cuY] = 0;
+
+        for (int cuX = 0; cuX < widthInCU; cuX++)
+        {
+            const int cuXY = cuX + cuY * widthInCU;
+            const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc.lumaStride;
+
+            /* Prep reference pixels */
+            pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
+            primitives.cu[sizeIdx].copy_pp(fencIntra, cuSize, pixCur, fenc.lumaStride);
+
+            memcpy(neighbours[0], pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
+            for (int i = 1; i < cuSize + 1; i++)
+                neighbours[0][i + cuSize2] = pixCur[-1 - fenc.lumaStride + i * fenc.lumaStride]; /* todo: fixme */
+
+            for (int i = 0; i < cuSize; i++)
+            {
+                neighbours[0][i + cuSize + 1] = neighbours[0][cuSize];                     // Copy above-last pixel
+                neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; // Copy left-last pixel
+            }
+
+            neighbours[1][0]  = neighbours[0][0];                      // Copy top-left pixel 
+            neighbours[1][cuSize2] = neighbours[0][cuSize2];           // Copy top-right pixel
+            neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
+
+            // Filter neighbour pixels with [1-2-1]
+            neighbours[1][1]           = (neighbours[0][0] + (neighbours[0][1] << 1)           + neighbours[0][2] + 2)               >> 2;
+            neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
+            for (int i = 2; i < cuSize2; i++)
+            {
+                neighbours[1][i]           = (neighbours[0][i - 1]           + (neighbours[0][i] << 1)           + neighbours[0][i + 1]      + 2) >> 2;
+                neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
+            }
+
+            int cost, icost = me.COST_MAX;
+            uint32_t ilowmode = 0;
+
+            /* DC and planar */
+            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
+            cost = satd(fencIntra, cuSize, prediction, cuSize);
+            COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
+
+            primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
+            cost = satd(fencIntra, cuSize, prediction, cuSize);
+            COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
+
+            /* scan angular predictions */
+            int filter, acost = me.COST_MAX;
+            uint32_t mode, alowmode = 4;
+            for (mode = 5; mode < 35; mode += 5)
+            {
+                filter = !!(g_intraFilterFlags[mode] & cuSize);
+                primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+                cost = satd(fencIntra, cuSize, prediction, cuSize);
+                COPY2_IF_LT(acost, cost, alowmode, mode);
+            }
+            for (uint32_t dist = 2; dist >= 1; dist--)
+            {
+                int minusmode = alowmode - dist;
+                int plusmode = alowmode + dist;
+
+                mode = minusmode;
+                filter = !!(g_intraFilterFlags[mode] & cuSize);
+                primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+                cost = satd(fencIntra, cuSize, prediction, cuSize);
+                COPY2_IF_LT(acost, cost, alowmode, mode);
+
+                mode = plusmode;
+                filter = !!(g_intraFilterFlags[mode] & cuSize);
+                primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+                cost = satd(fencIntra, cuSize, prediction, cuSize);
+                COPY2_IF_LT(acost, cost, alowmode, mode);
+            }
+            COPY2_IF_LT(icost, acost, ilowmode, alowmode);
+
+            icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
+
+            fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
+            fenc.intraCost[cuXY] = icost;
+            fenc.intraMode[cuXY] = (uint8_t)ilowmode;
+            fenc.rowSatds[0][0][cuY] += icost;
+            fenc.costEst[0][0] += icost;
+        }
+    }
+}
+
+uint32_t LookaheadTLD::weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp)
+{
+    pixel *src = ref.fpelPlane[0];
+    intptr_t stride = fenc.lumaStride;
+
+    if (wp.bPresentFlag)
+    {
+        int offset = wp.inputOffset << (X265_DEPTH - 8);
+        int scale = wp.inputWeight;
+        int denom = wp.log2WeightDenom;
+        int round = denom ? 1 << (denom - 1) : 0;
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+        int widthHeight = (int)stride;
+
+        primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
+            scale, round << correction, denom + correction, offset);
+        src = weightedRef.fpelPlane[0];
+    }
+
+    uint32_t cost = 0;
+    intptr_t pixoff = 0;
+    int mb = 0;
+
+    for (int y = 0; y < fenc.lines; y += 8, pixoff = y * stride)
+    {
+        for (int x = 0; x < fenc.width; x += 8, mb++, pixoff += 8)
+        {
+            int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc.fpelPlane[0] + pixoff, stride);
+            cost += X265_MIN(satd, fenc.intraCost[mb]);
+        }
+    }
+
+    return cost;
+}
+
+bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
+{
+    intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
+    paddedLines = (int)(planesize / fenc.lumaStride);
+
+    wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
+    if (wbuffer[0])
+    {
+        wbuffer[1] = wbuffer[0] + planesize;
+        wbuffer[2] = wbuffer[1] + planesize;
+        wbuffer[3] = wbuffer[2] + planesize;
+    }
+    else
+        return false;
+
+    for (int i = 0; i < 4; i++)
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
+
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
+    weightedRef.lumaStride = fenc.lumaStride;
+    weightedRef.isLowres = true;
+    weightedRef.isWeighted = false;
+
+    return true;
+}
+
+void LookaheadTLD::weightsAnalyse(Lowres& fenc, Lowres& ref)
+{
+    static const float epsilon = 1.f / 128.f;
+    int deltaIndex = fenc.frameNum - ref.frameNum;
+
+    WeightParam wp;
+    wp.bPresentFlag = false;
+
+    if (!wbuffer[0])
+    {
+        if (!allocWeightedRef(fenc))
+            return;
+    }
+
+    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+    float guessScale, fencMean, refMean;
+    x265_emms();
+    if (fenc.wp_ssd[0] && ref.wp_ssd[0])
+        guessScale = sqrtf((float)fenc.wp_ssd[0] / ref.wp_ssd[0]);
+    else
+        guessScale = 1.0f;
+    fencMean = (float)fenc.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8));
+    refMean = (float)ref.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8));
+
+    /* Early termination */
+    if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
+        return;
+
+    int minoff = 0, minscale, mindenom;
+    unsigned int minscore = 0, origscore = 1;
+    int found = 0;
+
+    wp.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);
+    mindenom = wp.log2WeightDenom;
+    minscale = wp.inputWeight;
+
+    origscore = minscore = weightCostLuma(fenc, ref, wp);
+
+    if (!minscore)
+        return;
+
+    unsigned int s = 0;
+    int curScale = minscale;
+    int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);
+    if (curOffset < -128 || curOffset > 127)
+    {
+        /* Rescale considering the constraints on curOffset. We do it in this order
+        * because scale has a much wider range than offset (because of denom), so
+        * it should almost never need to be clamped. */
+        curOffset = x265_clip3(-128, 127, curOffset);
+        curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
+        curScale = x265_clip3(0, 127, curScale);
+    }
+    SET_WEIGHT(wp, true, curScale, mindenom, curOffset);
+    s = weightCostLuma(fenc, ref, wp);
+    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
+
+    /* Use a smaller denominator if possible */
+    while (mindenom > 0 && !(minscale & 1))
+    {
+        mindenom--;
+        minscale >>= 1;
+    }
+
+    if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
+        return;
+    else
+    {
+        SET_WEIGHT(wp, true, minscale, mindenom, minoff);
+
+        // set weighted delta cost
+        fenc.weightedCostDelta[deltaIndex] = minscore / origscore;
+
+        int offset = wp.inputOffset << (X265_DEPTH - 8);
+        int scale = wp.inputWeight;
+        int denom = wp.log2WeightDenom;
+        int round = denom ? 1 << (denom - 1) : 0;
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+        intptr_t stride = ref.lumaStride;
+        int widthHeight = (int)stride;
+
+        for (int i = 0; i < 4; i++)
+            primitives.weight_pp(ref.buffer[i], wbuffer[i], stride, widthHeight, paddedLines,
+            scale, round << correction, denom + correction, offset);
+
+        weightedRef.isWeighted = true;
+    }
+}
+
 Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
-    : JobProvider(pool)
-    , m_est(pool)
 {
-    m_bReady = false;
-    m_bBusy = false;
     m_param = param;
+    m_pool  = pool;
+
+    m_lastNonB = NULL;
+    m_scratch  = NULL;
+    m_tld      = NULL;
+    m_filled   = false;
+    m_outputSignalRequired = false;
+    m_isActive = true;
+
+    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_ncu = m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU;
+
     m_lastKeyframe = -m_param->keyframeMax;
-    m_lastNonB = NULL;
-    m_bFilled = false;
-    m_bFlushed = false;
-    m_bFlush = false;
+    memset(m_preframes, 0, sizeof(m_preframes));
+    m_preTotal = m_preAcquired = m_preCompleted = 0;
+    m_sliceTypeBusy = false;
+    m_fullQueueSize = m_param->lookaheadDepth;
+    m_bAdaptiveQuant = m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred;
+
+    if (m_pool && m_pool->m_numWorkers > 2)
+    {
+        m_numRowsPerSlice = m_heightInCU / (m_pool->m_numWorkers - 1);   // default to numWorkers - 1 slices
+        m_numRowsPerSlice = X265_MAX(m_numRowsPerSlice, 10);             // at least 10 rows per slice
+        m_numRowsPerSlice = X265_MIN(m_numRowsPerSlice, m_heightInCU);   // but no more than the full picture
+        m_numCoopSlices = m_heightInCU / m_numRowsPerSlice;
+    }
+    else
+    {
+        m_numRowsPerSlice = m_heightInCU;
+        m_numCoopSlices = 1;
+    }
 
 #if DETAILED_CU_STATS
     m_slicetypeDecideElapsedTime = 0;
+    m_preLookaheadElapsedTime = 0;
     m_countSlicetypeDecide = 0;
+    m_countPreLookahead = 0;
 #endif
 
-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
     memset(m_histogram, 0, sizeof(m_histogram));
 }
 
-Lookahead::~Lookahead() { }
+#if DETAILED_CU_STATS
+void Lookahead::getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount)
+{
+    batchElapsedTime = coopSliceElapsedTime = 0;
+    coopSliceCount = batchCount = 0;
+    int tldCount = m_pool ? m_pool->m_numWorkers : 1;
+    for (int i = 0; i < tldCount; i++)
+    {
+        batchElapsedTime += m_tld[i].batchElapsedTime;
+        coopSliceElapsedTime += m_tld[i].coopSliceElapsedTime;
+        batchCount += m_tld[i].countBatches;
+        coopSliceCount += m_tld[i].countCoopSlices;
+    }
+}
+#endif
 
-void Lookahead::init()
+bool Lookahead::create()
 {
-    if (m_pool && m_pool->getThreadCount() >= 4 &&
-        ((m_param->bFrameAdaptive && m_param->bframes) ||
-         m_param->rc.cuTree || m_param->scenecutThreshold ||
-         (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
-    {
-        JobProvider::enqueue();
-    }
-    else
-        m_pool = NULL; /* disable use of worker thread */
+    int numTLD = 1 + (m_pool ? m_pool->m_numWorkers : 0);
+    m_tld = new LookaheadTLD[numTLD];
+    for (int i = 0; i < numTLD; i++)
+        m_tld[i].init(m_widthInCU, m_heightInCU, m_ncu);
+    m_scratch = X265_MALLOC(int, m_tld[0].widthInCU);
+
+    return m_tld && m_scratch;
 }
 
 void Lookahead::stop()
 {
-    /* do not allow slicetypeDecide() to get started again */
-    m_bReady = false;
-    m_bFlushed = false;
-    m_bFlush = false;
-    m_bBusy = false;
+    if (m_pool && !m_inputQueue.empty())
+    {
+        m_preLookaheadLock.acquire();
+        m_isActive = false;
+        bool wait = m_outputSignalRequired = m_sliceTypeBusy;
+        m_preLookaheadLock.release();
 
-    if (m_pool)
-        JobProvider::flush(); // flush will dequeue, if it is necessary
+        if (wait)
+            m_outputSignal.wait();
+    }
 }
 
 void Lookahead::destroy()
@@ -123,132 +567,165 @@
         delete curFrame;
     }
 
-    x265_free(m_scratch);
+    X265_FREE(m_scratch);
+
+    delete [] m_tld;
 }
 
+/* The synchronization of slicetypeDecide is managed here.  The findJob() method
+ * polls the occupancy of the input queue. If the queue is
+ * full, it will run slicetypeDecide() and output a mini-gop of frames to the
+ * output queue. If the flush() method has been called (implying no new pictures
+ * will be received) then the input queue is considered full if it has even one
+ * picture left. getDecidedPicture() removes pictures from the output queue and
+ * only blocks as a last resort. It does not start removing pictures until
+ * m_filled is true, which occurs after *more than* the lookahead depth of
+ * pictures have been input so slicetypeDecide() should have started prior to
+ * output pictures being withdrawn. The first slicetypeDecide() will obviously
+ * still require a blocking wait, but after this slicetypeDecide() will maintain
+ * its lead over the encoder (because one picture is added to the input queue
+ * each time one is removed from the output) and decides slice types of pictures
+ * just ahead of when the encoder needs them */
+
 /* Called by API thread */
-void Lookahead::addPicture(Frame *curFrame, int sliceType)
+void Lookahead::addPicture(Frame& curFrame, int sliceType)
 {
-    {
-        ProfileScopeEvent(prelookahead);
-        PicYuv *orig = curFrame->m_fencPic;
-        curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
-    }
-
-    m_inputQueueLock.acquire();
-    m_inputQueue.pushBack(*curFrame);
-
-    if (m_inputQueue.size() >= m_param->lookaheadDepth)
-    {
-        if (m_pool)
-        {
-            m_bReady = !m_bBusy;
-            m_inputQueueLock.release();
-            m_pool->pokeIdleThread();
-        }
-        else
-            slicetypeDecide();
-    }
-    else
-        m_inputQueueLock.release();
+    curFrame.m_lowres.sliceType = sliceType;
 
     /* determine if the lookahead is (over) filled enough for frames to begin to
      * be consumed by frame encoders */
-    if (!m_bFilled)
+    if (!m_filled)
     {
         if (!m_param->bframes & !m_param->lookaheadDepth)
-            m_bFilled = true; /* zero-latency */
-        else if (curFrame->m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)
-            m_bFilled = true; /* full capacity plus mini-gop lag */
+            m_filled = true; /* zero-latency */
+        else if (curFrame.m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)
+            m_filled = true; /* full capacity plus mini-gop lag */
     }
+
+    m_preLookaheadLock.acquire();
+
+    m_inputLock.acquire();
+    m_inputQueue.pushBack(curFrame);
+    m_inputLock.release();
+
+    m_preframes[m_preTotal++] = &curFrame;
+    X265_CHECK(m_preTotal <= X265_LOOKAHEAD_MAX, "prelookahead overflow\n");
+    
+    m_preLookaheadLock.release();
+
+    if (m_pool)
+        tryWakeOne();
 }
 
 /* Called by API thread */
 void Lookahead::flush()
 {
-    m_bFlush = true;
-    m_bFilled = true;
+    /* force slicetypeDecide to run until the input queue is empty */
+    m_fullQueueSize = 1;
+    m_filled = true;
+}
 
-    /* just in case the input queue is never allowed to fill */
-    m_inputQueueLock.acquire();
-    if (m_inputQueue.empty())
+void Lookahead::findJob(int workerThreadID)
+{
+    Frame* preFrame;
+    bool   doDecide;
+
+    if (!m_isActive)
+        return;
+
+    int tld = workerThreadID;
+    if (workerThreadID < 0)
+        tld = m_pool ? m_pool->m_numWorkers : 0;
+
+    m_preLookaheadLock.acquire();
+    do
     {
-        m_bFlushed = true;
-        m_inputQueueLock.release();
+        preFrame = NULL;
+        doDecide = false;
+
+        if (m_preTotal > m_preAcquired)
+            preFrame = m_preframes[m_preAcquired++];
+        else
+        {
+            if (m_preTotal == m_preCompleted)
+                m_preAcquired = m_preTotal = m_preCompleted = 0;
+
+            /* the worker thread that performs the last pre-lookahead will generally get to run
+             * slicetypeDecide() */
+            m_inputLock.acquire();
+            if (!m_sliceTypeBusy && !m_preTotal && m_inputQueue.size() >= m_fullQueueSize && m_isActive)
+                 doDecide = m_sliceTypeBusy = true;
+            m_inputLock.release();
+        }
+        m_preLookaheadLock.release();
+
+        if (preFrame)
+        {
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(m_preLookaheadElapsedTime);
+            m_countPreLookahead++;
+#endif
+            ProfileScopeEvent(prelookahead);
+
+            preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc);
+            if (m_bAdaptiveQuant)
+                m_tld[tld].calcAdaptiveQuantFrame(preFrame, m_param);
+            m_tld[tld].lowresIntraEstimate(preFrame->m_lowres);
+
+            m_preLookaheadLock.acquire(); /* re-acquire for next pass */
+            m_preCompleted++;
+        }
+        else if (doDecide)
+        {
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(m_slicetypeDecideElapsedTime);
+            m_countSlicetypeDecide++;
+#endif
+            ProfileScopeEvent(slicetypeDecideEV);
+
+            slicetypeDecide();
+
+            m_preLookaheadLock.acquire(); /* re-acquire for next pass */
+            if (m_outputSignalRequired)
+            {
+                m_outputSignal.trigger();
+                m_outputSignalRequired = false;
+            }
+            m_sliceTypeBusy = false;
+        }
+    }
+    while (preFrame || doDecide);
+
+    m_helpWanted = false;
+}
+
+/* Called by API thread */
+Frame* Lookahead::getDecidedPicture()
+{
+    if (m_filled)
+    {
+        m_outputLock.acquire();
+        Frame *out = m_outputQueue.popFront();
+        m_outputLock.release();
+
+        if (out)
+            return out;
+
+        /* process all pending pre-lookahead frames and run slicetypeDecide() if
+         * necessary */
+        findJob(-1);
+
+        m_preLookaheadLock.acquire();
+        bool wait = m_outputSignalRequired = m_sliceTypeBusy || m_preTotal;
+        m_preLookaheadLock.release();
+
+        if (wait)
+            m_outputSignal.wait();
+
+        return m_outputQueue.popFront();
     }
     else
-    {
-        if (m_pool)
-        {
-            m_bReady = !m_bBusy;
-            m_inputQueueLock.release();
-            m_pool->pokeIdleThread();
-        }
-        else
-            slicetypeDecide();
-    }
-}
-
-/* Called by API thread. If the lookahead queue has not yet been filled the
- * first time, it immediately returns NULL.  Else the function blocks until
- * outputs are available and then pops the first frame from the output queue. If
- * flush() has been called and the output queue is empty, NULL is returned. */
-Frame* Lookahead::getDecidedPicture()
-{
-    if (!m_bFilled)
         return NULL;
-
-    m_outputQueueLock.acquire();
-    Frame *fenc = m_outputQueue.popFront();
-    m_outputQueueLock.release();
-
-    if (fenc || m_bFlushed)
-        return fenc;
-
-    do
-    {
-        m_outputAvailable.wait();
-
-        m_outputQueueLock.acquire();
-        fenc = m_outputQueue.popFront();
-        m_outputQueueLock.release();
-    }
-    while (!fenc);
-
-    return fenc;
-}
-
-/* Called by pool worker threads */
-bool Lookahead::findJob(int)
-{
-    if (!m_bReady)
-        return false;
-
-    m_inputQueueLock.acquire();
-    if (!m_bReady)
-    {
-        m_inputQueueLock.release();
-        return false;
-    }
-
-    m_bReady = false;
-    m_bBusy = true;
-
-    do
-    {
-        slicetypeDecide(); // releases input queue lock
-
-        m_inputQueueLock.acquire();
-
-        if (!m_bBusy)
-            break;
-    }
-    while (m_inputQueue.size() >= m_param->lookaheadDepth ||
-           (m_bFlush && m_inputQueue.size()));
-
-    m_bBusy = false;
-    m_inputQueueLock.release();
-    return true;
 }
 
 /* Called by rate-control to calculate the estimated SATD cost for a given
@@ -339,12 +816,6 @@
 /* called by API thread or worker thread with inputQueueLock acquired */
 void Lookahead::slicetypeDecide()
 {
-    ProfileScopeEvent(slicetypeDecideEV);
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_slicetypeDecideElapsedTime);
-    m_countSlicetypeDecide++;
-#endif
-
     Lowres *frames[X265_LOOKAHEAD_MAX];
     Frame *list[X265_LOOKAHEAD_MAX];
     int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
@@ -352,6 +823,7 @@
     memset(frames, 0, sizeof(frames));
     memset(list, 0, sizeof(list));
     {
+        ScopedLock lock(m_inputLock);
         Frame *curFrame = m_inputQueue.first();
         int j;
         for (j = 0; j < m_param->bframes + 2; j++)
@@ -373,11 +845,6 @@
         maxSearch = j;
     }
 
-    m_inputQueueLock.release();
-
-    if (!m_est.m_rows && list[0])
-        m_est.init(m_param, list[0]);
-
     if (m_lastNonB && !m_param->rc.bStatRead &&
         ((m_param->bFrameAdaptive && m_param->bframes) ||
          m_param->rc.cuTree || m_param->scenecutThreshold ||
@@ -399,7 +866,7 @@
         }
 
         /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
-           smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
+         * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
         else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
                  m_param->maxNumReferences <= (brefs + 3))
         {
@@ -408,7 +875,7 @@
                      frm.sliceType, m_param->maxNumReferences);
         }
 
-        if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
+        if (/* (!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
         {
             if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
                 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
@@ -484,7 +951,10 @@
         /* estimate new non-B cost */
         p1 = b = bframes + 1;
         p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0;
-        m_est.estimateFrameCost(frames, p0, p1, b, 0);
+
+        CostEstimateGroup estGroup(*this, frames);
+
+        estGroup.singleCost(p0, p1, b);
 
         if (bframes)
         {
@@ -497,7 +967,7 @@
                 else
                     p1 = bframes + 1;
 
-                m_est.estimateFrameCost(frames, p0, p1, b, 0);
+                estGroup.singleCost(p0, p1, b);
 
                 if (frames[b]->sliceType == X265_TYPE_BREF)
                     p0 = b;
@@ -505,8 +975,7 @@
         }
     }
 
-    m_inputQueueLock.acquire();
-
+    m_inputLock.acquire();
     /* dequeue all frames from inputQueue that are about to be enqueued
      * in the output queue. The order is important because Frame can
      * only be in one list at a time */
@@ -518,10 +987,9 @@
         pts[i] = curFrame->m_pts;
         maxSearch--;
     }
+    m_inputLock.release();
 
-    m_inputQueueLock.release();
-
-    m_outputQueueLock.acquire();
+    m_outputLock.acquire();
     /* add non-B to output queue */
     int idx = 0;
     list[bframes]->m_reorderedPts = pts[idx++];
@@ -543,18 +1011,19 @@
     /* add B frames to output queue */
     for (int i = 0; i < bframes; i++)
     {
-        /* push all the B frames into output queue except B-ref, which already pushed into output queue*/
+        /* push all the B frames into output queue except B-ref, which already pushed into output queue */
         if (list[i]->m_lowres.sliceType != X265_TYPE_BREF)
         {
             list[i]->m_reorderedPts = pts[idx++];
             m_outputQueue.pushBack(*list[i]);
         }
     }
+    m_outputLock.release();
 
     bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead;
     if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
     {
-        m_inputQueueLock.acquire();
+        m_inputLock.acquire();
         Frame *curFrame = m_inputQueue.first();
         frames[0] = m_lastNonB;
         int j;
@@ -563,14 +1032,11 @@
             frames[j + 1] = &curFrame->m_lowres;
             curFrame = curFrame->m_next;
         }
+        m_inputLock.release();
 
         frames[j + 1] = NULL;
-        m_inputQueueLock.release();
         slicetypeAnalyse(frames, true);
     }
-
-    m_outputQueueLock.release();
-    m_outputAvailable.trigger();
 }
 
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
@@ -592,6 +1058,7 @@
             int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
             frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
             frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+
             /* Save the nextNonB Cost in each B frame of the current miniGop */
             if (curNonB > miniGopEnd)
             {
@@ -603,13 +1070,15 @@
             }
             idx++;
         }
+
         /* Handle the B-frames: coded order */
         if (m_param->bBPyramid && curNonB - prevNonB > 1)
             nextBRef = (prevNonB + curNonB + 1) / 2;
 
         for (int i = prevNonB + 1; i < curNonB; i++, idx++)
         {
-            int64_t satdCost = 0; int type = X265_TYPE_B;
+            int64_t satdCost = 0;
+            int type = X265_TYPE_B;
             if (nextBRef)
             {
                 if (i == nextBRef)
@@ -649,7 +1118,8 @@
 
 int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b)
 {
-    int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0);
+    CostEstimateGroup estGroup(*this, frames);
+    int64_t cost = estGroup.singleCost(p0, p1, b);
 
     if (m_param->rc.aqMode)
     {
@@ -658,6 +1128,7 @@
         else
             return frames[b]->costEstAq[b - p0][p1 - b];
     }
+
     return cost;
 }
 
@@ -665,7 +1136,7 @@
 {
     int numFrames, origNumFrames, keyintLimit, framecnt;
     int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
-    int cuCount = NUM_CUS;
+    int cuCount = m_ncu;
     int resetStart;
     bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth;
 
@@ -699,6 +1170,41 @@
         return;
     }
 
+    if (m_pool && m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS && numFrames > 1 && frames[1]->frameNum < m_param->lookaheadDepth)
+    {
+        /* pre-calculate all motion searches, using many worker threads */
+        CostEstimateGroup estGroup(*this, frames);
+        for (int b = 2; b < numFrames; b++)
+        {
+            for (int i = 1; i <= m_param->bframes + 1; i++)
+            {
+                if (b >= i && frames[b]->lowresMvs[0][i - 1][0].x == 0x7FFF)
+                    estGroup.add(b - i, b + i < numFrames ? b + i : b, b);
+            }
+        }
+        estGroup.finishBatch();
+
+        if (m_pool->m_numWorkers >= 16)
+        {
+            /* pre-calculate all frame cost estimates, using many worker threads */
+            for (int b = 2; b < numFrames; b++)
+            {
+                for (int i = 1; i <= m_param->bframes + 1; i++)
+                {
+                    if (b < i)
+                        continue;
+
+                    for (int j = 0; j <= m_param->bframes; j++)
+                    {
+                        if (b + j < numFrames && frames[b]->costEst[i][j] < 0)
+                            estGroup.add(b - i, b + j, b);
+                    }
+                }
+            }
+            estGroup.finishBatch();
+        }
+    }
+
     int numBFrames = 0;
     int numAnalyzed = numFrames;
     if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch))
@@ -716,29 +1222,27 @@
                 char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" };
                 int best_path_index = numFrames % (X265_BFRAME_MAX + 1);
 
-                /* Perform the frametype analysis. */
+                /* Perform the frame type analysis. */
                 for (int j = 2; j <= numFrames; j++)
-                {
                     slicetypePath(frames, j, best_paths);
-                }
 
                 numBFrames = (int)strspn(best_paths[best_path_index], "B");
 
                 /* Load the results of the analysis into the frame types. */
                 for (int j = 1; j < numFrames; j++)
-                {
                     frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P;
-                }
             }
             frames[numFrames]->sliceType = X265_TYPE_P;
         }
         else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST)
         {
+            CostEstimateGroup estGroup(*this, frames);
+
             int64_t cost1p0, cost2p0, cost1b1, cost2p1;
 
             for (int i = 0; i <= numFrames - 2; )
             {
-                cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1);
+                cost2p1 = estGroup.singleCost(i + 0, i + 2, i + 2, true);
                 if (frames[i + 2]->intraMbs[2] > cuCount / 2)
                 {
                     frames[i + 1]->sliceType = X265_TYPE_P;
@@ -747,9 +1251,9 @@
                     continue;
                 }
 
-                cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0);
-                cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0);
-                cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0);
+                cost1b1 = estGroup.singleCost(i + 0, i + 2, i + 1);
+                cost1p0 = estGroup.singleCost(i + 0, i + 1, i + 1);
+                cost2p0 = estGroup.singleCost(i + 1, i + 2, i + 2);
 
                 if (cost1p0 + cost2p0 < cost1b1 + cost2p1)
                 {
@@ -767,7 +1271,7 @@
                 for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++)
                 {
                     int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10);
-                    int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1);
+                    int64_t pcost = estGroup.singleCost(i + 0, j + 1, j + 1, true);
                     if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3)
                         break;
                     frames[j]->sliceType = X265_TYPE_B;
@@ -779,20 +1283,17 @@
             frames[numFrames]->sliceType = X265_TYPE_P;
             numBFrames = 0;
             while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B)
-            {
                 numBFrames++;
-            }
         }
         else
         {
             numBFrames = X265_MIN(numFrames - 1, m_param->bframes);
             for (int j = 1; j < numFrames; j++)
-            {
                 frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P;
-            }
 
             frames[numFrames]->sliceType = X265_TYPE_P;
         }
+
         /* Check scenecut on the first minigop. */
         for (int j = 1; j < numBFrames + 1; j++)
         {
@@ -809,9 +1310,7 @@
     else
     {
         for (int j = 1; j <= numFrames; j++)
-        {
             frames[j]->sliceType = X265_TYPE_P;
-        }
 
         resetStart = bKeyframe ? 1 : 2;
     }
@@ -829,11 +1328,9 @@
     if (bIsVbvLookahead)
         vbvLookahead(frames, numFrames, bKeyframe);
 
-    /* Restore frametypes for all frames that haven't actually been decided yet. */
+    /* Restore frame types for all frames that haven't actually been decided yet. */
     for (int j = resetStart; j <= numFrames; j++)
-    {
         frames[j]->sliceType = X265_TYPE_AUTO;
-    }
 }
 
 bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch)
@@ -858,9 +1355,7 @@
             if (!scenecutInternal(frames, p0, cp1, false))
                 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
                 for (int i = cp1; i > p0; i--)
-                {
                     frames[i]->bScenecut = false;
-                }
         }
 
         /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
@@ -886,7 +1381,8 @@
 {
     Lowres *frame = frames[p1];
 
-    m_est.estimateFrameCost(frames, p0, p1, p1, 0);
+    CostEstimateGroup estGroup(*this, frames);
+    estGroup.singleCost(p0, p1, p1);
 
     int64_t icost = frame->costEst[0][0];
     int64_t pcost = frame->costEst[p1 - p0][0];
@@ -915,7 +1411,7 @@
     if (res && bRealScenecut)
     {
         int imb = frame->intraMbs[p1 - p0];
-        int pmb = NUM_CUS - imb;
+        int pmb = m_ncu - imb;
         x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
                  frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb);
     }
@@ -957,18 +1453,19 @@
     int loc = 1;
     int cur_p = 0;
 
+    CostEstimateGroup estGroup(*this, frames);
+
     path--; /* Since the 1st path element is really the second frame */
     while (path[loc])
     {
         int next_p = loc;
         /* Find the location of the next P-frame. */
         while (path[next_p] != 'P')
-        {
             next_p++;
-        }
 
         /* Add the cost of the P-frame found above */
-        cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0);
+        cost += estGroup.singleCost(cur_p, next_p, next_p);
+
         /* Early terminate if the cost we have found is larger than the best path cost so far */
         if (cost > threshold)
             break;
@@ -976,23 +1473,18 @@
         if (m_param->bBPyramid && next_p - cur_p > 2)
         {
             int middle = cur_p + (next_p - cur_p) / 2;
-            cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0);
+            cost += estGroup.singleCost(cur_p, next_p, middle);
+
             for (int next_b = loc; next_b < middle && cost < threshold; next_b++)
-            {
-                cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0);
-            }
+                cost += estGroup.singleCost(cur_p, middle, next_b);
 
             for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++)
-            {
-                cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0);
-            }
+                cost += estGroup.singleCost(middle, next_p, next_b);
         }
         else
         {
             for (int next_b = loc; next_b < next_p && cost < threshold; next_b++)
-            {
-                cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0);
-            }
+                cost += estGroup.singleCost(cur_p, next_p, next_b);
         }
 
         loc = next_p + 1;
@@ -1018,9 +1510,6 @@
     int i = numframes;
     int cuCount = m_widthInCU * m_heightInCU;
 
-    if (bIntra)
-        m_est.estimateFrameCost(frames, 0, 0, 0, 0);
-
     while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
         i--;
 
@@ -1047,6 +1536,8 @@
         memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
     }
 
+    CostEstimateGroup estGroup(*this, frames);
+
     while (i-- > idx)
     {
         curnonb = i;
@@ -1056,13 +1547,14 @@
         if (curnonb < idx)
             break;
 
-        m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0);
+        estGroup.singleCost(curnonb, lastnonb, lastnonb);
+
         memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
         bframes = lastnonb - curnonb - 1;
         if (m_param->bBPyramid && bframes > 1)
         {
             int middle = (bframes + 1) / 2 + curnonb;
-            m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0);
+            estGroup.singleCost(curnonb, lastnonb, middle);
             memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
             while (i > curnonb)
             {
@@ -1070,7 +1562,7 @@
                 int p1 = i < middle ? middle : lastnonb;
                 if (i != middle)
                 {
-                    m_est.estimateFrameCost(frames, p0, p1, i, 0);
+                    estGroup.singleCost(p0, p1, i);
                     estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
                 }
                 i--;
@@ -1082,7 +1574,7 @@
         {
             while (i > curnonb)
             {
-                m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0);
+                estGroup.singleCost(curnonb, lastnonb, i);
                 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
                 i--;
             }
@@ -1093,7 +1585,7 @@
 
     if (!m_param->lookaheadDepth)
     {
-        m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0);
+        estGroup.singleCost(0, lastnonb, lastnonb);
         estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
         std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
     }
@@ -1118,7 +1610,7 @@
     x265_emms();
     double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);
 
-    /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
+    /* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
     if (!referenced)
         memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t));
 
@@ -1132,6 +1624,7 @@
 
         if (referenced)
             propagateCost += m_widthInCU;
+
         for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++)
         {
             int32_t propagate_amount = m_scratch[blockx];
@@ -1211,8 +1704,8 @@
     if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
         weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
 
-    /* Allow the strength to be adjusted via qcompress, since the two
-     * concepts are very similar. */
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
+     * are very similar. */
 
     int cuCount = m_widthInCU * m_heightInCU;
     double strength = 5.0 * (1.0 - m_param->rc.qCompress);
@@ -1260,557 +1753,307 @@
     return score;
 }
 
-CostEstimate::CostEstimate(ThreadPool *p)
-    : WaveFront(p)
+
+int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty)
 {
-    m_param = NULL;
-    m_curframes = NULL;
-    m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0;
-    m_rows = NULL;
-    m_paddedLines = m_widthInCU = m_heightInCU = 0;
-    m_bDoSearch[0] = m_bDoSearch[1] = false;
-    m_curb = m_curp0 = m_curp1 = 0;
-    m_bFrameCompleted = false;
+    LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0];
+    return estimateFrameCost(tld, p0, p1, b, intraPenalty);
 }
 
-CostEstimate::~CostEstimate()
+void CostEstimateGroup::add(int p0, int p1, int b, bool intraPenalty)
 {
-    for (int i = 0; i < 4; i++)
-        X265_FREE(m_wbuffer[i]);
+    X265_CHECK(m_batchMode || !m_jobTotal, "single CostEstimateGroup instance cannot mix batch modes\n");
+    m_batchMode = true;
 
-    delete[] m_rows;
+    Estimate& e = m_estimates[m_jobTotal++];
+    e.p0 = p0;
+    e.p1 = p1;
+    e.b = b;
+    e.bIntraPenalty = intraPenalty;
+
+    if (m_jobTotal == MAX_BATCH_SIZE)
+        finishBatch();
 }
 
-void CostEstimate::init(x265_param *_param, Frame *curFrame)
+void CostEstimateGroup::finishBatch()
 {
-    m_param = _param;
-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    if (m_lookahead.m_pool)
+        tryBondPeers(*m_lookahead.m_pool, m_jobTotal);
+    processTasks(-1);
+    waitForExit();
+    m_jobTotal = m_jobAcquired = 0;
+}
 
+void CostEstimateGroup::processTasks(int workerThreadID)
+{
+    ThreadPool* pool = m_lookahead.m_pool;
+    int id = workerThreadID;
+    if (workerThreadID < 0)
+        id = pool ? pool->m_numWorkers : 0;
+    LookaheadTLD& tld = m_lookahead.m_tld[id];
+
+    m_lock.acquire();
+    while (m_jobAcquired < m_jobTotal)
+    {
+        int i = m_jobAcquired++;
+        m_lock.release();
+
+        if (m_batchMode)
+        {
 #if DETAILED_CU_STATS
-    m_processRowElapsedTime = 0;
-    m_countProcessRow = 0;
+            ScopedElapsedTime filterPerfScope(tld.batchElapsedTime);
+            tld.countBatches++;
 #endif
+            ProfileScopeEvent(estCostSingle);
+            Estimate& e = m_estimates[i];
 
-    m_rows = new EstimateRow[m_heightInCU];
-    for (int i = 0; i < m_heightInCU; i++)
-    {
-        m_rows[i].m_widthInCU = m_widthInCU;
-        m_rows[i].m_heightInCU = m_heightInCU;
-        m_rows[i].m_param = m_param;
-    }
+            estimateFrameCost(tld, e.p0, e.p1, e.b, e.bIntraPenalty);
+        }
+        else
+        {
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(tld.coopSliceElapsedTime);
+            tld.countCoopSlices++;
+#endif
+            ProfileScopeEvent(estCostCoop);
+            X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n");
 
-    if (WaveFront::init(m_heightInCU))
-        WaveFront::enableAllRows();
-    else
-        m_pool = NULL;
+            int firstY = m_lookahead.m_numRowsPerSlice * i;
+            int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_heightInCU - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
 
-    if (m_param->bEnableWeightedPred)
-    {
-        PicYuv *orig = curFrame->m_fencPic;
-        m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
-        intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
+            bool lastRow = true;
+            for (int cuY = lastY; cuY >= firstY; cuY--)
+            {
+                m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
 
-        /* allocate weighted lowres buffers */
-        for (int i = 0; i < 4; i++)
-        {
-            m_wbuffer[i] = X265_MALLOC(pixel, curFrame->m_lowres.lumaStride * m_paddedLines);
-            m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
+                for (int cuX = m_lookahead.m_widthInCU - 1; cuX >= 0; cuX--)
+                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);
+
+                lastRow = false;
+            }
         }
 
-        m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
-        m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
-        m_weightedRef.isLowres = true;
-        m_weightedRef.isWeighted = false;
+        m_lock.acquire();
     }
+    m_lock.release();
 }
 
-int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty)
+int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty)
 {
-    int64_t score = 0;
-    Lowres *fenc = frames[b];
+    Lowres*     fenc  = m_frames[b];
+    x265_param* param = m_lookahead.m_param;
+    int64_t     score = 0;
 
     if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
         score = fenc->costEst[b - p0][p1 - b];
     else
     {
-        m_weightedRef.isWeighted = false;
-        if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF)
-        {
-            if (!fenc->bIntraCalculated)
-                estimateFrameCost(frames, b, b, b, 0);
-            weightsAnalyse(frames, b, p0);
-        }
+        X265_CHECK(p0 != b, "I frame estimates should always be pre-calculated\n");
 
-        /* For each list, check to see whether we have lowres motion-searched this reference */
-        m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
-        m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
+        bool bDoSearch[2];
+        bDoSearch[0] = p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
+        bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
 
-        if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
-        if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
+        tld.weightedRef.isWeighted = false;
+        if (param->bEnableWeightedPred && bDoSearch[0])
+            tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);
 
-        m_curb = b;
-        m_curp0 = p0;
-        m_curp1 = p1;
-        m_curframes = frames;
         fenc->costEst[b - p0][p1 - b] = 0;
         fenc->costEstAq[b - p0][p1 - b] = 0;
 
-        for (int i = 0; i < m_heightInCU; i++)
+        ThreadPool* pool = m_lookahead.m_pool;
+        if (!m_batchMode && pool && pool->m_numWorkers > 2 && ((p1 > b) || bDoSearch[0] || bDoSearch[1]))
         {
-            m_rows[i].init();
-            if (!fenc->bIntraCalculated)
-                fenc->rowSatds[0][0][i] = 0;
-            fenc->rowSatds[b - p0][p1 - b][i] = 0;
-#if DETAILED_CU_STATS
-            m_rows[i].m_processRowElapsedTime = 0;
-            m_rows[i].m_countProcessRow = 0;
-#endif
-        }
+            /* Use cooperative mode if a thread pool is available and the cost estimate is
+             * going to need motion searches or bidir measurements */
 
-        m_bFrameCompleted = false;
+            memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);
 
-        if (m_pool)
-        {
-            WaveFront::enqueue();
+            m_lock.acquire();
+            X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");
+            m_coop.p0 = p0;
+            m_coop.p1 = p1;
+            m_coop.b = b;
+            m_coop.bDoSearch[0] = bDoSearch[0];
+            m_coop.bDoSearch[1] = bDoSearch[1];
+            m_jobTotal = m_lookahead.m_numCoopSlices;
+            m_jobAcquired = 0;
+            m_lock.release();
 
-            // enableAllRows must be already called
-            enqueueRow(0);
-            while (!m_bFrameCompleted)
-                WaveFront::findJob(-1);
+            tryBondPeers(*m_lookahead.m_pool, m_jobTotal);
 
-            WaveFront::dequeue();
+            processTasks(-1);
+
+            waitForExit();
+
+            for (int i = 0; i < m_lookahead.m_numCoopSlices; i++)
+            {
+                fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst;
+                fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq;
+                fenc->intraMbs[b - p0] += m_slice[i].intraMbs;
+            }
         }
         else
         {
-            for (int row = 0; row < m_heightInCU; row++)
-                processRow(row, -1);
+            bool lastRow = true;
+            for (int cuY = m_lookahead.m_heightInCU - 1; cuY >= 0; cuY--)
+            {
+                fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
 
-            x265_emms();
+                for (int cuX = m_lookahead.m_widthInCU - 1; cuX >= 0; cuX--)
+                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);
+
+                lastRow = false;
+            }
         }
 
-        // Accumulate cost from each row
-        for (int row = 0; row < m_heightInCU; row++)
-        {
-#if DETAILED_CU_STATS
-            m_processRowElapsedTime += m_rows[row].m_processRowElapsedTime;
-            m_countProcessRow += m_rows[row].m_countProcessRow;
-#endif
-            score += m_rows[row].m_costEst;
-            fenc->costEst[0][0] += m_rows[row].m_costIntra;
-            if (m_param->rc.aqMode)
-            {
-                fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq;
-                fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq;
-            }
-            fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs;
-        }
-
-        fenc->bIntraCalculated = true;
+        score = fenc->costEst[b - p0][p1 - b];
 
         if (b != p1)
-            score = (uint64_t)score * 100 / (130 + m_param->bFrameBias);
-        if (b != p0 || b != p1) //Not Intra cost
-            fenc->costEst[b - p0][p1 - b] = score;
+            score = score * 100 / (130 + param->bFrameBias);
+
+        fenc->costEst[b - p0][p1 - b] = score;
     }
 
     if (bIntraPenalty)
-    {
         // arbitrary penalty for I-blocks after B-frames
-        int ncu = NUM_CUS;
-        score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8);
-    }
+        score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8);
+
     return score;
 }
 
-uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp)
+void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
 {
-    Lowres *fenc = frames[b];
-    Lowres *ref  = frames[p0];
-    pixel *src = ref->fpelPlane[0];
-    intptr_t stride = fenc->lumaStride;
+    Lowres *fref0 = m_frames[p0];
+    Lowres *fref1 = m_frames[p1];
+    Lowres *fenc  = m_frames[b];
 
-    if (wp)
-    {
-        int offset = wp->inputOffset << (X265_DEPTH - 8);
-        int scale = wp->inputWeight;
-        int denom = wp->log2WeightDenom;
-        int round = denom ? 1 << (denom - 1) : 0;
-        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
-        int widthHeight = (int)stride;
+    ReferencePlanes *wfref0 = tld.weightedRef.isWeighted ? &tld.weightedRef : fref0;
 
-        primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
-                             scale, round << correction, denom + correction, offset);
-        src = m_weightedRef.fpelPlane[0];
-    }
+    const int widthInCU = m_lookahead.m_widthInCU;
+    const int heightInCU = m_lookahead.m_heightInCU;
+    const int bBidir = (b < p1);
+    const int cuXY = cuX + cuY * widthInCU;
+    const int cuSize = X265_LOWRES_CU_SIZE;
+    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
 
-    uint32_t cost = 0;
-    intptr_t pixoff = 0;
-    int mb = 0;
-
-    for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride)
-    {
-        for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
-        {
-            int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
-            cost += X265_MIN(satd, fenc->intraCost[mb]);
-        }
-    }
-
-    return cost;
-}
-
-void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
-{
-    static const float epsilon = 1.f / 128.f;
-    Lowres *fenc, *ref;
-
-    fenc = frames[b];
-    ref  = frames[p0];
-    int deltaIndex = fenc->frameNum - ref->frameNum;
-
-    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
-    float guessScale, fencMean, refMean;
-    x265_emms();
-    if (fenc->wp_ssd[0] && ref->wp_ssd[0])
-        guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]);
-    else
-        guessScale = 1.0f;
-    fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
-    refMean  = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
-
-    /* Early termination */
-    if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
-        return;
-
-    int minoff = 0, minscale, mindenom;
-    unsigned int minscore = 0, origscore = 1;
-    int found = 0;
-
-    m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);
-    mindenom = m_w.log2WeightDenom;
-    minscale = m_w.inputWeight;
-
-    origscore = minscore = weightCostLuma(frames, b, p0, NULL);
-
-    if (!minscore)
-        return;
-
-    unsigned int s = 0;
-    int curScale = minscale;
-    int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);
-    if (curOffset < -128 || curOffset > 127)
-    {
-        /* Rescale considering the constraints on curOffset. We do it in this order
-         * because scale has a much wider range than offset (because of denom), so
-         * it should almost never need to be clamped. */
-        curOffset = x265_clip3(-128, 127, curOffset);
-        curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
-        curScale = x265_clip3(0, 127, curScale);
-    }
-    SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset);
-    s = weightCostLuma(frames, b, p0, &m_w);
-    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
-
-    /* Use a smaller denominator if possible */
-    while (mindenom > 0 && !(minscale & 1))
-    {
-        mindenom--;
-        minscale >>= 1;
-    }
-
-    if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
-        return;
-    else
-    {
-        SET_WEIGHT(m_w, 1, minscale, mindenom, minoff);
-        // set weighted delta cost
-        fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
-
-        int offset = m_w.inputOffset << (X265_DEPTH - 8);
-        int scale = m_w.inputWeight;
-        int denom = m_w.log2WeightDenom;
-        int round = denom ? 1 << (denom - 1) : 0;
-        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
-        intptr_t stride = ref->lumaStride;
-        int widthHeight = (int)stride;
-
-        for (int i = 0; i < 4; i++)
-            primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines,
-                                 scale, round << correction, denom + correction, offset);
-
-        m_weightedRef.isWeighted = true;
-    }
-}
-
-void CostEstimate::processRow(int row, int /*threadId*/)
-{
-    ProfileScopeEvent(costEstimateRow);
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_processRowElapsedTime);
-    m_countProcessRow++;
-#endif
-
-    int realrow = m_heightInCU - 1 - row;
-    Lowres **frames = m_curframes;
-    ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
-
-    /* Lowres lookahead goes backwards because the MVs are used as
-     * predictors in the main encode.  This considerably improves MV
-     * prediction overall. */
-    for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--)
-    {
-        // TODO: use lowres MVs as motion candidates in full-res search
-        m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch);
-        m_rows[row].m_completed++;
-
-        if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1)
-        {
-            ScopedLock below(m_rows[row + 1].m_lock);
-            if (m_rows[row + 1].m_active == false &&
-                m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed)
-            {
-                m_rows[row + 1].m_active = true;
-                enqueueRow(row + 1);
-            }
-        }
-
-        ScopedLock self(m_rows[row].m_lock);
-        if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 &&
-            m_rows[row - 1].m_completed < m_rows[row].m_completed + 2)
-        {
-            m_rows[row].m_active = false;
-            return;
-        }
-    }
-
-    if (row == m_heightInCU - 1)
-        m_bFrameCompleted = true;
-}
-
-void EstimateRow::init()
-{
-    m_costEst = 0;
-    m_costEstAq = 0;
-    m_costIntra = 0;
-    m_costIntraAq = 0;
-    m_intraMbs = 0;
-    m_active = false;
-    m_completed = 0;
-}
-
-void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
-{
-    Lowres *fref1 = frames[p1];
-    Lowres *fenc  = frames[b];
-
-    const int bBidir = (b < p1);
-    const int cuXY = cux + cuy * m_widthInCU;
-    const int cuSize = X265_LOWRES_CU_SIZE;
-    const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride;
-
-    // should this CU's cost contribute to the frame cost?
-    const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
-                                cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
-
-    m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
+    if (bBidir || bDoSearch[0] || bDoSearch[1])
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;
 
-    MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
-                         &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
-    int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
-                            &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
+    MV(*fencMVs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
+                        &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
+    int(*fencCosts[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
+                           &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
 
     MV mvmin, mvmax;
-    int bcost = m_me.COST_MAX;
+    int bcost = tld.me.COST_MAX;
     int listused = 0;
 
     // establish search bounds that don't cross extended frame boundaries
-    mvmin.x = (int16_t)(-cux * cuSize - 8);
-    mvmin.y = (int16_t)(-cuy * cuSize - 8);
-    mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8);
-    mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8);
+    mvmin.x = (int16_t)(-cuX * cuSize - 8);
+    mvmin.y = (int16_t)(-cuY * cuSize - 8);
+    mvmax.x = (int16_t)((widthInCU - cuX - 1) * cuSize + 8);
+    mvmax.y = (int16_t)((heightInCU - cuY - 1) * cuSize + 8);
 
-    if (p0 != p1)
+    for (int i = 0; i < 1 + bBidir; i++)
     {
-        for (int i = 0; i < 1 + bBidir; i++)
+        if (!bDoSearch[i])
         {
-            if (!bDoSearch[i])
-            {
-                /* Use previously calculated cost */
-                COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
-                continue;
-            }
-            int numc = 0;
-            MV mvc[4], mvp;
-            MV *fenc_mv = fenc_mvs[i];
+            COPY2_IF_LT(bcost, *fencCosts[i], listused, i + 1);
+            continue;
+        }
 
-            /* Reverse-order MV prediction. */
-            mvc[0] = 0;
-            mvc[2] = 0;
+        int numc = 0;
+        MV mvc[4], mvp;
+        MV *fencMV = fencMVs[i];
+
+        /* Reverse-order MV prediction */
+        mvc[0] = 0;
+        mvc[2] = 0;
 #define MVC(mv) mvc[numc++] = mv;
-            if (cux < m_widthInCU - 1)
-                MVC(fenc_mv[1]);
-            if (cuy < m_heightInCU - 1)
-            {
-                MVC(fenc_mv[m_widthInCU]);
-                if (cux > 0)
-                    MVC(fenc_mv[m_widthInCU - 1]);
-                if (cux < m_widthInCU - 1)
-                    MVC(fenc_mv[m_widthInCU + 1]);
-            }
+        if (cuX < widthInCU - 1)
+            MVC(fencMV[1]);
+        if (!lastRow)
+        {
+            MVC(fencMV[widthInCU]);
+            if (cuX > 0)
+                MVC(fencMV[widthInCU - 1]);
+            if (cuX < widthInCU - 1)
+                MVC(fencMV[widthInCU + 1]);
+        }
 #undef MVC
-            if (numc <= 1)
-                mvp = mvc[0];
-            else
-            {
-                median_mv(mvp, mvc[0], mvc[1], mvc[2]);
-            }
+        if (numc <= 1)
+            mvp = mvc[0];
+        else
+            median_mv(mvp, mvc[0], mvc[1], mvc[2]);
 
-            *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]);
-            COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
-        }
-        if (bBidir)
-        {
-            ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-            ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-            intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
-            pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
-            pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
-
-            ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-            primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
-            int bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
-            COPY2_IF_LT(bcost, bicost, listused, 3);
-
-            // Try 0,0 candidates
-            src0 = wfref0->lowresPlane[0] + pelOffset;
-            src1 = fref1->lowresPlane[0] + pelOffset;
-            primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32);
-            bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
-            COPY2_IF_LT(bcost, bicost, listused, 3);
-        }
+        *fencCosts[i] = tld.me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, s_merange, *fencMVs[i]);
+        COPY2_IF_LT(bcost, *fencCosts[i], listused, i + 1);
     }
 
-    if (!fenc->bIntraCalculated)
+    if (bBidir) /* B, also consider bidir */
     {
-        ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-        pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
-        const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
-        const int cuSize2 = cuSize << 1;
+        /* NOTE: the wfref0 (weightp) is not used for BIDIR */
 
-        pixel *pixCur = fenc->lowresPlane[0] + pelOffset;
+        /* avg(l0-mv, l1-mv) candidate */
+        ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+        ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+        intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
+        pixel *src0 = fref0->lowresMC(pelOffset, *fencMVs[0], subpelbuf0, stride0);
+        pixel *src1 = fref1->lowresMC(pelOffset, *fencMVs[1], subpelbuf1, stride1);
 
-        // Copy Above
-        memcpy(neighbours[0], pixCur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
+        ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+        primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
+        int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
+        COPY2_IF_LT(bcost, bicost, listused, 3);
 
-        // Copy Left
-        for (int i = 1; i < cuSize + 1; i++)
-            neighbours[0][i + cuSize2] = pixCur[-1 - fenc->lumaStride + i * fenc->lumaStride];
+        /* coloc candidate */
+        src0 = fref0->lowresPlane[0] + pelOffset;
+        src1 = fref1->lowresPlane[0] + pelOffset;
+        primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
+        bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
+        COPY2_IF_LT(bcost, bicost, listused, 3);
 
-        for (int i = 0; i < cuSize; i++)
-        {
-            // Copy above-last pixel
-            neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; //neighbours[0][i + 9] = neighbours[0][8]
-            // Copy left-last pixel
-            neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; //neighbours[0][i + 25] = neighbours[0][24]
-        }
+        bcost += lowresPenalty;
+    }
+    else /* P, also consider intra */
+    {
+        bcost += lowresPenalty;
 
-        // Filter neighbour pixels with [1-2-1]
-        neighbours[1][0]  = neighbours[0][0];  // Copy top-left pixel 
-        neighbours[1][cuSize2] = neighbours[0][cuSize2]; //Copy top-right pixel
-        neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
-
-        neighbours[1][1]           = (neighbours[0][0] + (neighbours[0][1] << 1)           + neighbours[0][2] + 2)               >> 2;
-        neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
-        for (int i = 2; i < cuSize2; i++)
-        {
-            neighbours[1][i]           = (neighbours[0][i - 1]      + (neighbours[0][i] << 1)      + neighbours[0][i + 1]      + 2) >> 2;
-            neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
-        }
-
-        int icost = m_me.COST_MAX, ilowmode;
-        primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
-        int cost = m_me.bufSATD(prediction, cuSize);
-        COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
-
-        pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
-        primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
-        cost = m_me.bufSATD(prediction, cuSize);
-        COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
-
-        uint32_t mode, lowmode = 4;
-        int acost = m_me.COST_MAX, filter;
-        for (mode = 5; mode < 35; mode += 5)
-        {
-            filter = !!(g_intraFilterFlags[mode] & cuSize);
-            primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
-            cost = m_me.bufSATD(prediction, cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-        }
-        for (uint32_t dist = 2; dist >= 1; dist--)
-        {
-            int minusmode = lowmode - dist;
-            int plusmode = lowmode + dist;
-
-            mode = minusmode;
-            filter = !!(g_intraFilterFlags[mode] & cuSize);
-            primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
-            cost = m_me.bufSATD(prediction, cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-
-            mode = plusmode;
-            filter = !!(g_intraFilterFlags[mode] & cuSize);
-            primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
-            cost = m_me.bufSATD(prediction, cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-        }
-        COPY2_IF_LT(icost, acost, ilowmode, lowmode);
-
-        const int intraPenalty = 5 * m_lookAheadLambda;
-        icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
-        fenc->intraCost[cuXY] = icost;
-        fenc->intraMode[cuXY] = (uint8_t)ilowmode;
-
-        int icostAq = icost;
-        if (bFrameScoreCU)
-        {
-            m_costIntra += icost;
-            if (fenc->invQscaleFactor)
-            {
-                icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
-                m_costIntraAq += icostAq;
-            }
-        }
-        fenc->rowSatds[0][0][cuy] += icostAq;
-    }
-    bcost += lowresPenalty;
-    if (!bBidir)
-    {
         if (fenc->intraCost[cuXY] < bcost)
         {
-            if (bFrameScoreCU) m_intraMbs++;
             bcost = fenc->intraCost[cuXY];
             listused = 0;
         }
     }
 
-    /* For I frames these costs were accumulated earlier */
-    if (p0 != p1)
+    /* do not include edge blocks in the frame cost estimates, they are not very accurate */
+    const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
+                                cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
+
+    int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8) : bcost;
+
+    if (bFrameScoreCU)
     {
-        int bcostAq = bcost;
-        if (bFrameScoreCU)
+        if (slice < 0)
         {
-            m_costEst += bcost;
-            if (fenc->invQscaleFactor)
-            {
-                bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
-                m_costEstAq += bcostAq;
-            }
+            fenc->costEst[b - p0][p1 - b] += bcost;
+            fenc->costEstAq[b - p0][p1 - b] += bcostAq;
+            if (!listused)
+                fenc->intraMbs[b - p0]++;
         }
-        fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq;
+        else
+        {
+            m_slice[slice].costEst += bcost;
+            m_slice[slice].costEstAq += bcostAq;
+            if (!listused)
+                m_slice[slice].intraMbs++;
+        }
     }
+
+    fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
     fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
 }
diff -r bc969aedef71 -r 8faf10074f9f source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/encoder/slicetype.h	Thu Feb 19 09:42:06 2015 -0600
@@ -28,148 +28,133 @@
 #include "slice.h"
 #include "motion.h"
 #include "piclist.h"
-#include "wavefront.h"
+#include "threadpool.h"
 
 namespace x265 {
 // private namespace
 
 struct Lowres;
 class Frame;
+class Lookahead;
 
 #define LOWRES_COST_MASK  ((1 << 14) - 1)
 #define LOWRES_COST_SHIFT 14
 
-class EstimateRow
+/* Thread local data for lookahead tasks */
+struct LookaheadTLD
 {
-public:
-    x265_param*         m_param;
-    MotionEstimate      m_me;
-    Lock                m_lock;
-
-    volatile uint32_t   m_completed;      // Number of CUs in this row for which cost estimation is completed
-    volatile bool       m_active;
-
-    uint64_t            m_costEst;        // Estimated cost for all CUs in a row
-    uint64_t            m_costEstAq;      // Estimated weight Aq cost for all CUs in a row
-    uint64_t            m_costIntraAq;    // Estimated weighted Aq Intra cost for all CUs in a row
-    int                 m_intraMbs;       // Number of Intra CUs
-    int                 m_costIntra;      // Estimated Intra cost for all CUs in a row
-
-    int                 m_merange;
-    int                 m_lookAheadLambda;
-
-    int                 m_widthInCU;
-    int                 m_heightInCU;
+    MotionEstimate  me;
+    ReferencePlanes weightedRef;
+    pixel*          wbuffer[4];
+    int             widthInCU;
+    int             heightInCU;
+    int             ncu;
+    int             paddedLines;
 
 #if DETAILED_CU_STATS
-    int64_t             m_processRowElapsedTime;
-    uint64_t            m_countProcessRow;
+    int64_t         batchElapsedTime;
+    int64_t         coopSliceElapsedTime;
+    uint64_t        countBatches;
+    uint64_t        countCoopSlices;
 #endif
 
-    EstimateRow()
+    LookaheadTLD()
     {
-        m_me.setQP(X265_LOOKAHEAD_QP);
-        m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
-        m_merange = 16;
-        m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
+        me.setQP(X265_LOOKAHEAD_QP);
+        me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
+        for (int i = 0; i < 4; i++)
+            wbuffer[i] = NULL;
+        widthInCU = heightInCU = ncu = paddedLines = 0;
+
+#if DETAILED_CU_STATS
+        batchElapsedTime = 0;
+        coopSliceElapsedTime = 0;
+        countBatches = 0;
+        countCoopSlices = 0;
+#endif
     }
 
-    void init();
+    void init(int w, int h, int n)
+    {
+        widthInCU = w;
+        heightInCU = h;
+        ncu = n;
+    }
 
-    void estimateCUCost(Lowres * *frames, ReferencePlanes * wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
-};
+    ~LookaheadTLD() { X265_FREE(wbuffer[0]); }
 
-/* CostEstimate manages the cost estimation of a single frame, ie:
- * estimateFrameCost() and everything below it in the call graph */
-class CostEstimate : public WaveFront
-{
-public:
-    CostEstimate(ThreadPool *p);
-    ~CostEstimate();
-    void init(x265_param *, Frame *);
+    void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
+    void lowresIntraEstimate(Lowres& fenc);
 
-    x265_param      *m_param;
-    EstimateRow     *m_rows;
-    pixel           *m_wbuffer[4];
-    Lowres         **m_curframes;
-
-    ReferencePlanes  m_weightedRef;
-    WeightParam      m_w;
-
-    int              m_paddedLines;     // number of lines in padded frame
-    int              m_widthInCU;       // width of lowres frame in downscale CUs
-    int              m_heightInCU;      // height of lowres frame in downscale CUs
-
-    bool             m_bDoSearch[2];
-    volatile bool    m_bFrameCompleted;
-    int              m_curb, m_curp0, m_curp1;
-
-#if DETAILED_CU_STATS
-    int64_t          m_processRowElapsedTime;
-    uint64_t         m_countProcessRow;
-#endif
-
-    void     processRow(int row, int threadId);
-    int64_t  estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty);
+    void weightsAnalyse(Lowres& fenc, Lowres& ref);
 
 protected:
 
-    void     weightsAnalyse(Lowres **frames, int b, int p0);
-    uint32_t weightCostLuma(Lowres **frames, int b, int p0, WeightParam *w);
+    uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp);
+    uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
+    bool     allocWeightedRef(Lowres& fenc);
 };
 
 class Lookahead : public JobProvider
 {
 public:
 
+    PicList       m_inputQueue;      // input pictures in order received
+    PicList       m_outputQueue;     // pictures to be encoded, in encode order
+    Lock          m_inputLock;
+    Lock          m_outputLock;
+
+    /* pre-lookahead */
+    Frame*        m_preframes[X265_LOOKAHEAD_MAX];
+    int           m_preTotal, m_preAcquired, m_preCompleted;
+    int           m_fullQueueSize;
+    bool          m_isActive;
+    bool          m_sliceTypeBusy;
+    bool          m_bAdaptiveQuant;
+    bool          m_outputSignalRequired;
+    Lock          m_preLookaheadLock;
+    Event         m_outputSignal;
+
+    LookaheadTLD* m_tld;
+    x265_param*   m_param;
+    Lowres*       m_lastNonB;
+    int*          m_scratch;         // temp buffer for cutree propagate
+    
+    int           m_histogram[X265_BFRAME_MAX + 1];
+    int           m_lastKeyframe;
+    int           m_widthInCU;
+    int           m_heightInCU;
+    int           m_ncu;
+    int           m_numCoopSlices;
+    int           m_numRowsPerSlice;
+    bool          m_filled;
+
     Lookahead(x265_param *param, ThreadPool *pool);
-    ~Lookahead();
-    void init();
-    void destroy();
-
-    CostEstimate     m_est;             // Frame cost estimator
-    PicList          m_inputQueue;      // input pictures in order received
-    PicList          m_outputQueue;     // pictures to be encoded, in encode order
-
-    x265_param      *m_param;
-    Lowres          *m_lastNonB;
-    int             *m_scratch;         // temp buffer
-
-    int              m_widthInCU;       // width of lowres frame in downscale CUs
-    int              m_heightInCU;      // height of lowres frame in downscale CUs
-    int              m_lastKeyframe;
-    int              m_histogram[X265_BFRAME_MAX + 1];
 
 #if DETAILED_CU_STATS
-    int64_t          m_slicetypeDecideElapsedTime;
-    uint64_t         m_countSlicetypeDecide;
-    bool             usingWorkerThreads() const { return !!m_pool; }
+    int64_t       m_slicetypeDecideElapsedTime;
+    int64_t       m_preLookaheadElapsedTime;
+    uint64_t      m_countSlicetypeDecide;
+    uint64_t      m_countPreLookahead;
+    void          getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount);
 #endif
 
-    void addPicture(Frame*, int sliceType);
-    void flush();
-    void stop();
-    Frame* getDecidedPicture();
+    bool    create();
+    void    destroy();
+    void    stop();
 
-    void getEstimatedPictureCost(Frame *pic);
+    void    addPicture(Frame&, int sliceType);
+    void    flush();
+    Frame*  getDecidedPicture();
+
+    void    getEstimatedPictureCost(Frame *pic);
+
 
 protected:
 
-    Lock  m_inputQueueLock;
-    Lock  m_outputQueueLock;
-    Event m_outputAvailable;
-
-    bool  m_bReady;   /* input lock - slicetypeDecide() can be started */
-    bool  m_bBusy;    /* input lock - slicetypeDecide() is running */
-    bool  m_bFilled;  /* enough frames in lookahead for output to be available */
-    bool  m_bFlushed; /* all frames have been decided, lookahead is finished */
-    bool  m_bFlush;   /* no more frames will be received, empty the input queue */
-
-    bool  findJob(int);
-
-    /* called by addPicture() or flush() to trigger slice decisions */
-    void slicetypeDecide();
-    void slicetypeAnalyse(Lowres **frames, bool bKeyframe);
+    void    findJob(int workerThreadID);
+    void    slicetypeDecide();
+    void    slicetypeAnalyse(Lowres **frames, bool bKeyframe);
 
     /* called by slicetypeAnalyse() to make slice decisions */
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch);
@@ -181,13 +166,64 @@
 
     /* called by slicetypeAnalyse() to effect cuTree adjustments to adaptive
      * quant offsets */
-    void cuTree(Lowres **frames, int numframes, bool bintra);
-    void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
-    void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
+    void    cuTree(Lowres **frames, int numframes, bool bintra);
+    void    estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
+    void    cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
 
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
 };
+
+class CostEstimateGroup : public BondedTaskGroup
+{
+public:
+
+    Lookahead& m_lookahead;
+    Lowres**   m_frames;
+    bool       m_batchMode;
+
+    CostEstimateGroup(Lookahead& l, Lowres** f) : m_lookahead(l), m_frames(f), m_batchMode(false) {}
+
+    /* Cooperative cost estimate using multiple slices of downscaled frame */
+    struct Coop
+    {
+        int  p0, b, p1;
+        bool bDoSearch[2];
+    } m_coop;
+
+    enum { MAX_COOP_SLICES = 32 };
+    struct Slice
+    {
+        int  costEst;
+        int  costEstAq;
+        int  intraMbs;
+    } m_slice[MAX_COOP_SLICES];
+
+    int64_t singleCost(int p0, int p1, int b, bool intraPenalty = false);
+
+    /* Batch cost estimates, using one worker thread per estimateFrameCost() call */
+    enum { MAX_BATCH_SIZE = 2048 };
+    struct Estimate
+    {
+        int  p0, b, p1;
+        bool bIntraPenalty;
+    } m_estimates[MAX_BATCH_SIZE];
+
+    void add(int p0, int p1, int b, bool intraPenalty = false);
+    void finishBatch();
+
+protected:
+
+    static const int s_merange = 16;
+
+    void    processTasks(int workerThreadID);
+
+    int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);
+    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
+
+    CostEstimateGroup& operator=(const CostEstimateGroup&);
+};
+
 }
 
 #endif // ifndef X265_SLICETYPE_H
diff -r bc969aedef71 -r 8faf10074f9f source/profile/cpuEvents.h
--- a/source/profile/cpuEvents.h	Thu Feb 19 09:39:48 2015 -0600
+++ b/source/profile/cpuEvents.h	Thu Feb 19 09:42:06 2015 -0600
@@ -5,6 +5,7 @@
 CPU_EVENT(filterCTURow)
 CPU_EVENT(slicetypeDecideEV)
 CPU_EVENT(prelookahead)
-CPU_EVENT(costEstimateRow)
+CPU_EVENT(estCostSingle)
+CPU_EVENT(estCostCoop)
 CPU_EVENT(pmode)
 CPU_EVENT(pme)