[x265] [PATCH] rc: add support for qg-size 8

Tue Aug 23 07:44:25 CEST 2016

# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1471411031 -19800
#      Wed Aug 17 10:47:11 2016 +0530
# Node ID 74d189cfdc36c061ce6951533d47cb7404b327b1
# Parent  49a0d1176aef5bc6330fcfd39b4589616c174f0a
rc: add support for qg-size 8

diff -r 49a0d1176aef -r 74d189cfdc36 source/common/common.h

--- a/source/common/common.h	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/common/common.h	Wed Aug 17 10:47:11 2016 +0530
@@ -81,6 +81,7 @@
 
 #elif defined(_MSC_VER)
 
+#define ALIGN_VAR_4(T, var)  __declspec(align(4)) T var
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
diff -r 49a0d1176aef -r 74d189cfdc36 source/common/frame.cpp
--- a/source/common/frame.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/common/frame.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -62,7 +62,7 @@
 
         if (quantOffsets)
         {
-            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
+            int32_t cuCount = m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes;
             m_quantOffsets = new float[cuCount];
         }
         return true;
diff -r 49a0d1176aef -r 74d189cfdc36 source/common/lowres.cpp
--- a/source/common/lowres.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/common/lowres.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -38,7 +38,10 @@
         lumaStride += 32 - (lumaStride & 31);
     maxBlocksInRow = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     maxBlocksInCol = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    maxBlocksInRowFullRes = maxBlocksInRow * 2;
+    maxBlocksInColFullRes = maxBlocksInCol * 2;
     int cuCount = maxBlocksInRow * maxBlocksInCol;
+    int cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
 
     /* rounding the width to multiple of lowres CU size */
     width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -49,10 +52,10 @@
 
     if (bAQEnabled)
     {
-        CHECKED_MALLOC(qpAqOffset, double, cuCount);
-        CHECKED_MALLOC(invQscaleFactor, int, cuCount);
-        CHECKED_MALLOC(qpCuTreeOffset, double, cuCount);
-        CHECKED_MALLOC(blockVariance, uint32_t, cuCount);
+        CHECKED_MALLOC(qpAqOffset, double, cuCountFullRes);
+        CHECKED_MALLOC(invQscaleFactor, int, cuCountFullRes);
+        CHECKED_MALLOC(qpCuTreeOffset, double, cuCountFullRes);
+        CHECKED_MALLOC(blockVariance, uint32_t, cuCountFullRes);
     }
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
diff -r 49a0d1176aef -r 74d189cfdc36 source/common/lowres.h
--- a/source/common/lowres.h	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/common/lowres.h	Wed Aug 17 10:47:11 2016 +0530
@@ -132,6 +132,8 @@
     MV*       lowresMvs[2][X265_BFRAME_MAX + 1];
     uint32_t  maxBlocksInRow;
     uint32_t  maxBlocksInCol;
+    uint32_t  maxBlocksInRowFullRes;
+    uint32_t  maxBlocksInColFullRes;
 
     /* used for vbvLookahead */
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
diff -r 49a0d1176aef -r 74d189cfdc36 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/common/pixel.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -853,7 +853,8 @@
     {
         int intraCost = intraCosts[i];
         int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
-        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+        int invQscaleFactor = (invQscales[i * 2] + invQscales[i * 2 + 1] + invQscales[i * 2 + len * 2] + invQscales[i * 2 + len * 2 + 1])/4;
+        double propagateIntra  = intraCost * invQscaleFactor; // Q16 x Q8.8 = Q24.8
         double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
         double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
 
diff -r 49a0d1176aef -r 74d189cfdc36 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -2356,7 +2356,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
-        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+        //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -2670,7 +2670,7 @@
         p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
-        p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        //p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -3666,7 +3666,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
-        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
         p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
diff -r 49a0d1176aef -r 74d189cfdc36 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/encoder/analysis.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -2647,17 +2647,17 @@
         uint32_t height = m_frame->m_fencPic->m_picHeight;
         uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
         uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
-        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
+        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (8 - 1)) / 8;
         uint32_t blockSize = g_maxCUSize >> cuGeom.depth;
         double qp_offset = 0;
         uint32_t cnt = 0;
         uint32_t idx;
 
-        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 16)
+        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 8)
         {
-            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 16)
+            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 8)
             {
-                idx = ((block_yy / 16) * (maxCols)) + (block_xx / 16);
+                idx = ((block_yy / 8) * (maxCols)) + (block_xx / 8);
                 qp_offset += qpoffs[idx];
                 cnt++;
             }
diff -r 49a0d1176aef -r 74d189cfdc36 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/encoder/encoder.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -605,7 +605,7 @@
 
         if (pic_in->quantOffsets != NULL)
         {
-            int cuCount = inFrame->m_lowres.maxBlocksInRow * inFrame->m_lowres.maxBlocksInCol;
+            int cuCount = inFrame->m_lowres.maxBlocksInRowFullRes * inFrame->m_lowres.maxBlocksInColFullRes;
             memcpy(inFrame->m_quantOffsets, pic_in->quantOffsets, cuCount * sizeof(float));
         }
 
@@ -790,7 +790,7 @@
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
                     m_aborted = true;
             if (pic_out)
-            {
+            { 
                 /* m_rcData is allocated for every frame */
                 pic_out->rcData = outFrame->m_rcData;
                 outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
@@ -1583,7 +1583,7 @@
     {
         pps->bUseDQP = true;
         pps->maxCuDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
-        X265_CHECK(pps->maxCuDQPDepth <= 2, "max CU DQP depth cannot be greater than 2\n");
+        X265_CHECK(pps->maxCuDQPDepth <= 3, "max CU DQP depth cannot be greater than 3\n");
     }
     else
     {
@@ -1867,10 +1867,10 @@
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
     if (!m_param->bLossless && (m_param->rc.aqMode || bIsVbv))
     {
-        if (p->rc.qgSize < X265_MAX(16, p->minCUSize))
+        if (p->rc.qgSize < X265_MAX(8, p->minCUSize))
         {
-            p->rc.qgSize = X265_MAX(16, p->minCUSize);
-            x265_log(p, X265_LOG_WARNING, "QGSize should be greater than or equal to 16 and minCUSize, setting QGSize = %d\n", p->rc.qgSize);
+            p->rc.qgSize = X265_MAX(8, p->minCUSize);
+            x265_log(p, X265_LOG_WARNING, "QGSize should be greater than or equal to 8 and minCUSize, setting QGSize = %d\n", p->rc.qgSize);
         }
         if (p->rc.qgSize > p->maxCUSize)
         {
diff -r 49a0d1176aef -r 74d189cfdc36 source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/encoder/ratecontrol.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -615,9 +615,9 @@
         }
         if (m_param->rc.cuTree)
         {
-            m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+            m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * 4 * sizeof(uint16_t));
             if (m_param->bBPyramid && m_param->rc.bStatRead)
-                m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+                m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * 4 * sizeof(uint16_t));
             m_cuTreeStats.qpBufPos = -1;
         }
     }
@@ -1437,7 +1437,7 @@
 
                 if (!fread(&type, 1, 1, m_cutreeStatFileIn))
                     goto fail;
-                if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), m_ncu, m_cutreeStatFileIn) != (size_t)m_ncu)
+                if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), m_ncu * 4, m_cutreeStatFileIn) != (size_t)m_ncu * 4)
                     goto fail;
 
                 if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1)
@@ -1448,8 +1448,8 @@
             }
             while(type != sliceTypeActual);
         }
-        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu);
-        for (int i = 0; i < m_ncu; i++)
+        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu * 4);
+        for (int i = 0; i < m_ncu * 4; i++)
             frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]);
         m_cuTreeStats.qpBufPos--;
     }
@@ -2612,10 +2612,10 @@
     if (m_param->rc.cuTree && IS_REFERENCED(curFrame) && !m_param->rc.bStatRead)
     {
         uint8_t sliceType = (uint8_t)rce->sliceType;
-        primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, m_ncu);
+        primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, m_ncu * 4);
         if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
             goto writeFailure;
-        if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu, m_cutreeStatFileOut) < (size_t)m_ncu)
+        if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu * 4, m_cutreeStatFileOut) < (size_t)m_ncu * 4)
             goto writeFailure;
     }
     return 0;
diff -r 49a0d1176aef -r 74d189cfdc36 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Jul 27 21:47:20 2016 +0200
+++ b/source/encoder/slicetype.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -60,12 +60,12 @@
 {
     if ((colorFormat != X265_CSP_I444) && plane)
     {
-        ALIGN_VAR_8(pixel, pix[8 * 8]);
-        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+        ALIGN_VAR_4(pixel, pix[4 * 4]);
+        primitives.cu[BLOCK_4x4].copy_pp(pix, 4, src, srcStride);
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_4x4].var(pix, 4), 4, plane);
     }
     else
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(src, srcStride), 6, plane);
 }
 
 } // end anonymous namespace
@@ -97,7 +97,7 @@
     /* Actual adaptive quantization */
     int maxCol = curFrame->m_fencPic->m_picWidth;
     int maxRow = curFrame->m_fencPic->m_picHeight;
-    int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
+    int blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
 
     float* quantOffsets = curFrame->m_quantOffsets;
     for (int y = 0; y < 3; y++)
@@ -113,7 +113,7 @@
     if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
     {
         /* Need to init it anyways for CU tree */
-        int cuCount = widthInCU * heightInCU;
+        int cuCount = blockCount;
 
         if (param->rc.aqMode && param->rc.aqStrength == 0)
         {
@@ -137,8 +137,8 @@
         /* Need variance data for weighted prediction */
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
         {
-            for (blockY = 0; blockY < maxRow; blockY += 16)
-                for (blockX = 0; blockX < maxCol; blockX += 16)
+            for (blockY = 0; blockY < maxRow; blockY += 8)
+                for (blockX = 0; blockX < maxCol; blockX += 8)
                     acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
         }
     }
@@ -152,10 +152,10 @@
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
             curFrame->m_lowres.frameVariance = 0;
             uint64_t rowVariance = 0;
-            for (blockY = 0; blockY < maxRow; blockY += 16)
+            for (blockY = 0; blockY < maxRow; blockY += 8)
             {
                 rowVariance = 0;
-                for (blockX = 0; blockX < maxCol; blockX += 16)
+                for (blockX = 0; blockX < maxCol; blockX += 8)
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
                     curFrame->m_lowres.blockVariance[blockXY] = energy;
@@ -172,21 +172,21 @@
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
             strength = param->rc.aqStrength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f)) / avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (8.f)) / avg_adj;
             bias_strength = param->rc.aqStrength;
         }
         else
             strength = param->rc.aqStrength * 1.0397f;
 
         blockXY = 0;
-        for (blockY = 0; blockY < maxRow; blockY += 16)
+        for (blockY = 0; blockY < maxRow; blockY += 8)
         {
-            for (blockX = 0; blockX < maxCol; blockX += 16)
+            for (blockX = 0; blockX < maxCol; blockX += 8)
             {
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
                 {
                     qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 11.f / (qp_adj * qp_adj));
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 8.f / (qp_adj * qp_adj));
                 }
                 else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
                 {
@@ -196,7 +196,7 @@
                 else
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (11.427f + 2 * (X265_DEPTH - 8)));
                 }
                 if (quantOffsets != NULL)
                     qp_adj += quantOffsets[blockXY];
@@ -318,8 +318,11 @@
             /* do not include edge blocks in the frame cost estimates, they are not very accurate */
             const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                         cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
-
-            int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] + 128) >> 8) : icost;
+            int invQscaleFactor = (fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+                fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+                fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes] +
+                fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes + 1]) / 4;
+            int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * invQscaleFactor + 128) >> 8) : icost;
 
             if (bFrameScoreCU)
             {
@@ -812,9 +815,13 @@
                     uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;
                     if (qp_offset)
                     {
-                        lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8);
+                        double qpOffset = (qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4] +
+                                           qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1] +
+                                           qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+                                           qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+                        lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qpOffset) + 128) >> 8);
                         int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
-                        curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8;
+                        curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
                     }
                     if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
                         for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
@@ -1669,7 +1676,7 @@
         if (bIntra)
         {
             memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
-            memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
+            memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * 4 * sizeof(double));
             return;
         }
         std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
@@ -1766,7 +1773,7 @@
         int cuIndex = blocky * strideInCU;
         primitives.propagateCost(m_scratch, propagateCost,
                                  frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
-                                 frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);
+                                 frames[b]->invQscaleFactor + (cuIndex * 4), &fpsFactor, m_8x8Width);
 
         if (referenced)
             propagateCost += m_8x8Width;
@@ -1852,14 +1859,27 @@
     if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
         weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
 
-    for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
+    for (int cuY = 0; cuY < m_8x8Height; cuY++)
     {
-        int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
-        if (intracost)
+        for (int cuX = 0; cuX < m_8x8Width; cuX++)
         {
-            int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
-            double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
-            frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
+            const int cuXY = cuX + cuY * m_8x8Width;
+            int invQscaleFactor = (frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4] +
+                frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + 1] +
+                frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] +
+                frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1]) / 4;
+
+            int intracost = (frame->intraCost[cuXY] * invQscaleFactor + 128) >> 8;
+            if (intracost)
+            {
+                int propagateCost = (frame->propagateCost[cuXY] * fpsFactor + 128) >> 8;
+                double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
+                frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
+                frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
+                frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
+                frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
+            }
+
         }
     }
 }
@@ -1883,7 +1903,10 @@
         {
             int cuxy = cux + cuy * m_8x8Width;
             int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
-            double qp_adj = qp_offset[cuxy];
+            double qp_adj = (qp_offset[cux * 2 + cuy * m_8x8Width * 4] +
+                qp_offset[cux * 2 + cuy * m_8x8Width * 4 + 1] +
+                qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes] +
+                qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes + 1]) / 4;
             cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
             rowSatd[cuy] += cuCost;
             if ((cuy > 0 && cuy < m_8x8Height - 1 &&
@@ -2202,8 +2225,12 @@
     /* do not include edge blocks in the frame cost estimates, they are not very accurate */
     const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                 cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
+    int invQscaleFactor = (fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+        fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+        fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes] +
+        fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes + 1]) / 4;
 
-    int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8) : bcost;
+    int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * invQscaleFactor + 128) >> 8) : bcost;
 
     if (bFrameScoreCU)
     {