[x265] [PATCH] rc: add support for qg-size 8

Fri Aug 26 06:49:51 CEST 2016

# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1471411031 -19800
#      Wed Aug 17 10:47:11 2016 +0530
# Node ID ab205f07f87b6a8485732e11cbef67c10eaf9b7a
# Parent  215eedc9ecc0570baaf8189eda7b96f1df89bd22
rc: add support for qg-size 8

diff -r 215eedc9ecc0 -r ab205f07f87b doc/reST/cli.rst

--- a/doc/reST/cli.rst	Wed Aug 24 13:17:45 2016 +0530
+++ b/doc/reST/cli.rst	Wed Aug 17 10:47:11 2016 +0530
@@ -1328,11 +1328,11 @@
 	Default 1.0.
 	**Range of values:** 0.0 to 3.0
 
-.. option:: --qg-size <64|32|16>
+.. option:: --qg-size <64|32|16|8>
 
 	Enable adaptive quantization for sub-CTUs. This parameter specifies 
 	the minimum CU size at which QP can be adjusted, ie. Quantization Group
-	size. Allowed range of values are 64, 32, 16 provided this falls within 
+	size. Allowed range of values are 64, 32, 16, 8 provided this falls within 
 	the inclusive range [maxCUSize, minCUSize]. Experimental.
 	Default: same as maxCUSize
 
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/common.h
--- a/source/common/common.h	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/common.h	Wed Aug 17 10:47:11 2016 +0530
@@ -71,6 +71,7 @@
 #define NUM_INTRA_MODE 35
 
 #if defined(__GNUC__)
+#define ALIGN_VAR_4(T, var)  T var __attribute__((aligned(4)))
 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
@@ -81,6 +82,7 @@
 
 #elif defined(_MSC_VER)
 
+#define ALIGN_VAR_4(T, var)  __declspec(align(4)) T var
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/frame.cpp
--- a/source/common/frame.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/frame.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -54,7 +54,7 @@
     CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
 
     if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
-        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
         m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
@@ -62,7 +62,11 @@
 
         if (quantOffsets)
         {
-            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
+            int32_t cuCount;
+            if (param->rc.qgSize == 8 )
+                cuCount = m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes;
+            else
+                cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
             m_quantOffsets = new float[cuCount];
         }
         return true;
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/lowres.cpp
--- a/source/common/lowres.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/lowres.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -27,7 +27,7 @@
 
 using namespace X265_NS;
 
-bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled)
+bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled, uint32_t qgSize)
 {
     isLowres = true;
     bframes = _bframes;
@@ -38,7 +38,14 @@
         lumaStride += 32 - (lumaStride & 31);
     maxBlocksInRow = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     maxBlocksInCol = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    maxBlocksInRowFullRes = maxBlocksInRow * 2;
+    maxBlocksInColFullRes = maxBlocksInCol * 2;
     int cuCount = maxBlocksInRow * maxBlocksInCol;
+    int cuCountFullRes;
+    if (qgSize == 8)
+        cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
+    else
+        cuCountFullRes = maxBlocksInRow * maxBlocksInCol;
 
     /* rounding the width to multiple of lowres CU size */
     width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -49,10 +56,10 @@
 
     if (bAQEnabled)
     {
-        CHECKED_MALLOC(qpAqOffset, double, cuCount);
-        CHECKED_MALLOC(invQscaleFactor, int, cuCount);
-        CHECKED_MALLOC(qpCuTreeOffset, double, cuCount);
-        CHECKED_MALLOC(blockVariance, uint32_t, cuCount);
+        CHECKED_MALLOC(qpAqOffset, double, cuCountFullRes);
+        CHECKED_MALLOC(invQscaleFactor, int, cuCountFullRes);
+        CHECKED_MALLOC(qpCuTreeOffset, double, cuCountFullRes);
+        CHECKED_MALLOC(blockVariance, uint32_t, cuCountFullRes);
     }
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/lowres.h
--- a/source/common/lowres.h	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/lowres.h	Wed Aug 17 10:47:11 2016 +0530
@@ -132,6 +132,8 @@
     MV*       lowresMvs[2][X265_BFRAME_MAX + 1];
     uint32_t  maxBlocksInRow;
     uint32_t  maxBlocksInCol;
+    uint32_t  maxBlocksInRowFullRes;
+    uint32_t  maxBlocksInColFullRes;
 
     /* used for vbvLookahead */
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
@@ -153,7 +155,7 @@
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
     ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
 
-    bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
+    bool create(PicYuv *origPic, int _bframes, bool bAqEnabled, uint32_t qgSize);
     void destroy();
     void init(PicYuv *origPic, int poc);
 };
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/pixel.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -845,30 +845,57 @@
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
-                                    const int32_t* invQscales, const double* fpsFactor, int len)
+                                    const int32_t* invQscales, const double* fpsFactor, int len, uint32_t qgSize)
 {
     double fps = *fpsFactor / 256;  // range[0.01, 1.00]
-
-    for (int i = 0; i < len; i++)
+    if (qgSize == 8)
     {
-        int intraCost = intraCosts[i];
-        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
-        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
-        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
-        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+        for (int i = 0; i < len; i++)
+        {
+            int intraCost = intraCosts[i];
+            int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+            int invQscaleFactor = (invQscales[i * 2] + invQscales[i * 2 + 1] + invQscales[i * 2 + len * 2] + invQscales[i * 2 + len * 2 + 1]) / 4;
+            double propagateIntra = intraCost * invQscaleFactor; // Q16 x Q8.8 = Q24.8
+            double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+            double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
 
 #if 0
-        // algorithm that output match to asm
-        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
-        float intraRcpError1 = (float)intraCost * (float)intraRcp;
-        intraRcpError1 *= (float)intraRcp;
-        float intraRcpError2 = intraRcp + intraRcp;
-        float propagateDenom = intraRcpError2 - intraRcpError1;
-        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+            // algorithm that output match to asm
+            float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
+            float intraRcpError1 = (float)intraCost * (float)intraRcp;
+            intraRcpError1 *= (float)intraRcp;
+            float intraRcpError2 = intraRcp + intraRcp;
+            float propagateDenom = intraRcpError2 - intraRcpError1;
+            dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
 #else
-        double propagateDenom  = (double)intraCost;             // Q32
-        dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+            double propagateDenom = (double)intraCost;             // Q32
+            dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
 #endif
+        }
+    }
+    else
+    {
+        for (int i = 0; i < len; i++)
+        {
+            int intraCost = intraCosts[i];
+            int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+            double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+            double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+            double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+
+#if 0
+            // algorithm that output match to asm
+            float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
+            float intraRcpError1 = (float)intraCost * (float)intraRcp;
+            intraRcpError1 *= (float)intraRcp;
+            float intraRcpError2 = intraRcp + intraRcp;
+            float propagateDenom = intraRcpError2 - intraRcpError1;
+            dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+#else
+            double propagateDenom = (double)intraCost;             // Q32
+            dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+#endif
+        }
     }
 }
 
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/primitives.h
--- a/source/common/primitives.h	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/primitives.h	Wed Aug 17 10:47:11 2016 +0530
@@ -187,7 +187,7 @@
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
-typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
+typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len, uint32_t qgSize);
 
 typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
 typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -1027,7 +1027,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
-        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+        //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -1312,7 +1312,7 @@
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
-        p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        //p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -2153,7 +2153,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
-        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
 
@@ -2356,7 +2356,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
-        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+        //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -2670,7 +2670,7 @@
         p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
-        p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        //p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -3666,7 +3666,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
-        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+        //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
         p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
diff -r 215eedc9ecc0 -r ab205f07f87b source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/common/x86/mc-a2.asm	Wed Aug 17 10:47:11 2016 +0530
@@ -994,7 +994,7 @@
 
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
-;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len, uint32_t qgSize)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal mbtree_propagate_cost, 7,7,7
diff -r 215eedc9ecc0 -r ab205f07f87b source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/encoder/analysis.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -2637,7 +2637,11 @@
 {
     FrameData& curEncData = *m_frame->m_encData;
     double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
-
+    int loopIncr;
+    if (m_param->rc.qgSize == 8)
+        loopIncr = 8;
+    else
+        loopIncr = 16;
     /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
     bool isReferenced = IS_REFERENCED(m_frame);
     double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
@@ -2647,17 +2651,17 @@
         uint32_t height = m_frame->m_fencPic->m_picHeight;
         uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
         uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
-        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
+        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
         uint32_t blockSize = g_maxCUSize >> cuGeom.depth;
         double qp_offset = 0;
         uint32_t cnt = 0;
         uint32_t idx;
 
-        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 16)
+        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
         {
-            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 16)
+            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
             {
-                idx = ((block_yy / 16) * (maxCols)) + (block_xx / 16);
+                idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
                 qp_offset += qpoffs[idx];
                 cnt++;
             }
diff -r 215eedc9ecc0 -r ab205f07f87b source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/encoder/encoder.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -605,7 +605,11 @@
 
         if (pic_in->quantOffsets != NULL)
         {
-            int cuCount = inFrame->m_lowres.maxBlocksInRow * inFrame->m_lowres.maxBlocksInCol;
+            int cuCount;
+            if (m_param->rc.qgSize == 8)
+                cuCount = inFrame->m_lowres.maxBlocksInRowFullRes * inFrame->m_lowres.maxBlocksInColFullRes;
+            else
+                cuCount = inFrame->m_lowres.maxBlocksInRow * inFrame->m_lowres.maxBlocksInCol;
             memcpy(inFrame->m_quantOffsets, pic_in->quantOffsets, cuCount * sizeof(float));
         }
 
@@ -790,7 +794,7 @@
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
                     m_aborted = true;
             if (pic_out)
-            {
+            { 
                 /* m_rcData is allocated for every frame */
                 pic_out->rcData = outFrame->m_rcData;
                 outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
@@ -1590,7 +1594,7 @@
     {
         pps->bUseDQP = true;
         pps->maxCuDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
-        X265_CHECK(pps->maxCuDQPDepth <= 2, "max CU DQP depth cannot be greater than 2\n");
+        X265_CHECK(pps->maxCuDQPDepth <= 3, "max CU DQP depth cannot be greater than 3\n");
     }
     else
     {
@@ -1874,10 +1878,10 @@
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
     if (!m_param->bLossless && (m_param->rc.aqMode || bIsVbv))
     {
-        if (p->rc.qgSize < X265_MAX(16, p->minCUSize))
+        if (p->rc.qgSize < X265_MAX(8, p->minCUSize))
         {
-            p->rc.qgSize = X265_MAX(16, p->minCUSize);
-            x265_log(p, X265_LOG_WARNING, "QGSize should be greater than or equal to 16 and minCUSize, setting QGSize = %d\n", p->rc.qgSize);
+            p->rc.qgSize = X265_MAX(8, p->minCUSize);
+            x265_log(p, X265_LOG_WARNING, "QGSize should be greater than or equal to 8 and minCUSize, setting QGSize = %d\n", p->rc.qgSize);
         }
         if (p->rc.qgSize > p->maxCUSize)
         {
diff -r 215eedc9ecc0 -r ab205f07f87b source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/encoder/ratecontrol.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -615,9 +615,18 @@
         }
         if (m_param->rc.cuTree)
         {
-            m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
-            if (m_param->bBPyramid && m_param->rc.bStatRead)
-                m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+            if (m_param->rc.qgSize == 8)
+            {
+                m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * 4 * sizeof(uint16_t));
+                if (m_param->bBPyramid && m_param->rc.bStatRead)
+                    m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * 4 * sizeof(uint16_t));
+            }
+            else
+            {
+                m_cuTreeStats.qpBuffer[0] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+                if (m_param->bBPyramid && m_param->rc.bStatRead)
+                    m_cuTreeStats.qpBuffer[1] = X265_MALLOC(uint16_t, m_ncu * sizeof(uint16_t));
+            }
             m_cuTreeStats.qpBufPos = -1;
         }
     }
@@ -1424,6 +1433,11 @@
 {
     int index = m_encOrder[frame->m_poc];
     uint8_t sliceTypeActual = (uint8_t)m_rce2Pass[index].sliceType;
+    int ncu;
+    if (m_param->rc.qgSize == 8)
+        ncu = m_ncu * 4;
+    else
+        ncu = m_ncu;
     if (m_rce2Pass[index].keptAsRef)
     {
         /* TODO: We don't need pre-lookahead to measure AQ offsets, but there is currently
@@ -1437,7 +1451,7 @@
 
                 if (!fread(&type, 1, 1, m_cutreeStatFileIn))
                     goto fail;
-                if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), m_ncu, m_cutreeStatFileIn) != (size_t)m_ncu)
+                if (fread(m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], sizeof(uint16_t), ncu, m_cutreeStatFileIn) != (size_t)ncu)
                     goto fail;
 
                 if (type != sliceTypeActual && m_cuTreeStats.qpBufPos == 1)
@@ -1448,8 +1462,8 @@
             }
             while(type != sliceTypeActual);
         }
-        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu);
-        for (int i = 0; i < m_ncu; i++)
+        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], ncu);
+        for (int i = 0; i < ncu; i++)
             frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]);
         m_cuTreeStats.qpBufPos--;
     }
@@ -2593,6 +2607,11 @@
 int RateControl::writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce)
 {
     FrameData& curEncData = *curFrame->m_encData;
+    int ncu;
+    if (m_param->rc.qgSize == 8)
+        ncu = m_ncu * 4;
+    else
+        ncu = m_ncu;
     char cType = rce->sliceType == I_SLICE ? (rce->poc > 0 && m_param->bOpenGOP ? 'i' : 'I')
         : rce->sliceType == P_SLICE ? 'P'
         : IS_REFERENCED(curFrame) ? 'B' : 'b';
@@ -2612,10 +2631,10 @@
     if (m_param->rc.cuTree && IS_REFERENCED(curFrame) && !m_param->rc.bStatRead)
     {
         uint8_t sliceType = (uint8_t)rce->sliceType;
-        primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, m_ncu);
+        primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, ncu);
         if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
             goto writeFailure;
-        if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu, m_cutreeStatFileOut) < (size_t)m_ncu)
+        if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), ncu, m_cutreeStatFileOut) < (size_t)ncu)
             goto writeFailure;
     }
     return 0;
diff -r 215eedc9ecc0 -r ab205f07f87b source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/encoder/slicetype.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -56,22 +56,36 @@
 }
 
 /* Find the energy of each block in Y/Cb/Cr plane */
-inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat)
+inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat, uint32_t qgSize)
 {
     if ((colorFormat != X265_CSP_I444) && plane)
     {
-        ALIGN_VAR_8(pixel, pix[8 * 8]);
-        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+        if (qgSize == 8)
+        {
+            ALIGN_VAR_4(pixel, pix[4 * 4]);
+            primitives.cu[BLOCK_4x4].copy_pp(pix, 4, src, srcStride);
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_4x4].var(pix, 4), 4, plane);
+        }
+        else
+        {
+            ALIGN_VAR_8(pixel, pix[8 * 8]);
+            primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+        }
     }
     else
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+    {
+        if (qgSize == 8)
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(src, srcStride), 6, plane);
+        else
+            return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+    }
 }
 
 } // end anonymous namespace
 
 /* Find the total AC energy of each block in all planes */
-uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp)
+uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize)
 {
     intptr_t stride = curFrame->m_fencPic->m_stride;
     intptr_t cStride = curFrame->m_fencPic->m_strideC;
@@ -82,11 +96,11 @@
 
     uint32_t var;
 
-    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
+    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp, qgSize);
     if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
     {
-        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
-        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp, qgSize);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp, qgSize);
     }
     x265_emms();
     return var;
@@ -97,7 +111,23 @@
     /* Actual adaptive quantization */
     int maxCol = curFrame->m_fencPic->m_picWidth;
     int maxRow = curFrame->m_fencPic->m_picHeight;
-    int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
+    int blockCount, loopIncr;
+    float modeOneConst, modeTwoConst;
+    if (param->rc.qgSize == 8)
+    {
+        blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
+        modeOneConst = 11.427f;
+        modeTwoConst = 8.f;
+        loopIncr = 8;
+    }
+    else
+    {
+        blockCount = widthInCU * heightInCU;
+        modeOneConst = 14.427f;
+        modeTwoConst = 11.f;
+        loopIncr = 16;
+    }
+    //int blockCount = curFrame->m_lowres.maxBlocksInRowFullRes * curFrame->m_lowres.maxBlocksInColFullRes;
 
     float* quantOffsets = curFrame->m_quantOffsets;
     for (int y = 0; y < 3; y++)
@@ -106,14 +136,14 @@
         curFrame->m_lowres.wp_sum[y] = 0;
     }
 
-    /* Calculate Qp offset for each 16x16 block in the frame */
+    /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
     int blockXY = 0;
     int blockX = 0, blockY = 0;
     double strength = 0.f;
     if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
     {
         /* Need to init it anyways for CU tree */
-        int cuCount = widthInCU * heightInCU;
+        int cuCount = blockCount;
 
         if (param->rc.aqMode && param->rc.aqStrength == 0)
         {
@@ -137,9 +167,9 @@
         /* Need variance data for weighted prediction */
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
         {
-            for (blockY = 0; blockY < maxRow; blockY += 16)
-                for (blockX = 0; blockX < maxCol; blockX += 16)
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+            for (blockY = 0; blockY < maxRow; blockY += loopIncr)
+                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
         }
     }
     else
@@ -152,12 +182,12 @@
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
             curFrame->m_lowres.frameVariance = 0;
             uint64_t rowVariance = 0;
-            for (blockY = 0; blockY < maxRow; blockY += 16)
+            for (blockY = 0; blockY < maxRow; blockY += loopIncr)
             {
                 rowVariance = 0;
-                for (blockX = 0; blockX < maxCol; blockX += 16)
+                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
                 {
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
                     curFrame->m_lowres.blockVariance[blockXY] = energy;
                     rowVariance += energy;
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
@@ -172,21 +202,21 @@
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
             strength = param->rc.aqStrength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f)) / avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (modeTwoConst)) / avg_adj;
             bias_strength = param->rc.aqStrength;
         }
         else
             strength = param->rc.aqStrength * 1.0397f;
 
         blockXY = 0;
-        for (blockY = 0; blockY < maxRow; blockY += 16)
+        for (blockY = 0; blockY < maxRow; blockY += loopIncr)
         {
-            for (blockX = 0; blockX < maxCol; blockX += 16)
+            for (blockX = 0; blockX < maxCol; blockX += loopIncr)
             {
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
                 {
                     qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 11.f / (qp_adj * qp_adj));
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
                 }
                 else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
                 {
@@ -195,8 +225,8 @@
                 }
                 else
                 {
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
                 }
                 if (quantOffsets != NULL)
                     qp_adj += quantOffsets[blockXY];
@@ -227,7 +257,7 @@
     }
 }
 
-void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
+void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)
 {
     ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
     pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
@@ -314,12 +344,21 @@
             fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
             fenc.intraCost[cuXY] = icost;
             fenc.intraMode[cuXY] = (uint8_t)ilowmode;
-
-            /* do not include edge blocks in the frame cost estimates, they are not very accurate */
+            /* do not include edge blocks in the 
+            frame cost estimates, they are not very accurate */
             const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                         cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
-
-            int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] + 128) >> 8) : icost;
+            int invQscaleFactor, icostAq;
+            if (qgSize == 8)
+            {
+                invQscaleFactor = (fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+                                  fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+                                  fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes] +
+                                  fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes + 1]) / 4;
+                icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * invQscaleFactor + 128) >> 8) : icost;
+            }
+            else
+                icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] +128) >> 8) : icost;
 
             if (bFrameScoreCU)
             {
@@ -812,9 +851,17 @@
                     uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;
                     if (qp_offset)
                     {
-                        lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8);
+                        double qpOffset;
+                        if (m_param->rc.qgSize == 8)
+                            qpOffset = (qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4] +
+                                       qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1] +
+                                       qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+                                       qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+                        else
+                            qpOffset = qp_offset[lowresCuIdx];
+                        lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qpOffset) + 128) >> 8);
                         int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
-                        curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8;
+                        curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
                     }
                     if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
                         for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
@@ -850,7 +897,7 @@
             /* cu-tree offsets were read from stats file */;
         else if (m_lookahead.m_bAdaptiveQuant)
             tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param);
-        tld.lowresIntraEstimate(preFrame->m_lowres);
+        tld.lowresIntraEstimate(preFrame->m_lowres, m_lookahead.m_param->rc.qgSize);
         preFrame->m_lowresInit = true;
 
         m_lock.acquire();
@@ -1669,7 +1716,10 @@
         if (bIntra)
         {
             memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
-            memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
+            if (m_param->rc.qgSize == 8)
+                memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * 4 * sizeof(double));
+            else
+                memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
             return;
         }
         std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
@@ -1764,9 +1814,14 @@
     for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++)
     {
         int cuIndex = blocky * strideInCU;
-        primitives.propagateCost(m_scratch, propagateCost,
-                                 frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
-                                 frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);
+        if (m_param->rc.qgSize == 8)
+            primitives.propagateCost(m_scratch, propagateCost,
+                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
+                       frames[b]->invQscaleFactor + (cuIndex * 4), &fpsFactor, m_8x8Width, m_param->rc.qgSize);
+        else
+            primitives.propagateCost(m_scratch, propagateCost,
+                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
+                       frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width, m_param->rc.qgSize);
 
         if (referenced)
             propagateCost += m_8x8Width;
@@ -1852,14 +1907,43 @@
     if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
         weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
 
-    for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
+    if (m_param->rc.qgSize == 8)
     {
-        int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
-        if (intracost)
+        for (int cuY = 0; cuY < m_8x8Height; cuY++)
         {
-            int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
-            double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
-            frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
+            for (int cuX = 0; cuX < m_8x8Width; cuX++)
+            {
+                const int cuXY = cuX + cuY * m_8x8Width;
+                int invQscaleFactor = (frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4] +
+                    frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + 1] +
+                    frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] +
+                    frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1]) / 4;
+
+                int intracost = ((frame->intraCost[cuXY]) / 4 * invQscaleFactor + 128) >> 8;
+                if (intracost)
+                {
+                    int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
+                    double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
+                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
+                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
+                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
+                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
+                }
+
+            }
+        }
+    }
+    else
+    {
+        for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
+        {
+            int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
+            if (intracost)
+            {
+                int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
+                double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
+                frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
+            }
         }
     }
 }
@@ -1883,7 +1967,14 @@
         {
             int cuxy = cux + cuy * m_8x8Width;
             int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
-            double qp_adj = qp_offset[cuxy];
+            double qp_adj;
+            if (m_param->rc.qgSize == 8)
+                qp_adj = (qp_offset[cux * 2 + cuy * m_8x8Width * 4] +
+                         qp_offset[cux * 2 + cuy * m_8x8Width * 4 + 1] +
+                         qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes] +
+                         qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes + 1]) / 4;
+            else 
+                qp_adj = qp_offset[cuxy];
             cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
             rowSatd[cuy] += cuCost;
             if ((cuy > 0 && cuy < m_8x8Height - 1 &&
@@ -2202,8 +2293,17 @@
     /* do not include edge blocks in the frame cost estimates, they are not very accurate */
     const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
                                 cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
-
-    int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8) : bcost;
+    int invQscaleFactor, bcostAq;
+    if (m_lookahead.m_param->rc.qgSize == 8)
+    {
+        invQscaleFactor = (fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+                          fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+                          fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes] +
+                          fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes + 1]) / 4;
+        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * invQscaleFactor + 128) >> 8) : bcost;
+    }
+    else
+        bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;
 
     if (bFrameScoreCU)
     {
diff -r 215eedc9ecc0 -r ab205f07f87b source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/encoder/slicetype.h	Wed Aug 17 10:47:11 2016 +0530
@@ -84,13 +84,13 @@
     ~LookaheadTLD() { X265_FREE(wbuffer[0]); }
 
     void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
-    void lowresIntraEstimate(Lowres& fenc);
+    void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
 
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
 
 protected:
 
-    uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp);
+    uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
     uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
     bool     allocWeightedRef(Lowres& fenc);
 };
diff -r 215eedc9ecc0 -r ab205f07f87b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/test/pixelharness.cpp	Wed Aug 17 10:47:11 2016 +0530
@@ -1387,8 +1387,8 @@
     {
         int width = 16 + rand() % 64;
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
-        ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
+        checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width, 32);
+        ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width, 32);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
@@ -3102,7 +3102,7 @@
     if (opt.propagateCost)
     {
         HEADER0("propagateCost");
-        REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
+        REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80, 32);
     }
 
     if (opt.fix8Pack)
diff -r 215eedc9ecc0 -r ab205f07f87b source/x265.h
--- a/source/x265.h	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/x265.h	Wed Aug 17 10:47:11 2016 +0530
@@ -1155,7 +1155,7 @@
 
         /* Enable adaptive quantization at CU granularity. This parameter specifies
          * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
+         * (QG) size. Allowed values are 64, 32, 16, 8 provided it falls within the
          * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
         uint32_t qgSize;
 
diff -r 215eedc9ecc0 -r ab205f07f87b source/x265cli.h
--- a/source/x265cli.h	Wed Aug 24 13:17:45 2016 +0530
+++ b/source/x265cli.h	Wed Aug 17 10:47:11 2016 +0530
@@ -386,7 +386,7 @@
     H0("   --analysis-file <filename>    Specify file name used for either dumping or reading analysis data.\n");
     H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
-    H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
+    H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
     H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
     H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);