[x265] [PATCH] new aq implementation

pooja at multicorewareinc.com pooja at multicorewareinc.com
Fri Dec 28 06:30:26 CET 2018

# HG changeset patch
# User Ashok Kumar Mishra <ashok at multicorewareinc.com>
# Date 1545902034 -19800
#      Thu Dec 27 14:43:54 2018 +0530
# Node ID 3cd0b5ed0b91bcb3d5d6cfa1395cb502fc6d01ca
# Parent  129416ec047966f7d7e7898fbe16110444b9a183
new aq implementation

It scales the quantization step size according to the spatial activity of one
coding unit relative to frame average spatial activity. This AQ method utilizes
the minimum variance of sub-unit in each coding unit to represent the coding
units spatial complexity.

diff -r 129416ec0479 -r 3cd0b5ed0b91 doc/reST/cli.rst
--- a/doc/reST/cli.rst	Fri Dec 28 09:03:26 2018 +0530
+++ b/doc/reST/cli.rst	Thu Dec 27 14:43:54 2018 +0530
@@ -1646,6 +1646,21 @@
 	Default 1.0.
 	**Range of values:** 0.0 to 3.0
+.. option:: --hevc-aq
+	Enable adaptive quantization
+	It scales the quantization step size according to the spatial activity of one
+	coding unit relative to frame average spatial activity. This AQ method utilizes
+	the minimum variance of sub-unit in each coding unit to represent the coding
+	unit’s spatial complexity.
+.. option:: --qp-adaptation-range
+	Delta-QP range by QP adaptation based on a psycho-visual model.
+	Default 1.0.
+	**Range of values:** 1.0 to 6.0
 .. option:: --aq-motion, --no-aq-motion
 	Adjust the AQ offsets based on the relative motion of each block with
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/common/lowres.cpp
--- a/source/common/lowres.cpp	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/common/lowres.cpp	Thu Dec 27 14:43:54 2018 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2013-2017 MulticoreWare, Inc
  * Authors: Gopu Govindaswamy <gopu at multicorewareinc.com>
+ *          Ashok Kumar Mishra <ashok at multicorewareinc.com>
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -27,10 +28,31 @@
 using namespace X265_NS;
+bool PicQPAdaptationLayer::create(uint32_t width, uint32_t height, uint32_t partWidth, uint32_t partHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt)
+    aqPartWidth = partWidth;
+    aqPartHeight = partHeight;
+    numAQPartInWidth = (width + partWidth - 1) / partWidth;
+    numAQPartInHeight = (height + partHeight - 1) / partHeight;
+    CHECKED_MALLOC_ZERO(dActivity, double, numAQPartInWidthExt * numAQPartInHeightExt);
+    CHECKED_MALLOC_ZERO(dQpOffset, double, numAQPartInWidthExt * numAQPartInHeightExt);
+    CHECKED_MALLOC_ZERO(dCuTreeOffset, double, numAQPartInWidthExt * numAQPartInHeightExt);
+    if (bQpSize)
+        CHECKED_MALLOC_ZERO(dCuTreeOffset8x8, double, numAQPartInWidthExt * numAQPartInHeightExt);
+    return true;
+    return false;
 bool Lowres::create(x265_param* param, PicYuv *origPic, uint32_t qgSize)
     isLowres = true;
     bframes = param->bframes;
+    widthFullRes = origPic->m_picWidth;
+    heightFullRes = origPic->m_picHeight;
     width = origPic->m_picWidth / 2;
     lines = origPic->m_picHeight / 2;
     lumaStride = width + 2 * origPic->m_lumaMarginX;
@@ -49,7 +71,7 @@
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
-    if (!!param->rc.aqMode)
+    if (!!param->rc.aqMode || !!param->rc.hevcAq)
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
@@ -57,10 +79,50 @@
         if (qgSize == 8)
             CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
     if (origPic->m_param->bAQMotion)
         CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
     if (origPic->m_param->bDynamicRefine)
         CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
+    if (!!param->rc.hevcAq)
+    {
+        m_maxCUSize = param->maxCUSize;
+        m_qgSize = qgSize;
+        uint32_t partWidth, partHeight, nAQPartInWidth, nAQPartInHeight;
+        pAQLayer = new PicQPAdaptationLayer[4];
+        maxAQDepth = 0;
+        for (uint32_t d = 0; d < 4; d++)
+        {
+            int ctuSizeIdx = 6 - g_log2Size[param->maxCUSize];
+            int aqDepth = g_log2Size[param->maxCUSize] - g_log2Size[qgSize];
+            if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
+                continue;
+            pAQLayer->minAQDepth = d;
+            partWidth = param->maxCUSize >> d;
+            partHeight = param->maxCUSize >> d;
+            if (minAQSize[ctuSizeIdx] == d)
+            {
+                pAQLayer[d].bQpSize = true;
+                nAQPartInWidth = maxBlocksInRow * 2;
+                nAQPartInHeight = maxBlocksInCol * 2;
+            }
+            else
+            {
+                pAQLayer[d].bQpSize = false;
+                nAQPartInWidth = (origPic->m_picWidth + partWidth - 1) / partWidth;
+                nAQPartInHeight = (origPic->m_picHeight + partHeight - 1) / partHeight;
+            }
+            maxAQDepth++;
+            pAQLayer[d].create(origPic->m_picWidth, origPic->m_picHeight, partWidth, partHeight, nAQPartInWidth, nAQPartInHeight);
+        }
+    }
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
     /* allocate lowres buffers */
@@ -130,6 +192,25 @@
+    if (maxAQDepth > 0)
+    {
+        for (uint32_t d = 0; d < 4; d++)
+        {
+            int ctuSizeIdx = 6 - g_log2Size[m_maxCUSize];
+            int aqDepth = g_log2Size[m_maxCUSize] - g_log2Size[m_qgSize];
+            if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
+                continue;
+            X265_FREE(pAQLayer[d].dActivity);
+            X265_FREE(pAQLayer[d].dQpOffset);
+            X265_FREE(pAQLayer[d].dCuTreeOffset);
+            if (pAQLayer[d].bQpSize == true)
+                X265_FREE(pAQLayer[d].dCuTreeOffset8x8);
+        }
+        delete[] pAQLayer;
+    }
 // (re) initialize lowres state
 void Lowres::init(PicYuv *origPic, int poc)
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/common/lowres.h
--- a/source/common/lowres.h	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/common/lowres.h	Thu Dec 27 14:43:54 2018 +0530
@@ -103,6 +103,49 @@
+static const uint32_t aqLayerDepth[3][4][4] = {
+    {  // ctu size 64
+        { 1, 0, 1, 0 },
+        { 1, 1, 1, 0 },
+        { 1, 1, 1, 0 },
+        { 1, 1, 1, 1 }
+    },
+    {  // ctu size 32
+        { 1, 1, 0, 0 },
+        { 1, 1, 0, 0 },
+        { 1, 1, 1, 0 },
+        { 0, 0, 0, 0 },
+    },
+    {  // ctu size 16
+        { 1, 0, 0, 0 },
+        { 1, 1, 0, 0 },
+        { 0, 0, 0, 0 },
+        { 0, 0, 0, 0 }
+    }
+// min aq size for ctu size 64, 32 and 16
+static const uint32_t minAQSize[3] = { 3, 2, 1 };
+struct PicQPAdaptationLayer
+    uint32_t aqPartWidth;
+    uint32_t aqPartHeight;
+    uint32_t numAQPartInWidth;
+    uint32_t numAQPartInHeight;
+    uint32_t minAQDepth;
+    double*  dActivity;
+    double*  dQpOffset;
+    double*  dCuTreeOffset;
+    double*  dCuTreeOffset8x8;
+    double   dAvgActivity;
+    bool     bQpSize;
+    bool  create(uint32_t width, uint32_t height, uint32_t aqPartWidth, uint32_t aqPartHeight, uint32_t numAQPartInWidthExt, uint32_t numAQPartInHeightExt);
+    void  destroy();
 /* lowres buffers, sizes and strides */
 struct Lowres : public ReferencePlanes
@@ -154,6 +197,13 @@
     uint64_t  wp_sum[3];
     /* cutree intermediate data */
+    PicQPAdaptationLayer* pAQLayer;
+    uint32_t maxAQDepth;
+    uint32_t widthFullRes;
+    uint32_t heightFullRes;
+    uint32_t m_maxCUSize;
+    uint32_t m_qgSize;
     uint16_t* propagateCost;
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
     ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/common/param.cpp
--- a/source/common/param.cpp	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/common/param.cpp	Thu Dec 27 14:43:54 2018 +0530
@@ -233,8 +233,10 @@
     param->rc.rateControlMode = X265_RC_CRF;
     param->rc.qp = 32;
     param->rc.aqMode = X265_AQ_AUTO_VARIANCE;
+    param->rc.hevcAq = 0;
     param->rc.qgSize = 32;
     param->rc.aqStrength = 1.0;
+    param->rc.qpAdaptationRange = 1.0;
     param->rc.cuTree = 1;
     param->rc.rfConstantMax = 0;
     param->rc.rfConstantMin = 0;
@@ -528,6 +530,7 @@
             param->rc.pbFactor = 1.0;
             param->rc.cuTree = 0;
             param->rc.aqMode = 0;
+            param->rc.hevcAq = 0;
             param->rc.qpStep = 1;
             param->rc.bEnableGrain = 1;
             param->bEnableRecursionSkip = 0;
@@ -1186,6 +1189,8 @@
         OPT("hrd-concat") p->bEnableHRDConcatFlag = atobool(value);
         OPT("refine-ctu-distortion") p->ctuDistortionRefine = atoi(value);
+        OPT("hevc-aq") p->rc.hevcAq = atobool(value);
+        OPT("qp-adaptation-range") p->rc.qpAdaptationRange = atof(value);
             return X265_PARAM_BAD_NAME;
@@ -1430,6 +1435,8 @@
           "Aq-Mode is out of range");
     CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
           "Aq-Strength is out of range");
+    CHECK(param->rc.qpAdaptationRange < 1.0f || param->rc.qpAdaptationRange > 6.0f,
+        "qp adaptation range is out of range");
     CHECK(param->deblockingFilterTCOffset < -6 || param->deblockingFilterTCOffset > 6,
           "deblocking filter tC offset must be in the range of -6 to +6");
     CHECK(param->deblockingFilterBetaOffset < -6 || param->deblockingFilterBetaOffset > 6,
@@ -1956,6 +1963,8 @@
     s += sprintf(s, " max-ausize-factor=%.1f", p->maxAUSizeFactor);
     BOOL(p->bDynamicRefine, "dynamic-refine");
     BOOL(p->bSingleSeiNal, "single-sei");
+    BOOL(p->rc.hevcAq, "hevc-aq");
+    s += sprintf(s, " qp-adaptation-range=%.2f", p->rc.qpAdaptationRange);
 #undef BOOL
     return buf;
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/encoder/analysis.cpp	Thu Dec 27 14:43:54 2018 +0530
@@ -3556,10 +3556,39 @@
     return cuVariance / cnt;
+double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom)
+    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
+    PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
+    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth;
+    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight;
+    uint32_t aqStride = pQPLayer->numAQPartInWidth;
+    double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX];
+    return dQpOffset;
+double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom)
+    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
+    PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
+    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth;
+    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight;
+    uint32_t aqStride = pcAQLayer->numAQPartInWidth;
+    double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX];
+    return dQpOffset;
 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
     FrameData& curEncData = *m_frame->m_encData;
     double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
+    bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
     if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
@@ -3577,40 +3606,60 @@
             return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
-    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
-    /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
-    bool isReferenced = IS_REFERENCED(m_frame);
-    double *qpoffs = (isReferenced && m_param->rc.cuTree && !complexCheck) ? m_frame->m_lowres.qpCuTreeOffset :
-                                                                             m_frame->m_lowres.qpAqOffset;
-    if (qpoffs)
+    if (m_param->rc.hevcAq)
-        uint32_t width = m_frame->m_fencPic->m_picWidth;
-        uint32_t height = m_frame->m_fencPic->m_picHeight;
-        uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
-        uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
-        uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
-        uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
-        double qp_offset = 0;
-        uint32_t cnt = 0;
-        for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
+        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
+        double dQpOffset = 0;
+        if (bCuTreeOffset)
-            for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
+            dQpOffset = cuTreeQPOffset(ctu, cuGeom);
+        }
+        else
+        {
+            dQpOffset = aqQPOffset(ctu, cuGeom);
+            if (complexCheck)
-                uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
-                qp_offset += qpoffs[idx];
-                cnt++;
+                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
+                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
+                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
+                return (offset < max_threshold);
-        qp_offset /= cnt;
-        qp += qp_offset;
-        if (complexCheck)
+        qp += dQpOffset;
+    }
+    else
+    {
+        int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
+        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
+        double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
+        if (qpoffs)
-            int32_t offset = (int32_t)(qp_offset * 100 + .5);
-            double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
-            int32_t max_threshold = (int32_t)(threshold * 100 + .5);
-            return (offset < max_threshold);
+            uint32_t width = m_frame->m_fencPic->m_picWidth;
+            uint32_t height = m_frame->m_fencPic->m_picHeight;
+            uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
+            uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
+            uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
+            uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
+            double dQpOffset = 0;
+            uint32_t cnt = 0;
+            for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
+            {
+                for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
+                {
+                    uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
+                    dQpOffset += qpoffs[idx];
+                    cnt++;
+                }
+            }
+            dQpOffset /= cnt;
+            qp += dQpOffset;
+            if (complexCheck)
+            {
+                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
+                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
+                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
+                return (offset < max_threshold);
+            }
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/encoder/analysis.h
--- a/source/encoder/analysis.h	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/encoder/analysis.h	Thu Dec 27 14:43:54 2018 +0530
@@ -201,7 +201,8 @@
     void classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
     void trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
+    double aqQPOffset(const CUData& ctu, const CUGeom& cuGeom);
+    double cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom);
     void calculateNormFactor(CUData& ctu, int qp);
     void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype);
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/encoder/encoder.cpp	Thu Dec 27 14:43:54 2018 +0530
@@ -2839,6 +2839,12 @@
     if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0)
         p->rc.aqStrength = 0;
+    if (p->rc.hevcAq && p->rc.aqMode)
+    {
+        p->rc.aqMode = X265_AQ_NONE;
+        x265_log(p, X265_LOG_WARNING, "hevc-aq enabled, disabling other aq-modes\n");
+    }
     if (p->totalFrames && p->totalFrames <= 2 * ((float)p->fpsNum) / p->fpsDenom && p->rc.bStrictCbr)
         p->lookaheadDepth = p->totalFrames;
     if (p->bIntraRefresh)
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/encoder/ratecontrol.cpp	Thu Dec 27 14:43:54 2018 +0530
@@ -153,10 +153,7 @@
     int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_ncu = lowresCuWidth * lowresCuHeight;
-    if (m_param->rc.cuTree)
-        m_qCompress = 1;
-    else
-        m_qCompress = m_param->rc.qCompress;
+    m_qCompress = (m_param->rc.cuTree && !m_param->rc.hevcAq) ? 1 : m_param->rc.qCompress;
     // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
     m_residualFrames = 0;
@@ -381,13 +378,14 @@
     m_isGrainEnabled = false;
     if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes
-    m_isGrainEnabled = true;
+        m_isGrainEnabled = true;
     for (int i = 0; i < 3; i++)
-    m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
+        m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
     m_avgPFrameQp = 0 ;
     /* 720p videos seem to be a good cutoff for cplxrSum */
-    double tuneCplxFactor = (m_ncu > 3600 && m_param->rc.cuTree) ? 2.5 : m_isGrainEnabled ? 1.9 : 1;
+    double tuneCplxFactor = (m_ncu > 3600 && m_param->rc.cuTree && !m_param->rc.hevcAq) ? 2.5 : m_param->rc.hevcAq ? 1.5 : m_isGrainEnabled ? 1.9 : 1.0;
     /* estimated ratio that produces a reasonable QP for the first I-frame */
     m_cplxrSum = .01 * pow(7.0e5, m_qCompress) * pow(m_ncu, 0.5) * tuneCplxFactor;
     m_wantedBitsWindow = m_bitrate * m_frameDuration;
@@ -2563,7 +2561,7 @@
     double q;
-    if (m_param->rc.cuTree)
+    if (m_param->rc.cuTree && !m_param->rc.hevcAq)
         // Scale and units are obtained from rateNum and rateDenom for videos with fixed frame rates.
         double timescale = (double)m_param->fpsDenom / (2 * m_param->fpsNum);
@@ -2571,6 +2569,7 @@
         q = pow(rce->blurredComplexity, 1 - m_param->rc.qCompress);
     // avoid NaN's in the Rceq
     if (rce->coeffBits + rce->mvBits == 0)
         q = m_lastQScaleFor[rce->sliceType];
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/encoder/slicetype.cpp	Thu Dec 27 14:43:54 2018 +0530
@@ -3,6 +3,7 @@
  * Authors: Gopu Govindaswamy <gopu at multicorewareinc.com>
  *          Steve Borho <steve at borho.org>
+ *          Ashok Kumar Mishra <ashok at multicorewareinc.com>
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -105,6 +106,7 @@
     return var;
 /* Find the sum of pixels of each block for luma plane */
 uint32_t LookaheadTLD::lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
@@ -121,6 +123,157 @@
     return (uint32_t)sum_ssd;
+void LookaheadTLD::xPreanalyzeQp(Frame* curFrame)
+    const uint32_t width = curFrame->m_fencPic->m_picWidth;
+    const uint32_t height = curFrame->m_fencPic->m_picHeight;
+    for (uint32_t d = 0; d < 4; d++)
+    {
+        int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
+        int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
+        if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
+            continue;
+        PicQPAdaptationLayer* pcAQLayer = &curFrame->m_lowres.pAQLayer[d];
+        const uint32_t aqPartWidth = pcAQLayer->aqPartWidth;
+        const uint32_t aqPartHeight = pcAQLayer->aqPartHeight;
+        double* pcAQU = pcAQLayer->dActivity;
+        double* pcQP = pcAQLayer->dQpOffset;
+        double* pcCuTree = pcAQLayer->dCuTreeOffset;
+        for (uint32_t y = 0; y < height; y += aqPartHeight)
+        {
+            for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++, pcQP++, pcCuTree++)
+            {
+                double dMaxQScale = pow(2.0, curFrame->m_param->rc.qpAdaptationRange / 6.0);
+                double dCUAct = *pcAQU;
+                double dAvgAct = pcAQLayer->dAvgActivity;
+                double dNormAct = (dMaxQScale*dCUAct + dAvgAct) / (dCUAct + dMaxQScale*dAvgAct);
+                double dQpOffset = (X265_LOG2(dNormAct) / X265_LOG2(2.0)) * 6.0;
+                *pcQP = dQpOffset;
+                *pcCuTree = dQpOffset;
+            }
+        }
+    }
+void LookaheadTLD::xPreanalyze(Frame* curFrame)
+    const uint32_t width = curFrame->m_fencPic->m_picWidth;
+    const uint32_t height = curFrame->m_fencPic->m_picHeight;
+    const intptr_t stride = curFrame->m_fencPic->m_stride;
+    for (uint32_t d = 0; d < 4; d++)
+    {
+        int ctuSizeIdx = 6 - g_log2Size[curFrame->m_param->maxCUSize];
+        int aqDepth = g_log2Size[curFrame->m_param->maxCUSize] - g_log2Size[curFrame->m_param->rc.qgSize];
+        if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
+            continue;
+        const pixel* src = curFrame->m_fencPic->m_picOrg[0];;
+        PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[d];
+        const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
+        const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
+        double* pcAQU = pQPLayer->dActivity;
+        double dSumAct = 0.0;
+        for (uint32_t y = 0; y < height; y += aqPartHeight)
+        {
+            const uint32_t currAQPartHeight = X265_MIN(aqPartHeight, height - y);
+            for (uint32_t x = 0; x < width; x += aqPartWidth, pcAQU++)
+            {
+                const uint32_t currAQPartWidth = X265_MIN(aqPartWidth, width - x);
+                const pixel* pBlkY = &src[x];
+                uint64_t sum[4] = { 0, 0, 0, 0 };
+                uint64_t sumSq[4] = { 0, 0, 0, 0 };
+                uint32_t by = 0;
+                for (; by < currAQPartHeight >> 1; by++)
+                {
+                    uint32_t bx = 0;
+                    for (; bx < currAQPartWidth >> 1; bx++)
+                    {
+                        sum[0] += pBlkY[bx];
+                        sumSq[0] += pBlkY[bx] * pBlkY[bx];
+                    }
+                    for (; bx < currAQPartWidth; bx++)
+                    {
+                        sum[1] += pBlkY[bx];
+                        sumSq[1] += pBlkY[bx] * pBlkY[bx];
+                    }
+                    pBlkY += stride;
+                }
+                for (; by < currAQPartHeight; by++)
+                {
+                    uint32_t bx = 0;
+                    for (; bx < currAQPartWidth >> 1; bx++)
+                    {
+                        sum[2] += pBlkY[bx];
+                        sumSq[2] += pBlkY[bx] * pBlkY[bx];
+                    }
+                    for (; bx < currAQPartWidth; bx++)
+                    {
+                        sum[3] += pBlkY[bx];
+                        sumSq[3] += pBlkY[bx] * pBlkY[bx];
+                    }
+                    pBlkY += stride;
+                }
+                assert((currAQPartWidth & 1) == 0);
+                assert((currAQPartHeight & 1) == 0);
+                const uint32_t pixelWidthOfQuadrants = currAQPartWidth >> 1;
+                const uint32_t pixelHeightOfQuadrants = currAQPartHeight >> 1;
+                const uint32_t numPixInAQPart = pixelWidthOfQuadrants * pixelHeightOfQuadrants;
+                double dMinVar = MAX_DOUBLE;
+                if (numPixInAQPart != 0)
+                {
+                    for (int i = 0; i < 4; i++)
+                    {
+                        const double dAverage = double(sum[i]) / numPixInAQPart;
+                        const double dVariance = double(sumSq[i]) / numPixInAQPart - dAverage * dAverage;
+                        dMinVar = X265_MIN(dMinVar, dVariance);
+                    }
+                }
+                else
+                {
+                    dMinVar = 0.0;
+                }
+                double dActivity = 1.0 + dMinVar;
+                *pcAQU = dActivity;
+                dSumAct += dActivity;
+            }
+            src += stride * currAQPartHeight;
+        }
+        const double dAvgAct = dSumAct / (pQPLayer->numAQPartInWidth * pQPLayer->numAQPartInHeight);
+        pQPLayer->dAvgActivity = dAvgAct;
+    }
+    xPreanalyzeQp(curFrame);
+    int minAQDepth = curFrame->m_lowres.pAQLayer->minAQDepth;
+    PicQPAdaptationLayer* pQPLayer = &curFrame->m_lowres.pAQLayer[minAQDepth];
+    const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
+    const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
+    double* pcQP = pQPLayer->dQpOffset;
+    // Use new qp offset values for qpAqOffset, qpCuTreeOffset and invQscaleFactor buffer
+    int blockXY = 0;
+    for (uint32_t y = 0; y < height; y += aqPartHeight)
+    {
+        for (uint32_t x = 0; x < width; x += aqPartWidth, pcQP++)
+        {
+            curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(*pcQP);
+            blockXY++;
+            acEnergyCu(curFrame, x, y, curFrame->m_param->internalCsp, curFrame->m_param->rc.qgSize);
+        }
+    }
 void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param)
     /* Actual adaptive quantization */
@@ -176,90 +329,99 @@
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
             for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
-                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)                
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);                
+                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
-        int blockXY = 0;
-        double avg_adj_pow2 = 0.f, avg_adj = 0.f, qp_adj = 0.f;
-        double bias_strength = 0.f, strength = 0.f;
-        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
+        if (param->rc.hevcAq)
-            double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));            
-            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
-            {                
-                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
-                {
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);                    
-                    qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
-                    curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
-                    avg_adj += qp_adj;
-                    avg_adj_pow2 += qp_adj * qp_adj;
-                    blockXY++;
-                }
-            }
-            avg_adj /= blockCount;
-            avg_adj_pow2 /= blockCount;
-            strength = param->rc.aqStrength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
-            bias_strength = param->rc.aqStrength;
+            // New method for calculating variance and qp offset
+            xPreanalyze(curFrame);
-            strength = param->rc.aqStrength * 1.0397f;
-        blockXY = 0;
-        for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
-            for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+            int blockXY = 0;
+            double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
+            double bias_strength = 0.f;
+            double strength = 0.f;
+            if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
-                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
-                {
-                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
-                }
-                else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
-                {
-                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                    qp_adj = strength * (qp_adj - avg_adj);
-                }
-                else
-                {
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));                    
-                }
-                if (param->bHDROpt)
+                double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));
+                for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
-                    uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
-                    uint32_t lumaAvg = sum / (loopIncr * loopIncr);
-                    if (lumaAvg < 301)
-                        qp_adj += 3;
-                    else if (lumaAvg >= 301 && lumaAvg < 367)
-                        qp_adj += 2;
-                    else if (lumaAvg >= 367 && lumaAvg < 434)
-                        qp_adj += 1;
-                    else if (lumaAvg >= 501 && lumaAvg < 567)
-                        qp_adj -= 1;
-                    else if (lumaAvg >= 567 && lumaAvg < 634)
-                        qp_adj -= 2;
-                    else if (lumaAvg >= 634 && lumaAvg < 701)
-                        qp_adj -= 3;
-                    else if (lumaAvg >= 701 && lumaAvg < 767)
-                        qp_adj -= 4;
-                    else if (lumaAvg >= 767 && lumaAvg < 834)
-                        qp_adj -= 5;
-                    else if (lumaAvg >= 834)
-                        qp_adj -= 6;
+                    for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+                    {
+                        uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                        qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
+                        curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                        avg_adj += qp_adj;
+                        avg_adj_pow2 += qp_adj * qp_adj;
+                        blockXY++;
+                    }
-                if (quantOffsets != NULL)
-                    qp_adj += quantOffsets[blockXY];
-                curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
-                curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
-                curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
-                blockXY++;
+                avg_adj /= blockCount;
+                avg_adj_pow2 /= blockCount;
+                strength = param->rc.aqStrength * avg_adj;
+                avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
+                bias_strength = param->rc.aqStrength;
+            }
+            else
+                strength = param->rc.aqStrength * 1.0397f;
+            blockXY = 0;
+            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
+            {
+                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+                {
+                    if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
+                    {
+                        qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                        qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
+                    }
+                    else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+                    {
+                        qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                        qp_adj = strength * (qp_adj - avg_adj);
+                    }
+                    else
+                    {
+                        uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                        qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
+                    }
+                    if (param->bHDROpt)
+                    {
+                        uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
+                        uint32_t lumaAvg = sum / (loopIncr * loopIncr);
+                        if (lumaAvg < 301)
+                            qp_adj += 3;
+                        else if (lumaAvg >= 301 && lumaAvg < 367)
+                            qp_adj += 2;
+                        else if (lumaAvg >= 367 && lumaAvg < 434)
+                            qp_adj += 1;
+                        else if (lumaAvg >= 501 && lumaAvg < 567)
+                            qp_adj -= 1;
+                        else if (lumaAvg >= 567 && lumaAvg < 634)
+                            qp_adj -= 2;
+                        else if (lumaAvg >= 634 && lumaAvg < 701)
+                            qp_adj -= 3;
+                        else if (lumaAvg >= 701 && lumaAvg < 767)
+                            qp_adj -= 4;
+                        else if (lumaAvg >= 767 && lumaAvg < 834)
+                            qp_adj -= 5;
+                        else if (lumaAvg >= 834)
+                            qp_adj -= 6;
+                    }
+                    if (quantOffsets != NULL)
+                        qp_adj += quantOffsets[blockXY];
+                    curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
+                    curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                    curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
+                    blockXY++;
+                }
@@ -301,11 +463,13 @@
         int blockXY = 0;
         for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
+        {
             for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
                 curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+        }
@@ -596,13 +760,16 @@
     /* Allow the strength to be adjusted via qcompress, since the two concepts
      * are very similar. */
-    m_cuTreeStrength = 5.0 * (1.0 - m_param->rc.qCompress);
+    m_cuTreeStrength = (m_param->rc.hevcAq ? 6.0 : 5.0) * (1.0 - m_param->rc.qCompress);
     m_lastKeyframe = -m_param->keyframeMax;
     m_sliceTypeBusy = false;
     m_fullQueueSize = X265_MAX(1, m_param->lookaheadDepth);
-    m_bAdaptiveQuant = m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred || m_param->bAQMotion;
+    m_bAdaptiveQuant = m_param->rc.aqMode ||
+                       m_param->bEnableWeightedPred ||
+                       m_param->bEnableWeightedBiPred ||
+                       m_param->bAQMotion ||
+                       m_param->rc.hevcAq;
     /* If we have a thread pool and are using --b-adapt 2, it is generally
      * preferable to perform all motion searches for each lowres frame in large
@@ -919,6 +1086,7 @@
     if (!m_param->analysisLoad || !m_param->bDisableLookahead)
         X265_CHECK(curFrame->m_lowres.costEst[b - p0][p1 - b] > 0, "Slice cost not estimated\n")
         if (m_param->rc.cuTree && !m_param->rc.bStatRead)
             /* update row satds based on cutree offsets */
             curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
@@ -1695,6 +1863,7 @@
     if (m_param->rc.cuTree)
         cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe);
     if (m_param->gopLookahead && (keyFrameLimit >= 0) && (keyFrameLimit <= m_param->bframes + 1) && !m_extendGopBoundary)
         keyintLimit = keyFrameLimit;
@@ -1928,6 +2097,7 @@
     return cost;
 void Lookahead::aqMotion(Lowres **frames, bool bIntra)
     if (!bIntra)
@@ -2223,44 +2393,191 @@
         cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
-void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
+void Lookahead::computeCUTreeQpOffset(Lowres *frame, double averageDuration, int ref0Distance)
     int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
+    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
     double weightdelta = 0.0;
     if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
         weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
+    uint32_t widthFullRes = frame->widthFullRes;
+    uint32_t heightFullRes = frame->heightFullRes;
     if (m_param->rc.qgSize == 8)
+        int minAQDepth = frame->pAQLayer->minAQDepth;
+        PicQPAdaptationLayer* pQPLayerMin = &frame->pAQLayer[minAQDepth];
+        double* pcCuTree8x8 = pQPLayerMin->dCuTreeOffset8x8;
         for (int cuY = 0; cuY < m_8x8Height; cuY++)
             for (int cuX = 0; cuX < m_8x8Width; cuX++)
                 const int cuXY = cuX + cuY * m_8x8Width;
-                int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
+                int intracost = ((frame->intraCost[cuXY] / 4) * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
                 if (intracost)
-                    int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
+                    int propagateCost = ((frame->propagateCost[cuXY] / 4)  * fpsFactor + 128) >> 8;
                     double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
-                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
-                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
-                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
-                    frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
+                    pcCuTree8x8[cuX * 2 + cuY * m_8x8Width * 4] = log2_ratio;
+                    pcCuTree8x8[cuX * 2 + cuY * m_8x8Width * 4 + 1] = log2_ratio;
+                    pcCuTree8x8[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = log2_ratio;
+                    pcCuTree8x8[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = log2_ratio;
+                }
+            }
+        }
+        for (uint32_t d = 0; d < 4; d++)
+        {
+            int ctuSizeIdx = 6 - g_log2Size[m_param->maxCUSize];
+            int aqDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
+            if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
+                continue;
+            PicQPAdaptationLayer* pQPLayer = &frame->pAQLayer[d];
+            const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
+            const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
+            const uint32_t numAQPartInWidth = pQPLayer->numAQPartInWidth;
+            const uint32_t numAQPartInHeight = pQPLayer->numAQPartInHeight;
+            double* pcQP = pQPLayer->dQpOffset;
+            double* pcCuTree = pQPLayer->dCuTreeOffset;
+            uint32_t maxCols = frame->maxBlocksInRowFullRes;
+            for (uint32_t y = 0; y < numAQPartInHeight; y++)
+            {
+                for (uint32_t x = 0; x < numAQPartInWidth; x++, pcQP++, pcCuTree++)
+                {
+                    uint32_t block_x = x * aqPartWidth;
+                    uint32_t block_y = y * aqPartHeight;
+                    uint32_t blockXY = 0;
+                    double log2_ratio = 0;
+                    for (uint32_t block_yy = block_y; block_yy < block_y + aqPartHeight && block_yy < heightFullRes; block_yy += loopIncr)
+                    {
+                        for (uint32_t block_xx = block_x; block_xx < block_x + aqPartWidth && block_xx < widthFullRes; block_xx += loopIncr)
+                        {
+                            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
+                            log2_ratio += *(pcCuTree8x8 + idx);
+                            blockXY++;
+                        }
+                    }
+                    double qp_offset = (m_cuTreeStrength * log2_ratio) / blockXY;
+                    *pcCuTree = *pcQP - qp_offset;
-        for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
+        for (uint32_t d = 0; d < 4; d++)
-            int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
-            if (intracost)
+            int ctuSizeIdx = 6 - g_log2Size[m_param->maxCUSize];
+            int aqDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
+            if (!aqLayerDepth[ctuSizeIdx][aqDepth][d])
+                continue;
+            PicQPAdaptationLayer* pQPLayer = &frame->pAQLayer[d];
+            const uint32_t aqPartWidth = pQPLayer->aqPartWidth;
+            const uint32_t aqPartHeight = pQPLayer->aqPartHeight;
+            const uint32_t numAQPartInWidth = pQPLayer->numAQPartInWidth;
+            const uint32_t numAQPartInHeight = pQPLayer->numAQPartInHeight;
+            double* pcQP = pQPLayer->dQpOffset;
+            double* pcCuTree = pQPLayer->dCuTreeOffset;
+            uint32_t maxCols = frame->maxBlocksInRow;
+            for (uint32_t y = 0; y < numAQPartInHeight; y++)
-                int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
-                double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
-                frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
+                for (uint32_t x = 0; x < numAQPartInWidth; x++, pcQP++, pcCuTree++)
+                {
+                    uint32_t block_x = x * aqPartWidth;
+                    uint32_t block_y = y * aqPartHeight;
+                    uint32_t blockXY = 0;
+                    double log2_ratio = 0;
+                    for (uint32_t block_yy = block_y; block_yy < block_y + aqPartHeight && block_yy < heightFullRes; block_yy += loopIncr)
+                    {
+                        for (uint32_t block_xx = block_x; block_xx < block_x + aqPartWidth && block_xx < widthFullRes; block_xx += loopIncr)
+                        {
+                            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
+                            int intraCost = (frame->intraCost[idx] * frame->invQscaleFactor[idx] + 128) >> 8;
+                            int propagateCost = (frame->propagateCost[idx] * fpsFactor + 128) >> 8;
+                            log2_ratio += (X265_LOG2(intraCost + propagateCost) - X265_LOG2(intraCost) + weightdelta);
+                            blockXY++;
+                        }
+                    }
+                    double qp_offset = (m_cuTreeStrength * log2_ratio) / blockXY;
+                    *pcCuTree = *pcQP - qp_offset;
+                }
+            }
+        }
+    }
+void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
+    if (m_param->rc.hevcAq)
+    {
+        computeCUTreeQpOffset(frame, averageDuration, ref0Distance);
+    }
+    else
+    {
+        int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
+        double weightdelta = 0.0;
+        if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
+            weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
+        if (m_param->rc.qgSize == 8)
+        {
+            for (int cuY = 0; cuY < m_8x8Height; cuY++)
+            {
+                for (int cuX = 0; cuX < m_8x8Width; cuX++)
+                {
+                    const int cuXY = cuX + cuY * m_8x8Width;
+                    int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
+                    if (intracost)
+                    {
+                        int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
+                        double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
+                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4] - m_cuTreeStrength * (log2_ratio);
+                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + 1] - m_cuTreeStrength * (log2_ratio);
+                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
+                        frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
+            {
+                int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
+                if (intracost)
+                {
+                    int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
+                    double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
+                    frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
+                }
@@ -2275,31 +2592,71 @@
     int64_t score = 0;
     int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b];
-    double *qp_offset = frames[b]->qpCuTreeOffset;
-    for (int cuy = m_8x8Height - 1; cuy >= 0; cuy--)
+    if (m_param->rc.hevcAq)
-        rowSatd[cuy] = 0;
-        for (int cux = m_8x8Width - 1; cux >= 0; cux--)
+        int minAQDepth = frames[b]->pAQLayer->minAQDepth;
+        PicQPAdaptationLayer* pQPLayer = &frames[b]->pAQLayer[minAQDepth];
+        double* pcQPCuTree = pQPLayer->dCuTreeOffset;
+        // Use new qp offset values for qpAqOffset, qpCuTreeOffset and invQscaleFactor buffer
+        for (int cuy = m_8x8Height - 1; cuy >= 0; cuy--)
-            int cuxy = cux + cuy * m_8x8Width;
-            int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
-            double qp_adj;
-            if (m_param->rc.qgSize == 8)
-                qp_adj = (qp_offset[cux * 2 + cuy * m_8x8Width * 4] +
-                          qp_offset[cux * 2 + cuy * m_8x8Width * 4 + 1] +
-                          qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes] +
-                          qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes + 1]) / 4;
-            else 
-                qp_adj = qp_offset[cuxy];
-            cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
-            rowSatd[cuy] += cuCost;
-            if ((cuy > 0 && cuy < m_8x8Height - 1 &&
-                 cux > 0 && cux < m_8x8Width - 1) ||
-                m_8x8Width <= 2 || m_8x8Height <= 2)
+            rowSatd[cuy] = 0;
+            for (int cux = m_8x8Width - 1; cux >= 0; cux--)
-                score += cuCost;
+                int cuxy = cux + cuy * m_8x8Width;
+                int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
+                double qp_adj;
+                if (m_param->rc.qgSize == 8)
+                    qp_adj = (pcQPCuTree[cux * 2 + cuy * m_8x8Width * 4] +
+                    pcQPCuTree[cux * 2 + cuy * m_8x8Width * 4 + 1] +
+                    pcQPCuTree[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes] +
+                    pcQPCuTree[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes + 1]) / 4;
+                else
+                    qp_adj = *(pcQPCuTree + cuxy);
+                cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
+                rowSatd[cuy] += cuCost;
+                if ((cuy > 0 && cuy < m_8x8Height - 1 &&
+                    cux > 0 && cux < m_8x8Width - 1) ||
+                    m_8x8Width <= 2 || m_8x8Height <= 2)
+                {
+                    score += cuCost;
+                }
+            }
+        }
+    }
+    else
+    {
+        double *qp_offset = frames[b]->qpCuTreeOffset;
+        for (int cuy = m_8x8Height - 1; cuy >= 0; cuy--)
+        {
+            rowSatd[cuy] = 0;
+            for (int cux = m_8x8Width - 1; cux >= 0; cux--)
+            {
+                int cuxy = cux + cuy * m_8x8Width;
+                int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
+                double qp_adj;
+                if (m_param->rc.qgSize == 8)
+                    qp_adj = (qp_offset[cux * 2 + cuy * m_8x8Width * 4] +
+                    qp_offset[cux * 2 + cuy * m_8x8Width * 4 + 1] +
+                    qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes] +
+                    qp_offset[cux * 2 + cuy * m_8x8Width * 4 + frames[b]->maxBlocksInRowFullRes + 1]) / 4;
+                else 
+                    qp_adj = qp_offset[cuxy];
+                cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
+                rowSatd[cuy] += cuCost;
+                if ((cuy > 0 && cuy < m_8x8Height - 1 &&
+                    cux > 0 && cux < m_8x8Width - 1) ||
+                    m_8x8Width <= 2 || m_8x8Height <= 2)
+                {
+                    score += cuCost;
+                }
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/encoder/slicetype.h	Thu Dec 27 14:43:54 2018 +0530
@@ -87,7 +87,8 @@
     void lowresIntraEstimate(Lowres& fenc, uint32_t qgSize);
     void weightsAnalyse(Lowres& fenc, Lowres& ref);
+    void xPreanalyze(Frame* curFrame);
+    void xPreanalyzeQp(Frame* curFrame);
     uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
@@ -175,6 +176,7 @@
     void    cuTree(Lowres **frames, int numframes, bool bintra);
     void    estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
     void    cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
+    void    computeCUTreeQpOffset(Lowres *frame, double averageDuration, int ref0Distance);
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/x265.h
--- a/source/x265.h	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/x265.h	Thu Dec 27 14:43:54 2018 +0530
@@ -1347,10 +1347,22 @@
          * generally improves. Default: X265_AQ_AUTO_VARIANCE */
         int       aqMode;
+        /*
+         * Enable adaptive quantization.
+         * It scales the quantization step size according to the spatial activity of one
+         * coding unit relative to frame average spatial activity. This AQ method utilizes
+         * the minimum variance of sub-unit in each coding unit to represent the coding
+         * unit’s spatial complexity. */
+        int       hevcAq;
         /* Sets the strength of AQ bias towards low detail CTUs. Valid only if
          * AQ is enabled. Default value: 1.0. Acceptable values between 0.0 and 3.0 */
         double    aqStrength;
+        /* Delta QP range by QP adaptation based on a psycho-visual model.
+         * Acceptable values between 1.0 to 6.0 */
+        double    qpAdaptationRange;
         /* Sets the maximum rate the VBV buffer should be assumed to refill at
          * Default is zero */
         int       vbvMaxBitrate;
diff -r 129416ec0479 -r 3cd0b5ed0b91 source/x265cli.h
--- a/source/x265cli.h	Fri Dec 28 09:03:26 2018 +0530
+++ b/source/x265cli.h	Thu Dec 27 14:43:54 2018 +0530
@@ -311,6 +311,9 @@
     { "dolby-vision-rpu", required_argument, NULL, 0 },
     { "hrd-concat",          no_argument, NULL, 0},
     { "no-hrd-concat",       no_argument, NULL, 0 },
+    { "hevc-aq", no_argument, NULL, 0 },
+    { "no-hevc-aq", no_argument, NULL, 0 },
+    { "qp-adaptation-range", required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -361,7 +364,7 @@
     H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
     H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
-    H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
+    H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
     H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
        "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
     H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
@@ -521,7 +524,9 @@
         "                                    - 1 : Store/Load ctu distortion to/from the file specified in analysis-save/load.\n"
         "                                Default 0 - Disabled\n");
     H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
+    H0("   --[no-]hevc-aq                Mode for HEVC Adaptive Quantization. Default %s\n", OPT(param->rc.hevcAq));
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
+    H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
     H0("   --[no-]aq-motion              Adaptive Quantization based on the relative motion of each CU w.r.t., frame. Default %s\n", OPT(param->bOptCUDeltaQP));
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265_stable.patch
Type: text/x-patch
Size: 57784 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20181228/4e57e7af/attachment-0001.bin>

More information about the x265-devel mailing list