[x265] [PATCH] lowres: Enhanced scaling

Tue Jul 22 11:55:11 CEST 2014

# HG changeset patch
# User Nicolas Morey-Chaisemartin <nmorey at kalray.eu>
# Date 1406020650 -7200
#      Tue Jul 22 11:17:30 2014 +0200
# Node ID fc75f5f4f85e0d9441dc73b09ec6aaaa0a36c20f
# Parent  4c9ce4db74d1c9768abc61290bd1bda002b79f4e
lowres: Enhanced scaling

  * Replace hard coded values with X265_LOWRES_CU_SIZE
  * Add X265_LOWRES_SCALE define to tweak divider for LowRes

Note: X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE must be lesser or equal to 64 to be able to use standard filters for weight prediction

Performance Impact:
Command Line:
./x265/build/x265     --preset medium  --accel=none  red_kayak_1080p-420.y4m kayak.hevc --bitrate=$BITRATE --ssim

- BITRATE=4000 X265_LOWRES_SCALE=2
encoded 570 frames in 105.56s (5.40 fps), 3334.27 kb/s, SSIM Mean Y: 0.8900527 ( 9.588 dB)
- BITRATE=4000 X265_LOWRES_SCALE=4
encoded 570 frames in 87.11s (6.54 fps), 3398.38 kb/s, SSIM Mean Y: 0.8836753 ( 9.343 dB)
- BITRATE=4000 X265_LOWRES_SCALE=8
encoded 570 frames in 79.71s (7.15 fps), 3437.19 kb/s, SSIM Mean Y: 0.8765783 ( 9.086 dB)

- BITRATE=9000 X265_LOWRES_SCALE=2
encoded 570 frames in 115.32s (4.94 fps), 7263.50 kb/s, SSIM Mean Y: 0.9272905 (11.384 dB)
- BITRATE=9000 X265_LOWRES_SCALE=4
encoded 570 frames in 101.53s (5.61 fps), 7439.24 kb/s, SSIM Mean Y: 0.9209998 (11.024 dB)
- BITRATE=9000 X265_LOWRES_SCALE=8
encoded 570 frames in 92.98s (6.13 fps), 7549.41 kb/s, SSIM Mean Y: 0.9160721 (10.761 dB)

diff --git a/source/common/common.h b/source/common/common.h
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -153,16 +153,17 @@ typedef int32_t  coeff_t;      // transf
  // arbitrary, but low because SATD scores are 1/4 normal
  #define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET)
  #define X265_LOOKAHEAD_MAX 250
  
  // Use the same size blocks as x264.  Using larger blocks seems to give artificially
  // high cost estimates (intra and inter both suffer)
  #define X265_LOWRES_CU_SIZE   8
  #define X265_LOWRES_CU_BITS   3
+#define X265_LOWRES_SCALE     2
  
  #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) * (count))
  #define X265_FREE(ptr)              x265_free(ptr)
  #define CHECKED_MALLOC(var, type, count) \
      { \
          var = (type*)x265_malloc(sizeof(type) * (count)); \
          if (!var) \
          { \
diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
--- a/source/common/lowres.cpp
+++ b/source/common/lowres.cpp
@@ -24,20 +24,21 @@
  #include "TLibCommon/TComPicYuv.h"
  #include "lowres.h"
  #include "mv.h"
  
  using namespace x265;
  
  bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)
  {
+	X265_CHECK(X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE <= 64, "Invalid LowRes scaling\n");
      isLowres = true;
      bframes = _bframes;
-    width = orig->getWidth() / 2;
-    lines = orig->getHeight() / 2;
+    width = orig->getWidth() / X265_LOWRES_SCALE;
+    lines = orig->getHeight() / X265_LOWRES_SCALE;
      lumaStride = width + 2 * orig->getLumaMarginX();
      if (lumaStride & 31)
          lumaStride += 32 - (lumaStride & 31);
      int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
      int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
      int cuCount = cuWidth * cuHeight;
  
      /* rounding the width to multiple of lowres CU size */
diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -984,19 +984,20 @@ int FrameEncoder::calcQpForCu(uint32_t c
      if (bIsVbv)
      {
          m_frame->m_cuCostsForVbv[cuAddr] = 0;
          m_frame->m_intraCuCostsForVbv[cuAddr] = 0;
      }
  
      /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
      double qp_offset = 0;
-    int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (16 - 1)) / 16;
-    int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (16 - 1)) / 16;
-    int noOfBlocks = g_maxCUSize / 16;
+    int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
+    int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (lowResCu - 1)) / lowResCu;
+    int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (lowResCu - 1)) / lowResCu;
+    int noOfBlocks = g_maxCUSize / lowResCu;
      int block_y = (cuAddr / m_frame->getPicSym()->getFrameWidthInCU()) * noOfBlocks;
      int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()->getFrameWidthInCU();
  
      /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
      double *qpoffs = (m_isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
  
      int cnt = 0, idx = 0;
      for (int h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp
+++ b/source/encoder/ratecontrol.cpp
@@ -198,18 +198,18 @@ void RateControl::calcAdaptiveQuantFrame
  
      /* Calculate Qp offset for each 16x16 block in the frame */
      int block_xy = 0;
      int block_x = 0, block_y = 0;
      double strength = 0.f;
      if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
      {
          /* Need to init it anyways for CU tree */
-        int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-        int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+        int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+        int cuHeight = ((maxRow / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
          int cuCount = cuWidth * cuHeight;
  
          if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
          {
              memset(pic->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
              memset(pic->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
              for (int cuxy = 0; cuxy < cuCount; cuxy++)
              {
@@ -231,19 +231,19 @@ void RateControl::calcAdaptiveQuantFrame
      }
      else
      {
          block_xy = 0;
          double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
          if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
          {
              double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
-            for (block_y = 0; block_y < maxRow; block_y += 16)
+            for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
              {
-                for (block_x = 0; block_x < maxCol; block_x += 16)
+                for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
                  {
                      uint32_t energy = acEnergyCu(pic, block_x, block_y);
                      qp_adj = pow(energy + 1, 0.1);
                      pic->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
                      avg_adj += qp_adj;
                      avg_adj_pow2 += qp_adj * qp_adj;
                      block_xy++;
                  }
@@ -253,19 +253,19 @@ void RateControl::calcAdaptiveQuantFrame
              avg_adj_pow2 /= m_ncu;
              strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
              avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
          }
          else
              strength = m_param->rc.aqStrength * 1.0397f;
  
          block_xy = 0;
-        for (block_y = 0; block_y < maxRow; block_y += 16)
+        for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
          {
-            for (block_x = 0; block_x < maxCol; block_x += 16)
+            for (block_x = 0; block_x < maxCol; block_x +=  (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
              {
                  if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
                  {
                      qp_adj = pic->m_lowres.qpCuTreeOffset[block_xy];
                      qp_adj = strength * (qp_adj - avg_adj);
                  }
                  else
                  {
@@ -297,18 +297,18 @@ void RateControl::calcAdaptiveQuantFrame
              pic->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
          }
      }
  }
  
  RateControl::RateControl(x265_param *p)
  {
      m_param = p;
-    int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    int lowresCuHeight = ((m_param->sourceHeight / 2)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
      m_ncu = lowresCuWidth * lowresCuHeight;
  
      if (m_param->rc.cuTree)
          m_qCompress = 1;
      else
          m_qCompress = m_param->rc.qCompress;
  
      // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp
+++ b/source/encoder/slicetype.cpp
@@ -61,18 +61,18 @@ Lookahead::Lookahead(x265_param *param,
  {
      m_bReady = 0;
      m_param = param;
      m_top = enc;
      m_lastKeyframe = -m_param->keyframeMax;
      m_lastNonB = NULL;
      m_bFilling = true;
      m_bFlushed = false;
-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
      m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
      memset(m_histogram, 0, sizeof(m_histogram));
  }
  
  Lookahead::~Lookahead() { }
  
  void Lookahead::init()
  {
@@ -1205,18 +1205,18 @@ CostEstimate::~CostEstimate()
      }
  
      delete[] m_rows;
  }
  
  void CostEstimate::init(x265_param *_param, Frame *pic)
  {
      m_param = _param;
-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
  
      m_rows = new EstimateRow[m_heightInCU];
      for (int i = 0; i < m_heightInCU; i++)
      {
          m_rows[i].m_widthInCU = m_widthInCU;
          m_rows[i].m_heightInCU = m_heightInCU;
      }
  
diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp
+++ b/source/encoder/weightPrediction.cpp
@@ -52,39 +52,40 @@ int sliceHeaderCost(WeightParam *w, int
      return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
  }
  
  /* make a motion compensated copy of lowres ref into mcout with the same stride.
   * The borders of mcout are not extended */
  void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
  {
      int stride = ref.lumaStride;
-    const int cuSize = 8;
+    const int cuSize = X265_LOWRES_CU_SIZE;
+    const int partSize = partitionFromSizes(cuSize, cuSize);
      MV mvmin, mvmax;
  
      int cu = 0;
  
      for (int y = 0; y < ref.lines; y += cuSize)
      {
          int pixoff = y * stride;
          mvmin.y = (int16_t)((-y - 8) << 2);
          mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
  
          for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
          {
-            ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
-            intptr_t bstride = 8;
+            ALIGN_VAR_16(pixel, buf[cuSize * cuSize]);
+            intptr_t bstride = cuSize;
              mvmin.x = (int16_t)((-x - 8) << 2);
              mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
  
              /* clip MV to available pixels */
              MV mv = mvs[cu];
              mv = mv.clipped(mvmin, mvmax);
-            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
-            primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
+            pixel *tmp = ref.lowresMC(pixoff, mv, buf, bstride);
+            primitives.luma_copy_pp[partSize](mcout + pixoff, stride, tmp, bstride);
          }
      }
  
      x265_emms();
  }
  
  /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
   * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
@@ -94,71 +95,73 @@ void mcChroma(pixel *      mcout,
                const MV *   mvs,
                const Cache& cache,
                int          height,
                int          width)
  {
      /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
       * luma blocks. We have to adapt block size to chroma csp */
      int csp = cache.csp;
-    int bw = 16 >> cache.hshift;
-    int bh = 16 >> cache.vshift;
+    int cuSize = X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE;
+    const int partSize = partitionFromSizes(cuSize, cuSize);
+    int bw = cuSize >> cache.hshift;
+    int bh = cuSize >> cache.vshift;
      MV mvmin, mvmax;
  
      for (int y = 0; y < height; y += bh)
      {
          /* note: lowres block count per row might be different from chroma block
           * count per row because of rounding issues, so be very careful with indexing
           * into the lowres structures */
          int cu = y * cache.lowresWidthInCU;
          int pixoff = y * stride;
          mvmin.y = (int16_t)((-y - 8) << 2);
          mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
  
          for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
          {
              if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
              {
-                MV mv = mvs[cu]; // lowres MV
-                mv <<= 1;        // fullres MV
+                MV mv = mvs[cu];         // lowres MV
+                mv *= X265_LOWRES_SCALE; // fullres MV
                  mv.x >>= cache.hshift;
                  mv.y >>= cache.vshift;
  
                  /* clip MV to available pixels */
                  mvmin.x = (int16_t)((-x - 8) << 2);
                  mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
                  mv = mv.clipped(mvmin, mvmax);
  
                  int fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
                  pixel *temp = src + pixoff + fpeloffset;
  
                  int xFrac = mv.x & 0x7;
                  int yFrac = mv.y & 0x7;
                  if ((yFrac | xFrac) == 0)
                  {
-                    primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
+                    primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, temp, stride);
                  }
                  else if (yFrac == 0)
                  {
-                    primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
+                    primitives.chroma[csp].filter_hpp[partSize](temp, stride, mcout + pixoff, stride, xFrac);
                  }
                  else if (xFrac == 0)
                  {
-                    primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
+                    primitives.chroma[csp].filter_vpp[partSize](temp, stride, mcout + pixoff, stride, yFrac);
                  }
                  else
                  {
-                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
-                    primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
-                    primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
+                    ALIGN_VAR_16(int16_t, imm[cuSize * (cuSize + NTAPS_CHROMA)]);
+                    primitives.chroma[csp].filter_hps[partSize](temp, stride, imm, bw, xFrac, 1);
+                    primitives.chroma[csp].filter_vsp[partSize](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
                  }
              }
              else
              {
-                primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
+                primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, src + pixoff, stride);
              }
          }
      }
  
      x265_emms();
  }
  
  /* Measure sum of 8x8 satd costs between source frame and reference
@@ -191,21 +194,24 @@ uint32_t weightCost(pixel *         fenc
      }
  
      uint32_t cost = 0;
      pixel *f = fenc, *r = ref;
  
      if (bLuma)
      {
          int cu = 0;
-        for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
+        int cuSize = X265_LOWRES_CU_SIZE;
+        int partSize = partitionFromSizes(cuSize, cuSize);
+
+        for (int y = cuSize; y < height; y += cuSize, r += cuSize * stride, f += cuSize * stride)
          {
-            for (int x = 8; x < width; x += 8, cu++)
+            for (int x = cuSize; x < width; x += cuSize, cu++)
              {
-                int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
+                int cmp = primitives.satd[partSize](r + x, stride, f + x, stride);
                  cost += X265_MIN(cmp, cache.intraCost[cu]);
              }
          }
      }
      else if (cache.csp == X265_CSP_I444)
          for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
          {
              for (int x = 16; x < width; x += 16)
@@ -235,18 +241,18 @@ void weightAnalyse(Slice& slice, x265_pa
      TComPicYuv *fencYuv = slice.m_pic->getPicYuvOrg();
      Lowres& fenc        = slice.m_pic->m_lowres;
  
      Cache cache;
  
      memset(&cache, 0, sizeof(cache));
      cache.intraCost = fenc.intraCost;
      cache.numPredDir = slice.isInterP() ? 1 : 2;
-    cache.lowresWidthInCU = fenc.width >> 3;
-    cache.lowresHeightInCU = fenc.lines >> 3;
+    cache.lowresWidthInCU = fenc.width >> X265_LOWRES_CU_BITS;
+    cache.lowresHeightInCU = fenc.lines >> X265_LOWRES_CU_BITS;
      cache.csp = fencYuv->m_picCsp;
      cache.hshift = CHROMA_H_SHIFT(cache.csp);
      cache.vshift = CHROMA_V_SHIFT(cache.csp);
  
      /* Use single allocation for motion compensated ref and weight buffers */
      pixel *mcbuf = X265_MALLOC(pixel, 2 * fencYuv->getStride() * fencYuv->getHeight());
      if (!mcbuf)
      {

-- 
Nicolas Morey Chaisemartin
Phone : +33 6 42 46 68 87