[x265] [PATCH] Implementation of low-pass subband dct approximation

Fri Nov 3 16:35:58 CET 2017

# HG changeset patch
# User hribeiro
# Date 1507997943 25200
#      Sat Oct 14 09:19:03 2017 -0700
# Node ID 893b36b82133a2bc4d3cfd6aa3a18c544ce0bf94
# Parent  6a310b24c6a2d831ef08bbda1bdcf9d929daa308
Implementation of low-pass subband dct approximation.

diff -r 6a310b24c6a2 -r 893b36b82133 doc/reST/cli.rst

--- a/doc/reST/cli.rst	Thu Nov 02 12:17:29 2017 +0530
+++ b/doc/reST/cli.rst	Sat Oct 14 09:19:03 2017 -0700
@@ -2142,6 +2142,18 @@
 
 	Only effective at RD levels 5 and 6
 
+DCT Approximations
+=================
+
+.. option:: --lowpass-dct
+
+    If enabled, x265 will use low-pass truncated dct approximation instead of the
+    standard dct. This approximation is less computational intesive but it generates
+    truncated coefficient matrixes for the transformed block. Empirical analysis shows
+    this approximation gives good PSNR results for QP>=23. 
+
+    This approximation should be considered for platforms with performance and time 
+    constrains.
 
 Debugging options
 =================
diff -r 6a310b24c6a2 -r 893b36b82133 source/CMakeLists.txt
--- a/source/CMakeLists.txt	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/CMakeLists.txt	Sat Oct 14 09:19:03 2017 -0700
@@ -29,7 +29,7 @@
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 136)
+set(X265_BUILD 137)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
diff -r 6a310b24c6a2 -r 893b36b82133 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/common/CMakeLists.txt	Sat Oct 14 09:19:03 2017 -0700
@@ -131,7 +131,7 @@
 add_library(common OBJECT
     ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
     primitives.cpp primitives.h
-    pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
+    pixel.cpp dct.cpp lowpassdct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
     constants.cpp constants.h
     cpu.cpp cpu.h version.cpp
     threading.cpp threading.h
diff -r 6a310b24c6a2 -r 893b36b82133 source/common/lowpassdct.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/lowpassdct.cpp	Sat Oct 14 09:19:03 2017 -0700
@@ -0,0 +1,127 @@
+/*****************************************************************************
+ * Copyright (C) 2017 
+ *
+ * Authors: Humberto Ribeiro Filho <mont3z.claro5 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+
+using namespace X265_NS;
+
+/* standard dct transformations */
+static dct_t* s_dct4x4;
+static dct_t* s_dct8x8;
+static dct_t* s_dct16x16;
+
+static void lowPassDct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[4 * 4]);
+    ALIGN_VAR_32(int16_t, avgBlock[4 * 4]);
+    int16_t totalSum = 0;
+    int16_t sum = 0;
+    
+    for (int i = 0; i < 4; i++)
+        for (int j =0; j < 4; j++)
+        {
+            // Calculate average of 2x2 cells
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            avgBlock[i*4 + j] = sum >> 2;
+
+            totalSum += sum; // use to calculate total block average
+        }
+
+    //dct4
+    (*s_dct4x4)(avgBlock, coef, 4);
+    memset(dst, 0, 64 * sizeof(int16_t));
+    for (int i = 0; i < 4; i++)
+    {
+        memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t));
+    }
+
+    // replace first coef with total block average
+    dst[0] = totalSum << 1;
+}
+
+static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[8 * 8]);
+    ALIGN_VAR_32(int16_t, avgBlock[8 * 8]);
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 8; i++)
+        for (int j =0; j < 8; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            avgBlock[i*8 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    (*s_dct8x8)(avgBlock, coef, 8);
+    memset(dst, 0, 256 * sizeof(int16_t));
+    for (int i = 0; i < 8; i++)
+    {
+        memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t));
+    }
+    dst[0] = static_cast<int16_t>(totalSum >> 1);
+}
+
+static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[16 * 16]);
+    ALIGN_VAR_32(int16_t, avgBlock[16 * 16]);
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 16; i++)
+        for (int j =0; j < 16; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            avgBlock[i*16 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    (*s_dct16x16)(avgBlock, coef, 16);
+    memset(dst, 0, 1024 * sizeof(int16_t));
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t));
+    }
+    dst[0] = static_cast<int16_t>(totalSum >> 3);
+}
+
+namespace X265_NS {
+// x265 private namespace
+
+void setupLowPassPrimitives_c(EncoderPrimitives& p)
+{
+    s_dct4x4 = &(p.cu[BLOCK_4x4].standard_dct);
+    s_dct8x8 = &(p.cu[BLOCK_8x8].standard_dct);
+    s_dct16x16 = &(p.cu[BLOCK_16x16].standard_dct);
+
+    p.cu[BLOCK_8x8].lowpass_dct = lowPassDct8_c;
+    p.cu[BLOCK_16x16].lowpass_dct = lowPassDct16_c;
+    p.cu[BLOCK_32x32].lowpass_dct = lowPassDct32_c;
+}
+}
diff -r 6a310b24c6a2 -r 893b36b82133 source/common/param.cpp
--- a/source/common/param.cpp	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/common/param.cpp	Sat Oct 14 09:19:03 2017 -0700
@@ -288,6 +288,9 @@
     param->csvfpt = NULL;
     param->forceFlush = 0;
     param->bDisableLookahead = 0;
+
+    /* DCT Approximations */
+    param->bLowPassDct = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -927,6 +930,7 @@
     OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
     OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
+    OPT("lowpass-dct") p->bLowPassDct = atobool(value);
     OPT("uhd-bd") p->uhdBluray = atobool(value);
     else
         bExtraParams = true;
@@ -1676,6 +1680,7 @@
     s += sprintf(s, " refine-mv=%d", p->mvRefine);
     BOOL(p->bLimitSAO, "limit-sao");
     s += sprintf(s, " ctu-info=%d", p->bCTUInfo);
+    BOOL(p->bLowPassDct, "lowpass-dct");
 #undef BOOL
     return buf;
 }
diff -r 6a310b24c6a2 -r 893b36b82133 source/common/primitives.cpp
--- a/source/common/primitives.cpp	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/common/primitives.cpp	Sat Oct 14 09:19:03 2017 -0700
@@ -58,11 +58,13 @@
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
 void setupSaoPrimitives_c(EncoderPrimitives &p);
 void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
+void setupLowPassPrimitives_c(EncoderPrimitives& p);
 
 void setupCPrimitives(EncoderPrimitives &p)
 {
     setupPixelPrimitives_c(p);      // pixel.cpp
     setupDCTPrimitives_c(p);        // dct.cpp
+    setupLowPassPrimitives_c(p);    // lowpassdct.cpp
     setupFilterPrimitives_c(p);     // ipfilter.cpp
     setupIntraPrimitives_c(p);      // intrapred.cpp
     setupLoopFilterPrimitives_c(p); // loopfilter.cpp
@@ -70,6 +72,19 @@
     setupSeaIntegralPrimitives_c(p);  // framefilter.cpp
 }
 
+void enableLowpassDCTPrimitives(EncoderPrimitives &p)
+{
+    // update copies of the standard dct transform
+    p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct;
+    p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct;
+    p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct;
+    p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct;
+
+    // replace active dct by lowpass dct for high dct transforms
+    p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct;
+    p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct;
+}
+
 void setupAliasPrimitives(EncoderPrimitives &p)
 {
 #if HIGH_BIT_DEPTH
@@ -256,6 +271,11 @@
 #endif
 
         setupAliasPrimitives(primitives);
+
+        if (param->bLowPassDct && param->rc.qp > 20)
+        {
+            enableLowpassDCTPrimitives(primitives); 
+        }
     }
 
     x265_report_simd(param);
diff -r 6a310b24c6a2 -r 893b36b82133 source/common/primitives.h
--- a/source/common/primitives.h	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/common/primitives.h	Sat Oct 14 09:19:03 2017 -0700
@@ -259,8 +259,12 @@
      * primitives will leave 64x64 pointers NULL.  Indexed by LumaCU */
     struct CU
     {
-        dct_t           dct;
-        idct_t          idct;
+        dct_t           dct;    // active dct transformation
+        idct_t          idct;   // active idct transformation
+
+        dct_t           standard_dct;   // original dct function, used by lowpass_dct
+        dct_t           lowpass_dct;    // lowpass dct approximation
+
         calcresidual_t  calcresidual;
         pixel_sub_ps_t  sub_ps;
         pixel_add_ps_t  add_ps;
diff -r 6a310b24c6a2 -r 893b36b82133 source/x265.h
--- a/source/x265.h	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/x265.h	Sat Oct 14 09:19:03 2017 -0700
@@ -1505,6 +1505,11 @@
 
     /* Disable lookahead */
     int       bDisableLookahead;
+
+    /* Use low-pass truncated dct approximation 
+    *  This DCT approximation is less computational intensive and gives results close to 
+    *  standard DCT for QP >= 23 */
+    int       bLowPassDct;
 } x265_param;
 
 /* x265_param_alloc:
diff -r 6a310b24c6a2 -r 893b36b82133 source/x265cli.h
--- a/source/x265cli.h	Thu Nov 02 12:17:29 2017 +0530
+++ b/source/x265cli.h	Sat Oct 14 09:19:03 2017 -0700
@@ -282,6 +282,7 @@
     { "force-flush",    required_argument, NULL, 0 },
     { "splitrd-skip",         no_argument, NULL, 0 },
     { "no-splitrd-skip",      no_argument, NULL, 0 },
+    { "lowpass-dct",          no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -543,6 +544,7 @@
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
     H1("   --recon-y4m-exec <string>     pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
+    H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
     H1("\nExecutable return codes:\n");
     H1("    0 - encode successful\n");
     H1("    1 - unable to parse command line\n");
-------------- next part --------------
A non-text attachment was scrubbed...
Name: videolan.patch
Type: text/x-patch
Size: 11822 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20171103/d501f5a7/attachment-0001.bin>