[x265-commits] [x265] motion: use fast weighted subpel refine

Fri Nov 22 06:41:44 CET 2013

details:   http://hg.videolan.org/x265/rev/8f156b97360b
branches:  
changeset: 5255:8f156b97360b
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 21 17:29:26 2013 -0600
description:
motion: use fast weighted subpel refine

Don't do the full-blown weighted motion compensation for ME.  Just interpolate
the weighted full pel pixels. It is not 100% accurate to the pixels that will
be used to encode the final prediction; but close enough for ME.

Testing with sintel_trailer_2k_720p24.y4m at medium preset and all defaults
x265 [info]: 651 of 1124 (57.92%) P frames weighted

before: 1253 frames in 512.74s (2.44 fps), 223.51 kb/s, Global PSNR: 50.552
after:  1253 frames in 410.25s (3.05 fps), 223.59 kb/s, Global PSNR: 50.589
Subject: [x265] cmake: almost revive Xcode support

details:   http://hg.videolan.org/x265/rev/f4e10e4d3f0d
branches:  
changeset: 5256:f4e10e4d3f0d
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 21 18:03:46 2013 -0600
description:
cmake: almost revive Xcode support

# macbrew based instructions
brew install cmake --HEAD
cmake -G Xcode ../source
open x265.xcodeproj

> cmake --version
cmake version 2.8.12.20131121

The static library is still not linking properly, so the cli does not link
as well; but it does build the shared library

diffstat:

 source/CMakeLists.txt                 |  26 ++++++++++---
 source/Lib/TLibEncoder/TEncSearch.cpp |  15 +++----
 source/common/CMakeLists.txt          |   2 +-
 source/encoder/motion.cpp             |  68 +++++++++++-----------------------
 source/encoder/motion.h               |   3 -
 5 files changed, 50 insertions(+), 64 deletions(-)

diffs (221 lines):

diff -r b172259c07f1 -r f4e10e4d3f0d source/CMakeLists.txt

--- a/source/CMakeLists.txt	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/CMakeLists.txt	Thu Nov 21 18:03:46 2013 -0600
@@ -5,6 +5,7 @@ if(NOT CMAKE_BUILD_TYPE)
         "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel."
         FORCE)
 endif()
+cmake_policy(SET CMP0025 OLD) # CMAKE_CXX_COMPILER_ID is Clang on Mac OS X (not AppleClang)
 
 project (x265)
 cmake_minimum_required (VERSION 2.8.8) # OBJECT libraries require 2.8.8
@@ -151,14 +152,27 @@ include_directories(. Lib common encoder
 add_subdirectory(common)
 add_subdirectory(encoder)
 
-if(MSVC_IDE AND ENABLE_PRIMITIVES_ASM)
+if((MSVC_IDE OR XCODE) AND ENABLE_PRIMITIVES_ASM)
     # this is horrible. ugly, and hacky, and it reproduces logic found
     # in the yasm CMake modules, but this is required because of this cmake bug
     # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170
     if (X64)
-        set(FLAGS -f win64 -m amd64 -DARCH_X86_64=1 -DHAVE_ALIGNED_STACK=0)
+        if(APPLE)
+            set(FLAGS -f macho64 -m amd64 -DPREFIX -DPIC -DARCH_X86_64=1 -DHAVE_ALIGNED_STACK=0)
+        else()
+            set(FLAGS -f win64 -m amd64 -DARCH_X86_64=1 -DHAVE_ALIGNED_STACK=0)
+        endif()
     else()
-        set(FLAGS -f win32 -DARCH_X86_64=0 -DHAVE_ALIGNED_STACK=0 -DPREFIX)
+        if(APPLE)
+            set(FLAGS -f macho -DARCH_X86_64=0 -DHAVE_ALIGNED_STACK=0 -DPREFIX)
+        else()
+            set(FLAGS -f win32 -DARCH_X86_64=0 -DHAVE_ALIGNED_STACK=0 -DPREFIX)
+        endif()
+    endif()
+    if(WIN32)
+        set(SUFFIX obj)
+    else()
+        set(SUFFIX o)
     endif()
     if (HIGH_BIT_DEPTH)
         set(FLAGS ${FLAGS} -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10)
@@ -168,10 +182,10 @@ if(MSVC_IDE AND ENABLE_PRIMITIVES_ASM)
     foreach(ASM ${MSVC_ASMS})
         set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
         set(YASM_SRCS ${YASM_SRCS} ${YASM_SRC})
-        set(YASM_OBJS ${YASM_OBJS} ${ASM}.obj)
+        set(YASM_OBJS ${YASM_OBJS} ${ASM}.${SUFFIX})
         add_custom_command(
-            OUTPUT ${ASM}.obj
-            COMMAND ${YASM_EXECUTABLE} ARGS ${FLAGS} ${YASM_SRC} -o ${ASM}.obj
+            OUTPUT ${ASM}.${SUFFIX}
+            COMMAND ${YASM_EXECUTABLE} ARGS ${FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX}
             DEPENDS ${YASM_SRC})
     endforeach()
 endif()
diff -r b172259c07f1 -r f4e10e4d3f0d source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Nov 21 18:03:46 2013 -0600
@@ -458,9 +458,9 @@ void TEncSearch::xIntraCodingLumaBlk(TCo
     }
 
     //===== get residual signal =====
-    assert(!((uint32_t)fenc & (width - 1)));
-    assert(!((uint32_t)pred & (width - 1)));
-    assert(!((uint32_t)residual & (width - 1)));
+    assert(!((uint32_t)(size_t)fenc & (width - 1)));
+    assert(!((uint32_t)(size_t)pred & (width - 1)));
+    assert(!((uint32_t)(size_t)residual & (width - 1)));
     primitives.calcresidual[(int)g_convertToBit[width]](fenc, pred, residual, stride);
 
     //===== transform and quantization =====
@@ -499,7 +499,6 @@ void TEncSearch::xIntraCodingLumaBlk(TCo
     }
 
     //===== reconstruction =====
-    assert(((uint32_t)residual & (width - 1)) == 0);
     assert(width <= 32);
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
 
@@ -587,9 +586,9 @@ void TEncSearch::xIntraCodingChromaBlk(T
     }
 
     //===== get residual signal =====
-    assert(!((uint32_t)fenc & (width - 1)));
-    assert(!((uint32_t)pred & (width - 1)));
-    assert(!((uint32_t)residual & (width - 1)));
+    assert(!((uint32_t)(size_t)fenc & (width - 1)));
+    assert(!((uint32_t)(size_t)pred & (width - 1)));
+    assert(!((uint32_t)(size_t)residual & (width - 1)));
     int size = g_convertToBit[width];
     primitives.calcresidual[size](fenc, pred, residual, stride);
 
@@ -638,7 +637,7 @@ void TEncSearch::xIntraCodingChromaBlk(T
     }
 
     //===== reconstruction =====
-    assert(((uint32_t)residual & (width - 1)) == 0);
+    assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
     assert(width <= 32);
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE / 2, reconIPredStride);
 
diff -r b172259c07f1 -r f4e10e4d3f0d source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/common/CMakeLists.txt	Thu Nov 21 18:03:46 2013 -0600
@@ -119,7 +119,7 @@ if(ENABLE_PRIMITIVES_ASM)
         set(A_SRCS ${A_SRCS} pixel-32.asm)
     endif()
 
-    if(MSVC_IDE)
+    if(MSVC_IDE OR XCODE)
         # MSVC requires custom build rules in the main cmake script for yasm
         set(MSVC_ASMS "${A_SRCS}" CACHE INTERNAL "yasm sources")
         set(A_SRCS)
diff -r b172259c07f1 -r f4e10e4d3f0d source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/encoder/motion.cpp	Thu Nov 21 18:03:46 2013 -0600
@@ -104,17 +104,11 @@ MotionEstimate::MotionEstimate()
         init_scales();
 
     fenc = (pixel*)X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
-    subpelbuf = (pixel*)X265_MALLOC(pixel, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1));
-    immedVal = (int16_t*)X265_MALLOC(int16_t, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1 + NTAPS_LUMA - 1));
-    immedVal2 = (int16_t*)X265_MALLOC(int16_t, (MAX_CU_SIZE + 1) * (MAX_CU_SIZE + 1 + NTAPS_LUMA - 1));
 }
 
 MotionEstimate::~MotionEstimate()
 {
     X265_FREE(fenc);
-    X265_FREE(subpelbuf);
-    X265_FREE(immedVal);
-    X265_FREE(immedVal2);
 }
 
 void MotionEstimate::setSourcePU(int offset, int width, int height)
@@ -1137,50 +1131,32 @@ int MotionEstimate::subpelCompare(Refere
     }
     else
     {
-        if (ref->isWeighted)
+        /* We are taking a short-cut here if the reference is weighted. To be
+         * accurate we should be interpolating unweighted pixels and weighting
+         * the final 16bit values prior to rounding and downshifting. Instead we
+         * are simply interpolating the weighted full-pel pixels. Not 100%
+         * accurate but good enough for fast qpel ME */
+        ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
+        pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
+        if (yFrac == 0)
         {
-            int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
-            int shift = ref->shift + shiftNum;
-            int round = shift ? (1 << (shift - 1)) : 0;
-            pixel *fref = ref->unweightedFPelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
-
-            if (yFrac == 0)
-            {
-                primitives.ipfilter_ps[FILTER_H_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, blockheight, g_lumaFilter[xFrac]);
-                primitives.weight_sp(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, blockheight, ref->weight, round, shift, ref->offset);
-            }
-            else if (xFrac == 0)
-            {
-                primitives.ipfilter_ps[FILTER_V_P_S_8](fref, ref->lumaStride, immedVal, FENC_STRIDE, blockwidth, blockheight, g_lumaFilter[yFrac]);
-                primitives.weight_sp(immedVal, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, blockheight, ref->weight, round, shift, ref->offset);
-            }
-            else
-            {
-                int filterSize = NTAPS_LUMA;
-                int halfFilterSize = (filterSize >> 1);
-                primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, blockheight + filterSize - 1, g_lumaFilter[xFrac]);
-                primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE, blockwidth, blockheight, yFrac);
-                primitives.weight_sp(immedVal2, subpelbuf, FENC_STRIDE, FENC_STRIDE, blockwidth, blockheight, ref->weight, round, shift, ref->offset);
-            }
+            primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
+        }
+        else if (xFrac == 0)
+        {
+            primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
         }
         else
         {
-            pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
-            if (yFrac == 0)
-            {
-                primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
-            }
-            else if (xFrac == 0)
-            {
-                primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
-            }
-            else
-            {
-                int filterSize = NTAPS_LUMA;
-                int halfFilterSize = (filterSize >> 1);
-                primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth, blockheight + filterSize - 1, g_lumaFilter[xFrac]);
-                primitives.luma_vsp[partEnum](immedVal + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
-            }
+            ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
+
+            int filterSize = NTAPS_LUMA;
+            int halfFilterSize = filterSize >> 1;
+            primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize - 1) * ref->lumaStride, ref->lumaStride,
+                                                   immed, blockwidth,
+                                                   blockwidth, blockheight + filterSize - 1,
+                                                   g_lumaFilter[xFrac]);
+            primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
         }
         return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
     }
diff -r b172259c07f1 -r f4e10e4d3f0d source/encoder/motion.h
--- a/source/encoder/motion.h	Thu Nov 21 17:07:34 2013 -0600
+++ b/source/encoder/motion.h	Thu Nov 21 18:03:46 2013 -0600
@@ -52,9 +52,6 @@ protected:
     int subpelRefine;
 
     /* subpel generation buffers */
-    pixel *subpelbuf;
-    int16_t *immedVal;
-    int16_t *immedVal2;
     int blockwidth;
     int blockheight;