[x265-commits] [x265] TEncSearch: fix comment for TEncSearch::predInterSearch

Tue Oct 15 08:17:21 CEST 2013

details:   http://hg.videolan.org/x265/rev/062c51758069
branches:  
changeset: 4448:062c51758069
user:      Steve Borho <steve at borho.org>
date:      Mon Oct 14 21:55:04 2013 -0500
description:
TEncSearch: fix comment for TEncSearch::predInterSearch
Subject: [x265] cmake: do not query clang version, it is not used

details:   http://hg.videolan.org/x265/rev/fa90c915a323
branches:  
changeset: 4449:fa90c915a323
user:      Steve Borho <steve at borho.org>
date:      Tue Oct 15 00:37:37 2013 -0500
description:
cmake: do not query clang version, it is not used
Subject: [x265] cmake: give 16bpp vector sad primitives their own C++ file

details:   http://hg.videolan.org/x265/rev/764c0e9984f0
branches:  
changeset: 4450:764c0e9984f0
user:      Steve Borho <steve at borho.org>
date:      Tue Oct 15 00:50:51 2013 -0500
description:
cmake: give 16bpp vector sad primitives their own C++ file

diffstat:

 source/CMakeLists.txt                 |    10 +-
 source/Lib/TLibEncoder/TEncSearch.cpp |     7 +-
 source/common/CMakeLists.txt          |     5 +-
 source/common/vec/pixel-sse41.cpp     |    14 +-
 source/common/vec/pixel16-sse41.cpp   |  1866 +++++++++++++++++++++++++++++++++
 source/common/vec/pixel16.inc         |  1805 -------------------------------
 6 files changed, 1881 insertions(+), 1826 deletions(-)

diffs (truncated from 3785 to 300 lines):

diff -r abae6903e0af -r 764c0e9984f0 source/CMakeLists.txt

--- a/source/CMakeLists.txt	Mon Oct 14 13:12:22 2013 -0500
+++ b/source/CMakeLists.txt	Tue Oct 15 00:50:51 2013 -0500
@@ -64,19 +64,19 @@ if(INTEL_CXX AND UNIX)
     # treat icpc roughly like gcc
     set(GCC 1)
     add_definitions(-Wall -Wextra -Wshadow -no-vec)
+elseif(CLANG)
+    # treat clang roughly like gcc
+    set(GCC 1)
+    add_definitions(-Wall -Wextra -Wshadow -mstackrealign -ffast-math)
 elseif(CMAKE_COMPILER_IS_GNUCXX)
+    add_definitions(-Wall -Wextra -Wshadow -mstackrealign -ffast-math)
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    add_definitions(-Wall -Wextra -Wshadow -mstackrealign -ffast-math)
     if(NOT GCC_VERSION VERSION_LESS 4.7)
         # this is necessary to avoid name conflicts in vector class
         # library.  if vector classes are removed/replaced this can
         # likely be removed as well.
         add_definitions(-fabi-version=6)
     endif()
-elseif(CLANG)
-    execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    set(GCC 1)
-    add_definitions(-Wall -Wextra -Wshadow -ffast-math -mstackrealign)
 endif()
 if (GCC)
     option(WARNINGS_AS_ERRORS "Stop compiles on first warning" OFF)
diff -r abae6903e0af -r 764c0e9984f0 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Oct 14 13:12:22 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Oct 15 00:50:51 2013 -0500
@@ -2218,11 +2218,8 @@ void TEncSearch::xRestrictBipredMergeCan
 
 /** search of the best candidate for inter prediction
  * \param cu
- * \param fencYuv
- * \param rpcPredYuv
- * \param rpcResiYuv
- * \param rpcRecoYuv
- * \param bUseRes
+ * \param predYuv
+ * \param bUseMRG
  * \returns void
  */
 void TEncSearch::predInterSearch(TComDataCU* cu, TComYuv* predYuv, bool bUseMRG)
diff -r abae6903e0af -r 764c0e9984f0 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Mon Oct 14 13:12:22 2013 -0500
+++ b/source/common/CMakeLists.txt	Tue Oct 15 00:50:51 2013 -0500
@@ -89,14 +89,14 @@ if(ENABLE_PRIMITIVES_VEC)
             vec/pixel-sse3.cpp vec/pixel-ssse3.cpp vec/pixel-sse41.cpp
             vec/dct-sse3.cpp vec/dct-ssse3.cpp vec/dct-sse41.cpp
             vec/ipfilter-ssse3.cpp vec/ipfilter-sse41.cpp
-            vec/intra-sse3.cpp vec/intra-sse41.cpp)
+            vec/pixel16-sse41.cpp vec/intra-sse3.cpp vec/intra-sse41.cpp)
         if (NOT X64)
             # x64 implies SSE4, so this flag would have no effect (and it issues a warning)
             set_source_files_properties(vec/blockcopy-sse3.cpp
                 vec/pixel-sse3.cpp vec/pixel-ssse3.cpp vec/pixel-sse41.cpp
                 vec/dct-sse3.cpp vec/dct-ssse3.cpp vec/dct-sse41.cpp
                 vec/ipfilter-ssse3.cpp vec/ipfilter-sse41.cpp
-                vec/intra-sse3.cpp vec/intra-sse41.cpp
+                vec/intra-sse3.cpp vec/intra-sse41.cpp vec/pixel16-sse41.cpp
                 PROPERTIES COMPILE_FLAGS /arch:SSE2)
         endif()
         if (MSVC_VERSION EQUAL 1700 OR INTEL_CXX)
@@ -129,6 +129,7 @@ if(ENABLE_PRIMITIVES_VEC)
                 PROPERTIES COMPILE_FLAGS "-mssse3")
             set_source_files_properties(
                 vec/pixel-sse41.cpp vec/ipfilter-sse41.cpp vec/dct-sse41.cpp vec/intra-sse41.cpp
+                vec/pixel16-sse41.cpp
                 PROPERTIES COMPILE_FLAGS "-msse4.1")
         endif()
         if(INTEL_CXX OR CLANG OR (NOT GCC_VERSION VERSION_LESS 4.7))
diff -r abae6903e0af -r 764c0e9984f0 source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp	Mon Oct 14 13:12:22 2013 -0500
+++ b/source/common/vec/pixel-sse41.cpp	Tue Oct 15 00:50:51 2013 -0500
@@ -5527,16 +5527,10 @@ int sse_ss64(short* fenc, intptr_t strid
 }
 }
 
-#if HIGH_BIT_DEPTH
-#define INSTRSET 5
-#include "vectorclass.h"
-namespace {
-#include "pixel16.inc"
-}
-#endif
-
 namespace x265 {
 
+extern void Setup_Vec_Pixel16Primitives_sse41(EncoderPrimitives &p);
+
 #if HIGH_BIT_DEPTH
 #define SETUP_PARTITION(W, H) \
     p.sad[PARTITION_##W##x##H] = sad_##W<H>; \
@@ -5594,7 +5588,9 @@ void Setup_Vec_PixelPrimitives_sse41(Enc
     SETUP_NONSAD(4, 4); // 4x4 SAD covered by assembly
     /* 4x4 is too small for any sub partitions */
 
-#if !HIGH_BIT_DEPTH
+#if HIGH_BIT_DEPTH
+    Setup_Vec_Pixel16Primitives_sse41(p);
+#else
     // These are the only SSE primitives uncovered by assembly
     p.sad_x3[PARTITION_4x16] = sad_x3_4x16;
     p.sad_x4[PARTITION_4x16] = sad_x4_4x16;
diff -r abae6903e0af -r 764c0e9984f0 source/common/vec/pixel16-sse41.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/vec/pixel16-sse41.cpp	Tue Oct 15 00:50:51 2013 -0500
@@ -0,0 +1,1866 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve at borho.org>
+ *          Mandar Gurav <mandar at multicorewareinc.com>
+ *          Mahesh Pittala <mahesh at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#include "TLibCommon/TComRom.h"
+#include "primitives.h"
+#include <assert.h>
+#include <xmmintrin.h> // SSE
+#include <smmintrin.h> // SSE4.1
+
+using namespace x265;
+
+/* intrinsics for when pixel type is short */
+#if HIGH_BIT_DEPTH
+
+#define INSTRSET 5
+#include "vectorclass.h"
+
+namespace {
+template<int ly>
+int sad_4(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
+{
+    Vec8s m1, n1;
+
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int max_iterators = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < max_iterators; row += 16)
+    {
+        for (int i = 0; i < 16; i++)
+        {
+            m1.load_a(fenc);
+            n1.load(fref);
+            sad += abs(m1 - n1);
+
+            fenc += fencstride;
+            fref += frefstride;
+        }
+
+        sum += extend_low(sad);
+        sad = 0;
+    }
+
+    while (row++ < ly)
+    {
+        m1.load_a(fenc);
+        n1.load(fref);
+        sad += abs(m1 - n1);
+
+        fenc += fencstride;
+        fref += frefstride;
+    }
+
+    sum += extend_low(sad);
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
+{
+    Vec8s m1, n1;
+
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int max_iterators = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < max_iterators; row += 16)
+    {
+        for (int i = 0; i < 16; i++)
+        {
+            m1.load_a(fenc);
+            n1.load(fref);
+            sad += abs(m1 - n1);
+
+            fenc += fencstride;
+            fref += frefstride;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+
+    while (row++ < ly)
+    {
+        m1.load_a(fenc);
+        n1.load(fref);
+        sad += abs(m1 - n1);
+
+        fenc += fencstride;
+        fref += frefstride;
+    }
+
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int sad_12(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
+{
+    Vec8s m1, n1;
+
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int max_iterators = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < max_iterators; row += 16)
+    {
+        for (int i = 0; i < 16; i++)
+        {
+            m1.load_a(fenc);
+            n1.load(fref);
+            sad += abs(m1 - n1);
+
+            m1.load_a(fenc + 8);
+            m1.cutoff(4);
+            n1.load(fref + 8);
+            n1.cutoff(4);
+            sad += abs(m1 - n1);
+
+            fenc += fencstride;
+            fref += frefstride;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+
+    while (row++ < ly)
+    {
+        m1.load_a(fenc);
+        n1.load(fref);
+        sad += abs(m1 - n1);
+
+        m1.load_a(fenc + 8);
+        m1.cutoff(4);
+        n1.load(fref + 8);
+        n1.cutoff(4);
+        sad += abs(m1 - n1);
+
+        fenc += fencstride;
+        fref += frefstride;
+    }
+
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int sad_16(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
+{
+    Vec8s m1, n1;
+
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int max_iterators = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < max_iterators; row += 8)
+    {
+        for (int i = 0; i < 8; i++)
+        {
+            m1.load_a(fenc);