[x265-commits] [x265] blockcopy-sse3.cpp: Replace pixeladd_pp vector class func...

Thu Oct 10 21:28:22 CEST 2013

details:   http://hg.videolan.org/x265/rev/12d098e5d907
branches:  
changeset: 4361:12d098e5d907
user:      Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date:      Thu Oct 10 15:20:57 2013 +0530
description:
blockcopy-sse3.cpp: Replace pixeladd_pp vector class function with intrinsic.
Subject: [x265] blockcopy: move intrinsic function out of vector-class section

details:   http://hg.videolan.org/x265/rev/7dbbbb2a42bc
branches:  
changeset: 4362:7dbbbb2a42bc
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 11:49:27 2013 -0500
description:
blockcopy: move intrinsic function out of vector-class section
Subject: [x265] intra-sse3.cpp: Replace PredIntraAng4_32 vector class function with intrinsic.

details:   http://hg.videolan.org/x265/rev/8b49d3995f0c
branches:  
changeset: 4363:8b49d3995f0c
user:      Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date:      Thu Oct 10 16:15:27 2013 +0530
description:
intra-sse3.cpp: Replace PredIntraAng4_32 vector class function with intrinsic.
Subject: [x265] dct: replace dequant vector class function with intrinsic

details:   http://hg.videolan.org/x265/rev/b77a66b6b93d
branches:  
changeset: 4364:b77a66b6b93d
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Thu Oct 10 16:57:47 2013 +0530
description:
dct: replace dequant vector class function with intrinsic
Subject: [x265] dct: replaced partialButterfly8 vector class function with intrinsic

details:   http://hg.videolan.org/x265/rev/6fa763ba9da8
branches:  
changeset: 4365:6fa763ba9da8
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Thu Oct 10 18:31:20 2013 +0530
description:
dct: replaced partialButterfly8 vector class function with intrinsic
Subject: [x265] dct: move last vector dct function into its own section

details:   http://hg.videolan.org/x265/rev/b0b5c22f5a34
branches:  
changeset: 4366:b0b5c22f5a34
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 12:04:45 2013 -0500
description:
dct: move last vector dct function into its own section
Subject: [x265] dct: move functions which require SSE4.1 from dct-sse3.cpp to dct-sse41.cpp

details:   http://hg.videolan.org/x265/rev/be7c6c42566a
branches:  
changeset: 4367:be7c6c42566a
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 12:07:50 2013 -0500
description:
dct: move functions which require SSE4.1 from dct-sse3.cpp to dct-sse41.cpp
Subject: [x265] dct: remove vector class includes from dct-sse41.cpp, it is clean

details:   http://hg.videolan.org/x265/rev/3be4451ea3aa
branches:  
changeset: 4368:3be4451ea3aa
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 12:09:11 2013 -0500
description:
dct: remove vector class includes from dct-sse41.cpp, it is clean
Subject: [x265] dct: add comments for future opts/code reuse

details:   http://hg.videolan.org/x265/rev/7b4a6a5f8efc
branches:  
changeset: 4369:7b4a6a5f8efc
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 12:18:42 2013 -0500
description:
dct: add comments for future opts/code reuse
Subject: [x265] cmake: link PPA and other libs into x265-shared and x265-static

details:   http://hg.videolan.org/x265/rev/499ef0e4e254
branches:  
changeset: 4370:499ef0e4e254
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 13:40:46 2013 -0500
description:
cmake: link PPA and other libs into x265-shared and x265-static
Subject: [x265] cmake: merge TLibEncoderH source group into TLibEncoder

details:   http://hg.videolan.org/x265/rev/bfdfeb2fd817
branches:  
changeset: 4371:bfdfeb2fd817
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 14:08:33 2013 -0500
description:
cmake: merge TLibEncoderH source group into TLibEncoder

They are both small enough now that they are manageable as a single unit
Subject: [x265] NALwrite: reintroduce include of cstring, required for memcpy on Linux

details:   http://hg.videolan.org/x265/rev/d6b9cc9c402f
branches:  
changeset: 4372:d6b9cc9c402f
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 14:18:40 2013 -0500
description:
NALwrite: reintroduce include of cstring, required for memcpy on Linux
Subject: [x265] TComSlice: remove unreferenced member variable

details:   http://hg.videolan.org/x265/rev/614a68ab4703
branches:  
changeset: 4373:614a68ab4703
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 10 14:25:08 2013 -0500
description:
TComSlice: remove unreferenced member variable

diffstat:

 source/CMakeLists.txt                |     7 +-
 source/Lib/TLibCommon/TComSlice.h    |     1 -
 source/Lib/TLibEncoder/NALwrite.cpp  |     2 +
 source/common/vec/blockcopy-sse3.cpp |   144 +-
 source/common/vec/dct-sse3.cpp       |  1299 ++++++++++++++-------------------
 source/common/vec/dct-sse41.cpp      |   214 +++++-
 source/common/vec/intra-sse3.cpp     |    20 +-
 source/encoder/CMakeLists.txt        |     3 +-
 8 files changed, 850 insertions(+), 840 deletions(-)

diffs (truncated from 1838 to 300 lines):

diff -r a79ecf3a7875 -r 614a68ab4703 source/CMakeLists.txt

--- a/source/CMakeLists.txt	Thu Oct 10 12:29:41 2013 +0530
+++ b/source/CMakeLists.txt	Thu Oct 10 14:25:08 2013 -0500
@@ -113,7 +113,7 @@ option(ENABLE_PPA "Enable PPA profiling 
 if(ENABLE_PPA)
     add_definitions(-DENABLE_PPA)
     add_subdirectory(PPA)
-    SET(PLATFORM_LIBS ${PLATFORM_LIBS} PPA)
+    SET(EXTRA_LIBS ${EXTRA_LIBS} PPA)
     if(UNIX)
         SET(PLATFORM_LIBS ${PLATFORM_LIBS} dl)
     endif(UNIX)
@@ -142,9 +142,10 @@ add_subdirectory(encoder)
 add_library(x265-shared SHARED dllmain.cpp x265.def $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
 add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
 if(ENABLE_PRIMITIVES_ASM AND (MSVC OR XCODE))
-    target_link_libraries(x265-shared assembly)
-    target_link_libraries(x265-static assembly)
+    SET(EXTRA_LIBS ${EXTRA_LIBS} assembly)
 endif()
+target_link_libraries(x265-shared ${EXTRA_LIBS})
+target_link_libraries(x265-static ${EXTRA_LIBS})
 set_target_properties(x265-shared PROPERTIES OUTPUT_NAME x265)
 if(NOT WIN32)
     set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
diff -r a79ecf3a7875 -r 614a68ab4703 source/Lib/TLibCommon/TComSlice.h
--- a/source/Lib/TLibCommon/TComSlice.h	Thu Oct 10 12:29:41 2013 +0530
+++ b/source/Lib/TLibCommon/TComSlice.h	Thu Oct 10 14:25:08 2013 -0500
@@ -307,7 +307,6 @@ private:
     UInt m_initialCpbRemovalDelayLengthMinus1;
     UInt m_cpbRemovalDelayLengthMinus1;
     UInt m_dpbOutputDelayLengthMinus1;
-    UInt m_numDU;
     HrdSubLayerInfo m_HRD[MAX_TLAYER];
 
 public:
diff -r a79ecf3a7875 -r 614a68ab4703 source/Lib/TLibEncoder/NALwrite.cpp
--- a/source/Lib/TLibEncoder/NALwrite.cpp	Thu Oct 10 12:29:41 2013 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.cpp	Thu Oct 10 14:25:08 2013 -0500
@@ -36,6 +36,8 @@
 #include "NALwrite.h"
 #include "common.h"
 
+#include <cstring>
+
 namespace x265 {
 
 //! \ingroup TLibEncoder
diff -r a79ecf3a7875 -r 614a68ab4703 source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp	Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp	Thu Oct 10 14:25:08 2013 -0500
@@ -99,6 +99,78 @@ void blockcopy_p_s(int bx, int by, pixel
         }
     }
 }
+
+void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
+{
+    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | sstride1 | dstride;
+    int i = 1;
+    if (!(aligncheck & 15))
+    {
+        __m128i maxval = _mm_set1_epi8((i << X265_DEPTH) - 1);
+        __m128i zero = _mm_setzero_si128();
+
+        // fast path, multiples of 16 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 16)
+            {
+                __m128i word0, word1, sum;
+                word0 = _mm_load_si128((__m128i const*)(src0 + x));
+                word1 = _mm_load_si128((__m128i const*)(src1 + x));
+                sum = _mm_adds_epu8(word0, word1);
+                sum = _mm_max_epu8(sum, zero);
+                sum = _mm_min_epu8(sum, maxval);
+                _mm_storeu_si128((__m128i*)&dst[x], sum);
+            }
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else if (!(bx & 15))
+    {
+        __m128i maxval = _mm_set1_epi8((i << X265_DEPTH) - 1);
+        __m128i zero = _mm_setzero_si128();
+
+        // fast path, multiples of 16 pixel wide blocks but pointers/strides require unaligned accesses
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 16)
+            {
+                __m128i word0, word1, sum;
+                word0 = _mm_load_si128((__m128i const*)(src0 + x));
+                word1 = _mm_load_si128((__m128i const*)(src1 + x));
+                sum = _mm_adds_epu8(word0, word1);
+                sum = _mm_max_epu8(sum, zero);
+                sum = _mm_min_epu8(sum, maxval);
+                _mm_storeu_si128((__m128i*)&dst[x], sum);
+            }
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+    {
+        int tmp;
+        int max = (1 << X265_DEPTH) - 1;
+        // slow path, irregular memory alignments or sizes
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x++)
+            {
+                tmp = src0[x] + src1[x];
+                tmp = tmp < 0 ? 0 : tmp;
+                tmp = tmp > max ? max : tmp;
+                dst[x] = (pixel)tmp;
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+}
 #endif /* if HIGH_BIT_DEPTH */
 
 void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)
@@ -301,78 +373,6 @@ void blockcopy_p_p(int bx, int by, pixel
         }
     }
 }
-#else
-void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
-{
-    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | sstride1 | dstride;
-
-    if (!(aligncheck & 15))
-    {
-        Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); 
-        // fast path, multiples of 16 pixel wide blocks
-        for (int y = 0; y < by; y++)
-        {
-            for (int x = 0; x < bx; x += 16)
-            {
-                Vec16uc vecsrc0, vecsrc1, vecsum;
-                vecsrc0.load_a(src0 + x);
-                vecsrc1.load_a(src1 + x);
-                vecsum = add_saturated(vecsrc0, vecsrc1);
-                vecsum = max(vecsum, zero);
-                vecsum = min(vecsum, maxval);
-
-                vecsum.store(dst + x);
-            }
-
-            src0 += sstride0;
-            src1 += sstride1;
-            dst += dstride;
-        }
-    }
-    else if (!(bx & 15))
-    {
-        Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); 
-        // fast path, multiples of 16 pixel wide blocks but pointers/strides require unaligned accesses
-        for (int y = 0; y < by; y++)
-        {
-            for (int x = 0; x < bx; x += 16)
-            {
-                Vec16uc vecsrc0, vecsrc1, vecsum;
-                vecsrc0.load(src0 + x);
-                vecsrc1.load(src1 + x);
-                vecsum = add_saturated(vecsrc0, vecsrc1);
-                vecsum = max(vecsum, zero);
-                vecsum = min(vecsum, maxval);
-
-                vecsum.store(dst + x);
-            }
-
-            src0 += sstride0;
-            src1 += sstride1;
-            dst += dstride;
-        }
-    }
-    else
-    {
-        int tmp;
-        int max = (1 << X265_DEPTH) - 1;
-        // slow path, irregular memory alignments or sizes
-        for (int y = 0; y < by; y++)
-        {
-            for (int x = 0; x < bx; x++)
-            {
-                tmp = src0[x] + src1[x];
-                tmp = tmp < 0 ? 0 : tmp;
-                tmp = tmp > max ? max : tmp;
-                dst[x] = (pixel)tmp;
-            }
-
-            src0 += sstride0;
-            src1 += sstride1;
-            dst += dstride;
-        }
-    }
-}
 #endif
 }
 
diff -r a79ecf3a7875 -r 614a68ab4703 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/dct-sse3.cpp	Thu Oct 10 14:25:08 2013 -0500
@@ -28,12 +28,11 @@
  * For more information, contact us at licensing at multicorewareinc.com.
  *****************************************************************************/
 
-#define INSTRSET 3
-#include "vectorclass.h"
-
 #include "primitives.h"
 #include "TLibCommon/TypeDef.h"    // TCoeff, int, UInt
 #include "TLibCommon/TComRom.h"
+#include <xmmintrin.h> // SSE
+#include <pmmintrin.h> // SSE3
 
 #include <assert.h>
 #include <string.h>
@@ -41,124 +40,6 @@
 using namespace x265;
 
 namespace {
-void dequant(const int* quantCoef, int* coef, int width, int height, int per, int rem, bool useScalingList, unsigned int log2TrSize, int *deQuantCoef)
-{
-    int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
-
-    if (width > 32)
-    {
-        width  = 32;
-        height = 32;
-    }
-
-    int valueToAdd;
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
-    int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
-
-    if (useScalingList)
-    {
-        shift += 4;
-
-        if (shift > per)
-        {
-            valueToAdd = 1 << (shift - per - 1);
-            Vec4i IAdd(valueToAdd);
-
-            for (int n = 0; n < width * height; n = n + 8)
-            {
-                Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
-
-                quantCoef1.load(quantCoef + n);
-                quantCoef2.load(quantCoef + n + 4);
-
-                deQuantCoef1.load(deQuantCoef + n);
-                deQuantCoef2.load(deQuantCoef + n + 4);
-
-                Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef2 = extend_high(quantCoef12);
-
-                quantCoef1 =  (quantCoef1 *  deQuantCoef1 + IAdd) >> (shift - per);
-                quantCoef2 =  (quantCoef2 *  deQuantCoef2 + IAdd) >> (shift - per);
-
-                quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef1.store(coef + n);
-                quantCoef2 = extend_high(quantCoef12);
-                quantCoef2.store(coef + n + 4);
-            }
-        }
-        else
-        {
-            for (int n = 0; n < width * height; n = n + 8)
-            {
-                Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
-
-                quantCoef1.load(quantCoef + n);
-                quantCoef2.load(quantCoef + n + 4);
-
-                deQuantCoef1.load(deQuantCoef + n);
-                deQuantCoef2.load(deQuantCoef + n + 4);
-
-                Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef2 = extend_high(quantCoef12);
-
-                quantCoef1 = quantCoef1 * deQuantCoef1;
-                quantCoef2 = quantCoef2 * deQuantCoef2;
-