[x265-commits] [x265] blockcopy-sse3.cpp: Replace pixeladd_pp vector class func...
Dnyaneshwar Gorade
dnyaneshwar at multicorewareinc.com
Thu Oct 10 21:28:22 CEST 2013
details: http://hg.videolan.org/x265/rev/12d098e5d907
branches:
changeset: 4361:12d098e5d907
user: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date: Thu Oct 10 15:20:57 2013 +0530
description:
blockcopy-sse3.cpp: Replace pixeladd_pp vector class function with intrinsic.
Subject: [x265] blockcopy: move intrinsic function out of vector-class section
details: http://hg.videolan.org/x265/rev/7dbbbb2a42bc
branches:
changeset: 4362:7dbbbb2a42bc
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 11:49:27 2013 -0500
description:
blockcopy: move intrinsic function out of vector-class section
Subject: [x265] intra-sse3.cpp: Replace PredIntraAng4_32 vector class function with intrinsic.
details: http://hg.videolan.org/x265/rev/8b49d3995f0c
branches:
changeset: 4363:8b49d3995f0c
user: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date: Thu Oct 10 16:15:27 2013 +0530
description:
intra-sse3.cpp: Replace PredIntraAng4_32 vector class function with intrinsic.
Subject: [x265] dct: replace dequant vector class function with intrinsic
details: http://hg.videolan.org/x265/rev/b77a66b6b93d
branches:
changeset: 4364:b77a66b6b93d
user: Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date: Thu Oct 10 16:57:47 2013 +0530
description:
dct: replace dequant vector class function with intrinsic
Subject: [x265] dct: replaced partialButterfly8 vector class function with intrinsic
details: http://hg.videolan.org/x265/rev/6fa763ba9da8
branches:
changeset: 4365:6fa763ba9da8
user: Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date: Thu Oct 10 18:31:20 2013 +0530
description:
dct: replaced partialButterfly8 vector class function with intrinsic
Subject: [x265] dct: move last vector dct function into its own section
details: http://hg.videolan.org/x265/rev/b0b5c22f5a34
branches:
changeset: 4366:b0b5c22f5a34
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 12:04:45 2013 -0500
description:
dct: move last vector dct function into its own section
Subject: [x265] dct: move functions which require SSE4.1 from dct-sse3.cpp to dct-sse41.cpp
details: http://hg.videolan.org/x265/rev/be7c6c42566a
branches:
changeset: 4367:be7c6c42566a
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 12:07:50 2013 -0500
description:
dct: move functions which require SSE4.1 from dct-sse3.cpp to dct-sse41.cpp
Subject: [x265] dct: remove vector class includes from dct-sse41.cpp, it is clean
details: http://hg.videolan.org/x265/rev/3be4451ea3aa
branches:
changeset: 4368:3be4451ea3aa
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 12:09:11 2013 -0500
description:
dct: remove vector class includes from dct-sse41.cpp, it is clean
Subject: [x265] dct: add comments for future opts/code reuse
details: http://hg.videolan.org/x265/rev/7b4a6a5f8efc
branches:
changeset: 4369:7b4a6a5f8efc
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 12:18:42 2013 -0500
description:
dct: add comments for future opts/code reuse
Subject: [x265] cmake: link PPA and other libs into x265-shared and x265-static
details: http://hg.videolan.org/x265/rev/499ef0e4e254
branches:
changeset: 4370:499ef0e4e254
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 13:40:46 2013 -0500
description:
cmake: link PPA and other libs into x265-shared and x265-static
Subject: [x265] cmake: merge TLibEncoderH source group into TLibEncoder
details: http://hg.videolan.org/x265/rev/bfdfeb2fd817
branches:
changeset: 4371:bfdfeb2fd817
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 14:08:33 2013 -0500
description:
cmake: merge TLibEncoderH source group into TLibEncoder
They are both small enough now that they are manageable as a single unit
Subject: [x265] NALwrite: reintroduce include of cstring, required for memcpy on Linux
details: http://hg.videolan.org/x265/rev/d6b9cc9c402f
branches:
changeset: 4372:d6b9cc9c402f
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 14:18:40 2013 -0500
description:
NALwrite: reintroduce include of cstring, required for memcpy on Linux
Subject: [x265] TComSlice: remove unreferenced member variable
details: http://hg.videolan.org/x265/rev/614a68ab4703
branches:
changeset: 4373:614a68ab4703
user: Steve Borho <steve at borho.org>
date: Thu Oct 10 14:25:08 2013 -0500
description:
TComSlice: remove unreferenced member variable
diffstat:
source/CMakeLists.txt | 7 +-
source/Lib/TLibCommon/TComSlice.h | 1 -
source/Lib/TLibEncoder/NALwrite.cpp | 2 +
source/common/vec/blockcopy-sse3.cpp | 144 +-
source/common/vec/dct-sse3.cpp | 1299 ++++++++++++++-------------------
source/common/vec/dct-sse41.cpp | 214 +++++-
source/common/vec/intra-sse3.cpp | 20 +-
source/encoder/CMakeLists.txt | 3 +-
8 files changed, 850 insertions(+), 840 deletions(-)
diffs (truncated from 1838 to 300 lines):
diff -r a79ecf3a7875 -r 614a68ab4703 source/CMakeLists.txt
--- a/source/CMakeLists.txt Thu Oct 10 12:29:41 2013 +0530
+++ b/source/CMakeLists.txt Thu Oct 10 14:25:08 2013 -0500
@@ -113,7 +113,7 @@ option(ENABLE_PPA "Enable PPA profiling
if(ENABLE_PPA)
add_definitions(-DENABLE_PPA)
add_subdirectory(PPA)
- SET(PLATFORM_LIBS ${PLATFORM_LIBS} PPA)
+ SET(EXTRA_LIBS ${EXTRA_LIBS} PPA)
if(UNIX)
SET(PLATFORM_LIBS ${PLATFORM_LIBS} dl)
endif(UNIX)
@@ -142,9 +142,10 @@ add_subdirectory(encoder)
add_library(x265-shared SHARED dllmain.cpp x265.def $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
if(ENABLE_PRIMITIVES_ASM AND (MSVC OR XCODE))
- target_link_libraries(x265-shared assembly)
- target_link_libraries(x265-static assembly)
+ SET(EXTRA_LIBS ${EXTRA_LIBS} assembly)
endif()
+target_link_libraries(x265-shared ${EXTRA_LIBS})
+target_link_libraries(x265-static ${EXTRA_LIBS})
set_target_properties(x265-shared PROPERTIES OUTPUT_NAME x265)
if(NOT WIN32)
set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
diff -r a79ecf3a7875 -r 614a68ab4703 source/Lib/TLibCommon/TComSlice.h
--- a/source/Lib/TLibCommon/TComSlice.h Thu Oct 10 12:29:41 2013 +0530
+++ b/source/Lib/TLibCommon/TComSlice.h Thu Oct 10 14:25:08 2013 -0500
@@ -307,7 +307,6 @@ private:
UInt m_initialCpbRemovalDelayLengthMinus1;
UInt m_cpbRemovalDelayLengthMinus1;
UInt m_dpbOutputDelayLengthMinus1;
- UInt m_numDU;
HrdSubLayerInfo m_HRD[MAX_TLAYER];
public:
diff -r a79ecf3a7875 -r 614a68ab4703 source/Lib/TLibEncoder/NALwrite.cpp
--- a/source/Lib/TLibEncoder/NALwrite.cpp Thu Oct 10 12:29:41 2013 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.cpp Thu Oct 10 14:25:08 2013 -0500
@@ -36,6 +36,8 @@
#include "NALwrite.h"
#include "common.h"
+#include <cstring>
+
namespace x265 {
//! \ingroup TLibEncoder
diff -r a79ecf3a7875 -r 614a68ab4703 source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp Thu Oct 10 14:25:08 2013 -0500
@@ -99,6 +99,78 @@ void blockcopy_p_s(int bx, int by, pixel
}
}
}
+
+void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
+{
+ size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | sstride1 | dstride;
+ int i = 1;
+ if (!(aligncheck & 15))
+ {
+ __m128i maxval = _mm_set1_epi8((i << X265_DEPTH) - 1);
+ __m128i zero = _mm_setzero_si128();
+
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 16)
+ {
+ __m128i word0, word1, sum;
+ word0 = _mm_load_si128((__m128i const*)(src0 + x));
+ word1 = _mm_load_si128((__m128i const*)(src1 + x));
+ sum = _mm_adds_epu8(word0, word1);
+ sum = _mm_max_epu8(sum, zero);
+ sum = _mm_min_epu8(sum, maxval);
+ _mm_storeu_si128((__m128i*)&dst[x], sum);
+ }
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else if (!(bx & 15))
+ {
+ __m128i maxval = _mm_set1_epi8((i << X265_DEPTH) - 1);
+ __m128i zero = _mm_setzero_si128();
+
+ // fast path, multiples of 16 pixel wide blocks but pointers/strides require unaligned accesses
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 16)
+ {
+ __m128i word0, word1, sum;
+ word0 = _mm_load_si128((__m128i const*)(src0 + x));
+ word1 = _mm_load_si128((__m128i const*)(src1 + x));
+ sum = _mm_adds_epu8(word0, word1);
+ sum = _mm_max_epu8(sum, zero);
+ sum = _mm_min_epu8(sum, maxval);
+ _mm_storeu_si128((__m128i*)&dst[x], sum);
+ }
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else
+ {
+ int tmp;
+ int max = (1 << X265_DEPTH) - 1;
+ // slow path, irregular memory alignments or sizes
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ tmp = src0[x] + src1[x];
+ tmp = tmp < 0 ? 0 : tmp;
+ tmp = tmp > max ? max : tmp;
+ dst[x] = (pixel)tmp;
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+}
#endif /* if HIGH_BIT_DEPTH */
void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)
@@ -301,78 +373,6 @@ void blockcopy_p_p(int bx, int by, pixel
}
}
}
-#else
-void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
-{
- size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | sstride1 | dstride;
-
- if (!(aligncheck & 15))
- {
- Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
- // fast path, multiples of 16 pixel wide blocks
- for (int y = 0; y < by; y++)
- {
- for (int x = 0; x < bx; x += 16)
- {
- Vec16uc vecsrc0, vecsrc1, vecsum;
- vecsrc0.load_a(src0 + x);
- vecsrc1.load_a(src1 + x);
- vecsum = add_saturated(vecsrc0, vecsrc1);
- vecsum = max(vecsum, zero);
- vecsum = min(vecsum, maxval);
-
- vecsum.store(dst + x);
- }
-
- src0 += sstride0;
- src1 += sstride1;
- dst += dstride;
- }
- }
- else if (!(bx & 15))
- {
- Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
- // fast path, multiples of 16 pixel wide blocks but pointers/strides require unaligned accesses
- for (int y = 0; y < by; y++)
- {
- for (int x = 0; x < bx; x += 16)
- {
- Vec16uc vecsrc0, vecsrc1, vecsum;
- vecsrc0.load(src0 + x);
- vecsrc1.load(src1 + x);
- vecsum = add_saturated(vecsrc0, vecsrc1);
- vecsum = max(vecsum, zero);
- vecsum = min(vecsum, maxval);
-
- vecsum.store(dst + x);
- }
-
- src0 += sstride0;
- src1 += sstride1;
- dst += dstride;
- }
- }
- else
- {
- int tmp;
- int max = (1 << X265_DEPTH) - 1;
- // slow path, irregular memory alignments or sizes
- for (int y = 0; y < by; y++)
- {
- for (int x = 0; x < bx; x++)
- {
- tmp = src0[x] + src1[x];
- tmp = tmp < 0 ? 0 : tmp;
- tmp = tmp > max ? max : tmp;
- dst[x] = (pixel)tmp;
- }
-
- src0 += sstride0;
- src1 += sstride1;
- dst += dstride;
- }
- }
-}
#endif
}
diff -r a79ecf3a7875 -r 614a68ab4703 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/dct-sse3.cpp Thu Oct 10 14:25:08 2013 -0500
@@ -28,12 +28,11 @@
* For more information, contact us at licensing at multicorewareinc.com.
*****************************************************************************/
-#define INSTRSET 3
-#include "vectorclass.h"
-
#include "primitives.h"
#include "TLibCommon/TypeDef.h" // TCoeff, int, UInt
#include "TLibCommon/TComRom.h"
+#include <xmmintrin.h> // SSE
+#include <pmmintrin.h> // SSE3
#include <assert.h>
#include <string.h>
@@ -41,124 +40,6 @@
using namespace x265;
namespace {
-void dequant(const int* quantCoef, int* coef, int width, int height, int per, int rem, bool useScalingList, unsigned int log2TrSize, int *deQuantCoef)
-{
- int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
-
- if (width > 32)
- {
- width = 32;
- height = 32;
- }
-
- int valueToAdd;
- int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
- int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
-
- if (useScalingList)
- {
- shift += 4;
-
- if (shift > per)
- {
- valueToAdd = 1 << (shift - per - 1);
- Vec4i IAdd(valueToAdd);
-
- for (int n = 0; n < width * height; n = n + 8)
- {
- Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
-
- quantCoef1.load(quantCoef + n);
- quantCoef2.load(quantCoef + n + 4);
-
- deQuantCoef1.load(deQuantCoef + n);
- deQuantCoef2.load(deQuantCoef + n + 4);
-
- Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
- quantCoef1 = extend_low(quantCoef12);
- quantCoef2 = extend_high(quantCoef12);
-
- quantCoef1 = (quantCoef1 * deQuantCoef1 + IAdd) >> (shift - per);
- quantCoef2 = (quantCoef2 * deQuantCoef2 + IAdd) >> (shift - per);
-
- quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
- quantCoef1 = extend_low(quantCoef12);
- quantCoef1.store(coef + n);
- quantCoef2 = extend_high(quantCoef12);
- quantCoef2.store(coef + n + 4);
- }
- }
- else
- {
- for (int n = 0; n < width * height; n = n + 8)
- {
- Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
-
- quantCoef1.load(quantCoef + n);
- quantCoef2.load(quantCoef + n + 4);
-
- deQuantCoef1.load(deQuantCoef + n);
- deQuantCoef2.load(deQuantCoef + n + 4);
-
- Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
- quantCoef1 = extend_low(quantCoef12);
- quantCoef2 = extend_high(quantCoef12);
-
- quantCoef1 = quantCoef1 * deQuantCoef1;
- quantCoef2 = quantCoef2 * deQuantCoef2;
-
More information about the x265-commits
mailing list