[x265-commits] [x265] asm: fix Main12 assembly up to SSSE3
Min Chen
chenm003 at 163.com
Sat Jul 11 19:52:43 CEST 2015
details: http://hg.videolan.org/x265/rev/b89e593b9a07
branches:
changeset: 10793:b89e593b9a07
user: Min Chen <chenm003 at 163.com>
date: Fri Jul 10 15:57:44 2015 -0700
description:
asm: fix Main12 assembly up to SSSE3
Subject: [x265] Main12: comment on constant SAO_BIT_INC
details: http://hg.videolan.org/x265/rev/c296e281c75e
branches:
changeset: 10794:c296e281c75e
user: Min Chen <chenm003 at 163.com>
date: Fri Jul 10 16:25:51 2015 -0700
description:
Main12: comment on constant SAO_BIT_INC
Subject: [x265] Main12: fix up sample fault on 10b -> 12b input convert
details: http://hg.videolan.org/x265/rev/cebf086ceac0
branches:
changeset: 10795:cebf086ceac0
user: Min Chen <chenm003 at 163.com>
date: Fri Jul 10 19:20:56 2015 -0700
description:
Main12: fix up sample fault on 10b -> 12b input convert
Subject: [x265] fix pixelcmp testbench on 12bpp
details: http://hg.videolan.org/x265/rev/88f9d8752054
branches:
changeset: 10796:88f9d8752054
user: Min Chen <chenm003 at 163.com>
date: Fri Jul 10 19:21:02 2015 -0700
description:
fix pixelcmp testbench on 12bpp
Subject: [x265] test: support test-bench builds with EXPORT_C_API=OFF (special namespaces)
details: http://hg.videolan.org/x265/rev/133ee202f6db
branches:
changeset: 10797:133ee202f6db
user: Steve Borho <steve at borho.org>
date: Sat Jul 11 12:29:09 2015 -0500
description:
test: support test-bench builds with EXPORT_C_API=OFF (special namespaces)
Going forward a lot of developers will probably use a multilib setup, so this
makes it possible for them to enable the test-bench for all bit-depth builds
Subject: [x265] test: correctly report 12bit builds
details: http://hg.videolan.org/x265/rev/79f4906e9cb8
branches:
changeset: 10798:79f4906e9cb8
user: Steve Borho <steve at borho.org>
date: Sat Jul 11 12:31:21 2015 -0500
description:
test: correctly report 12bit builds
diffstat:
source/common/picyuv.cpp | 105 +++--
source/common/pixel.cpp | 13 +
source/common/primitives.h | 1 +
source/common/vec/dct-sse3.cpp | 9 +-
source/common/vec/dct-ssse3.cpp | 130 +++----
source/common/x86/const-a.asm | 1 +
source/common/x86/dct8.asm | 351 ++++++------------
source/common/x86/intrapred16.asm | 30 +-
source/common/x86/ipfilter16.asm | 682 +++++++++++++++++++------------------
source/encoder/sao.h | 2 +-
source/test/mbdstharness.cpp | 2 +-
source/test/testbench.cpp | 5 +-
source/test/testharness.h | 29 +-
13 files changed, 626 insertions(+), 734 deletions(-)
diffs (truncated from 3024 to 300 lines):
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/picyuv.cpp
--- a/source/common/picyuv.cpp Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/picyuv.cpp Sat Jul 11 12:31:21 2015 -0500
@@ -148,52 +148,62 @@ void PicYuv::copyFromPicture(const x265_
padx++;
pady++;
- if (pic.bitDepth < X265_DEPTH)
+ X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
+
+ if (pic.bitDepth == 8)
{
- pixel *yPixel = m_picOrg[0];
- pixel *uPixel = m_picOrg[1];
- pixel *vPixel = m_picOrg[2];
+#if (X265_DEPTH > 8)
+ {
+ pixel *yPixel = m_picOrg[0];
+ pixel *uPixel = m_picOrg[1];
+ pixel *vPixel = m_picOrg[2];
- uint8_t *yChar = (uint8_t*)pic.planes[0];
- uint8_t *uChar = (uint8_t*)pic.planes[1];
- uint8_t *vChar = (uint8_t*)pic.planes[2];
- int shift = X265_MAX(0, X265_DEPTH - pic.bitDepth);
+ uint8_t *yChar = (uint8_t*)pic.planes[0];
+ uint8_t *uChar = (uint8_t*)pic.planes[1];
+ uint8_t *vChar = (uint8_t*)pic.planes[2];
+ int shift = (X265_DEPTH - 8);
- primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
- primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
- primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
- }
- else if (pic.bitDepth == 8)
- {
- pixel *yPixel = m_picOrg[0];
- pixel *uPixel = m_picOrg[1];
- pixel *vPixel = m_picOrg[2];
+ primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
+ primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+ primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+ }
+#else /* Case for (X265_DEPTH == 8) */
+ // TODO: Does we need this path? may merge into above in future
+ {
+ pixel *yPixel = m_picOrg[0];
+ pixel *uPixel = m_picOrg[1];
+ pixel *vPixel = m_picOrg[2];
- uint8_t *yChar = (uint8_t*)pic.planes[0];
- uint8_t *uChar = (uint8_t*)pic.planes[1];
- uint8_t *vChar = (uint8_t*)pic.planes[2];
+ uint8_t *yChar = (uint8_t*)pic.planes[0];
+ uint8_t *uChar = (uint8_t*)pic.planes[1];
+ uint8_t *vChar = (uint8_t*)pic.planes[2];
- for (int r = 0; r < height; r++)
- {
- memcpy(yPixel, yChar, width * sizeof(pixel));
+ for (int r = 0; r < height; r++)
+ {
+ memcpy(yPixel, yChar, width * sizeof(pixel));
- yPixel += m_stride;
- yChar += pic.stride[0] / sizeof(*yChar);
+ yPixel += m_stride;
+ yChar += pic.stride[0] / sizeof(*yChar);
+ }
+
+ for (int r = 0; r < height >> m_vChromaShift; r++)
+ {
+ memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+ memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+
+ uPixel += m_strideC;
+ vPixel += m_strideC;
+ uChar += pic.stride[1] / sizeof(*uChar);
+ vChar += pic.stride[2] / sizeof(*vChar);
+ }
}
-
- for (int r = 0; r < height >> m_vChromaShift; r++)
- {
- memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
- memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
-
- uPixel += m_strideC;
- vPixel += m_strideC;
- uChar += pic.stride[1] / sizeof(*uChar);
- vChar += pic.stride[2] / sizeof(*vChar);
- }
+#endif /* (X265_DEPTH > 8) */
}
else /* pic.bitDepth > 8 */
{
+ /* defensive programming, mask off bits that are supposed to be zero */
+ uint16_t mask = (1 << X265_DEPTH) - 1;
+ int shift = abs(pic.bitDepth - X265_DEPTH);
pixel *yPixel = m_picOrg[0];
pixel *uPixel = m_picOrg[1];
pixel *vPixel = m_picOrg[2];
@@ -202,15 +212,20 @@ void PicYuv::copyFromPicture(const x265_
uint16_t *uShort = (uint16_t*)pic.planes[1];
uint16_t *vShort = (uint16_t*)pic.planes[2];
- /* defensive programming, mask off bits that are supposed to be zero */
- uint16_t mask = (1 << X265_DEPTH) - 1;
- int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH);
-
- /* shift and mask pixels to final size */
-
- primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
- primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
- primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+ if (pic.bitDepth > X265_DEPTH)
+ {
+ /* shift right and mask pixels to final size */
+ primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+ primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+ primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+ }
+ else /* Case for (pic.bitDepth <= X265_DEPTH) */
+ {
+ /* shift left and mask pixels to final size */
+ primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+ primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+ primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+ }
}
/* extend the right edge if width was not multiple of the minimum CU size */
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/pixel.cpp Sat Jul 11 12:31:21 2015 -0500
@@ -945,6 +945,18 @@ static void planecopy_sp_c(const uint16_
}
}
+static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+{
+ for (int r = 0; r < height; r++)
+ {
+ for (int c = 0; c < width; c++)
+ dst[c] = (pixel)((src[c] << shift) & mask);
+
+ dst += dstStride;
+ src += srcStride;
+ }
+}
+
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
@@ -1245,6 +1257,7 @@ void setupPixelPrimitives_c(EncoderPrimi
p.planecopy_cp = planecopy_cp_c;
p.planecopy_sp = planecopy_sp_c;
+ p.planecopy_sp_shl = planecopy_sp_shl_c;
p.propagateCost = estimateCUPropagateCost;
}
}
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/primitives.h
--- a/source/common/primitives.h Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/primitives.h Sat Jul 11 12:31:21 2015 -0500
@@ -312,6 +312,7 @@ struct EncoderPrimitives
extendCURowBorder_t extendRowBorder;
planecopy_cp_t planecopy_cp;
planecopy_sp_t planecopy_sp;
+ planecopy_sp_t planecopy_sp_shl;
weightp_sp_t weight_sp;
weightp_pp_t weight_pp;
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/vec/dct-sse3.cpp Sat Jul 11 12:31:21 2015 -0500
@@ -38,13 +38,8 @@ using namespace X265_NS;
#define SHIFT1 7
#define ADD1 64
-#if HIGH_BIT_DEPTH
-#define SHIFT2 10
-#define ADD2 512
-#else
-#define SHIFT2 12
-#define ADD2 2048
-#endif
+#define SHIFT2 (12 - (X265_DEPTH - 8))
+#define ADD2 (1 << ((SHIFT2) - 1))
ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
{
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/vec/dct-ssse3.cpp
--- a/source/common/vec/dct-ssse3.cpp Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/vec/dct-ssse3.cpp Sat Jul 11 12:31:21 2015 -0500
@@ -34,6 +34,18 @@
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
+#define DCT16_SHIFT1 (3 + X265_DEPTH - 8)
+#define DCT16_ADD1 (1 << ((DCT16_SHIFT1) - 1))
+
+#define DCT16_SHIFT2 10
+#define DCT16_ADD2 (1 << ((DCT16_SHIFT2) - 1))
+
+#define DCT32_SHIFT1 (DCT16_SHIFT1 + 1)
+#define DCT32_ADD1 (1 << ((DCT32_SHIFT1) - 1))
+
+#define DCT32_SHIFT2 (DCT16_SHIFT2 + 1)
+#define DCT32_ADD2 (1 << ((DCT32_SHIFT2) - 1))
+
using namespace X265_NS;
ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
@@ -100,20 +112,9 @@ ALIGN_VAR_32(static const int16_t, tab_d
static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
-#if HIGH_BIT_DEPTH
-#define SHIFT1 5
-#define ADD1 16
-#else
-#define SHIFT1 3
-#define ADD1 4
-#endif
-
-#define SHIFT2 10
-#define ADD2 512
-
// Const
- __m128i c_4 = _mm_set1_epi32(ADD1);
- __m128i c_512 = _mm_set1_epi32(ADD2);
+ __m128i c_4 = _mm_set1_epi32(DCT16_ADD1);
+ __m128i c_512 = _mm_set1_epi32(DCT16_ADD2);
int i;
@@ -201,29 +202,29 @@ static void dct16(const int16_t *src, in
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
@@ -233,8 +234,8 @@ static void dct16(const int16_t *src, in
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
@@ -244,8 +245,8 @@ static void dct16(const int16_t *src, in
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
@@ -255,8 +256,8 @@ static void dct16(const int16_t *src, in
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
T60 = _mm_hadd_epi32(T60, T61);
More information about the x265-commits
mailing list