[x265] [PATCH] Added 10bit support to ssse3 dct16 and dct32 intrinsics
dtyx265 at gmail.com
dtyx265 at gmail.com
Wed Jan 21 23:06:24 CET 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1421877896 28800
# Node ID ebbcf28b6d78afe0781516523c6f961e4404581c
# Parent 66f85a0519e2e881b3ecd0026b3fabfc46926293
Added 10bit support to ssse3 dct16 and dct32 intrinsics
WARNING:My system is old and limited to sse3 so this is untested!
I will be happy to fix any errors found by anyone else.
diff -r 66f85a0519e2 -r ebbcf28b6d78 source/common/vec/dct-ssse3.cpp
--- a/source/common/vec/dct-ssse3.cpp Fri Dec 19 18:32:20 2014 +0100
+++ b/source/common/vec/dct-ssse3.cpp Wed Jan 21 14:04:56 2015 -0800
@@ -36,7 +36,6 @@
using namespace x265;
-#if !HIGH_BIT_DEPTH
namespace {
ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
{
@@ -102,9 +101,20 @@
void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
{
+#if HIGH_BIT_DEPTH
+#define SHIFT1 5
+#define ADD1 16
+#else
+#define SHIFT1 3
+#define ADD1 4
+#endif
+
+#define SHIFT2 10
+#define ADD2 512
+
// Const
- __m128i c_4 = _mm_set1_epi32(4);
- __m128i c_512 = _mm_set1_epi32(512);
+ __m128i c_4 = _mm_set1_epi32(ADD1);
+ __m128i c_512 = _mm_set1_epi32(ADD2);
int i;
@@ -192,29 +202,29 @@
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
T60 = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
T61 = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
T60 = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
T61 = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
@@ -224,8 +234,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
@@ -235,8 +245,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
@@ -246,8 +256,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
@@ -257,8 +267,8 @@
T63 = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
T60 = _mm_hadd_epi32(T60, T61);
T61 = _mm_hadd_epi32(T62, T63);
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3);
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3);
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
T70 = _mm_packs_epi32(T60, T61);
_mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
@@ -277,8 +287,8 @@
T63 = _mm_hadd_epi32(T66, T67); \
T60 = _mm_hadd_epi32(T60, T61); \
T61 = _mm_hadd_epi32(T62, T63); \
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \
- T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1); \
+ T61 = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1); \
T70 = _mm_packs_epi32(T60, T61); \
_mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
@@ -342,8 +352,8 @@
T40 = _mm_hadd_epi32(T30, T31);
T41 = _mm_hsub_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
- T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
+ T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
T41 = _mm_packs_epi32(T41, T41);
_mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
@@ -367,7 +377,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
@@ -389,7 +399,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
@@ -411,7 +421,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
@@ -433,7 +443,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
@@ -455,7 +465,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
@@ -477,7 +487,7 @@
T31 = _mm_hadd_epi32(T32, T33);
T40 = _mm_hadd_epi32(T30, T31);
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2);
T40 = _mm_packs_epi32(T40, T40);
_mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
@@ -500,7 +510,7 @@
T31 = _mm_hadd_epi32(T32, T33); \
\
T40 = _mm_hadd_epi32(T30, T31); \
- T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
+ T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), SHIFT2); \
T40 = _mm_packs_epi32(T40, T40); \
_mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
@@ -514,6 +524,10 @@
MAKE_ODD(28, 15);
#undef MAKE_ODD
}
+#undef SHIFT1
+#undef ADD1
+#undef SHIFT2
+#undef ADD2
}
ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
@@ -668,9 +682,20 @@
void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
{
+#if HIGH_BIT_DEPTH
+#define SHIFT1 6
+#define ADD1 32
+#else
+#define SHIFT1 4
+#define ADD1 8
+#endif
+
+#define SHIFT2 11
+#define ADD2 1024
+
// Const
- __m128i c_8 = _mm_set1_epi32(8);
- __m128i c_1024 = _mm_set1_epi32(1024);
+ __m128i c_8 = _mm_set1_epi32(ADD1);
+ __m128i c_1024 = _mm_set1_epi32(ADD2);
int i;
@@ -815,15 +840,15 @@
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[0][i] = T60;
T50 = _mm_hsub_epi32(T40, T41);
T51 = _mm_hsub_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[16][i] = T60;
@@ -843,8 +868,8 @@
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[8][i] = T60;
@@ -864,8 +889,8 @@
T50 = _mm_hadd_epi32(T40, T41);
T51 = _mm_hadd_epi32(T42, T43);
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4);
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4);
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1);
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1);
T60 = _mm_packs_epi32(T50, T51);
im[24][i] = T60;
@@ -886,8 +911,8 @@
\
T50 = _mm_hadd_epi32(T40, T41); \
T51 = _mm_hadd_epi32(T42, T43); \
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
T60 = _mm_packs_epi32(T50, T51); \
im[(dstPos)][i] = T60;
@@ -949,8 +974,8 @@
\
T50 = _mm_hadd_epi32(T50, T51); \
T51 = _mm_hadd_epi32(T52, T53); \
- T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
- T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
+ T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), SHIFT1); \
+ T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), SHIFT1); \
T60 = _mm_packs_epi32(T50, T51); \
im[(dstPos)][i] = T60;
@@ -1058,7 +1083,7 @@
\
T60 = _mm_hadd_epi32(T60, T61); \
\
- T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
+ T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), SHIFT2); \
T60 = _mm_packs_epi32(T60, T60); \
_mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
@@ -1100,9 +1125,12 @@
MAKE_ODD(158, 159, 160, 161, 31);
#undef MAKE_ODD
}
+#undef SHIFT1
+#undef ADD1
+#undef SHIFT2
+#undef ADD2
}
}
-#endif // if !HIGH_BIT_DEPTH
namespace x265 {
void setupIntrinsicDCT_ssse3(EncoderPrimitives &p)
@@ -1110,9 +1138,7 @@
/* Note: We have AVX2 assembly for these two functions, but since AVX2 is
* still somewhat rare on end-user PCs we still compile and link these SSSE3
* intrinsic SIMD functions */
-#if !HIGH_BIT_DEPTH
p.cu[BLOCK_16x16].dct = dct16;
p.cu[BLOCK_32x32].dct = dct32;
-#endif
}
}
More information about the x265-devel
mailing list