[x265] [PATCH 4 of 6] idct: primitive blockfil_s for dc fill
Min Chen
chenm003 at 163.com
Fri Aug 16 12:52:34 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1376650220 -28800
# Node ID 23b8aafb1fded20981d39cf5cae48c59f041bef8
# Parent 0b225ee24b5d5271e0bbdc366e2bcb2c320bffe4
idct: primitive blockfil_s for dc fill
diff -r 0b225ee24b5d -r 23b8aafb1fde source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Fri Aug 16 18:49:58 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Fri Aug 16 18:50:20 2013 +0800
@@ -477,26 +477,20 @@
// CHECK_ME: we can't here when no any coeff
assert(lastPos >= 0);
+ const UInt log2BlockSize = g_convertToBit[width];
+
#if !HIGH_BIT_DEPTH
// DC only
if (lastPos == 0 && !((width == 4) && (mode != REG_DCT)))
{
- int dc = (((m_tmpCoeff[0] * 64 + 64) >> 7) * 64 + 2048) >> 12;
-
- for(int i = 0; i < width; i++)
- {
- for(int j = 0; j < width; j++)
- {
- residual[i * stride + j] = dc;
- }
- }
+ int dc_val = (((m_tmpCoeff[0] * 64 + 64) >> 7) * 64 + 2048) >> 12;
+ x265::primitives.blockfil_s[log2BlockSize](residual, stride, dc_val);
return;
}
#endif
// TODO: this may need larger data types for X265_DEPTH > 8
- const UInt log2BlockSize = g_convertToBit[width];
x265::primitives.idct[x265::IDCT_4x4 + log2BlockSize - ((width == 4) && (mode != REG_DCT))](m_tmpCoeff, residual, stride);
}
}
diff -r 0b225ee24b5d -r 23b8aafb1fde source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Aug 16 18:49:58 2013 +0800
+++ b/source/common/pixel.cpp Fri Aug 16 18:50:20 2013 +0800
@@ -416,6 +416,18 @@
}
}
+template <int size>
+void blockfil_s_c(short *dst, intptr_t dstride, short val)
+{
+ for (int y = 0; y < size; y++)
+ {
+ for (int x = 0; x < size; x++)
+ {
+ dst[y * dstride + x] = val;
+ }
+ }
+}
+
void convert16to32(short *src, int *dst, int num)
{
for (int i = 0; i < num; i++)
@@ -723,6 +735,12 @@
p.blockcpy_sp = blockcopy_s_p;
p.blockcpy_sc = blockcopy_s_c;
+ p.blockfil_s[BLOCK_4x4] = blockfil_s_c<4>;
+ p.blockfil_s[BLOCK_8x8] = blockfil_s_c<8>;
+ p.blockfil_s[BLOCK_16x16] = blockfil_s_c<16>;
+ p.blockfil_s[BLOCK_32x32] = blockfil_s_c<32>;
+ p.blockfil_s[BLOCK_64x64] = blockfil_s_c<64>;
+
p.cvt16to32 = convert16to32;
p.cvt16to32_shl = convert16to32_shl;
p.cvt32to16 = convert32to16;
diff -r 0b225ee24b5d -r 23b8aafb1fde source/common/primitives.h
--- a/source/common/primitives.h Fri Aug 16 18:49:58 2013 +0800
+++ b/source/common/primitives.h Fri Aug 16 18:50:20 2013 +0800
@@ -196,6 +196,7 @@
typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*pixeladd_ss_t)(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*pixeladd_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*blockfil_s_t)(short *dst, intptr_t dstride, short val);
typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
@@ -250,6 +251,7 @@
blockcpy_ps_t blockcpy_ps; // block copy pixel from short
blockcpy_sp_t blockcpy_sp; // block copy short from pixel
blockcpy_sc_t blockcpy_sc; // block copy short from unsigned char
+ blockfil_s_t blockfil_s[NUM_SQUARE_BLOCKS]; // block fill with value
cvt16to32_t cvt16to32;
cvt16to32_shl_t cvt16to32_shl;
cvt16to16_shl_t cvt16to16_shl;
diff -r 0b225ee24b5d -r 23b8aafb1fde source/common/vec/pixel.inc
--- a/source/common/vec/pixel.inc Fri Aug 16 18:49:58 2013 +0800
+++ b/source/common/vec/pixel.inc Fri Aug 16 18:50:20 2013 +0800
@@ -310,6 +310,74 @@
transpose16_dummy(dst + 16, 32, src + 16 * srcstride, srcstride);
}
+void blockfil_s_4(short *dst, intptr_t dstride, short val)
+{
+ __m128i T00;
+ T00 = _mm_cvtsi32_si128(val);
+ T00 = _mm_shufflelo_epi16(T00, 0);
+
+ _mm_storel_epi64((__m128i*)&dst[0 * dstride], T00);
+ _mm_storel_epi64((__m128i*)&dst[1 * dstride], T00);
+ _mm_storel_epi64((__m128i*)&dst[2 * dstride], T00);
+ _mm_storel_epi64((__m128i*)&dst[3 * dstride], T00);
+}
+
+void blockfil_s_8(short *dst, intptr_t dstride, short val)
+{
+ __m128i T00;
+ T00 = _mm_cvtsi32_si128(val);
+ T00 = _mm_shufflelo_epi16(T00, 0);
+ T00 = _mm_shuffle_epi32(T00, 0);
+
+ _mm_storeu_si128((__m128i*)&dst[0 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[1 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[2 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[3 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[4 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[5 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[6 * dstride], T00);
+ _mm_storeu_si128((__m128i*)&dst[7 * dstride], T00);
+}
+
+void blockfil_s_16(short *dst, intptr_t dstride, short val)
+{
+ __m128i T00;
+ T00 = _mm_cvtsi32_si128(val);
+ T00 = _mm_shufflelo_epi16(T00, 0);
+ T00 = _mm_shuffle_epi32(T00, 0);
+
+ for(int i = 0; i < 16; i += 4)
+ {
+ _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride ], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 8], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride ], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 8], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 2) * dstride ], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 2) * dstride + 8], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 3) * dstride ], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 3) * dstride + 8], T00);
+ }
+}
+
+void blockfil_s_32(short *dst, intptr_t dstride, short val)
+{
+ __m128i T00;
+ T00 = _mm_cvtsi32_si128(val);
+ T00 = _mm_shufflelo_epi16(T00, 0);
+ T00 = _mm_shuffle_epi32(T00, 0);
+
+ for(int i = 0; i < 32; i += 2)
+ {
+ _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride ], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 8], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 16], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 24], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride ], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 8], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 16], T00);
+ _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 24], T00);
+ }
+}
#endif // !HIGH_BIT_DEPTH
} // end anonymous namespace
@@ -394,6 +462,13 @@
p.weightpUni = weightUnidir;
#endif
+#if !HIGH_BIT_DEPTH
+ p.blockfil_s[BLOCK_4x4] = blockfil_s_4;
+ p.blockfil_s[BLOCK_8x8] = blockfil_s_8;
+ p.blockfil_s[BLOCK_16x16] = blockfil_s_16;
+ p.blockfil_s[BLOCK_32x32] = blockfil_s_32;
+#endif
+
#if !HIGH_BIT_DEPTH && (INSTRSET >= X265_CPU_LEVEL_SSSE3)
p.scale1D_128to64 = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
More information about the x265-devel
mailing list