[x265] [PATCH 4 of 6] idct: primitive blockfil_s for dc fill

Min Chen chenm003 at 163.com
Fri Aug 16 12:52:34 CEST 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1376650220 -28800
# Node ID 23b8aafb1fded20981d39cf5cae48c59f041bef8
# Parent  0b225ee24b5d5271e0bbdc366e2bcb2c320bffe4
idct: primitive blockfil_s for dc fill

diff -r 0b225ee24b5d -r 23b8aafb1fde source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Aug 16 18:49:58 2013 +0800
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Fri Aug 16 18:50:20 2013 +0800
@@ -477,26 +477,20 @@
         // CHECK_ME: we can't here when no any coeff
         assert(lastPos >= 0);
 
+        const UInt log2BlockSize = g_convertToBit[width];
+
 #if !HIGH_BIT_DEPTH
         // DC only
         if (lastPos == 0 && !((width == 4) && (mode != REG_DCT)))
         {
-            int dc = (((m_tmpCoeff[0] * 64 + 64) >> 7) * 64 + 2048) >> 12;
-
-            for(int i = 0; i < width; i++)
-            {
-                for(int j = 0; j < width; j++)
-                {
-                    residual[i * stride + j] = dc;
-                }
-            }
+            int dc_val = (((m_tmpCoeff[0] * 64 + 64) >> 7) * 64 + 2048) >> 12;
+            x265::primitives.blockfil_s[log2BlockSize](residual, stride, dc_val);
 
             return;
         }
 #endif
 
         // TODO: this may need larger data types for X265_DEPTH > 8
-        const UInt log2BlockSize = g_convertToBit[width];
         x265::primitives.idct[x265::IDCT_4x4 + log2BlockSize - ((width == 4) && (mode != REG_DCT))](m_tmpCoeff, residual, stride);
     }
 }
diff -r 0b225ee24b5d -r 23b8aafb1fde source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Aug 16 18:49:58 2013 +0800
+++ b/source/common/pixel.cpp	Fri Aug 16 18:50:20 2013 +0800
@@ -416,6 +416,18 @@
     }
 }
 
+template <int size>
+void blockfil_s_c(short *dst, intptr_t dstride, short val)
+{
+    for (int y = 0; y < size; y++)
+    {
+        for (int x = 0; x < size; x++)
+        {
+            dst[y * dstride + x] = val;
+        }
+    }
+}
+
 void convert16to32(short *src, int *dst, int num)
 {
     for (int i = 0; i < num; i++)
@@ -723,6 +735,12 @@
     p.blockcpy_sp = blockcopy_s_p;
     p.blockcpy_sc = blockcopy_s_c;
 
+    p.blockfil_s[BLOCK_4x4]   = blockfil_s_c<4>;
+    p.blockfil_s[BLOCK_8x8]   = blockfil_s_c<8>;
+    p.blockfil_s[BLOCK_16x16] = blockfil_s_c<16>;
+    p.blockfil_s[BLOCK_32x32] = blockfil_s_c<32>;
+    p.blockfil_s[BLOCK_64x64] = blockfil_s_c<64>;
+
     p.cvt16to32     = convert16to32;
     p.cvt16to32_shl = convert16to32_shl;
     p.cvt32to16     = convert32to16;
diff -r 0b225ee24b5d -r 23b8aafb1fde source/common/primitives.h
--- a/source/common/primitives.h	Fri Aug 16 18:49:58 2013 +0800
+++ b/source/common/primitives.h	Fri Aug 16 18:50:20 2013 +0800
@@ -196,6 +196,7 @@
 typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
 typedef void (*pixeladd_ss_t)(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
 typedef void (*pixeladd_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*blockfil_s_t)(short *dst, intptr_t dstride, short val);
 
 typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
 typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
@@ -250,6 +251,7 @@
     blockcpy_ps_t   blockcpy_ps;                // block copy pixel from short
     blockcpy_sp_t   blockcpy_sp;                // block copy short from pixel
     blockcpy_sc_t   blockcpy_sc;                // block copy short from unsigned char
+    blockfil_s_t    blockfil_s[NUM_SQUARE_BLOCKS];  // block fill with value
     cvt16to32_t     cvt16to32;
     cvt16to32_shl_t cvt16to32_shl;
     cvt16to16_shl_t cvt16to16_shl;
diff -r 0b225ee24b5d -r 23b8aafb1fde source/common/vec/pixel.inc
--- a/source/common/vec/pixel.inc	Fri Aug 16 18:49:58 2013 +0800
+++ b/source/common/vec/pixel.inc	Fri Aug 16 18:50:20 2013 +0800
@@ -310,6 +310,74 @@
     transpose16_dummy(dst + 16,           32, src + 16 * srcstride,      srcstride);
 }
 
+void blockfil_s_4(short *dst, intptr_t dstride, short val)
+{
+    __m128i T00;
+    T00 = _mm_cvtsi32_si128(val);
+    T00 = _mm_shufflelo_epi16(T00, 0);
+
+    _mm_storel_epi64((__m128i*)&dst[0 * dstride], T00);
+    _mm_storel_epi64((__m128i*)&dst[1 * dstride], T00);
+    _mm_storel_epi64((__m128i*)&dst[2 * dstride], T00);
+    _mm_storel_epi64((__m128i*)&dst[3 * dstride], T00);
+}
+
+void blockfil_s_8(short *dst, intptr_t dstride, short val)
+{
+    __m128i T00;
+    T00 = _mm_cvtsi32_si128(val);
+    T00 = _mm_shufflelo_epi16(T00, 0);
+    T00 = _mm_shuffle_epi32(T00, 0);
+
+    _mm_storeu_si128((__m128i*)&dst[0 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[1 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[2 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[3 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[4 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[5 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[6 * dstride], T00);
+    _mm_storeu_si128((__m128i*)&dst[7 * dstride], T00);
+}
+
+void blockfil_s_16(short *dst, intptr_t dstride, short val)
+{
+    __m128i T00;
+    T00 = _mm_cvtsi32_si128(val);
+    T00 = _mm_shufflelo_epi16(T00, 0);
+    T00 = _mm_shuffle_epi32(T00, 0);
+
+    for(int i = 0; i < 16; i += 4)
+    {
+        _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride    ], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 8], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride    ], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 8], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 2) * dstride    ], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 2) * dstride + 8], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 3) * dstride    ], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 3) * dstride + 8], T00);
+    }
+}
+
+void blockfil_s_32(short *dst, intptr_t dstride, short val)
+{
+    __m128i T00;
+    T00 = _mm_cvtsi32_si128(val);
+    T00 = _mm_shufflelo_epi16(T00, 0);
+    T00 = _mm_shuffle_epi32(T00, 0);
+
+    for(int i = 0; i < 32; i += 2)
+    {
+        _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride     ], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride +  8], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 16], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 0) * dstride + 24], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride     ], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride +  8], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 16], T00);
+        _mm_storeu_si128((__m128i*)&dst[(i + 1) * dstride + 24], T00);
+    }
+}
 #endif // !HIGH_BIT_DEPTH
 }  // end anonymous namespace
 
@@ -394,6 +462,13 @@
     p.weightpUni = weightUnidir;
 #endif
 
+#if !HIGH_BIT_DEPTH
+    p.blockfil_s[BLOCK_4x4]   = blockfil_s_4;
+    p.blockfil_s[BLOCK_8x8]   = blockfil_s_8;
+    p.blockfil_s[BLOCK_16x16] = blockfil_s_16;
+    p.blockfil_s[BLOCK_32x32] = blockfil_s_32;
+#endif
+
 #if !HIGH_BIT_DEPTH && (INSTRSET >= X265_CPU_LEVEL_SSSE3)
     p.scale1D_128to64 = scale1D_128to64;
     p.scale2D_64to32 = scale2D_64to32;



More information about the x265-devel mailing list