[x265] [PATCH 2 of 3] ipfilter8.inc: vectorized vertical weighted filter
deepthidevaki at multicorewareinc.com
deepthidevaki at multicorewareinc.com
Tue Aug 6 13:56:09 CEST 2013
# HG changeset patch
# User Deepthi Devaki
# Date 1375788151 -19800
# Node ID a4b3c63cb2495a6c8369d314937a2b2cbf3e9c65
# Parent 9ff9eba7d6d625d9ce70dc7c1cd6881754955808
ipfilter8.inc: vectorized vertical weighted filter
diff -r 9ff9eba7d6d6 -r a4b3c63cb249 source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc Tue Aug 06 16:49:54 2013 +0530
+++ b/source/common/vec/ipfilter.inc Tue Aug 06 16:52:31 2013 +0530
@@ -62,6 +62,7 @@
#if !HIGH_BIT_DEPTH && INSTRSET >= X265_CPU_LEVEL_SSE41
p.filterVmulti = filterVerticalMultiplaneExtend;
+ p.filterVwghtd = filterVerticalWeighted;
#if !(defined(_MSC_VER) && _MSC_VER == 1500 && X86_64)
p.filterHmulti = filterHorizontalMultiplaneExtend;
p.filterHwghtd = filterHorizontalWeighted;
diff -r 9ff9eba7d6d6 -r a4b3c63cb249 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Tue Aug 06 16:49:54 2013 +0530
+++ b/source/common/vec/ipfilter8.inc Tue Aug 06 16:52:31 2013 +0530
@@ -445,6 +445,226 @@
}
}
+#define PROCESSROWWGHTD(a0, a1, a2, a3, a4, a5, a6, a7) { \
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + (row + 7) * srcStride)); \
+ a7 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15)); \
+ exp1 = _mm_sub_epi32(_mm_sub_epi32(_mm_sll_epi32(a1, _mm_cvtsi32_si128(2)), a0), _mm_mullo_epi32(a2, _mm_set1_epi32(10))); \
+ exp2 = _mm_mullo_epi32(a3, _mm_set1_epi32(40)); \
+ exp3 = _mm_mullo_epi32(a3, _mm_set1_epi32(17)); \
+ exp4 = _mm_mullo_epi32(a4, _mm_set1_epi32(17)); \
+ exp5 = _mm_mullo_epi32(a4, _mm_set1_epi32(40)); \
+ exp6 = _mm_sub_epi32(_mm_sub_epi32(_mm_sll_epi32(a6, _mm_cvtsi32_si128(2)), a7), _mm_mullo_epi32(a5, _mm_set1_epi32(10))); \
+ sume = _mm_add_epi32(exp1, _mm_add_epi32(_mm_add_epi32(exp2, exp3), \
+ _mm_add_epi32(_mm_add_epi32(a3, exp4), \
+ _mm_add_epi32(_mm_mullo_epi32(a5, _mm_set1_epi32(-5)), \
+ a6) \
+ ) \
+ ) \
+ ); \
+ sumi = _mm_sub_epi32(_mm_add_epi32(_mm_add_epi32(exp1, exp2), _mm_add_epi32(exp5, exp6)), \
+ _mm_add_epi32(a2, a5)); \
+ sump = _mm_add_epi32(a1, _mm_add_epi32(_mm_add_epi32(exp3, exp4), \
+ _mm_add_epi32(_mm_add_epi32(exp5, exp6), \
+ _mm_add_epi32(_mm_mullo_epi32(a2, _mm_set1_epi32(-5)), \
+ a4) \
+ ) \
+ ) \
+ ); \
+ /* store results */ \
+ sumi = _mm_srai_epi32(sumi, IF_FILTER_PREC); \
+ tmp = _mm_packus_epi32(_mm_and_si128(sumi, _mm_set1_epi32(0x0000FFFF)), _mm_set1_epi32(0)); \
+ sumi = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15)); \
+ /*Apply weight*/ \
+ sumi = _mm_mullo_epi32(sumi, vscale); \
+ sumi = _mm_add_epi32(sumi, vround); \
+ sumi = _mm_srai_epi32(sumi, wshift); \
+ sumi = _mm_add_epi32(sumi, ofs); \
+ tmp = _mm_packs_epi32(sumi, _mm_setzero_si128()); \
+ sumi = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(sumi); \
+ sume = _mm_srai_epi32(sume, IF_FILTER_PREC); \
+ tmp = _mm_packus_epi32(_mm_and_si128(sume, _mm_set1_epi32(0x0000FFFF)), _mm_set1_epi32(0)); \
+ sume = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15)); \
+ /*Apply weight*/ \
+ sume = _mm_mullo_epi32(sume, vscale); \
+ sume = _mm_add_epi32(sume, vround); \
+ sume = _mm_srai_epi32(sume, wshift); \
+ sume = _mm_add_epi32(sume, ofs); \
+ tmp = _mm_packs_epi32(sume, _mm_setzero_si128()); \
+ sume = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(sume); \
+ sump = _mm_srai_epi32(sump, IF_FILTER_PREC); \
+ tmp = _mm_packus_epi32(_mm_and_si128(sump, _mm_set1_epi32(0x0000FFFF)), _mm_set1_epi32(0)); \
+ sump = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15)); \
+ /*Apply weight*/ \
+ sump = _mm_mullo_epi32(sump, vscale); \
+ sump = _mm_add_epi32(sump, vround); \
+ sump = _mm_srai_epi32(sump, wshift); \
+ sump = _mm_add_epi32(sump, ofs); \
+ tmp = _mm_packs_epi32(sump, _mm_setzero_si128()); \
+ sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
+}
+
+void filterVerticalWeighted(short *src, int srcStride,
+ pixel *dstE, pixel *dstI, pixel *dstP, int dstStride,
+ int block_width, int block_height,
+ int marginX, int marginY,
+ int scale, int wround, int wshift, int woffset)
+{
+ int row, col;
+
+ int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
+ int shift = IF_FILTER_PREC + headRoom;
+
+ int offset = 1 << (shift - 1);
+
+ offset += IF_INTERNAL_OFFS << IF_FILTER_PREC;
+
+ src -= (8 / 2 - 1) * srcStride;
+
+ __m128i tmp16e, tmp16i, tmp16p;
+ __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+ __m128i tmp;
+ __m128i sume, sumi, sump;
+ __m128i exp1, exp2, exp3, exp4, exp5, exp6;
+
+ Int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
+ wshift = wshift + shiftNum;
+ wround = wshift ? (1 << (wshift - 1)) : 0;
+
+ __m128i vround = _mm_set1_epi32(wround + scale * IF_INTERNAL_OFFS);
+ __m128i ofs = _mm_set1_epi32(woffset);
+ __m128i vscale = _mm_set1_epi32(scale);
+
+ col = 0;
+
+ tmp = _mm_loadu_si128((__m128i const*)(src + col));
+ a0 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + srcStride));
+ a1 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * srcStride));
+ a2 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * srcStride));
+ a3 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * srcStride));
+ a4 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * srcStride));
+ a5 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * srcStride));
+ a6 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+ for (row = 0; row < block_height; row++)
+ {
+ PROCESSROWWGHTD(a0, a1, a2, a3, a4, a5, a6, a7) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a1, a2, a3, a4, a5, a6, a7, a0) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a2, a3, a4, a5, a6, a7, a0, a1) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a3, a4, a5, a6, a7, a0, a1, a2) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a4, a5, a6, a7, a0, a1, a2, a3) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a5, a6, a7, a0, a1, a2, a3, a4) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a6, a7, a0, a1, a2, a3, a4, a5) EXTENDCOL(0, 0) row++;
+ PROCESSROWWGHTD(a7, a0, a1, a2, a3, a4, a5, a6) EXTENDCOL(0, 0)
+ }
+
+ col += 4;
+
+ for (; col < block_width - 4; col += 4) // Considering block width is always a multiple of 4
+ {
+ tmp = _mm_loadu_si128((__m128i const*)(src + col));
+ a0 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + srcStride));
+ a1 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * srcStride));
+ a2 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * srcStride));
+ a3 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * srcStride));
+ a4 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * srcStride));
+ a5 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * srcStride));
+ a6 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+ for (row = 0; row < block_height; row++)
+ {
+ PROCESSROWWGHTD(a0, a1, a2, a3, a4, a5, a6, a7) row++;
+ PROCESSROWWGHTD(a1, a2, a3, a4, a5, a6, a7, a0) row++;
+ PROCESSROWWGHTD(a2, a3, a4, a5, a6, a7, a0, a1) row++;
+ PROCESSROWWGHTD(a3, a4, a5, a6, a7, a0, a1, a2) row++;
+ PROCESSROWWGHTD(a4, a5, a6, a7, a0, a1, a2, a3) row++;
+ PROCESSROWWGHTD(a5, a6, a7, a0, a1, a2, a3, a4) row++;
+ PROCESSROWWGHTD(a6, a7, a0, a1, a2, a3, a4, a5) row++;
+ PROCESSROWWGHTD(a7, a0, a1, a2, a3, a4, a5, a6)
+ }
+ }
+
+ tmp = _mm_loadu_si128((__m128i const*)(src + col));
+ a0 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + srcStride));
+ a1 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * srcStride));
+ a2 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * srcStride));
+ a3 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * srcStride));
+ a4 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * srcStride));
+ a5 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * srcStride));
+ a6 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+ for (row = 0; row < block_height; row++)
+ {
+ PROCESSROWWGHTD(a0, a1, a2, a3, a4, a5, a6, a7) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a1, a2, a3, a4, a5, a6, a7, a0) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a2, a3, a4, a5, a6, a7, a0, a1) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a3, a4, a5, a6, a7, a0, a1, a2) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a4, a5, a6, a7, a0, a1, a2, a3) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a5, a6, a7, a0, a1, a2, a3, a4) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a6, a7, a0, a1, a2, a3, a4, a5) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROWWGHTD(a7, a0, a1, a2, a3, a4, a5, a6) EXTENDCOL((block_width + marginX), 3)
+ }
+
+ // Extending bottom rows
+ pixel *pe, *pi, *pp;
+ pe = dstE + (block_height - 1) * dstStride - marginX;
+ pi = dstI + (block_height - 1) * dstStride - marginX;
+ pp = dstP + (block_height - 1) * dstStride - marginX;
+ for (int y = 1; y <= marginY; y++)
+ {
+ memcpy(pe + y * dstStride, pe, block_width + marginX * 2);
+ }
+
+ for (int y = 1; y <= marginY; y++)
+ {
+ memcpy(pi + y * dstStride, pi, block_width + marginX * 2);
+ }
+
+ for (int y = 1; y <= marginY; y++)
+ {
+ memcpy(pp + y * dstStride, pp, block_width + marginX * 2);
+ }
+
+ // Extending top rows
+ pe -= ((block_height - 1) * dstStride);
+ pi -= ((block_height - 1) * dstStride);
+ pp -= ((block_height - 1) * dstStride);
+ for (int y = 1; y <= marginY; y++)
+ {
+ memcpy(pe - y * dstStride, pe, block_width + marginX * 2);
+ }
+
+ for (int y = 1; y <= marginY; y++)
+ {
+ memcpy(pi - y * dstStride, pi, block_width + marginX * 2);
+ }
+
+ for (int y = 1; y <= marginY; y++)
+ {
+ memcpy(pp - y * dstStride, pp, block_width + marginX * 2);
+ }
+}
+
template<int N>
void filterVertical_p_p(pixel *src, int srcStride,
pixel *dst, int dstStride,
More information about the x265-devel
mailing list