[x265] [PATCH] intrinsic version loopfilter FilterLumaV
Min Chen
chenm003 at 163.com
Sat Sep 7 15:02:04 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1378558915 -28800
# Node ID c0a3789a29f1cc2651274fda99ca330352d950be
# Parent 413573beeef82c0196b88488271f459e28f711e2
intrinsic version loopfilter FilterLumaV
diff -r 413573beeef8 -r c0a3789a29f1 source/Lib/TLibCommon/TComLoopFilter.cpp
--- a/source/Lib/TLibCommon/TComLoopFilter.cpp Sat Sep 07 13:12:21 2013 +0800
+++ b/source/Lib/TLibCommon/TComLoopFilter.cpp Sat Sep 07 21:01:55 2013 +0800
@@ -630,9 +630,19 @@
bool sw = xUseStrongFiltering(offset, 2 * d0, beta, tc, tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 0))
&& xUseStrongFiltering(offset, 2 * d3, beta, tc, tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 3));
- for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
+ if (dir == EDGE_VER)
{
- xPelFilterLuma(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
+ //for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i+=2)
+ {
+ primitives.filterLumaV(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4)/* + srcStep * i*/, srcStep, tc, sw, bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
+ }
+ }
+ else
+ {
+ for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
+ {
+ xPelFilterLuma(tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
+ }
}
}
}
diff -r 413573beeef8 -r c0a3789a29f1 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Sat Sep 07 13:12:21 2013 +0800
+++ b/source/common/ipfilter.cpp Sat Sep 07 21:01:55 2013 +0800
@@ -753,6 +753,67 @@
}
}
+void FilterLumaV(pixel* src, int offset, int tc, bool sw, bool bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool bFilterSecondQ)
+{
+ int delta;
+
+ short m4 = (short)src[0];
+ short m3 = (short)src[-1];
+ short m5 = (short)src[1];
+ short m2 = (short)src[-2];
+ short m6 = (short)src[2];
+ short m1 = (short)src[-3];
+ short m7 = (short)src[3];
+ short m0 = (short)src[-4];
+
+ if (sw)
+ {
+ src[-offset] = (pixel)Clip3(m3 - 2 * tc, m3 + 2 * tc, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3));
+ src[0] = (pixel)Clip3(m4 - 2 * tc, m4 + 2 * tc, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3));
+ src[-offset * 2] = (pixel)Clip3(m2 - 2 * tc, m2 + 2 * tc, ((m1 + m2 + m3 + m4 + 2) >> 2));
+ src[offset] = (pixel)Clip3(m5 - 2 * tc, m5 + 2 * tc, ((m3 + m4 + m5 + m6 + 2) >> 2));
+ src[-offset * 3] = (pixel)Clip3(m1 - 2 * tc, m1 + 2 * tc, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3));
+ src[offset * 2] = (pixel)Clip3(m6 - 2 * tc, m6 + 2 * tc, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3));
+ }
+ else
+ {
+ /* Weak filter */
+ delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
+
+ if (abs(delta) < thrCut)
+ {
+ delta = Clip3(-tc, tc, delta);
+ src[-1] = (pixel)ClipY((m3 + delta));
+ src[0] = (pixel)ClipY((m4 - delta));
+
+ int tc2 = tc >> 1;
+ if (bFilterSecondP)
+ {
+ int delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
+ src[-2] = (pixel)ClipY((m2 + delta1));
+ }
+ if (bFilterSecondQ)
+ {
+ int delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
+ src[1] = (pixel)ClipY((m5 + delta2));
+ }
+ }
+ }
+
+ if (bPartPNoFilter)
+ {
+ src[-1] = (Pel)m3;
+ src[-2] = (Pel)m2;
+ src[-3] = (Pel)m1;
+ }
+ if (bPartQNoFilter)
+ {
+ src[0] = (Pel)m4;
+ src[1] = (Pel)m5;
+ src[2] = (Pel)m6;
+ }
+}
+
namespace x265 {
// x265 private namespace
@@ -782,5 +843,7 @@
p.filterHwghtd = filterHorizontalWeighted;
p.extendRowBorder = extendCURowColBorder;
+
+ p.filterLumaV = FilterLumaV;
}
}
diff -r 413573beeef8 -r c0a3789a29f1 source/common/primitives.h
--- a/source/common/primitives.h Sat Sep 07 13:12:21 2013 +0800
+++ b/source/common/primitives.h Sat Sep 07 21:01:55 2013 +0800
@@ -234,6 +234,8 @@
typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height);
+typedef void (*FilterLumaV_t)(pixel* src, intptr_t stride, int tc, bool sw, bool bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool bFilterSecondQ);
+
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
* a vectorized primitive, or a C function. */
@@ -297,6 +299,8 @@
scale_t scale1D_128to64;
scale_t scale2D_64to32;
downscale_t frame_init_lowres_core;
+
+ FilterLumaV_t filterLumaV;
};
/* This copy of the table is what gets used by the encoder.
diff -r 413573beeef8 -r c0a3789a29f1 source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc Sat Sep 07 13:12:21 2013 +0800
+++ b/source/common/vec/ipfilter.inc Sat Sep 07 21:01:55 2013 +0800
@@ -66,5 +66,9 @@
p.filterHwghtd = filterHorizontalWeighted;
#endif
#endif
+
+#if !HIGH_BIT_DEPTH && INSTRSET >= X265_CPU_LEVEL_SSSE3
+ p.filterLumaV = FilterLumaV;
+#endif
}
}
diff -r 413573beeef8 -r c0a3789a29f1 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Sat Sep 07 13:12:21 2013 +0800
+++ b/source/common/vec/ipfilter8.inc Sat Sep 07 21:01:55 2013 +0800
@@ -1868,3 +1868,133 @@
}
}
}
+
+#if INSTRSET >= X265_CPU_LEVEL_SSSE3
+#ifndef DEBLOCK_SMALLEST_BLOCK
+#define DEBLOCK_SMALLEST_BLOCK 8
+#endif
+
+ALIGN_VAR_32(static const uint8_t, FilterLumaV_0[][16]) =
+{
+ {2, 3, 1, 1, 1, 0, 0, 0, 2, 3, 1, 1, 1, 0, 0, 0},
+ {0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0},
+ {0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 0, 0},
+ {0, 0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 0},
+ {0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0},
+ {0, 0, 0, 1, 1, 1, 3, 2, 0, 0, 0, 1, 1, 1, 3, 2},
+};
+void FilterLumaV(pixel* src, intptr_t stride, int tc, bool sw, bool bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool bFilterSecondQ)
+{
+ int delta;
+
+ if (sw)
+ {
+ uint64_t mask0 = UINT64_C(0xFFFFFFFFFFFF);
+ uint64_t maskP = bPartPNoFilter ? UINT64_C(0xFFFFFF000000) : ~(uint64_t)0;
+ uint64_t maskQ = bPartQNoFilter ? UINT64_C(0x000000FFFFFF) : ~(uint64_t)0;
+ mask0 = mask0 & maskP & maskQ;
+ const __m128i mask = _mm_setr_epi32((int)mask0, (int)(mask0 >> 32), 0, 0);
+ const __m128i c4 = _mm_set1_epi32(0x00040004);
+
+ // NOTE: DONT unroll this loop, VC9 BUG!
+ for(int i=0; i<2;i++)
+ {
+ __m128i T00 = _mm_loadl_epi64((__m128i*)&src[-4 ]); // [- - - - - - - - 7 6 5 4 3 2 1 0]
+ __m128i T01 = _mm_loadl_epi64((__m128i*)&src[-4 + stride]);
+ __m128i T02 = _mm_unpacklo_epi64(T00, T01);
+
+ // NOTE: DONT use PMOVZXBW here, VC9 BUG!
+ __m128i T03 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
+ __m128i T04 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
+ __m128i tc2 = _mm_shuffle_epi32(_mm_cvtsi32_si128((tc << 17) | (tc << 1)), 0);
+ __m128i tcL0 = _mm_sub_epi16(_mm_srli_si128(T03, 2), tc2);
+ __m128i tcH0 = _mm_add_epi16(_mm_srli_si128(T03, 2), tc2);
+ __m128i tcL1 = _mm_sub_epi16(_mm_srli_si128(T04, 2), tc2);
+ __m128i tcH1 = _mm_add_epi16(_mm_srli_si128(T04, 2), tc2);
+
+ __m128i T10 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)FilterLumaV_0[0]));
+ __m128i T11 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)FilterLumaV_0[1]));
+ __m128i T12 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)FilterLumaV_0[2]));
+ __m128i T13 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)FilterLumaV_0[3]));
+ __m128i T14 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)FilterLumaV_0[4]));
+ __m128i T15 = _mm_maddubs_epi16(T02, _mm_load_si128((__m128i*)FilterLumaV_0[5]));
+
+ __m128i T20 = _mm_unpacklo_epi64(T10, T11);
+ __m128i T21 = _mm_unpacklo_epi64(T12, T13);
+ __m128i T22 = _mm_unpacklo_epi64(T14, T15);
+ __m128i T23 = _mm_unpackhi_epi64(T10, T11);
+ __m128i T24 = _mm_unpackhi_epi64(T12, T13);
+ __m128i T25 = _mm_unpackhi_epi64(T14, T15);
+
+ __m128i T30 = _mm_hadd_epi16(T20, T21);
+ __m128i T31 = _mm_hadd_epi16(T22, T22);
+ __m128i T40 = _mm_hadd_epi16(T30, T31);
+ __m128i T50 = _mm_srai_epi16(_mm_add_epi16(T40, c4), 3);
+
+ T50 = _mm_max_epi16(T50, tcL0);
+ T50 = _mm_min_epi16(T50, tcH0);
+ T50 = _mm_packus_epi16(T50, T50);
+
+ _mm_maskmoveu_si128(T50, mask, (char*)&src[-3]);
+
+ T30 = _mm_hadd_epi16(T23, T24);
+ T31 = _mm_hadd_epi16(T25, T25);
+ T40 = _mm_hadd_epi16(T30, T31);
+ T50 = _mm_srai_epi16(_mm_add_epi16(T40, c4), 3);
+
+ T50 = _mm_max_epi16(T50, tcL1);
+ T50 = _mm_min_epi16(T50, tcH1);
+ T50 = _mm_packus_epi16(T50, T50);
+
+ _mm_maskmoveu_si128(T50, mask, (char*)&src[-3 + stride]);
+ src += 2 * stride;
+ }
+ }
+ else
+ {
+ for(int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
+ {
+ short m1 = (short)src[-3];
+ short m2 = (short)src[-2];
+ short m3 = (short)src[-1];
+ short m4 = (short)src[ 0];
+ short m5 = (short)src[ 1];
+ short m6 = (short)src[ 2];
+
+ /* Weak filter */
+ delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
+
+ if (abs(delta) < thrCut)
+ {
+ delta = x265::Clip3(-tc, tc, delta);
+ src[-1] = (pixel)x265::ClipY((m3 + delta));
+ src[ 0] = (pixel)x265::ClipY((m4 - delta));
+
+ int tc2 = tc >> 1;
+ if (bFilterSecondP)
+ {
+ int delta1 = x265::Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2 + delta) >> 1));
+ src[-2] = (pixel)x265::ClipY((m2 + delta1));
+ }
+ if (bFilterSecondQ)
+ {
+ int delta2 = x265::Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5 - delta) >> 1));
+ src[1] = (pixel)x265::ClipY((m5 + delta2));
+ }
+ }
+
+ if (bPartPNoFilter)
+ {
+ src[-2] = (pixel)m2;
+ src[-1] = (pixel)m3;
+ }
+ if (bPartQNoFilter)
+ {
+ src[0] = (pixel)m4;
+ src[1] = (pixel)m5;
+ }
+ src += stride;
+ }
+ }
+}
+#endif
diff -r 413573beeef8 -r c0a3789a29f1 source/x265.h
--- a/source/x265.h Sat Sep 07 13:12:21 2013 +0800
+++ b/source/x265.h Sat Sep 07 21:01:55 2013 +0800
@@ -25,6 +25,13 @@
#define _X265_H_
#include <stdint.h>
+#ifndef UINT64_C
+ #ifdef _MSC_VER
+ #define UINT64_C(x) (x##UL)
+ #else
+ #define UINT64_C(x) (x##ULL)
+ #endif
+#endif
#if __cplusplus
extern "C" {
More information about the x265-devel
mailing list