[x265] [PATCH] intrinsic version loopfilter FilterLumaV
Steve Borho
steve at borho.org
Sun Sep 8 03:01:33 CEST 2013
On Sat, Sep 7, 2013 at 8:02 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1378558915 -28800
> # Node ID c0a3789a29f1cc2651274fda99ca330352d950be
> # Parent 413573beeef82c0196b88488271f459e28f711e2
> intrinsic version loopfilter FilterLumaV
>
This looks reasonable, but a few requests (see below)
>
> diff -r 413573beeef8 -r c0a3789a29f1
> source/Lib/TLibCommon/TComLoopFilter.cpp
> --- a/source/Lib/TLibCommon/TComLoopFilter.cpp Sat Sep 07 13:12:21 2013
> +0800
> +++ b/source/Lib/TLibCommon/TComLoopFilter.cpp Sat Sep 07 21:01:55 2013
> +0800
> @@ -630,9 +630,19 @@
> bool sw = xUseStrongFiltering(offset, 2 * d0, beta,
> tc, tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 0))
> && xUseStrongFiltering(offset, 2 * d3, beta, tc,
> tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 3));
>
> - for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
> + if (dir == EDGE_VER)
> {
> - xPelFilterLuma(tmpsrc + srcStep * (idx *
> pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter,
> bPartQNoFilter, thrCut, bFilterP, bFilterQ);
> + //for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2;
> i+=2)
> + {
> + primitives.filterLumaV(tmpsrc + srcStep *
> (idx * pelsInPart + blkIdx * 4)/* + srcStep * i*/, srcStep, tc, sw,
> bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
> + }
>
Please clean these up
> + }
> + else
> + {
> + for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2;
> i++)
> + {
> + xPelFilterLuma(tmpsrc + srcStep * (idx *
> pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter,
> bPartQNoFilter, thrCut, bFilterP, bFilterQ);
> + }
> }
> }
> }
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/ipfilter.cpp Sat Sep 07 21:01:55 2013 +0800
> @@ -753,6 +753,67 @@
> }
> }
>
> +void FilterLumaV(pixel* src, int offset, int tc, bool sw, bool
> bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool
> bFilterSecondQ)
> +{
>
int offset needs to be intptr_t stride
> + int delta;
> +
> + short m4 = (short)src[0];
> + short m3 = (short)src[-1];
> + short m5 = (short)src[1];
> + short m2 = (short)src[-2];
> + short m6 = (short)src[2];
> + short m1 = (short)src[-3];
> + short m7 = (short)src[3];
> + short m0 = (short)src[-4];
> +
> + if (sw)
> + {
> + src[-offset] = (pixel)Clip3(m3 - 2 * tc, m3 + 2 * tc, ((m1 +
> 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3));
> + src[0] = (pixel)Clip3(m4 - 2 * tc, m4 + 2 * tc, ((m2 +
> 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3));
> + src[-offset * 2] = (pixel)Clip3(m2 - 2 * tc, m2 + 2 * tc, ((m1 +
> m2 + m3 + m4 + 2) >> 2));
> + src[offset] = (pixel)Clip3(m5 - 2 * tc, m5 + 2 * tc, ((m3 +
> m4 + m5 + m6 + 2) >> 2));
> + src[-offset * 3] = (pixel)Clip3(m1 - 2 * tc, m1 + 2 * tc, ((2 *
> m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3));
> + src[offset * 2] = (pixel)Clip3(m6 - 2 * tc, m6 + 2 * tc, ((m3 +
> m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3));
> + }
> + else
> + {
> + /* Weak filter */
> + delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
> +
> + if (abs(delta) < thrCut)
> + {
> + delta = Clip3(-tc, tc, delta);
> + src[-1] = (pixel)ClipY((m3 + delta));
> + src[0] = (pixel)ClipY((m4 - delta));
> +
> + int tc2 = tc >> 1;
> + if (bFilterSecondP)
> + {
> + int delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2
> + delta) >> 1));
> + src[-2] = (pixel)ClipY((m2 + delta1));
> + }
> + if (bFilterSecondQ)
> + {
> + int delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5
> - delta) >> 1));
> + src[1] = (pixel)ClipY((m5 + delta2));
> + }
> + }
> + }
> +
> + if (bPartPNoFilter)
> + {
> + src[-1] = (Pel)m3;
> + src[-2] = (Pel)m2;
> + src[-3] = (Pel)m1;
> + }
> + if (bPartQNoFilter)
> + {
> + src[0] = (Pel)m4;
> + src[1] = (Pel)m5;
> + src[2] = (Pel)m6;
>
we should use pixel in our own code, instead of Pel
> + }
> +}
> +
> namespace x265 {
> // x265 private namespace
>
> @@ -782,5 +843,7 @@
> p.filterHwghtd = filterHorizontalWeighted;
>
> p.extendRowBorder = extendCURowColBorder;
> +
> + p.filterLumaV = FilterLumaV;
> }
> }
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/primitives.h
> --- a/source/common/primitives.h Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/primitives.h Sat Sep 07 21:01:55 2013 +0800
> @@ -234,6 +234,8 @@
> typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel
> *dstv, pixel *dstc,
> intptr_t src_stride, intptr_t dst_stride, int
> width, int height);
>
> +typedef void (*FilterLumaV_t)(pixel* src, intptr_t stride, int tc, bool
> sw, bool bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool
> bFilterSecondP, bool bFilterSecondQ);
> +
> /* Define a structure containing function pointers to optimized encoder
> * primitives. Each pointer can reference either an assembly routine,
> * a vectorized primitive, or a C function. */
> @@ -297,6 +299,8 @@
> scale_t scale1D_128to64;
> scale_t scale2D_64to32;
> downscale_t frame_init_lowres_core;
> +
> + FilterLumaV_t filterLumaV;
> };
>
> /* This copy of the table is what gets used by the encoder.
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/vec/ipfilter.inc
> --- a/source/common/vec/ipfilter.inc Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/vec/ipfilter.inc Sat Sep 07 21:01:55 2013 +0800
> @@ -66,5 +66,9 @@
> p.filterHwghtd = filterHorizontalWeighted;
> #endif
> #endif
> +
> +#if !HIGH_BIT_DEPTH && INSTRSET >= X265_CPU_LEVEL_SSSE3
> + p.filterLumaV = FilterLumaV;
> +#endif
> }
> }
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/vec/ipfilter8.inc
> --- a/source/common/vec/ipfilter8.inc Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/vec/ipfilter8.inc Sat Sep 07 21:01:55 2013 +0800
> @@ -1868,3 +1868,133 @@
> }
> }
> }
> +
> +#if INSTRSET >= X265_CPU_LEVEL_SSSE3
> +#ifndef DEBLOCK_SMALLEST_BLOCK
> +#define DEBLOCK_SMALLEST_BLOCK 8
> +#endif
> +
> +ALIGN_VAR_32(static const uint8_t, FilterLumaV_0[][16]) =
> +{
> + {2, 3, 1, 1, 1, 0, 0, 0, 2, 3, 1, 1, 1, 0, 0, 0},
> + {0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0},
> + {0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 0, 0},
> + {0, 0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 0},
> + {0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0},
> + {0, 0, 0, 1, 1, 1, 3, 2, 0, 0, 0, 1, 1, 1, 3, 2},
> +};
> +void FilterLumaV(pixel* src, intptr_t stride, int tc, bool sw, bool
> bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool
> bFilterSecondQ)
> +{
> + int delta;
> +
> + if (sw)
> + {
> + uint64_t mask0 = UINT64_C(0xFFFFFFFFFFFF);
> + uint64_t maskP = bPartPNoFilter ? UINT64_C(0xFFFFFF000000) :
> ~(uint64_t)0;
> + uint64_t maskQ = bPartQNoFilter ? UINT64_C(0x000000FFFFFF) :
> ~(uint64_t)0;
> + mask0 = mask0 & maskP & maskQ;
> + const __m128i mask = _mm_setr_epi32((int)mask0, (int)(mask0 >>
> 32), 0, 0);
> + const __m128i c4 = _mm_set1_epi32(0x00040004);
> +
> + // NOTE: DONT unroll this loop, VC9 BUG!
> + for(int i=0; i<2;i++)
> + {
> + __m128i T00 = _mm_loadl_epi64((__m128i*)&src[-4 ]);
> // [- - - - - - - - 7 6 5 4 3 2 1 0]
> + __m128i T01 = _mm_loadl_epi64((__m128i*)&src[-4 + stride]);
> + __m128i T02 = _mm_unpacklo_epi64(T00, T01);
> +
> + // NOTE: DONT use PMOVZXBW here, VC9 BUG!
> + __m128i T03 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
> + __m128i T04 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
> + __m128i tc2 = _mm_shuffle_epi32(_mm_cvtsi32_si128((tc << 17)
> | (tc << 1)), 0);
> + __m128i tcL0 = _mm_sub_epi16(_mm_srli_si128(T03, 2), tc2);
> + __m128i tcH0 = _mm_add_epi16(_mm_srli_si128(T03, 2), tc2);
> + __m128i tcL1 = _mm_sub_epi16(_mm_srli_si128(T04, 2), tc2);
> + __m128i tcH1 = _mm_add_epi16(_mm_srli_si128(T04, 2), tc2);
>
This part could use some ascii art to make these variable names meaningful.
Also, this new primitive needs a test case
> +
> + __m128i T10 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[0]));
> + __m128i T11 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[1]));
> + __m128i T12 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[2]));
> + __m128i T13 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[3]));
> + __m128i T14 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[4]));
> + __m128i T15 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[5]));
> +
> + __m128i T20 = _mm_unpacklo_epi64(T10, T11);
> + __m128i T21 = _mm_unpacklo_epi64(T12, T13);
> + __m128i T22 = _mm_unpacklo_epi64(T14, T15);
> + __m128i T23 = _mm_unpackhi_epi64(T10, T11);
> + __m128i T24 = _mm_unpackhi_epi64(T12, T13);
> + __m128i T25 = _mm_unpackhi_epi64(T14, T15);
> +
> + __m128i T30 = _mm_hadd_epi16(T20, T21);
> + __m128i T31 = _mm_hadd_epi16(T22, T22);
> + __m128i T40 = _mm_hadd_epi16(T30, T31);
> + __m128i T50 = _mm_srai_epi16(_mm_add_epi16(T40, c4), 3);
> +
> + T50 = _mm_max_epi16(T50, tcL0);
> + T50 = _mm_min_epi16(T50, tcH0);
> + T50 = _mm_packus_epi16(T50, T50);
> +
> + _mm_maskmoveu_si128(T50, mask, (char*)&src[-3]);
> +
> + T30 = _mm_hadd_epi16(T23, T24);
> + T31 = _mm_hadd_epi16(T25, T25);
> + T40 = _mm_hadd_epi16(T30, T31);
> + T50 = _mm_srai_epi16(_mm_add_epi16(T40, c4), 3);
> +
> + T50 = _mm_max_epi16(T50, tcL1);
> + T50 = _mm_min_epi16(T50, tcH1);
> + T50 = _mm_packus_epi16(T50, T50);
> +
> + _mm_maskmoveu_si128(T50, mask, (char*)&src[-3 + stride]);
> + src += 2 * stride;
> + }
> + }
> + else
> + {
> + for(int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
> + {
> + short m1 = (short)src[-3];
> + short m2 = (short)src[-2];
> + short m3 = (short)src[-1];
> + short m4 = (short)src[ 0];
> + short m5 = (short)src[ 1];
> + short m6 = (short)src[ 2];
>
the casts to short are probably unnecessary?
> +
> + /* Weak filter */
> + delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
> +
> + if (abs(delta) < thrCut)
> + {
> + delta = x265::Clip3(-tc, tc, delta);
> + src[-1] = (pixel)x265::ClipY((m3 + delta));
> + src[ 0] = (pixel)x265::ClipY((m4 - delta));
> +
> + int tc2 = tc >> 1;
> + if (bFilterSecondP)
> + {
> + int delta1 = x265::Clip3(-tc2, tc2, ((((m1 + m3 + 1)
> >> 1) - m2 + delta) >> 1));
> + src[-2] = (pixel)x265::ClipY((m2 + delta1));
> + }
> + if (bFilterSecondQ)
> + {
> + int delta2 = x265::Clip3(-tc2, tc2, ((((m6 + m4 + 1)
> >> 1) - m5 - delta) >> 1));
> + src[1] = (pixel)x265::ClipY((m5 + delta2));
> + }
> + }
> +
> + if (bPartPNoFilter)
> + {
> + src[-2] = (pixel)m2;
> + src[-1] = (pixel)m3;
> + }
> + if (bPartQNoFilter)
> + {
> + src[0] = (pixel)m4;
> + src[1] = (pixel)m5;
> + }
> + src += stride;
> + }
> + }
> +}
> +#endif
> diff -r 413573beeef8 -r c0a3789a29f1 source/x265.h
> --- a/source/x265.h Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/x265.h Sat Sep 07 21:01:55 2013 +0800
> @@ -25,6 +25,13 @@
> #define _X265_H_
>
> #include <stdint.h>
> +#ifndef UINT64_C
> + #ifdef _MSC_VER
> + #define UINT64_C(x) (x##UL)
> + #else
> + #define UINT64_C(x) (x##ULL)
> + #endif
> +#endif
>
unless we need to define 64bit constants in our public API, this should go
into common.h or TypeDefs.h
> #if __cplusplus
> extern "C" {
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://mailman.videolan.org/private/x265-devel/attachments/20130907/d4c7162e/attachment-0001.html>
More information about the x265-devel
mailing list