[x265] [PATCH] intrinsic version loopfilter FilterLumaV

Steve Borho steve at borho.org
Sun Sep 8 03:01:33 CEST 2013


On Sat, Sep 7, 2013 at 8:02 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1378558915 -28800
> # Node ID c0a3789a29f1cc2651274fda99ca330352d950be
> # Parent  413573beeef82c0196b88488271f459e28f711e2
> intrinsic version loopfilter FilterLumaV
>

This looks reasonable, but a few requests (see below)


>
> diff -r 413573beeef8 -r c0a3789a29f1
> source/Lib/TLibCommon/TComLoopFilter.cpp
> --- a/source/Lib/TLibCommon/TComLoopFilter.cpp  Sat Sep 07 13:12:21 2013
> +0800
> +++ b/source/Lib/TLibCommon/TComLoopFilter.cpp  Sat Sep 07 21:01:55 2013
> +0800
> @@ -630,9 +630,19 @@
>                      bool sw =  xUseStrongFiltering(offset, 2 * d0, beta,
> tc, tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 0))
>                          && xUseStrongFiltering(offset, 2 * d3, beta, tc,
> tmpsrc + srcStep * (idx * pelsInPart + blkIdx * 4 + 3));
>
> -                    for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
> +                    if (dir == EDGE_VER)
>                      {
> -                        xPelFilterLuma(tmpsrc + srcStep * (idx *
> pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter,
> bPartQNoFilter, thrCut, bFilterP, bFilterQ);
> +                        //for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2;
> i+=2)
> +                        {
> +                            primitives.filterLumaV(tmpsrc + srcStep *
> (idx * pelsInPart + blkIdx * 4)/* + srcStep * i*/, srcStep, tc, sw,
> bPartPNoFilter, bPartQNoFilter, thrCut, bFilterP, bFilterQ);
> +                        }
>

Please clean these up


> +                    }
> +                    else
> +                    {
> +                        for (int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2;
> i++)
> +                        {
> +                            xPelFilterLuma(tmpsrc + srcStep * (idx *
> pelsInPart + blkIdx * 4 + i), offset, tc, sw, bPartPNoFilter,
> bPartQNoFilter, thrCut, bFilterP, bFilterQ);
> +                        }
>                      }
>                  }
>              }
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp        Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/ipfilter.cpp        Sat Sep 07 21:01:55 2013 +0800
> @@ -753,6 +753,67 @@
>      }
>  }
>
> +void FilterLumaV(pixel* src, int offset, int tc, bool sw, bool
> bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool
> bFilterSecondQ)
> +{
>

int offset needs to be intptr_t stride


> +    int delta;
> +
> +    short m4  = (short)src[0];
> +    short m3  = (short)src[-1];
> +    short m5  = (short)src[1];
> +    short m2  = (short)src[-2];
> +    short m6  = (short)src[2];
> +    short m1  = (short)src[-3];
> +    short m7  = (short)src[3];
> +    short m0  = (short)src[-4];
> +
> +    if (sw)
> +    {
> +        src[-offset]     = (pixel)Clip3(m3 - 2 * tc, m3 + 2 * tc, ((m1 +
> 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3));
> +        src[0]           = (pixel)Clip3(m4 - 2 * tc, m4 + 2 * tc, ((m2 +
> 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3));
> +        src[-offset * 2] = (pixel)Clip3(m2 - 2 * tc, m2 + 2 * tc, ((m1 +
> m2 + m3 + m4 + 2) >> 2));
> +        src[offset]      = (pixel)Clip3(m5 - 2 * tc, m5 + 2 * tc, ((m3 +
> m4 + m5 + m6 + 2) >> 2));
> +        src[-offset * 3] = (pixel)Clip3(m1 - 2 * tc, m1 + 2 * tc, ((2 *
> m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3));
> +        src[offset * 2]  = (pixel)Clip3(m6 - 2 * tc, m6 + 2 * tc, ((m3 +
> m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3));
> +    }
> +    else
> +    {
> +        /* Weak filter */
> +        delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
> +
> +        if (abs(delta) < thrCut)
> +        {
> +            delta = Clip3(-tc, tc, delta);
> +            src[-1] = (pixel)ClipY((m3 + delta));
> +            src[0] = (pixel)ClipY((m4 - delta));
> +
> +            int tc2 = tc >> 1;
> +            if (bFilterSecondP)
> +            {
> +                int delta1 = Clip3(-tc2, tc2, ((((m1 + m3 + 1) >> 1) - m2
> + delta) >> 1));
> +                src[-2] = (pixel)ClipY((m2 + delta1));
> +            }
> +            if (bFilterSecondQ)
> +            {
> +                int delta2 = Clip3(-tc2, tc2, ((((m6 + m4 + 1) >> 1) - m5
> - delta) >> 1));
> +                src[1] = (pixel)ClipY((m5 + delta2));
> +            }
> +        }
> +    }
> +
> +    if (bPartPNoFilter)
> +    {
> +        src[-1] = (Pel)m3;
> +        src[-2] = (Pel)m2;
> +        src[-3] = (Pel)m1;
> +    }
> +    if (bPartQNoFilter)
> +    {
> +        src[0] = (Pel)m4;
> +        src[1] = (Pel)m5;
> +        src[2] = (Pel)m6;
>

we should use pixel in our own code, instead of Pel


> +    }
> +}
> +
>  namespace x265 {
>  // x265 private namespace
>
> @@ -782,5 +843,7 @@
>      p.filterHwghtd = filterHorizontalWeighted;
>
>      p.extendRowBorder = extendCURowColBorder;
> +
> +    p.filterLumaV = FilterLumaV;
>  }
>  }
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/primitives.h
> --- a/source/common/primitives.h        Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/primitives.h        Sat Sep 07 21:01:55 2013 +0800
> @@ -234,6 +234,8 @@
>  typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel
> *dstv, pixel *dstc,
>                              intptr_t src_stride, intptr_t dst_stride, int
> width, int height);
>
> +typedef void (*FilterLumaV_t)(pixel* src, intptr_t stride, int tc, bool
> sw, bool bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool
> bFilterSecondP, bool bFilterSecondQ);
> +
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
>   * a vectorized primitive, or a C function. */
> @@ -297,6 +299,8 @@
>      scale_t         scale1D_128to64;
>      scale_t         scale2D_64to32;
>      downscale_t     frame_init_lowres_core;
> +
> +    FilterLumaV_t   filterLumaV;
>  };
>
>  /* This copy of the table is what gets used by the encoder.
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/vec/ipfilter.inc
> --- a/source/common/vec/ipfilter.inc    Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/vec/ipfilter.inc    Sat Sep 07 21:01:55 2013 +0800
> @@ -66,5 +66,9 @@
>      p.filterHwghtd = filterHorizontalWeighted;
>  #endif
>  #endif
> +
> +#if !HIGH_BIT_DEPTH && INSTRSET >= X265_CPU_LEVEL_SSSE3
> +    p.filterLumaV = FilterLumaV;
> +#endif
>  }
>  }
> diff -r 413573beeef8 -r c0a3789a29f1 source/common/vec/ipfilter8.inc
> --- a/source/common/vec/ipfilter8.inc   Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/common/vec/ipfilter8.inc   Sat Sep 07 21:01:55 2013 +0800
> @@ -1868,3 +1868,133 @@
>          }
>      }
>  }
> +
> +#if INSTRSET >= X265_CPU_LEVEL_SSSE3
> +#ifndef DEBLOCK_SMALLEST_BLOCK
> +#define DEBLOCK_SMALLEST_BLOCK  8
> +#endif
> +
> +ALIGN_VAR_32(static const uint8_t, FilterLumaV_0[][16]) =
> +{
> +    {2, 3, 1, 1, 1, 0, 0, 0, 2, 3, 1, 1, 1, 0, 0, 0},
> +    {0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0},
> +    {0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 0, 0},
> +    {0, 0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 2, 2, 2, 1, 0},
> +    {0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0},
> +    {0, 0, 0, 1, 1, 1, 3, 2, 0, 0, 0, 1, 1, 1, 3, 2},
> +};
> +void FilterLumaV(pixel* src, intptr_t stride, int tc, bool sw, bool
> bPartPNoFilter, bool bPartQNoFilter, int thrCut, bool bFilterSecondP, bool
> bFilterSecondQ)
> +{
> +    int delta;
> +
> +    if (sw)
> +    {
> +        uint64_t mask0 = UINT64_C(0xFFFFFFFFFFFF);
> +        uint64_t maskP = bPartPNoFilter ? UINT64_C(0xFFFFFF000000) :
> ~(uint64_t)0;
> +        uint64_t maskQ = bPartQNoFilter ? UINT64_C(0x000000FFFFFF) :
> ~(uint64_t)0;
> +        mask0 = mask0 & maskP & maskQ;
> +        const __m128i mask = _mm_setr_epi32((int)mask0, (int)(mask0 >>
> 32), 0, 0);
> +        const __m128i c4 = _mm_set1_epi32(0x00040004);
> +
> +        // NOTE: DONT unroll this loop, VC9 BUG!
> +        for(int i=0; i<2;i++)
> +        {
> +            __m128i T00 = _mm_loadl_epi64((__m128i*)&src[-4         ]);
>  // [- - - - - - - - 7 6 5 4 3 2 1 0]
> +            __m128i T01 = _mm_loadl_epi64((__m128i*)&src[-4 + stride]);
> +            __m128i T02 = _mm_unpacklo_epi64(T00, T01);
> +
> +            // NOTE: DONT use PMOVZXBW here, VC9 BUG!
> +            __m128i T03 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
> +            __m128i T04 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
> +            __m128i tc2 = _mm_shuffle_epi32(_mm_cvtsi32_si128((tc << 17)
> | (tc << 1)), 0);
> +            __m128i tcL0 = _mm_sub_epi16(_mm_srli_si128(T03, 2), tc2);
> +            __m128i tcH0 = _mm_add_epi16(_mm_srli_si128(T03, 2), tc2);
> +            __m128i tcL1 = _mm_sub_epi16(_mm_srli_si128(T04, 2), tc2);
> +            __m128i tcH1 = _mm_add_epi16(_mm_srli_si128(T04, 2), tc2);
>

This part could use some ascii art to make these variable names meaningful.

Also, this new primitive needs a test case


> +
> +            __m128i T10 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[0]));
> +            __m128i T11 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[1]));
> +            __m128i T12 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[2]));
> +            __m128i T13 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[3]));
> +            __m128i T14 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[4]));
> +            __m128i T15 = _mm_maddubs_epi16(T02,
> _mm_load_si128((__m128i*)FilterLumaV_0[5]));
> +
> +            __m128i T20 = _mm_unpacklo_epi64(T10, T11);
> +            __m128i T21 = _mm_unpacklo_epi64(T12, T13);
> +            __m128i T22 = _mm_unpacklo_epi64(T14, T15);
> +            __m128i T23 = _mm_unpackhi_epi64(T10, T11);
> +            __m128i T24 = _mm_unpackhi_epi64(T12, T13);
> +            __m128i T25 = _mm_unpackhi_epi64(T14, T15);
> +
> +            __m128i T30 = _mm_hadd_epi16(T20, T21);
> +            __m128i T31 = _mm_hadd_epi16(T22, T22);
> +            __m128i T40 = _mm_hadd_epi16(T30, T31);
> +            __m128i T50 = _mm_srai_epi16(_mm_add_epi16(T40, c4), 3);
> +
> +            T50 = _mm_max_epi16(T50, tcL0);
> +            T50 = _mm_min_epi16(T50, tcH0);
> +            T50 = _mm_packus_epi16(T50, T50);
> +
> +            _mm_maskmoveu_si128(T50, mask, (char*)&src[-3]);
> +
> +            T30 = _mm_hadd_epi16(T23, T24);
> +            T31 = _mm_hadd_epi16(T25, T25);
> +            T40 = _mm_hadd_epi16(T30, T31);
> +            T50 = _mm_srai_epi16(_mm_add_epi16(T40, c4), 3);
> +
> +            T50 = _mm_max_epi16(T50, tcL1);
> +            T50 = _mm_min_epi16(T50, tcH1);
> +            T50 = _mm_packus_epi16(T50, T50);
> +
> +            _mm_maskmoveu_si128(T50, mask, (char*)&src[-3 + stride]);
> +            src += 2 * stride;
> +        }
> +    }
> +    else
> +    {
> +        for(int i = 0; i < DEBLOCK_SMALLEST_BLOCK / 2; i++)
> +        {
> +            short m1  = (short)src[-3];
> +            short m2  = (short)src[-2];
> +            short m3  = (short)src[-1];
> +            short m4  = (short)src[ 0];
> +            short m5  = (short)src[ 1];
> +            short m6  = (short)src[ 2];
>

the casts to short are probably unnecessary?


> +
> +            /* Weak filter */
> +            delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
> +
> +            if (abs(delta) < thrCut)
> +            {
> +                delta = x265::Clip3(-tc, tc, delta);
> +                src[-1] = (pixel)x265::ClipY((m3 + delta));
> +                src[ 0] = (pixel)x265::ClipY((m4 - delta));
> +
> +                int tc2 = tc >> 1;
> +                if (bFilterSecondP)
> +                {
> +                    int delta1 = x265::Clip3(-tc2, tc2, ((((m1 + m3 + 1)
> >> 1) - m2 + delta) >> 1));
> +                    src[-2] = (pixel)x265::ClipY((m2 + delta1));
> +                }
> +                if (bFilterSecondQ)
> +                {
> +                    int delta2 = x265::Clip3(-tc2, tc2, ((((m6 + m4 + 1)
> >> 1) - m5 - delta) >> 1));
> +                    src[1] = (pixel)x265::ClipY((m5 + delta2));
> +                }
> +            }
> +
> +            if (bPartPNoFilter)
> +            {
> +                src[-2] = (pixel)m2;
> +                src[-1] = (pixel)m3;
> +            }
> +            if (bPartQNoFilter)
> +            {
> +                src[0] = (pixel)m4;
> +                src[1] = (pixel)m5;
> +            }
> +            src += stride;
> +        }
> +    }
> +}
> +#endif
> diff -r 413573beeef8 -r c0a3789a29f1 source/x265.h
> --- a/source/x265.h     Sat Sep 07 13:12:21 2013 +0800
> +++ b/source/x265.h     Sat Sep 07 21:01:55 2013 +0800
> @@ -25,6 +25,13 @@
>  #define _X265_H_
>
>  #include <stdint.h>
> +#ifndef UINT64_C
> +    #ifdef _MSC_VER
> +        #define UINT64_C(x)     (x##UL)
> +    #else
> +        #define UINT64_C(x)     (x##ULL)
> +    #endif
> +#endif
>

unless we need to define 64bit constants in our public API, this should go
into common.h or TypeDefs.h


>  #if __cplusplus
>  extern "C" {
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://mailman.videolan.org/private/x265-devel/attachments/20130907/d4c7162e/attachment-0001.html>


More information about the x265-devel mailing list