[x265] [PATCH 4 of 4] asm: chroma_p2s to replace ipfilter_p2s

Steve Borho steve at borho.org
Thu Oct 31 18:13:24 CET 2013


On Thu, Oct 31, 2013 at 8:03 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1383224503 -28800
> # Node ID 4a40c4069ad12bc72a1c443b45a91c65d319d35d
> # Parent  21dbf988079b0e33265ae48578c26347cc779fbe
> asm: chroma_p2s to replace ipfilter_p2s
>

the testbench is reporting failures after this change.  I'll see if I can
patch this myself.


>
> diff -r 21dbf988079b -r 4a40c4069ad1
> source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp  Thu Oct 31 21:01:29 2013
> +0800
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp  Thu Oct 31 21:01:43 2013
> +0800
> @@ -619,10 +619,13 @@
>      uint32_t cxWidth = width >> 1;
>      uint32_t cxHeight = height >> 1;
>
> +    assert(dstStride == MAX_CU_SIZE / 2);
> +    assert(((cxWidth | cxHeight) % 2) == 0);
> +
>      if ((yFrac | xFrac) == 0)
>      {
> -        primitives.ipfilter_p2s(refCb, refStride, dstCb, dstStride,
> cxWidth, cxHeight);
> -        primitives.ipfilter_p2s(refCr, refStride, dstCr, dstStride,
> cxWidth, cxHeight);
> +        primitives.chroma_p2s(refCb, refStride, dstCb, cxWidth, cxHeight);
> +        primitives.chroma_p2s(refCr, refStride, dstCr, cxWidth, cxHeight);
>      }
>      else if (yFrac == 0)
>      {
> diff -r 21dbf988079b -r 4a40c4069ad1 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp        Thu Oct 31 21:01:29 2013 +0800
> +++ b/source/common/ipfilter.cpp        Thu Oct 31 21:01:43 2013 +0800
> @@ -264,6 +264,7 @@
>      }
>  }
>
> +template<int dstStride>
>  void filterConvertPelToShort_c(pixel *src, intptr_t srcStride, int16_t
> *dst, int width, int height)
>  {
>      int shift = IF_INTERNAL_PREC - X265_DEPTH;
> @@ -278,7 +279,7 @@
>          }
>
>          src += srcStride;
> -        dst += MAX_CU_SIZE;
> +        dst += dstStride;
>      }
>  }
>
> @@ -489,7 +490,8 @@
>
>      p.ipfilter_p2s = filterConvertPelToShort_c;
>      p.ipfilter_s2p = filterConvertShortToPel_c;
> -    p.luma_p2s = filterConvertPelToShort_c;
> +    p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
> +    p.chroma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE/2>;
>
>      p.extendRowBorder = extendCURowColBorder;
>  }
> diff -r 21dbf988079b -r 4a40c4069ad1 source/common/primitives.h
> --- a/source/common/primitives.h        Thu Oct 31 21:01:29 2013 +0800
> +++ b/source/common/primitives.h        Thu Oct 31 21:01:43 2013 +0800
> @@ -254,6 +254,7 @@
>      filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
>      filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
>      filter_p2s_t    luma_p2s;
> +    filter_p2s_t    chroma_p2s;
>
>      intra_dc_t      intra_pred_dc;
>      intra_planar_t  intra_pred_planar;
> diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Thu Oct 31 21:01:29 2013
> +0800
> +++ b/source/common/x86/asm-primitives.cpp      Thu Oct 31 21:01:43 2013
> +0800
> @@ -318,6 +318,7 @@
>          p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
>          p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
>          p.luma_p2s = x265_luma_p2s_ssse3;
> +        p.chroma_p2s = x265_chroma_p2s_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Thu Oct 31 21:01:29 2013 +0800
> +++ b/source/common/x86/ipfilter8.asm   Thu Oct 31 21:01:43 2013 +0800
> @@ -2124,3 +2124,61 @@
>      jnz         .loopH
>
>      RET
> +
> +
> +; TODO: combin of U and V is more performance, but need more register
> +; TODO: use two path for height alignment to 4 and otherwise may
> improvement 10% performance, but code is more complex, so I disable it
> +INIT_XMM ssse3
> +cglobal chroma_p2s, 3, 7, 6
> +
> +    ; load width and height
> +    mov         r3d, r3m
> +    mov         r4d, r4m
> +
> +    ; load constant
> +    mova        m4, [tab_c_128]
> +    mova        m5, [tab_c_64_n64]
> +
> +.loopH:
> +
> +    xor         r5d, r5d
> +.loopW:
> +    lea         r6, [r0 + r5]
> +
> +    movh        m0, [r6]
> +    punpcklbw   m0, m4
> +    pmaddubsw   m0, m5
> +
> +    movh        m1, [r6 + r1]
> +    punpcklbw   m1, m6
> +    pmaddubsw   m1, m7
> +
> +    add         r5d, 8
> +    cmp         r5d, r3d
> +    lea         r6, [r2 + r5 * 2]
> +    jg          .width2
> +    movu        [r6 + FENC_STRIDE / 2 * 0 - 16], m0
> +    movu        [r6 + FENC_STRIDE / 2 * 2 - 16], m1
> +    je          .nextH
> +    jmp         .loopW
> +
> +.width4:
> +    cmp         r3d, 4
> +    jl          .width2
> +    movh        [r6 + FENC_STRIDE / 2 * 0 - 16], m0
> +    movh        [r6 + FENC_STRIDE / 2 * 2 - 16], m1
> +    lea         r6, [r6 + 8]
> +    jz          .nextH
> +
> +.width2:
> +    movd        [r6 + FENC_STRIDE / 2 * 0 - 16], m0
> +    movd        [r6 + FENC_STRIDE / 2 * 2 - 16], m1
> +
> +.nextH:
> +    lea         r0, [r0 + r1 * 2]
> +    add         r2, FENC_STRIDE / 2 * 4
> +
> +    sub         r4d, 2
> +    jnz         .loopH
> +
> +    RET
> diff -r 21dbf988079b -r 4a40c4069ad1 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h     Thu Oct 31 21:01:29 2013 +0800
> +++ b/source/common/x86/ipfilter8.h     Thu Oct 31 21:01:43 2013 +0800
> @@ -91,6 +91,7 @@
>  void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int idxX, int idxY);
>  void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
>  void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst,
> int width, int height);
> +void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst,
> int width, int height);
>
>  #undef SETUP_CHROMA_FUNC_DEF
>  #undef SETUP_LUMA_FUNC_DEF
> diff -r 21dbf988079b -r 4a40c4069ad1 source/test/ipfilterharness.cpp
> --- a/source/test/ipfilterharness.cpp   Thu Oct 31 21:01:29 2013 +0800
> +++ b/source/test/ipfilterharness.cpp   Thu Oct 31 21:01:43 2013 +0800
> @@ -240,14 +240,15 @@
>      return true;
>  }
>
> -bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref,
> filter_p2s_t opt)
> +bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref,
> filter_p2s_t opt, int isChroma)
>  {
> -    int16_t rand_srcStride;
> +    intptr_t rand_srcStride;
> +    const int min_size = isChroma ? 2 : 4;
>
>      for (int i = 0; i <= 1000; i++)
>      {
> -        int16_t rand_height = (int16_t)rand() % 100;                 //
> Randomly generated Height
> -        int16_t rand_width = (int16_t)rand() % 100;                  //
> Randomly generated Width
> +        int rand_height = (int16_t)rand() % 100;                 //
> Randomly generated Height
> +        int rand_width = (int16_t)rand() % 100;                  //
> Randomly generated Width
>
>          memset(IPF_vec_output_s, 0, ipf_t_size);      // Initialize
> output buffer to zero
>          memset(IPF_C_output_s, 0, ipf_t_size);        // Initialize
> output buffer to zero
> @@ -256,13 +257,13 @@
>          if (rand_srcStride < rand_width)
>              rand_srcStride = rand_width;
>
> -        rand_width %= 4;
> -        if (rand_width < 4)
> -            rand_width = 4;
> +        rand_width %= min_size;
> +        if (rand_width < min_size)
> +            rand_width = min_size;
>
> -        rand_height %= 4;
> -        if (rand_height < 4)
> -            rand_height = 4;
> +        rand_height %= min_size;
> +        if (rand_height < min_size)
> +            rand_height = min_size;
>
>          ref(pixel_buff,
>              rand_srcStride,
> @@ -461,7 +462,16 @@
>
>      if (opt.luma_p2s)
>      {
> -        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s))
> +        if (!check_IPFilter_primitive(ref.luma_p2s, opt.luma_p2s, 0))
> +        {
> +            printf("ipfilter_p2s failed\n");
> +            return false;
> +        }
> +    }
> +
> +    if (opt.chroma_p2s)
> +    {
> +        if (!check_IPFilter_primitive(ref.chroma_p2s, opt.chroma_p2s, 1))
>          {
>              printf("ipfilter_p2s failed\n");
>              return false;
> @@ -586,6 +596,13 @@
>                         pixel_buff, srcStride, IPF_vec_output_s, width,
> height);
>      }
>
> +    if (opt.chroma_p2s)
> +    {
> +        printf("chroma_p2s\t");
> +        REPORT_SPEEDUP(opt.chroma_p2s, ref.chroma_p2s,
> +                       pixel_buff, srcStride, IPF_vec_output_s, width,
> height);
> +    }
> +
>      if (opt.ipfilter_s2p)
>      {
>          printf("ipfilter_s2p\t");
> diff -r 21dbf988079b -r 4a40c4069ad1 source/test/ipfilterharness.h
> --- a/source/test/ipfilterharness.h     Thu Oct 31 21:01:29 2013 +0800
> +++ b/source/test/ipfilterharness.h     Thu Oct 31 21:01:43 2013 +0800
> @@ -45,7 +45,7 @@
>      bool check_IPFilter_primitive(ipfilter_ps_t ref, ipfilter_ps_t opt);
>      bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
>      bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
> -    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt);
> +    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int
> isChroma);
>      bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
>      bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
>      bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131031/04a3cf5a/attachment-0001.html>


More information about the x265-devel mailing list