[x265] [PATCH Update] asm: interp_8tap_hv_pp_8x8() for InterpolateHV_8x8

Fri Oct 25 18:56:40 CEST 2013

On Fri, Oct 25, 2013 at 7:25 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1382703678 -28800
> # Node ID 2221e3abb479b1e9a586d80d769373d13c7f7980
> # Parent  4ca4da7bdd36fbef00b9eefe54c0a56bf11633f3
> asm: interp_8tap_hv_pp_8x8() for InterpolateHV_8x8
>

How does this compare, performance wise, to the combined h_ps + v_sp
intrinsic functions?

> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp        Fri Oct 25 12:11:31 2013 +0530
> +++ b/source/common/ipfilter.cpp        Fri Oct 25 20:21:18 2013 +0800
> @@ -401,6 +401,17 @@
>          dst += dstStride;
>      }
>  }
> +typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, short *dst,
> intptr_t dstStride, int width, int height, const short *coeff);
> +typedef void (*ipfilter_sp_t)(short *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, const short *coeff);
> +
> +template<int N, int width, int height>
> +void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t
> dstStride, int idxX, int idxY)
> +{
> +    short m_immedVals[(64 + 8) * (64 + 8)];
> +    filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals,
> width, width, height + 7, g_lumaFilter[idxX]);
> +    filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst,
> dstStride, width, height, g_lumaFilter[idxY]);
> +}
>

the intermediate buffer should be an argument

> +
>  }
>
>  namespace x265 {
> @@ -411,7 +422,8 @@
>      p.chroma_vpp[CHROMA_ ## W ## x ## H] = interp_vert_pp_c<4, W, H>
>
>  #define LUMA(W, H) \
> -    p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>
> +    p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>; \
> +    p.luma_hvpp[LUMA_ ## W ## x ## H]    = interp_hv_pp_c<8, W, H>
>
>  void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
>  {
> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/common/primitives.h
> --- a/source/common/primitives.h        Fri Oct 25 12:11:31 2013 +0530
> +++ b/source/common/primitives.h        Fri Oct 25 20:21:18 2013 +0800
> @@ -199,6 +199,7 @@
>  typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t
> dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src,  intptr_t
> srcStride, int w, int h);
>
>  typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int coeffIdx);
> +typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int idxX, int idxY);
>
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
> @@ -235,6 +236,7 @@
>      filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];
>      filter_pp_t     chroma_vpp[NUM_CHROMA_PARTITIONS];
>      filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
> +    filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
>
>      intra_dc_t      intra_pred_dc;
>      intra_planar_t  intra_pred_planar;
> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Fri Oct 25 12:11:31 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Fri Oct 25 20:21:18 2013
> +0800
> @@ -278,6 +278,8 @@
>          p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
>          p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
>          p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
> +
> +        p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Fri Oct 25 12:11:31 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm   Fri Oct 25 20:21:18 2013 +0800
> @@ -35,7 +35,9 @@
>             db 4, 5, 6, 7, 8,  9,  10, 11, 5, 6, 7, 8,  9,  10, 11, 12
>             db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
>
> -tab_c_512:  times 8 dw 512
> +tab_c_512:      times 8 dw 512
> +tab_c_8192:     times 8 dw 8192
> +tab_c_526336:   times 4 dd 8192*64+2048
>
>  tab_ChromaCoeff: db  0, 64,  0,  0
>                   db -2, 58, 10, -2
> @@ -51,6 +53,25 @@
>                   db  -1, 4, -11, 40,  40, -11, 4, -1
>                   db   0, 1, -5,  17,  58, -10, 4, -1
>
> +tab_LumaCoeffV: times 4 dw 0, 0
> +                times 4 dw 0, 64
> +                times 4 dw 0, 0
> +                times 4 dw 0, 0
> +
> +                times 4 dw -1, 4
> +                times 4 dw -10, 58
> +                times 4 dw 17, -5
> +                times 4 dw 1, 0
> +
> +                times 4 dw -1, 4
> +                times 4 dw -11, 40
> +                times 4 dw 40, -11
> +                times 4 dw 4, -1
> +
> +                times 4 dw 0, 1
> +                times 4 dw -5, 17
> +                times 4 dw 58, -10
> +                times 4 dw 4, -1
>
>  SECTION .text
>
> @@ -523,8 +544,8 @@
>      pmaddubsw   %1, %5
>      phaddw      %4, %1
>      phaddw      %2, %4
> +  %if %0 == 8
>      pmulhrsw    %2, %6
> -  %if %0 == 8
>      packuswb    %2, %2
>      movh        %8, %2
>    %endif
> @@ -623,3 +644,135 @@
>      IPFILTER_LUMA 48, 64
>      IPFILTER_LUMA 64, 16
>      IPFILTER_LUMA 16, 64
> +
> +
>
> +;-----------------------------------------------------------------------------
> +; Interpolate HV
>
> +;-----------------------------------------------------------------------------
> +%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) ->
> (t3, t5), (t4, t1), [2]
> +    mova        %5, [r0 +  (%6 + 0) * 16]
> +    mova        %1, [r0 +  (%6 + 1) * 16]
> +    mova        %2, [r0 +  (%6 + 2) * 16]
> +    punpcklwd   %3, %5, %1
> +    punpckhwd   %5, %1
> +    pmaddwd     %3, [r5 + (%7) * 16]   ; R3 = L[0+1] -- Row 0
> +    pmaddwd     %5, [r5 + (%7) * 16]   ; R0 = H[0+1]
> +    punpcklwd   %4, %1, %2
> +    punpckhwd   %1, %2
> +    pmaddwd     %4, [r5 + (%7) * 16]   ; R4 = L[1+2] -- Row 1
> +    pmaddwd     %1, [r5 + (%7) * 16]   ; R1 = H[1+2]
> +%endmacro ; FILTER_HV8_START
> +
> +%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H,
> t6, t7, off_src, off_coeff) -> [6]
> +    mova        %8, [r0 +  (%9 + 0) * 16]
> +    mova        %1, [r0 +  (%9 + 1) * 16]
> +    punpcklwd   %7, %2, %8
> +    punpckhwd   %2, %8
> +    pmaddwd     %7, [r5 + %10 * 16]
> +    pmaddwd     %2, [r5 + %10 * 16]
> +    paddd       %3, %7              ; R3 = L[0+1+2+3] -- Row 0
> +    paddd       %5, %2              ; R0 = H[0+1+2+3]
> +    punpcklwd   %7, %8, %1
> +    punpckhwd   %8, %1
> +    pmaddwd     %7, [r5 + %10 * 16]
> +    pmaddwd     %8, [r5 + %10 * 16]
> +    paddd       %4, %7              ; R4 = L[1+2+3+4] -- Row 1
> +    paddd       %6, %8              ; R1 = H[1+2+3+4]
> +%endmacro ; FILTER_HV8_START
> +
> +; Round and Saturate
> +%macro FILTER_HV8_END 4 ; output in [1, 3]
> +    paddd       %1, [tab_c_526336]
> +    paddd       %2, [tab_c_526336]
> +    paddd       %3, [tab_c_526336]
> +    paddd       %4, [tab_c_526336]
> +    psrad       %1, 12
> +    psrad       %2, 12
> +    psrad       %3, 12
> +    psrad       %4, 12
> +    packssdw    %1, %2
> +    packssdw    %3, %4
> +
> +    ; TODO: is merge better? I think this way is short dependency link
> +    packuswb    %1, %1
> +    packuswb    %3, %3
> +%endmacro ; FILTER_HV8_END
> +
>
> +;-----------------------------------------------------------------------------
> +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int idxX, int idxY)
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM ssse3
> +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
> +%define coef        m7
> +%define stk_buf     rsp
> +
> +    mov         r4d,        r4m
> +    mov         r5d,        r5m
> +
> +%ifdef PIC
> +    lea         r6,         [tab_LumaCoeff]
> +    movh        coef,       [r6 + r4 * 8]
> +%else
> +    movh        coef,       [tab_LumaCoeff + r4 * 8]
> +%endif
> +    punpcklqdq  coef,       coef
> +
> +; FILTER_H8_W8 7-8   ; t0, t1, t2, t3, coef, c512, src, dst
> +
> +    ; move to row -3
> +    lea         r6,         [r1 + r1 * 2]
> +    sub         r0,         r6
> +
> +    xor         r6,         r6
> +    mov         r4,         rsp
> +
> +.loopH:
> +    FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
> +    psubw       m1,         [tab_c_8192]
> +    mova        [r4],       m1
> +
> +    add         r0,         r1
> +    add         r4,         16
> +    inc         r6
> +    cmp         r6,         8+7
> +    jnz         .loopH
> +
> +    ; ready to phase V
> +    ; Here all of mN is free
> +
> +    ; load coeff table
> +    shl         r5,         6
> +    lea         r6,         [tab_LumaCoeffV]
> +    lea         r5,         [r5 + r6]
> +
> +    ; load intermedia buffer
> +    mov         r0,         stk_buf
> +
> +    ; register mapping
> +    ; r0 - src
> +    ; r5 - coeff
> +    ; r6 - loop_i
> +
> +    ; let's go
> +    xor         r6,         r6
> +
> +    ; TODO: this loop have more than 70 instructions, I think it is more
> than Intel loop decode cache
> +.loopV:
> +
> +    FILTER_HV8_START    m1, m2, m3, m4, m0,             0, 0
> +    FILTER_HV8_MID      m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
> +    FILTER_HV8_MID      m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
> +    FILTER_HV8_MID      m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
> +    FILTER_HV8_END      m3, m0, m4, m1
> +
> +    movq        [r2],       m3
> +    movq        [r2 + r3],  m4
> +
> +    lea         r0,         [r0 + 16 * 2]
> +    lea         r2,         [r2 + r3 * 2]
> +
> +    inc         r6
> +    cmp         r6,         8/2
> +    jnz         .loopV
> +
> +    RET
> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h     Fri Oct 25 12:11:31 2013 +0530
> +++ b/source/common/x86/ipfilter8.h     Fri Oct 25 20:21:18 2013 +0800
> @@ -87,6 +87,8 @@
>  CHROMA_FILTERS(_sse4);
>  LUMA_FILTERS(_sse4);
>
> +void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int idxX, int idxY);
> +
>  #undef SETUP_CHROMA_FUNC_DEF
>  #undef SETUP_LUMA_FUNC_DEF
>  #undef CHROMA_FILTERS
> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/test/ipfilterharness.cpp
> --- a/source/test/ipfilterharness.cpp   Fri Oct 25 12:11:31 2013 +0530
> +++ b/source/test/ipfilterharness.cpp   Fri Oct 25 20:21:18 2013 +0800
> @@ -325,6 +325,40 @@
>      return true;
>  }
>
> +bool IPFilterHarness::check_IPFilterLumaHV_primitive(filter_hv_pp_t ref,
> filter_hv_pp_t opt)
> +{
> +    int rand_srcStride, rand_dstStride, rand_coeffIdxX, rand_coeffIdxY;
> +
> +    for (int i = 0; i <= 1000; i++)
> +    {
> +        rand_coeffIdxX = rand() % 3;                // Random coeffIdex
> in the filter
> +        rand_coeffIdxY = rand() % 3;                // Random coeffIdex
> in the filter
> +
> +        rand_srcStride = rand() % 100;             // Randomly generated
> srcStride
> +        rand_dstStride = rand() % 100;             // Randomly generated
> dstStride
> +
> +        ref(pixel_buff + 3 * rand_srcStride,
> +            rand_srcStride,
> +            IPF_C_output_p,
> +            rand_dstStride,
> +            rand_coeffIdxX,
> +            rand_coeffIdxY
> +        );
> +        opt(pixel_buff + 3 * rand_srcStride,
> +            rand_srcStride,
> +            IPF_vec_output_p,
> +            rand_dstStride,
> +            rand_coeffIdxX,
> +            rand_coeffIdxY
> +        );
> +
> +        if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
> +            return false;
> +    }
> +
> +    return true;
> +}
> +
>  bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
>  {
>      for (int value = 0; value < NUM_IPFILTER_P_P; value++)
> @@ -421,6 +455,18 @@
>          }
>      }
>
> +    for (int value = 0; value < NUM_LUMA_PARTITIONS; value++)
> +    {
> +        if (opt.luma_hvpp[value])
> +        {
> +            if (!check_IPFilterLumaHV_primitive(ref.luma_hvpp[value],
> opt.luma_hvpp[value]))
> +            {
> +                printf("luma_hvpp[%s]", lumaPartStr[value]);
> +                return false;
> +            }
> +        }
> +    }
> +
>

You can merge this with the existing loop over luma partition sizes.

>      return true;
>  }
>
> @@ -486,6 +532,7 @@
>              REPORT_SPEEDUP(opt.luma_hpp[value], ref.luma_hpp[value],
>                             pixel_buff + srcStride, srcStride,
> IPF_vec_output_p, dstStride, 1);
>          }
> +
>          if (opt.luma_vpp[value])
>          {
>              printf("luma_vpp[%s]\t", lumaPartStr[value]);
> @@ -493,6 +540,13 @@
>                             pixel_buff + maxVerticalfilterHalfDistance *
> srcStride, srcStride,
>                             IPF_vec_output_p, dstStride, 1);
>          }
> +
> +        if (opt.luma_hvpp[value])
> +        {
> +            printf("luma_hv [%s]\t", lumaPartStr[value]);
> +            REPORT_SPEEDUP(opt.luma_hvpp[value], ref.luma_hvpp[value],
> +                           pixel_buff + srcStride, srcStride,
> IPF_vec_output_p, dstStride, 1, 3);
> +        }
>      }
>
>      for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
> diff -r 4ca4da7bdd36 -r 2221e3abb479 source/test/ipfilterharness.h
> --- a/source/test/ipfilterharness.h     Fri Oct 25 12:11:31 2013 +0530
> +++ b/source/test/ipfilterharness.h     Fri Oct 25 20:21:18 2013 +0800
> @@ -48,6 +48,7 @@
>      bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
>      bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
>      bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
> +    bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref,
> filter_hv_pp_t opt);
>
>  public:
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131025/925a5951/attachment-0001.html>