[x265] [PATCH 4 of 4] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

Mon Oct 28 19:25:21 CET 2013

On Mon, Oct 28, 2013 at 9:24 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1382970234 -28800
> # Node ID 41425f18efe14be468715bfa68fdebbb9a49145f
> # Parent  5f7b3d06d94c6aec44bfd4a7bfb6f6751182b4ed
> asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]
>

I'm getting link errors on x86_64 from this series:

error LNK2017: 'ADDR32' relocation to 'tab_LumaCoeffV' invalid without
/LARGEADDRESSAWARE:NO

In general, I think we should drop all of the interpolation merging while
we get all the assembly completed for motion compensation.  When the
assembly is alltogether, we can experiment and figure out if it makes sense
to re-merge some of them back together.

> diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Mon Oct 28 22:23:29 2013
> +0800
> +++ b/source/common/x86/asm-primitives.cpp      Mon Oct 28 22:23:54 2013
> +0800
> @@ -280,6 +280,7 @@
>          p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
>
>          p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
> +        p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/common/x86/ipfilter8.asm   Mon Oct 28 22:23:54 2013 +0800
> @@ -774,3 +774,114 @@
>      jnz         .loopV
>
>      RET
> +
> +
>
> +;-----------------------------------------------------------------------------
> +; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, const int coeffIdx);
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM ssse3
> +cglobal interp_8tap_v_sp, 4, 7, 8, 0-(2*4 + 3*gprsize)
> +%define old_r0      (rsp + 2 * 4 + 0 * gprsize)
> +%define old_r2      (rsp + 2 * 4 + 1 * gprsize)
> +%define old_r3      (rsp + 2 * 4 + 2 * gprsize)
> +%define old_r4d     (rsp + 0 * 4)
> +%define old_6rows   (rsp + 1 * 4)
> +
> +    mov         r4d,        r4m
> +    mov         r5d,        r5m
> +
> +    ; load coeff table
> +    mov         r6d,        r6m
> +    shl         r6,         6
> +    lea         r6,         [tab_LumaCoeffV + r6]
> +
> +    mov         [old_r4d], r4d
> +    mov         [old_r2], r2
> +
> +    ; move to -3
> +    lea         r1, [r1 * 2]
> +    lea         r4, [r1 + r1 * 2]
> +    sub         r0, r4
> +    lea         r4, [r4 * 2]
> +    mov         [old_6rows], r4
> +
> +.loopH:
> +
> +    ; load width
> +    mov         r4d, [old_r4d]
> +
> +    ; save old src
> +    mov         [old_r0], r0
> +
> +.loopW:
> +
> +    movu        m0, [r0]
> +    movu        m1, [r0 + r1]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m2, m0, m1
> +    pmaddwd     m2, [r6 + 0 * 16]
> +    punpckhwd   m0, m1
> +    pmaddwd     m0, [r6 + 0 * 16]
> +
> +    movu        m3, [r0]
> +    movu        m4, [r0 + r1]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m1, m3, m4
> +    pmaddwd     m1, [r6 + 1 * 16]
> +    paddd       m2, m1
> +    punpckhwd   m3, m4
> +    pmaddwd     m3, [r6 + 1 * 16]
> +    paddd       m0, m3
> +
> +    movu        m3, [r0]
> +    movu        m4, [r0 + r1]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m1, m3, m4
> +    pmaddwd     m1, [r6 + 2 * 16]
> +    paddd       m2, m1
> +    punpckhwd   m3, m4
> +    pmaddwd     m3, [r6 + 2 * 16]
> +    paddd       m0, m3
> +
> +    movu        m3, [r0]
> +    movu        m4, [r0 + r1]
> +    punpcklwd   m1, m3, m4
> +    pmaddwd     m1, [r6 + 3 * 16]
> +    paddd       m2, m1
> +    punpckhwd   m3, m4
> +    pmaddwd     m3, [r6 + 3 * 16]
> +    paddd       m0, m3
> +
> +    paddd       m2, [tab_c_526336]
> +    paddd       m0, [tab_c_526336]
> +    psrad       m2, 12
> +    psrad       m0, 12
> +    packssdw    m2, m0
> +    packuswb    m2, m2
> +
> +    ; move to next 8 col
> +    sub         r0, [old_6rows]
> +
> +    sub         r4, 8
> +    jl          .width4
> +    movq        [r2], m2
> +    je          .nextH
> +    lea         r0, [r0 + 16]
> +    lea         r2, [r2 + 8]
> +    jmp         .loopW
> +
> +.width4:
> +    movd        [r2], m2
> +    lea         r0, [r0 + 4]
> +
> +.nextH:
> +    ; move to next row
> +    mov         r0, [old_r0]
> +    lea         r0, [r0 + r1]
> +    add         [old_r2], r3d
> +    mov         r2, [old_r2]
> +
> +    dec         r5d
> +    jnz         .loopH
> +
> +    RET
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h     Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/common/x86/ipfilter8.h     Mon Oct 28 22:23:54 2013 +0800
> @@ -89,6 +89,7 @@
>  LUMA_FILTERS(_sse4);
>
>  void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int idxX, int idxY);
> +void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
>
>  #undef SETUP_CHROMA_FUNC_DEF
>  #undef SETUP_LUMA_FUNC_DEF
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/test/ipfilterharness.cpp
> --- a/source/test/ipfilterharness.cpp   Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/test/ipfilterharness.cpp   Mon Oct 28 22:23:54 2013 +0800
> @@ -164,6 +164,8 @@
>      int rand_width = rand() % 100;                  // Randomly generated
> Width
>      int16_t rand_val, rand_srcStride, rand_dstStride;
>
> +    rand_width &= ~3;
> +
>      for (int i = 0; i <= 100; i++)
>      {
>          memset(IPF_vec_output_p, 0, ipf_t_size);      // Initialize
> output buffer to zero
> @@ -173,16 +175,16 @@
>          rand_srcStride = rand() % 100;              // Randomly generated
> srcStride
>          rand_dstStride = rand() % 100;              // Randomly generated
> dstStride
>
> -        opt(short_buff + 3 * rand_srcStride,
> +        ref(short_buff + 3 * rand_srcStride,
>              rand_srcStride,
> -            IPF_vec_output_p,
> +            IPF_C_output_p,
>              rand_dstStride,
>              rand_width,
>              rand_height, rand_val
>              );
> -        ref(short_buff + 3 * rand_srcStride,
> +        opt(short_buff + 3 * rand_srcStride,
>              rand_srcStride,
> -            IPF_C_output_p,
> +            IPF_vec_output_p,
>              rand_dstStride,
>              rand_width,
>              rand_height, rand_val
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/test/testbench.cpp
> --- a/source/test/testbench.cpp Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/test/testbench.cpp Mon Oct 28 22:23:54 2013 +0800
> @@ -74,7 +74,7 @@
>          }
>      }
>
> -    int seed = (int)time(NULL);
> +    int seed = 0x526E629B;//(int)time(NULL);
>      const char *bpp[] = { "8bpp", "16bpp" };
>      printf("Using random seed %X %s\n", seed, bpp[HIGH_BIT_DEPTH]);
>      srand(seed);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131028/cf811fc7/attachment.html>