[x265] [PATCH] primitives pointer initialization and chroma asm code for all width sizes

Tue Oct 15 19:26:31 CEST 2013

On Tue, Oct 15, 2013 at 12:00 PM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Praveen Tiwari
> # Date 1381856430 -19800
> # Node ID 203a9b334293c50a3e8741352726f2eef71dddb3
> # Parent  39fc3c36e1b1b5fcaa7a7f65ddd21a2ecba1fc06
> primitives pointer initialization and chroma asm code for all width sizes
>

holding off on this patch until the testbench is ready for these.

>
> diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp        Tue Oct 15 21:22:03 2013 +0530
> +++ b/source/common/ipfilter.cpp        Tue Oct 15 22:30:30 2013 +0530
> @@ -497,8 +497,20 @@
>  namespace x265 {
>  // x265 private namespace
>
> +    #define SETUP_PARTITION(W) \
> +    p.chroma_hpp[CHROMA_PARTITION_W##W] = interp_horiz_pp##<4, W>;
> +
>  void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
>  {
> +
> +    SETUP_PARTITION(2);
> +    SETUP_PARTITION(4);
> +    SETUP_PARTITION(6);
> +    SETUP_PARTITION(8);
> +    SETUP_PARTITION(12);
> +    SETUP_PARTITION(16);
> +    SETUP_PARTITION(32);
> +
>      p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;
>      p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;
>      p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_p_s<8>;
> diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Tue Oct 15 21:22:03 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Tue Oct 15 22:30:30 2013
> +0530
> @@ -2,6 +2,7 @@
>   * Copyright (C) 2013 x265 project
>   *
>   * Authors: Steve Borho <steve at borho.org>
> + *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -91,7 +92,13 @@
>  DECL_SUF( x265_pixel_avg_4x8,   ( pixel *, intptr_t, pixel *, intptr_t,
> pixel *, intptr_t, int ))
>  DECL_SUF( x265_pixel_avg_4x4,   ( pixel *, intptr_t, pixel *, intptr_t,
> pixel *, intptr_t, int ))
>
> -void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
> +void x265_interp_4tap_horiz_pp_w2_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w4_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w6_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w8_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w12_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w16_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w32_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
>  }
>
>  using namespace x265;
> @@ -273,9 +280,14 @@
>          p.sa8d[BLOCK_8x8]   = x265_pixel_sa8d_8x8_sse4;
>          p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
>          SA8D_INTER_FROM_BLOCK(sse4);
> -
> -#if !defined(X86_64)
> -        p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
> +#if !defined(X86_64)    // will go away tommorow once PIC issue is fixed
> for 64-bit build
> +        p.chroma_hpp[CHROMA_PARTITION_W2] =
> x265_interp_4tap_horiz_pp_w2_sse4;
> +        p.chroma_hpp[CHROMA_PARTITION_W4] =
> x265_interp_4tap_horiz_pp_w4_sse4;
> +        p.chroma_hpp[CHROMA_PARTITION_W6] =
> x265_interp_4tap_horiz_pp_w6_sse4;
> +        p.chroma_hpp[CHROMA_PARTITION_W8] =
> x265_interp_4tap_horiz_pp_w8_sse4;
> +        p.chroma_hpp[CHROMA_PARTITION_W12] =
> x265_interp_4tap_horiz_pp_w12_sse4;
> +        p.chroma_hpp[CHROMA_PARTITION_W16] =
> x265_interp_4tap_horiz_pp_w16_sse4;
> +        p.chroma_hpp[CHROMA_PARTITION_W32] =
> x265_interp_4tap_horiz_pp_w32_sse4;
>  #endif
>      }
>      if (cpuMask & X265_CPU_AVX)
> diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Tue Oct 15 21:22:03 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm   Tue Oct 15 22:30:30 2013 +0530
> @@ -26,109 +26,208 @@
>  %include "x86inc.asm"
>  %include "x86util.asm"
>
> -%if ARCH_X86_64 == 0
> -
>  SECTION_RODATA 32
> -tab_leftmask:   db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
> -
> -tab_Tm:     db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
> +tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
> +            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
>
>  tab_c_512:  times 8 dw 512
>
> +tab_coeff:    db  0, 64,  0,  0
> +              db -2, 58, 10, -2
> +              db -4, 54, 16, -2
> +              db -6, 46, 28, -4
> +              db -4, 36, 36, -4
> +              db -4, 28, 46, -6
> +              db -2, 16, 54, -4
> +              db -2, 10, 58, -2
> +
>  SECTION .text
>
> -%macro FILTER_H4 2
> -    movu        %1, [src + col - 1]
> -    pshufb      %1, Tm4
> -    pmaddubsw   %1, coef2
> -    movu        %2, [src + col + 1]
> -    pshufb      %2, Tm4
> -    pmaddubsw   %2, coef3
> -    paddw       %1, %2
> -    pmulhrsw    %1, c512
> -    packuswb    %1, %1
> +%macro FILTER_H4_w2 3
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    phaddw      %2, %1
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    pextrw      [dstq], %2, 0
> +%endmacro
> +
> +%macro FILTER_H4_w4 3
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    phaddw      %2, %1
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    movd        [dstq],      %2
> +%endmacro
> +
> +%macro FILTER_H4_w6 3
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    movd        [dstq],      %2
> +    pextrw      [dstq + 4], %2, 2
> +%endmacro
> +
> +%macro FILTER_H4_w8 3
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    pmulhrsw    %2, %3
> +    packuswb    %2, %2
> +    movh        [dstq],      %2
> +%endmacro
> +
> +%macro FILTER_H4_w12 3
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    pmulhrsw    %2, %3
> +    movu        %1, [srcq - 1 + 8]
> +    pshufb      %1, %1, Tm0
> +    pmaddubsw   %1, coef2
> +    phaddw      %1, %1
> +    pmulhrsw    %1, %3
> +    packuswb    %2, %1
> +    movh        [dstq],      %2
> +    pextrd      [dstq + 8], %2, 2
> +%endmacro
> +
> +%macro FILTER_H4_w16 4
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    movu        %1, [srcq - 1 + 8]
> +    pshufb      %4, %1, Tm0
> +    pmaddubsw   %4, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %4, %1
> +    pmulhrsw    %2, %3
> +    pmulhrsw    %4, %3
> +    packuswb    %2, %4
> +    movu        [dstq],      %2
> +%endmacro
> +
> +%macro FILTER_H4_w32 4
> +    movu        %1, [srcq - 1]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    movu        %1, [srcq - 1 + 8]
> +    pshufb      %4, %1, Tm0
> +    pmaddubsw   %4, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %4, %1
> +    pmulhrsw    %2, %3
> +    pmulhrsw    %4, %3
> +    packuswb    %2, %4
> +    movu        [dstq],      %2
> +    movu        %1, [srcq - 1 + 16]
> +    pshufb      %2, %1, Tm0
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    movu        %1, [srcq - 1 + 24]
> +    pshufb      %4, %1, Tm0
> +    pmaddubsw   %4, coef2
> +    pshufb      %1, %1, Tm1
> +    pmaddubsw   %1, coef2
> +    phaddw      %4, %1
> +    pmulhrsw    %2, %3
> +    pmulhrsw    %4, %3
> +    packuswb    %2, %4
> +    movu        [dstq + 16],      %2
>  %endmacro
>
>
>  ;-----------------------------------------------------------------------------
> -; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, short const *coeff)
> +; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int height, int coeffIdx)
>
>  ;-----------------------------------------------------------------------------
> +%macro IPFILTER_CHROMA 1
>  INIT_XMM sse4
> -cglobal filterHorizontal_p_p_4, 0, 7, 8
> -%define src         r0
> -%define dst         r1
> -%define row         r2
> -%define col         r3
> -%define width       r4
> -%define widthleft   r5
> -%define mask_offset r6
> -%define coef2       m7
> -%define coef3       m6
> -%define Tm4         m5
> -%define c512        m4
> -%define x2          m3
> -%define x1          m2
> -%define x0          m1
> -%define leftmask    m0
> -%define tmp         r0
> -%define tmp1        r1
> -
> -    mov         tmp,        r6m
> -    movd        coef2,      [tmp    ]
> -    movd        coef3,      [tmp + 4]
> -    pshufd      coef2,      coef2,  0
> -    pshufd      coef3,      coef3,  0
> -    packsswb    coef2,      coef2
> -    packsswb    coef3,      coef3
> +cglobal interp_4tap_horiz_pp_w%1, 6, 7, 6, src, srcstride, dst,
> dststride, height, coeffIdx
> +%define coef2       m5
> +%define Tm0         m4
> +%define Tm1         m3
> +%define x2          m2
> +%define x1          m1
> +%define x0          m0
>
> -    mov         width,      r4m
> -    mov         widthleft,  width
> -    and         width,      ~7
> -    and         widthleft,  7
> -    mov         mask_offset,  widthleft
> -    neg         mask_offset
> +movd        coef2,       [tab_coeff + r5d * 4]
>
> -    movq        leftmask,   [tab_leftmask + (7 + mask_offset)]
> -    mova        Tm4,        [tab_Tm]
> -    mova        c512,       [tab_c_512]
> +pshufd      coef2,       coef2,      0
> +mova        x2,          [tab_c_512]
> +mova        Tm0,         [tab_Tm]
> +mova        Tm1,        [tab_Tm + 16]
>
> -    mov         src,        r0m
> -    mov         dst,        r2m
> -    mov         row,        r5m
> +.loop
> +FILTER_H4_w%1   x0, x1, x2
> +add         srcq,        srcstrideq
> +add         dstq,        dststrideq
>
> -_loop_row:
> -    xor         col,        col
> -
> -_loop_col:
> -    cmp         col,        width
> -    jge         _end_col
> +dec          r4d
> +jnz         .loop
> +
> +RET
> +%endmacro
>
> -    FILTER_H4   x0, x1
> -    movh        [dst + col], x0
> +IPFILTER_CHROMA 2
> +IPFILTER_CHROMA 4
> +IPFILTER_CHROMA 6
> +IPFILTER_CHROMA 8
> +IPFILTER_CHROMA 12
>
> -    add         col,         8
> -    jmp         _loop_col
>
> +;-----------------------------------------------------------------------------
> +; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int height, int coeffIdx)
>
> +;-----------------------------------------------------------------------------
> +%macro IPFILTER_CHROMA_W 1
> +INIT_XMM sse4
> +cglobal interp_4tap_horiz_pp_w%1, 6, 7, 7, src, srcstride, dst,
> dststride, height, coeffIdx
> +%define coef2       m6
> +%define Tm0         m5
> +%define Tm1         m4
> +%define x3          m3
> +%define x2          m2
> +%define x1          m1
> +%define x0          m0
>
> -_end_col:
> -    test        widthleft,  widthleft
> -    jz          _next_row
> +movd        coef2,       [tab_coeff + r5d * 4]
>
> -    movq        x2, [dst + col]
> -    FILTER_H4   x0, x1
> -    pblendvb    x2, x2, x0, leftmask
> -    movh        [dst + col], x2
> +pshufd      coef2,       coef2,      0
> +mova        x2,          [tab_c_512]
> +mova        Tm0,         [tab_Tm]
> +mova        Tm1,         [tab_Tm + 16]
>
> -_next_row:
> -    add         src,        r1m
> -    add         dst,        r3m
> -    dec         row
> +.loop
> +FILTER_H4_w%1   x0, x1, x2, x3
> +add         srcq,        srcstrideq
> +add         dstq,        dststrideq
>
> -    test        row,        row
> -    jz          _end_row
> +dec          r4d
> +jnz         .loop
> +
> +RET
> +%endmacro
>
> -    jmp         _loop_row
> -
> -_end_row:
> -
> -    RET
> -
> -%endif  ; ARCH_X86_64 == 0
> +IPFILTER_CHROMA_W 16
> +IPFILTER_CHROMA_W 32
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131015/0498ca19/attachment-0001.html>