[x265] [PATCH 2 of 3] asm: ipfilter_ss[FILTER_V_S_S_8]

Steve Borho steve at borho.org
Tue Nov 5 01:46:56 CET 2013


On Mon, Nov 4, 2013 at 5:05 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1383563104 -28800
> # Node ID 539ad4851359f96591f612f8b7b6fb0483e5a48c
> # Parent  2a7a5766fbd84436cdbbd5018a34db92e62553a1
> asm: ipfilter_ss[FILTER_V_S_S_8]
>

When this patch is applied, the testbench enters an infinite loop in this
function

>
> diff -r 2a7a5766fbd8 -r 539ad4851359
> source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp  Mon Nov 04 19:04:43 2013
> +0800
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp  Mon Nov 04 19:05:04 2013
> +0800
> @@ -537,7 +537,7 @@
>          int filterSize = NTAPS_LUMA;
>          int halfFilterSize = (filterSize >> 1);
>          primitives.ipfilter_ps[FILTER_H_P_S_8](ref - (halfFilterSize - 1)
> * refStride, refStride, m_immedVals, tmpStride, width, height + filterSize
> - 1, g_lumaFilter[xFrac]);
> -        primitives.ipfilter_ss[FILTER_V_S_S_8](m_immedVals +
> (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height,
> g_lumaFilter[yFrac]);
> +        primitives.ipfilter_ss[FILTER_V_S_S_8](m_immedVals +
> (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height,
> yFrac);
>      }
>  }
>
> @@ -643,9 +643,9 @@
>          int filterSize = NTAPS_CHROMA;
>          int halfFilterSize = (filterSize >> 1);
>          primitives.ipfilter_ps[FILTER_H_P_S_4](refCb - (halfFilterSize -
> 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight +
> filterSize - 1, g_chromaFilter[xFrac]);
> -        primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals +
> (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth,
> cxHeight, g_chromaFilter[yFrac]);
> +        primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals +
> (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth,
> cxHeight, yFrac);
>          primitives.ipfilter_ps[FILTER_H_P_S_4](refCr - (halfFilterSize -
> 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight +
> filterSize - 1, g_chromaFilter[xFrac]);
> -        primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals +
> (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth,
> cxHeight, g_chromaFilter[yFrac]);
> +        primitives.ipfilter_ss[FILTER_V_S_S_4](m_immedVals +
> (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth,
> cxHeight, yFrac);
>      }
>  }
>
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp        Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/common/ipfilter.cpp        Mon Nov 04 19:05:04 2013 +0800
> @@ -120,8 +120,9 @@
>  }
>
>  template<int N>
> -void filterVertical_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst,
> intptr_t dstStride, int width, int height, int16_t const *c)
> +void filterVertical_ss_c(int16_t *src, intptr_t srcStride, int16_t *dst,
> intptr_t dstStride, int width, int height, const int coefIdx)
>  {
> +    const int16_t *const c = (N == 8 ? g_lumaFilter[coefIdx] :
> g_chromaFilter[coefIdx]);
>      int shift = IF_FILTER_PREC;
>      int row, col;
>      src -= (N / 2 - 1) * srcStride;
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/primitives.h
> --- a/source/common/primitives.h        Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/common/primitives.h        Mon Nov 04 19:05:04 2013 +0800
> @@ -166,7 +166,7 @@
>  typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, const int16_t *coeff);
>  typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t
> *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
>  typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
> -typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t
> *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
> +typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t
> *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
>  typedef void (*ipfilter_p2s_t)(pixel *src, intptr_t srcStride, int16_t
> *dst, intptr_t dstStride, int width, int height);
>  typedef void (*ipfilter_s2p_t)(int16_t *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int width, int height);
>  typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t
> dstride, pixel *src, intptr_t sstride); // dst is aligned
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Mon Nov 04 19:04:43 2013
> +0800
> +++ b/source/common/x86/asm-primitives.cpp      Mon Nov 04 19:05:04 2013
> +0800
> @@ -279,6 +279,7 @@
>          SA8D_INTER_FROM_BLOCK(sse2);
>
>          p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
> +        p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;
>      }
>      if (cpuMask & X265_CPU_SSSE3)
>      {
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/common/x86/ipfilter8.asm   Mon Nov 04 19:05:04 2013 +0800
> @@ -2593,3 +2593,144 @@
>      jnz         .loopH
>
>      RET
> +
> +
>
> +;-------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_v_ss(int16_t *src, intptr_t srcStride, int16_t *dst,
> intptr_t dstStride, int width, int height, const int coefIdx)
>
> +;-------------------------------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +
> +%if ARCH_X86_64
> +cglobal interp_8tap_v_ss, 4, 7+1, 8
> +%define tmp_r4d     r7d
> +%define tmp_r5d     r8d
> +%else
> +cglobal interp_8tap_v_ss, 4, 7, 8, 0-2*4
> +%define tmp_r4d     dword [rsp + 0*4]
> +%define tmp_r5d     dword [rsp + 1*4]
> +%endif
> +
> +    ; load width, height and filterIdx
> +    mov         r4d, r4m
> +    mov         r5d, r5m
> +    mov         r6d, r6m
> +
> +    ; convert to word stride
> +    add         r1, r1
> +    add         r3, r3
> +
> +    ; stort to temporary memory or register
> +    shr         r4d, 2
> +    mov         tmp_r4d, r4d
> +    shr         r5d, 2
> +    mov         tmp_r5d, r5d
> +
> +    shl         r6d, 6
> +%ifdef PIC
> +    lea         r5, [tab_LumaCoeffV]
> +    lea         r6, [r5 + r6]
> +%else
> +    lea         r6, [tab_LumaCoeffV + r6]
> +%endif
> +
> +    lea         r4, [r1 * 3]
> +    sub         r0, r4
> +
> +.loopH:
> +    ; load width
> +    mov         r4d, tmp_r4d
> +
> +.loopW:
> +
> +    movh        m0, [r0]                    ; m0 = [0]
> +    movh        m1, [r0 + r1]               ; m1 = [1]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m0, m1
> +    pmaddwd     m0, [r6 + 0 * 16]           ; m0 = [0+1]            = R0
> +
> +    movh        m2, [r0]                    ; m2 = [2]
> +    movh        m3, [r0 + r1]               ; m3 = [3]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m1, m2
> +    pmaddwd     m1, [r6 + 0 * 16]           ; m1 = [1+2]            = R1
> +    punpcklwd   m2, m3                      ; m2 = [2 3]
> +    pmaddwd     m7, m2, [r6 + 1 * 16]       ;
> +    paddd       m0, m7                      ; m0 = [0+1+2+3]        = R0
> +    pmaddwd     m2, [r6 + 0 * 16]           ; m2 = [2+3]            = R2
> +
> +    movh        m4, [r0]                    ; m4 = [4]
> +    movh        m5, [r0 + r1]               ; m5 = [5]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m3, m4                      ; m3 = [3 4]
> +    pmaddwd     m7, m3, [r6 + 1 * 16]
> +    paddd       m1, m7                      ; m1 = [1+2+3+4]        = R1
> +    pmaddwd     m3, [r6 + 0 * 16]           ; m3 = [3+4]            = R3
> +    punpcklwd   m4, m5                      ; m4 = [4 5]
> +    pmaddwd     m7, m4, [r6 + 2 * 16]
> +    paddd       m0, m7                      ; m0 = [0+1+2+3+4+5]    = R0
> +    pmaddwd     m4, [r6 + 1 * 16]
> +    paddd       m2, m4                      ; m2 = [2+3+4+5]        = R2
> +
> +    movh        m6, [r0]                    ; m6 = [6]
> +    movh        m7, [r0 + r1]               ; m7 = [7]
> +    lea         r0, [r0 + r1 * 2]
> +    punpcklwd   m5, m6                      ; m5 = [5 6]
> +    pmaddwd     m4, m5, [r6 + 2 * 16]
> +    paddd       m1, m4                      ; m1 = [1+2+3+4+5+6]    = R1
> +    pmaddwd     m5, [r6 + 1 * 16]
> +    paddd       m3, m5                      ; m3 = [3+4+5+6]        = R3
> +    punpcklwd   m6, m7                      ; m6 = [6 7]
> +    pmaddwd     m4, m6, [r6 + 3 * 16]
> +    paddd       m0, m4                      ; m0 = [0+1+2+3+4+5+6+7]= R0
> +    pmaddwd     m6, [r6 + 2 * 16]
> +    paddd       m2, m6                      ; m2 = [2+3+4+5+6+7]    = R2
> +    psrad       m0, 6
> +    packssdw    m0, m0
> +    movh        [r2], m0                    ; store [0]
> +
> +    movh        m4, [r0]                    ; m4 = [8]
> +    movh        m5, [r0 + r1]               ; m5 = [9]
> +    punpcklwd   m7, m4                      ; m7 = [7 8]
> +    pmaddwd     m6, m7, [r6 + 3 * 16]
> +    paddd       m1, m6                      ; m1 = [1+2+3+4+5+6+7+8]= R1
> +    pmaddwd     m7, [r6 + 2 * 16]
> +    paddd       m3, m7                      ; m3 = [3+4+5+6+7+8]    = R3
> +    psrad       m1, 6
> +    packssdw    m1, m1
> +    movh        [r2 + r3], m1               ; store [1]
> +    punpcklwd   m4, m5                      ; m4 = [8 9]
> +    pmaddwd     m4, [r6 + 3 * 16]
> +    paddd       m2, m4                      ; m2 = [2+3+4+5+6+7+8+9]= R2
> +    psrad       m2, 6
> +    packssdw    m2, m2
> +    movh        [r2 + r3 * 2], m2           ; store [2]
> +    lea         r2, [r2 + r3 * 2]
> +
> +    movh        m4, [r0 + r1 * 2]           ; m4 = [10]
> +    punpcklwd   m5, m4                      ; m5 = [9 10]
> +    pmaddwd     m5, [r6 + 3 * 16]
> +    paddd       m3, m5                      ; m3 = [3+4+5+6+7+8+9+10]=R3
> +    psrad       m3, 6
> +    packssdw    m3, m3
> +    movh        [r2 + r3], m3               ; store [3]
> +
> +    lea         r5, [r1 * 8 - 8]
> +    sub         r0, r5
> +    lea         r5, [r3 * 2 - 8]
> +    sub         r2, r5
> +
> +    dec         r4d
> +    jnz         .loopW
> +
> +    ; move to next row
> +    mov         r4d, tmp_r4d
> +    shl         r4d, 3
> +    lea         r0, [r0 + r1 * 4]
> +    sub         r0, r4
> +    lea         r2, [r2 + r3 * 4]
> +    sub         r2, r4
> +
> +    dec         tmp_r5d
> +    jnz         .loopH
> +
> +    RET
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/common/x86/pixel.h Mon Nov 04 19:05:04 2013 +0800
> @@ -214,6 +214,7 @@
>  uint64_t x265_pixel_sa8d_satd_16x16_avx2(pixel *pix1, intptr_t stride1,
> pixel *pix2, intptr_t stride2);
>
>  void x265_cvt32to16_shr_sse2(int16_t *dst, int *src, intptr_t, int, int);
> +void x265_interp_8tap_v_ss_sse2(int16_t *src, intptr_t srcStride, int16_t
> *dst, intptr_t dstStride, int width, int height, const int coefIdx);
>
>  #define DECL_HEVC_SSD(suffix) \
>      int x265_pixel_ssd_32x64_ ## suffix(pixel *, intptr_t, pixel *,
> intptr_t); \
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/encoder/motion.cpp
> --- a/source/encoder/motion.cpp Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/encoder/motion.cpp Mon Nov 04 19:05:04 2013 +0800
> @@ -1213,7 +1213,7 @@
>              int filterSize = NTAPS_LUMA;
>              int halfFilterSize = (filterSize >> 1);
>              primitives.ipfilter_ps[FILTER_H_P_S_8](fref - (halfFilterSize
> - 1) * ref->lumaStride, ref->lumaStride, immedVal, blockwidth, blockwidth,
> realHeight + filterSize - 1, g_lumaFilter[xFrac]);
> -            primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal +
> (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE,
> blockwidth, realHeight, g_lumaFilter[yFrac]);
> +            primitives.ipfilter_ss[FILTER_V_S_S_8](immedVal +
> (halfFilterSize - 1) * blockwidth, blockwidth, immedVal2, FENC_STRIDE,
> blockwidth, realHeight, yFrac);
>              primitives.weightpUni(immedVal2, subpelbuf, FENC_STRIDE,
> FENC_STRIDE, blockwidth, realHeight, ref->weight, local_round, local_shift,
> ref->offset);
>          }
>      }
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/test/ipfilterharness.cpp
> --- a/source/test/ipfilterharness.cpp   Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/test/ipfilterharness.cpp   Mon Nov 04 19:05:04 2013 +0800
> @@ -318,6 +318,68 @@
>      return true;
>  }
>
> +bool IPFilterHarness::check_IPFilter_primitive(ipfilter_ss_t ref,
> ipfilter_ss_t opt, int isChroma)
> +{
> +    int rand_val, rand_srcStride, rand_dstStride;
> +    const int min_size = isChroma ? 2 : 4;
> +
> +    // NOTE: refill data to avoid overflow
> +    const int max_filter_val = 64 * (1 << 8);
> +    for (int i = 0; i < ipf_t_size; i++)
> +    {
> +        short_buff[i] = rand() % (2 * max_filter_val) - max_filter_val;
> +    }
> +
> +    for (int i = 0; i <= 1000; i++)
> +    {
> +        int rand_height = rand() % 100;                 // Randomly
> generated Height
> +        int rand_width = rand() % 100;                  // Randomly
> generated Width
> +
> +        memset(IPF_vec_output_s, 0xCD, ipf_t_size);      // Initialize
> output buffer to zero
> +        memset(IPF_C_output_s, 0xCD, ipf_t_size);        // Initialize
> output buffer to zero
> +
> +        rand_val = rand() % 4;                      // Random offset in
> the filter
> +        rand_srcStride = rand() % 100;              // Randomly generated
> srcStride
> +        rand_dstStride = rand() % 100;              // Randomly generated
> dstStride
> +
> +        rand_width &= ~(min_size - 1);
> +        if (rand_width < min_size)
> +            rand_width = min_size;
> +
> +        rand_height &= ~(min_size - 1);
> +        if (rand_height < min_size)
> +            rand_height = min_size;
> +
> +        if (rand_srcStride < rand_width)
> +            rand_srcStride = rand_width;
> +
> +        if (rand_dstStride < rand_width)
> +            rand_dstStride = rand_width;
> +
> +        ref(short_buff + 3 * rand_srcStride,
> +            rand_srcStride,
> +            IPF_C_output_s,
> +            rand_dstStride,
> +            rand_width,
> +            rand_height, rand_val
> +            );
> +        opt(short_buff + 3 * rand_srcStride,
> +            rand_srcStride,
> +            IPF_vec_output_s,
> +            rand_dstStride,
> +            rand_width,
> +            rand_height, rand_val
> +            );
> +
> +        if (memcmp(IPF_C_output_s, IPF_vec_output_s, ipf_t_size *
> sizeof(int16_t)))
> +        {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
>  bool IPFilterHarness::check_IPFilterChroma_primitive(filter_pp_t ref,
> filter_pp_t opt)
>  {
>      int rand_srcStride, rand_dstStride, rand_coeffIdx;
> @@ -452,6 +514,18 @@
>          }
>      }
>
> +    for (int value = 0; value < NUM_IPFILTER_S_S; value++)
> +    {
> +        if (opt.ipfilter_ss[value])
> +        {
> +            if (!check_IPFilter_primitive(ref.ipfilter_ss[value],
> opt.ipfilter_ss[value], (value == FILTER_V_S_S_4)))
> +            {
> +                printf("ipfilter_ss %d failed\n", 8 / (value + 1));
> +                return false;
> +            }
> +        }
> +    }
> +
>      if (opt.ipfilter_p2s)
>      {
>          if (!check_IPFilter_primitive(ref.ipfilter_p2s, opt.ipfilter_p2s))
> @@ -583,6 +657,17 @@
>          }
>      }
>
> +    for (int value = 0; value < NUM_IPFILTER_S_S; value++)
> +    {
> +        if (opt.ipfilter_ss[value])
> +        {
> +            printf("ipfilter_ss %d\t", 8 / (value + 1));
> +            REPORT_SPEEDUP(opt.ipfilter_ss[value], ref.ipfilter_ss[value],
> +                           short_buff + maxVerticalfilterHalfDistance *
> srcStride, srcStride,
> +                           IPF_vec_output_s, dstStride, width, height,
> val);
> +        }
> +    }
> +
>      if (opt.ipfilter_p2s)
>      {
>          printf("ipfilter_p2s\t");
> diff -r 2a7a5766fbd8 -r 539ad4851359 source/test/ipfilterharness.h
> --- a/source/test/ipfilterharness.h     Mon Nov 04 19:04:43 2013 +0800
> +++ b/source/test/ipfilterharness.h     Mon Nov 04 19:05:04 2013 +0800
> @@ -47,6 +47,7 @@
>      bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
>      bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int
> isChroma);
>      bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
> +    bool check_IPFilter_primitive(ipfilter_ss_t ref, ipfilter_ss_t opt,
> int isChroma);
>      bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
>      bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
>      bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref,
> filter_hv_pp_t opt);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131104/9814ed18/attachment-0001.html>


More information about the x265-devel mailing list