[x265] [PATCH] add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve 1989c->515c

Praveen Tiwari praveen at multicorewareinc.com
Fri Jan 9 12:26:56 CET 2015


If it is only 64x64, then definitely it is range issue when we are finally
accumulating sum of all sad calculations. It make more obvious with 64x64
because more number of accumulation is here. Algorithm issue must have
reflected in other partition also.

Regards,
Praveen

On Fri, Jan 9, 2015 at 4:05 PM, Steve Borho <steve at borho.org> wrote:

> On 01/09, Divya Manivannan wrote:
> > # HG changeset patch
> > # User Divya Manivannan <divya at multicorewareinc.com>
> > # Date 1420790181 -19800
> > #      Fri Jan 09 13:26:21 2015 +0530
> > # Node ID 0f4b677cea64254d0b8f77ccc84c785bf832698d
> > # Parent  c99e1a309bd1690be9a0a407050d97d95ccab05a
> > add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve
> 1989c->515c
>
> I get an error with a 10bit build:
>
> steve at zeppelin> ./test/TestBench
> Using random seed 54AFAEC9 16bpp
> Testing primitives: SSE2
> Testing primitives: SSE3
> Testing primitives: SSSE3
> Testing primitives: SSE4
>
> psy_cost_ss[64x64] failed!
>
> > diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/asm-primitives.cpp
> > --- a/source/common/x86/asm-primitives.cpp    Fri Jan 09 13:09:39 2015
> +0530
> > +++ b/source/common/x86/asm-primitives.cpp    Fri Jan 09 13:26:21 2015
> +0530
> > @@ -1430,6 +1430,7 @@
> >          p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
> >          p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
> >  #endif
> > +        p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;
> >      }
> >      if (cpuMask & X265_CPU_XOP)
> >      {
> > @@ -1716,6 +1717,7 @@
> >          p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
> >          p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
> >  #endif
> > +        p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;
> >      }
> >      if (cpuMask & X265_CPU_AVX)
> >      {
> > diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel-a.asm
> > --- a/source/common/x86/pixel-a.asm   Fri Jan 09 13:09:39 2015 +0530
> > +++ b/source/common/x86/pixel-a.asm   Fri Jan 09 13:26:21 2015 +0530
> > @@ -7569,3 +7569,157 @@
> >      RET
> >  %endif ; HIGH_BIT_DEPTH
> >  %endif
> > +
> >
> +;---------------------------------------------------------------------------------------------------------------------
> > +;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t*
> recon, intptr_t rstride)
> >
> +;---------------------------------------------------------------------------------------------------------------------
> > +INIT_XMM sse4
> > +cglobal psyCost_ss_4x4, 4, 5, 8
> > +
> > +    add             r1, r1
> > +    lea             r4, [3 * r1]
> > +    movddup         m0, [r0]
> > +    movddup         m1, [r0 + r1]
> > +    movddup         m2, [r0 + r1 * 2]
> > +    movddup         m3, [r0 + r4]
> > +
> > +    pabsw           m4, m0
> > +    pabsw           m5, m1
> > +    paddw           m5, m4
> > +    pabsw           m4, m2
> > +    paddw           m5, m4
> > +    pabsw           m4, m3
> > +    paddw           m5, m4
> > +    pmaddwd         m5, [pw_1]
> > +    psrldq          m4, m5, 4
> > +    paddd           m5, m4
> > +    psrld           m6, m5, 2
> > +
> > +    mova            m4, [hmul_8w]
> > +    pmaddwd         m0, m4
> > +    pmaddwd         m1, m4
> > +    pmaddwd         m2, m4
> > +    pmaddwd         m3, m4
> > +
> > +    psrldq          m4, m0, 4
> > +    psubd           m5, m0, m4
> > +    paddd           m0, m4
> > +    shufps          m0, m5, 10001000b
> > +
> > +    psrldq          m4, m1, 4
> > +    psubd           m5, m1, m4
> > +    paddd           m1, m4
> > +    shufps          m1, m5, 10001000b
> > +
> > +    psrldq          m4, m2, 4
> > +    psubd           m5, m2, m4
> > +    paddd           m2, m4
> > +    shufps          m2, m5, 10001000b
> > +
> > +    psrldq          m4, m3, 4
> > +    psubd           m5, m3, m4
> > +    paddd           m3, m4
> > +    shufps          m3, m5, 10001000b
> > +
> > +    mova            m4, m0
> > +    paddd           m0, m1
> > +    psubd           m1, m4
> > +    mova            m4, m2
> > +    paddd           m2, m3
> > +    psubd           m3, m4
> > +    mova            m4, m0
> > +    paddd           m0, m2
> > +    psubd           m2, m4
> > +    mova            m4, m1
> > +    paddd           m1, m3
> > +    psubd           m3, m4
> > +
> > +    pabsd           m0, m0
> > +    pabsd           m2, m2
> > +    pabsd           m1, m1
> > +    pabsd           m3, m3
> > +    paddd           m0, m2
> > +    paddd           m1, m3
> > +    paddd           m0, m1
> > +    movhlps         m1, m0
> > +    paddd           m0, m1
> > +    psrldq          m1, m0, 4
> > +    paddd           m0, m1
> > +    psrld           m0, 1
> > +    psubd           m7, m0, m6
> > +
> > +    add             r3, r3
> > +    lea             r4, [3 * r3]
> > +    movddup         m0, [r2]
> > +    movddup         m1, [r2 + r3]
> > +    movddup         m2, [r2 + r3 * 2]
> > +    movddup         m3, [r2 + r4]
> > +
> > +    pabsw           m4, m0
> > +    pabsw           m5, m1
> > +    paddw           m5, m4
> > +    pabsw           m4, m2
> > +    paddw           m5, m4
> > +    pabsw           m4, m3
> > +    paddw           m5, m4
> > +    pmaddwd         m5, [pw_1]
> > +    psrldq          m4, m5, 4
> > +    paddd           m5, m4
> > +    psrld           m6, m5, 2
> > +
> > +    mova            m4, [hmul_8w]
> > +    pmaddwd         m0, m4
> > +    pmaddwd         m1, m4
> > +    pmaddwd         m2, m4
> > +    pmaddwd         m3, m4
> > +
> > +    psrldq          m4, m0, 4
> > +    psubd           m5, m0, m4
> > +    paddd           m0, m4
> > +    shufps          m0, m5, 10001000b
> > +
> > +    psrldq          m4, m1, 4
> > +    psubd           m5, m1, m4
> > +    paddd           m1, m4
> > +    shufps          m1, m5, 10001000b
> > +
> > +    psrldq          m4, m2, 4
> > +    psubd           m5, m2, m4
> > +    paddd           m2, m4
> > +    shufps          m2, m5, 10001000b
> > +
> > +    psrldq          m4, m3, 4
> > +    psubd           m5, m3, m4
> > +    paddd           m3, m4
> > +    shufps          m3, m5, 10001000b
> > +
> > +    mova            m4, m0
> > +    paddd           m0, m1
> > +    psubd           m1, m4
> > +    mova            m4, m2
> > +    paddd           m2, m3
> > +    psubd           m3, m4
> > +    mova            m4, m0
> > +    paddd           m0, m2
> > +    psubd           m2, m4
> > +    mova            m4, m1
> > +    paddd           m1, m3
> > +    psubd           m3, m4
> > +
> > +    pabsd           m0, m0
> > +    pabsd           m2, m2
> > +    pabsd           m1, m1
> > +    pabsd           m3, m3
> > +    paddd           m0, m2
> > +    paddd           m1, m3
> > +    paddd           m0, m1
> > +    movhlps         m1, m0
> > +    paddd           m0, m1
> > +    psrldq          m1, m0, 4
> > +    paddd           m0, m1
> > +    psrld           m0, 1
> > +    psubd           m0, m6
> > +    psubd           m7, m0
> > +    pabsd           m0, m7
> > +    movd            eax, m0
> > +    RET
> > diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel.h
> > --- a/source/common/x86/pixel.h       Fri Jan 09 13:09:39 2015 +0530
> > +++ b/source/common/x86/pixel.h       Fri Jan 09 13:26:21 2015 +0530
> > @@ -223,6 +223,7 @@
> >  int x265_psyCost_pp_16x16_sse4(const pixel* source, intptr_t sstride,
> const pixel* recon, intptr_t rstride);
> >  int x265_psyCost_pp_32x32_sse4(const pixel* source, intptr_t sstride,
> const pixel* recon, intptr_t rstride);
> >  int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride,
> const pixel* recon, intptr_t rstride);
> > +int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride,
> const int16_t* recon, intptr_t rstride);
> >
> >  #undef DECL_PIXELS
> >  #undef DECL_HEVC_SSD
> > diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.cpp
> > --- a/source/test/pixelharness.cpp    Fri Jan 09 13:09:39 2015 +0530
> > +++ b/source/test/pixelharness.cpp    Fri Jan 09 13:26:21 2015 +0530
> > @@ -1089,6 +1089,28 @@
> >      return true;
> >  }
> >
> > +bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t
> opt)
> > +{
> > +    int j = 0, index1, index2, optres, refres;
> > +    intptr_t stride = STRIDE;
> > +
> > +    for (int i = 0; i < ITERS; i++)
> > +    {
> > +        index1 = rand() % TEST_CASES;
> > +        index2 = rand() % TEST_CASES;
> > +        optres = (int)checked(opt, short_test_buff[index1], stride,
> short_test_buff[index2] + j, stride);
> > +        refres = ref(short_test_buff[index1], stride,
> short_test_buff[index2] + j, stride);
> > +
> > +        if (optres != refres)
> > +            return false;
> > +
> > +        reportfail();
> > +        j += INCR;
> > +    }
> > +
> > +    return true;
> > +}
> > +
> >  bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t
> opt)
> >  {
> >      ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> > @@ -1470,6 +1492,15 @@
> >                  return false;
> >              }
> >          }
> > +
> > +        if (opt.psy_cost_ss[i])
> > +        {
> > +            if (!check_psyCost_ss(ref.psy_cost_ss[i],
> opt.psy_cost_ss[i]))
> > +            {
> > +                printf("\npsy_cost_ss[%dx%d] failed!\n", 4 << i, 4 <<
> i);
> > +                return false;
> > +            }
> > +        }
> >      }
> >
> >      if (opt.weight_pp)
> > @@ -1862,6 +1893,12 @@
> >              HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
> >              REPORT_SPEEDUP(opt.psy_cost_pp[i], ref.psy_cost_pp[i],
> pbuf1, STRIDE, pbuf2, STRIDE);
> >          }
> > +
> > +        if (opt.psy_cost_ss[i])
> > +        {
> > +            HEADER("psy_cost_ss[%dx%d]", 4 << i, 4 << i);
> > +            REPORT_SPEEDUP(opt.psy_cost_ss[i], ref.psy_cost_ss[i],
> sbuf1, STRIDE, sbuf2, STRIDE);
> > +        }
> >      }
> >
> >      if (opt.weight_pp)
> > diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.h
> > --- a/source/test/pixelharness.h      Fri Jan 09 13:09:39 2015 +0530
> > +++ b/source/test/pixelharness.h      Fri Jan 09 13:26:21 2015 +0530
> > @@ -101,6 +101,7 @@
> >      bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
> >      bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
> >      bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
> > +    bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
> >      bool check_calSign(sign_t ref, sign_t opt);
> >
> >  public:
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150109/e88559d2/attachment-0001.html>


More information about the x265-devel mailing list