[x265] [PATCH] add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve 1989c->515c

Steve Borho steve at borho.org
Fri Jan 9 11:35:49 CET 2015


On 01/09, Divya Manivannan wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1420790181 -19800
> #      Fri Jan 09 13:26:21 2015 +0530
> # Node ID 0f4b677cea64254d0b8f77ccc84c785bf832698d
> # Parent  c99e1a309bd1690be9a0a407050d97d95ccab05a
> add testbench for psyCost_ss and asm for psyCost_ss_4x4: improve 1989c->515c

I get an error with a 10bit build:

steve at zeppelin> ./test/TestBench
Using random seed 54AFAEC9 16bpp
Testing primitives: SSE2
Testing primitives: SSE3
Testing primitives: SSSE3
Testing primitives: SSE4

psy_cost_ss[64x64] failed!

> diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Fri Jan 09 13:09:39 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Fri Jan 09 13:26:21 2015 +0530
> @@ -1430,6 +1430,7 @@
>          p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
>          p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
>  #endif
> +        p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;
>      }
>      if (cpuMask & X265_CPU_XOP)
>      {
> @@ -1716,6 +1717,7 @@
>          p.psy_cost_pp[BLOCK_32x32] = x265_psyCost_pp_32x32_sse4;
>          p.psy_cost_pp[BLOCK_64x64] = x265_psyCost_pp_64x64_sse4;
>  #endif
> +        p.psy_cost_ss[BLOCK_4x4] = x265_psyCost_ss_4x4_sse4;
>      }
>      if (cpuMask & X265_CPU_AVX)
>      {
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm	Fri Jan 09 13:09:39 2015 +0530
> +++ b/source/common/x86/pixel-a.asm	Fri Jan 09 13:26:21 2015 +0530
> @@ -7569,3 +7569,157 @@
>      RET
>  %endif ; HIGH_BIT_DEPTH
>  %endif
> +
> +;---------------------------------------------------------------------------------------------------------------------
> +;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
> +;---------------------------------------------------------------------------------------------------------------------
> +INIT_XMM sse4
> +cglobal psyCost_ss_4x4, 4, 5, 8
> +
> +    add             r1, r1
> +    lea             r4, [3 * r1]
> +    movddup         m0, [r0]
> +    movddup         m1, [r0 + r1]
> +    movddup         m2, [r0 + r1 * 2]
> +    movddup         m3, [r0 + r4]
> +
> +    pabsw           m4, m0
> +    pabsw           m5, m1
> +    paddw           m5, m4
> +    pabsw           m4, m2
> +    paddw           m5, m4
> +    pabsw           m4, m3
> +    paddw           m5, m4
> +    pmaddwd         m5, [pw_1]
> +    psrldq          m4, m5, 4
> +    paddd           m5, m4
> +    psrld           m6, m5, 2
> +
> +    mova            m4, [hmul_8w]
> +    pmaddwd         m0, m4
> +    pmaddwd         m1, m4
> +    pmaddwd         m2, m4
> +    pmaddwd         m3, m4
> +
> +    psrldq          m4, m0, 4
> +    psubd           m5, m0, m4
> +    paddd           m0, m4
> +    shufps          m0, m5, 10001000b
> +
> +    psrldq          m4, m1, 4
> +    psubd           m5, m1, m4
> +    paddd           m1, m4
> +    shufps          m1, m5, 10001000b
> +
> +    psrldq          m4, m2, 4
> +    psubd           m5, m2, m4
> +    paddd           m2, m4
> +    shufps          m2, m5, 10001000b
> +
> +    psrldq          m4, m3, 4
> +    psubd           m5, m3, m4
> +    paddd           m3, m4
> +    shufps          m3, m5, 10001000b
> +
> +    mova            m4, m0
> +    paddd           m0, m1
> +    psubd           m1, m4
> +    mova            m4, m2
> +    paddd           m2, m3
> +    psubd           m3, m4
> +    mova            m4, m0
> +    paddd           m0, m2
> +    psubd           m2, m4
> +    mova            m4, m1
> +    paddd           m1, m3
> +    psubd           m3, m4
> +
> +    pabsd           m0, m0
> +    pabsd           m2, m2
> +    pabsd           m1, m1
> +    pabsd           m3, m3
> +    paddd           m0, m2
> +    paddd           m1, m3
> +    paddd           m0, m1
> +    movhlps         m1, m0
> +    paddd           m0, m1
> +    psrldq          m1, m0, 4
> +    paddd           m0, m1
> +    psrld           m0, 1
> +    psubd           m7, m0, m6
> +
> +    add             r3, r3
> +    lea             r4, [3 * r3]
> +    movddup         m0, [r2]
> +    movddup         m1, [r2 + r3]
> +    movddup         m2, [r2 + r3 * 2]
> +    movddup         m3, [r2 + r4]
> +
> +    pabsw           m4, m0
> +    pabsw           m5, m1
> +    paddw           m5, m4
> +    pabsw           m4, m2
> +    paddw           m5, m4
> +    pabsw           m4, m3
> +    paddw           m5, m4
> +    pmaddwd         m5, [pw_1]
> +    psrldq          m4, m5, 4
> +    paddd           m5, m4
> +    psrld           m6, m5, 2
> +
> +    mova            m4, [hmul_8w]
> +    pmaddwd         m0, m4
> +    pmaddwd         m1, m4
> +    pmaddwd         m2, m4
> +    pmaddwd         m3, m4
> +
> +    psrldq          m4, m0, 4
> +    psubd           m5, m0, m4
> +    paddd           m0, m4
> +    shufps          m0, m5, 10001000b
> +
> +    psrldq          m4, m1, 4
> +    psubd           m5, m1, m4
> +    paddd           m1, m4
> +    shufps          m1, m5, 10001000b
> +
> +    psrldq          m4, m2, 4
> +    psubd           m5, m2, m4
> +    paddd           m2, m4
> +    shufps          m2, m5, 10001000b
> +
> +    psrldq          m4, m3, 4
> +    psubd           m5, m3, m4
> +    paddd           m3, m4
> +    shufps          m3, m5, 10001000b
> +
> +    mova            m4, m0
> +    paddd           m0, m1
> +    psubd           m1, m4
> +    mova            m4, m2
> +    paddd           m2, m3
> +    psubd           m3, m4
> +    mova            m4, m0
> +    paddd           m0, m2
> +    psubd           m2, m4
> +    mova            m4, m1
> +    paddd           m1, m3
> +    psubd           m3, m4
> +
> +    pabsd           m0, m0
> +    pabsd           m2, m2
> +    pabsd           m1, m1
> +    pabsd           m3, m3
> +    paddd           m0, m2
> +    paddd           m1, m3
> +    paddd           m0, m1
> +    movhlps         m1, m0
> +    paddd           m0, m1
> +    psrldq          m1, m0, 4
> +    paddd           m0, m1
> +    psrld           m0, 1
> +    psubd           m0, m6
> +    psubd           m7, m0
> +    pabsd           m0, m7
> +    movd            eax, m0
> +    RET
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h	Fri Jan 09 13:09:39 2015 +0530
> +++ b/source/common/x86/pixel.h	Fri Jan 09 13:26:21 2015 +0530
> @@ -223,6 +223,7 @@
>  int x265_psyCost_pp_16x16_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
>  int x265_psyCost_pp_32x32_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
>  int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
> +int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
>  
>  #undef DECL_PIXELS
>  #undef DECL_HEVC_SSD
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Fri Jan 09 13:09:39 2015 +0530
> +++ b/source/test/pixelharness.cpp	Fri Jan 09 13:26:21 2015 +0530
> @@ -1089,6 +1089,28 @@
>      return true;
>  }
>  
> +bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
> +{
> +    int j = 0, index1, index2, optres, refres;
> +    intptr_t stride = STRIDE;
> +
> +    for (int i = 0; i < ITERS; i++)
> +    {
> +        index1 = rand() % TEST_CASES;
> +        index2 = rand() % TEST_CASES;
> +        optres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
> +        refres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
> +
> +        if (optres != refres)
> +            return false;
> +
> +        reportfail();
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)
>  {
>      ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -1470,6 +1492,15 @@
>                  return false;
>              }
>          }
> +
> +        if (opt.psy_cost_ss[i])
> +        {
> +            if (!check_psyCost_ss(ref.psy_cost_ss[i], opt.psy_cost_ss[i]))
> +            {
> +                printf("\npsy_cost_ss[%dx%d] failed!\n", 4 << i, 4 << i);
> +                return false;
> +            }
> +        }
>      }
>  
>      if (opt.weight_pp)
> @@ -1862,6 +1893,12 @@
>              HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
>              REPORT_SPEEDUP(opt.psy_cost_pp[i], ref.psy_cost_pp[i], pbuf1, STRIDE, pbuf2, STRIDE);
>          }
> +
> +        if (opt.psy_cost_ss[i])
> +        {
> +            HEADER("psy_cost_ss[%dx%d]", 4 << i, 4 << i);
> +            REPORT_SPEEDUP(opt.psy_cost_ss[i], ref.psy_cost_ss[i], sbuf1, STRIDE, sbuf2, STRIDE);
> +        }
>      }
>  
>      if (opt.weight_pp)
> diff -r c99e1a309bd1 -r 0f4b677cea64 source/test/pixelharness.h
> --- a/source/test/pixelharness.h	Fri Jan 09 13:09:39 2015 +0530
> +++ b/source/test/pixelharness.h	Fri Jan 09 13:26:21 2015 +0530
> @@ -101,6 +101,7 @@
>      bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
>      bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
>      bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
> +    bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
>      bool check_calSign(sign_t ref, sign_t opt);
>  
>  public:
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list