[x265] [PATCH] asm: Unit test code for pixelsub_ps function

Steve Borho steve at borho.org
Wed Nov 13 00:20:25 CET 2013


On Tue, Nov 12, 2013 at 7:41 AM, <murugan at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1384263623 -19800
> #      Tue Nov 12 19:10:23 2013 +0530
> # Node ID b1e0fe97bbfa7bf367d7318f057690c64f1f1f19
> # Parent  7a8118d07276312b2971b292d689805074abd28a
> asm: Unit test code for pixelsub_ps function
>

you need to address Min's comments for the asm patch


>
> diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Tue Nov 12 17:06:34 2013 +0530
> +++ b/source/common/pixel.cpp   Tue Nov 12 19:10:23 2013 +0530
> @@ -778,6 +778,22 @@
>          b += strideb;
>      }
>  }
> +
> +template<int bx, int by>
> +void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1,
> intptr_t sstride0, intptr_t sstride1)
> +{
> +    for (int y = 0; y < by; y++)
> +    {
> +        for (int x = 0; x < bx; x++)
> +        {
> +            a[x] = (int16_t)(b0[x] - b1[x]);
> +        }
> +
> +        b0 += sstride0;
> +        b1 += sstride1;
> +        a += dstride;
> +    }
> +}
>  }  // end anonymous namespace
>
>  namespace x265 {
> @@ -821,12 +837,14 @@
>  #define CHROMA(W, H) \
>      p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
>      p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
> -    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;
> +    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
> +    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
>
>  #define LUMA(W, H) \
>      p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
>      p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
> -    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;
> +    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
> +    p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
>
>      LUMA(4, 4);
>      LUMA(8, 8);
> diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/primitives.h
> --- a/source/common/primitives.h        Tue Nov 12 17:06:34 2013 +0530
> +++ b/source/common/primitives.h        Tue Nov 12 19:10:23 2013 +0530
> @@ -207,6 +207,8 @@
>  typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src,
> intptr_t srcStride);
>  typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src,
> intptr_t srcStride);
>
> +typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel
> *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> +
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
>   * a vectorized primitive, or a C function. */
> @@ -237,6 +239,9 @@
>      copy_ps_t       luma_copy_ps[NUM_LUMA_PARTITIONS];
>      copy_ps_t       chroma_copy_ps[NUM_CHROMA_PARTITIONS];
>
> +    pixel_sub_ps_t  luma_sub_ps[NUM_LUMA_PARTITIONS];
> +    pixel_sub_ps_t  chroma_sub_ps[NUM_CHROMA_PARTITIONS];
> +
>      ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];
>      ipfilter_sp_t   ipfilter_sp[NUM_IPFILTER_S_P];
>      ipfilter_ss_t   ipfilter_ss[NUM_IPFILTER_S_S];
> diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Tue Nov 12 17:06:34 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Tue Nov 12 19:10:23 2013
> +0530
> @@ -133,7 +133,8 @@
>
>  #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
>      p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ##
> W ## x ## H ## cpu; \
> -    p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W
> ## x ## H ## cpu;
> +    p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W
> ## x ## H ## cpu;\
> +    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ##
> x ## H ## cpu;
>
>  #define SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
>      p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ##
> x ## H ## cpu;
> @@ -194,7 +195,8 @@
>      p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ##
> x ## H ## cpu; \
>      p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ##
> x ## H ## cpu; \
>      p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ##
> x ## H ## cpu; \
> -    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ##
> x ## H ## cpu;
> +    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ##
> x ## H ## cpu;\
> +    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ##
> H ## cpu;
>
>  #define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
>      p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x
> ## H ## cpu;
> diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Tue Nov 12 17:06:34 2013 +0530
> +++ b/source/common/x86/pixel.h Tue Nov 12 19:10:23 2013 +0530
> @@ -266,11 +266,77 @@
>  DECL_ADS(2, avx2)
>  DECL_ADS(1, avx2)
>
> +#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
> +    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t *dest, intptr_t
> destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t
> srcstride1);
> +
> +#define CHROMA_PIXELSUB_DEF(cpu) \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 2, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 4, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 4, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 6, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(6, 8, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 2, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 8, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 8, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 12, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(12, 16, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 4, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 16, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 24, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(24, 32, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 8, cpu); \
> +    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu);
> +
> +#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
> +    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t *dest, intptr_t
> destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t
> srcstride1);
> +
> +#define LUMA_PIXELSUB_DEF(cpu) \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(4,   4, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(8,   8, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(8,   4, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(4,   8, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(16,  8, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(8,  16, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 12, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(12, 16, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(16,  4, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(4,  16, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 24, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(24, 32, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(32,  8, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(8,  32, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 32, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 64, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 48, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(48, 64, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 16, cpu); \
> +    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 64, cpu);
> +
> +CHROMA_PIXELSUB_DEF(_sse4);
> +LUMA_PIXELSUB_DEF(_sse4);
> +
>  #undef DECL_PIXELS
>  #undef DECL_SUF
>  #undef DECL_HEVC_SSD
>  #undef DECL_X1
>  #undef DECL_X4
>  #undef DECL_ADS
> +#undef SETUP_CHROMA_PIXELSUB_PS_FUNC
> +#undef SETUP_LUMA_PIXELSUB_PS_FUNC
> +#undef CHROMA_PIXELSUB_DEF
> +#undef LUMA_PIXELSUB_DEF
>
>  #endif // ifndef X265_I386_PIXEL_H
> diff -r 7a8118d07276 -r b1e0fe97bbfa source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Tue Nov 12 17:06:34 2013 +0530
> +++ b/source/test/pixelharness.cpp      Tue Nov 12 19:10:23 2013 +0530
> @@ -586,6 +586,29 @@
>      return true;
>  }
>
> +bool PixelHarness::check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t
> opt)
> +{
> +    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> +
> +    memset(ref_dest, 0xCD, sizeof(ref_dest));
> +    memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> +    int j = 0;
> +    for (int i = 0; i < 1; i++)
> +    {
> +        opt(opt_dest, 64, pbuf2 + j, pbuf1 + j, STRIDE, STRIDE);
> +        ref(ref_dest, 64, pbuf2 + j, pbuf1 + j, STRIDE, STRIDE);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> +            return false;
> +
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref,
> const EncoderPrimitives& opt)
>  {
>      if (opt.satd[part])
> @@ -722,6 +745,24 @@
>              return false;
>          }
>      }
> +
> +    if (opt.luma_sub_ps[part])
> +    {
> +        if (!check_pixel_sub_ps(ref.luma_sub_ps[part],
> opt.luma_sub_ps[part]))
> +        {
> +            printf("luma_sub_ps[%s] failed\n", lumaPartStr[part]);
> +            return false;
> +        }
> +    }
> +
> +    if (opt.chroma_sub_ps[part])
> +    {
> +        if (!check_pixel_sub_ps(ref.chroma_sub_ps[part],
> opt.chroma_sub_ps[part]))
> +        {
> +            printf("chroma_sub_ps[%s] failed\n", chromaPartStr[part]);
> +            return false;
> +        }
> +    }
>      return true;
>  }
>
> @@ -968,6 +1009,18 @@
>          printf("ccpy_ps[%s]", chromaPartStr[part]);
>          REPORT_SPEEDUP(opt.chroma_copy_ps[part],
> ref.chroma_copy_ps[part], sbuf1, 64, pbuf1, 128);
>      }
> +
> +    if (opt.luma_sub_ps[part])
> +    {
> +        printf("luma_sub_ps[%s]", lumaPartStr[part]);
> +        REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part],
> (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
> +    }
> +
> +    if (opt.chroma_sub_ps[part])
> +    {
> +        printf("chroma_sub_ps[%s]", chromaPartStr[part]);
> +        REPORT_SPEEDUP(opt.chroma_sub_ps[part], ref.chroma_sub_ps[part],
> (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
> +    }
>  }
>
>  void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
> diff -r 7a8118d07276 -r b1e0fe97bbfa source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Tue Nov 12 17:06:34 2013 +0530
> +++ b/source/test/pixelharness.h        Tue Nov 12 19:10:23 2013 +0530
> @@ -60,6 +60,8 @@
>      bool check_block_copy_ps(copy_ps_t ref, copy_ps_t opt);
>
>      bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
> +
> +    bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
>  public:
>
>      PixelHarness();
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131112/5b22ac80/attachment.html>


More information about the x265-devel mailing list