[x265] [PATCH] weighted prediction pixel, interface simplification

Mon Oct 20 19:32:10 CEST 2014

On 10/20, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1413793389 -19800
> # Node ID 3366be6ef59eec3d3ca69ed52942708b5d1b3bc6
> # Parent  1e09d0395826bdd01a4b4e46569853a2f04b9e95
> weighted prediction pixel, interface simplification

Queued, thanks

> diff -r 1e09d0395826 -r 3366be6ef59e source/common/pixel.cpp
> --- a/source/common/pixel.cpp	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/common/pixel.cpp	Mon Oct 20 13:53:09 2014 +0530
> @@ -640,11 +640,13 @@
>      }
>  }
>  
> -void weight_pp_c(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
> +void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
>  {
>      int x, y;
>  
>      X265_CHECK(!(width & 15), "weightp alignment error\n");
> +    X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
> +    X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
>  
>      for (y = 0; y <= height - 1; y++)
>      {
> @@ -656,8 +658,8 @@
>              x++;
>          }
>  
> -        src += srcStride;
> -        dst += dstStride;
> +        src += stride;
> +        dst += stride;
>      }
>  }
>  
> diff -r 1e09d0395826 -r 3366be6ef59e source/common/primitives.h
> --- a/source/common/primitives.h	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/common/primitives.h	Mon Oct 20 13:53:09 2014 +0530
> @@ -168,7 +168,7 @@
>  typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
>  typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
>  
> -typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
> +typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
>  typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>  typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
>  typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
> diff -r 1e09d0395826 -r 3366be6ef59e source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/common/x86/pixel-util.h	Mon Oct 20 13:53:09 2014 +0530
> @@ -57,7 +57,7 @@
>  void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
>  int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
>  
> -void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
> +void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
>  void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>  
>  void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
> diff -r 1e09d0395826 -r 3366be6ef59e source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/common/x86/pixel-util8.asm	Mon Oct 20 13:53:09 2014 +0530
> @@ -1298,36 +1298,29 @@
>  
>  
>  ;-----------------------------------------------------------------------------------------------------------------------------------------------
> -;void weight_pp(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
> +;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
>  ;-----------------------------------------------------------------------------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal weight_pp, 6, 7, 6
>  
> +    shl         r5d, 6      ; m0 = [w0<<6]
>      mov         r6d, r6m
> -    shl         r6d, 6
> -    movd        m0, r6d         ; m0 = [w0<<6]
> -
> -    movd        m1, r7m         ; m1 = [round]
> -    punpcklwd   m0, m1          ; assuming both (w0<<6) and round are using maximum of 16 bits each.
> -    pshufd      m0, m0, 0       ; m0 = [w0<<6 round]
> -
> -    movd        m1, r8m
> -
> -    movd        m2, r9m
> +    shl         r6d, 16
> +    or          r6d, r5d    ; assuming both (w0<<6) and round are using maximum of 16 bits each.
> +    movd        m0, r6d
> +    pshufd      m0, m0, 0   ; m0 = [w0<<6, round]
> +    movd        m1, r7m
> +    movd        m2, r8m
>      pshufd      m2, m2, 0
> -
>      mova        m5, [pw_1]
> -
> -    sub         r2d, r4d
> -    sub         r3d, r4d
> +    sub         r2d, r3d
> +    shr         r3d, 4
>  
>  .loopH:
> -    mov         r6d, r4d
> -    shr         r6d, 4
> +    mov         r5d, r3d
> +
>  .loopW:
> -    movh        m4, [r0]
> -    pmovzxbw    m4, m4
> -
> +    pmovzxbw    m4, [r0]
>      punpcklwd   m3, m4, m5
>      pmaddwd     m3, m0
>      psrad       m3, m1
> @@ -1340,12 +1333,9 @@
>  
>      packssdw    m3, m4
>      packuswb    m3, m3
> -
>      movh        [r1], m3
>  
> -    movh        m4, [r0 + 8]
> -    pmovzxbw    m4, m4
> -
> +    pmovzxbw    m4, [r0 + 8]
>      punpcklwd   m3, m4, m5
>      pmaddwd     m3, m0
>      psrad       m3, m1
> @@ -1358,21 +1348,19 @@
>  
>      packssdw    m3, m4
>      packuswb    m3, m3
> -
>      movh        [r1 + 8], m3
>  
>      add         r0, 16
>      add         r1, 16
>  
> -    dec         r6d
> +    dec         r5d
>      jnz         .loopW
>  
>      lea         r0, [r0 + r2]
> -    lea         r1, [r1 + r3]
> -
> -    dec         r5d
> +    lea         r1, [r1 + r2]
> +
> +    dec         r4d
>      jnz         .loopH
> -
>      RET
>  
>  ;-------------------------------------------------------------------------------------------------------------------------------------------------
> diff -r 1e09d0395826 -r 3366be6ef59e source/encoder/reference.cpp
> --- a/source/encoder/reference.cpp	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/encoder/reference.cpp	Mon Oct 20 13:53:09 2014 +0530
> @@ -92,7 +92,7 @@
>      // Computing weighted CU rows
>      int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
>      int padwidth = (width + 15) & ~15;  // weightp assembly needs even 16 byte widths
> -    primitives.weight_pp(src, dst, lumaStride, lumaStride, padwidth, height,
> +    primitives.weight_pp(src, dst, lumaStride, padwidth, height,
>                           weight, round << correction, shift + correction, offset);
>  
>      // Extending Left & Right
> diff -r 1e09d0395826 -r 3366be6ef59e source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/encoder/slicetype.cpp	Mon Oct 20 13:53:09 2014 +0530
> @@ -1383,7 +1383,7 @@
>          int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
>          int widthHeight = (int)stride;
>  
> -        primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, widthHeight, m_paddedLines,
> +        primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
>                               scale, round << correction, denom + correction, offset);
>          src = m_weightedRef.fpelPlane;
>      }
> @@ -1481,7 +1481,7 @@
>          int widthHeight = (int)stride;
>  
>          for (int i = 0; i < 4; i++)
> -            primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, widthHeight, m_paddedLines,
> +            primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines,
>                                   scale, round << correction, denom + correction, offset);
>  
>          m_weightedRef.isWeighted = true;
> diff -r 1e09d0395826 -r 3366be6ef59e source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/encoder/weightPrediction.cpp	Mon Oct 20 13:53:09 2014 +0530
> @@ -186,7 +186,7 @@
>          int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
>          int pwidth = ((width + 15) >> 4) << 4;
>  
> -        primitives.weight_pp(ref, weightTemp, stride, stride, pwidth, height,
> +        primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
>                               weight, round << correction, denom + correction, offset);
>          ref = weightTemp;
>      }
> diff -r 1e09d0395826 -r 3366be6ef59e source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Sun Oct 19 20:53:36 2014 -0500
> +++ b/source/test/pixelharness.cpp	Mon Oct 20 13:53:09 2014 +0530
> @@ -334,8 +334,8 @@
>      for (int i = 0; i < ITERS; i++)
>      {
>          int index = i % TEST_CASES;
> -        checked(opt, pixel_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round, shift, offset);
> -        ref(pixel_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round, shift, offset);
> +        checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round, shift, offset);
> +        ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round, shift, offset);
>  
>          if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
>              return false;
> @@ -1775,7 +1775,7 @@
>      if (opt.weight_pp)
>      {
>          HEADER0("weight_pp");
> -        REPORT_SPEEDUP(opt.weight_pp, ref.weight_pp, pbuf1, pbuf2, 64, 64, 32, 32, 128, 1 << 9, 10, 100);
> +        REPORT_SPEEDUP(opt.weight_pp, ref.weight_pp, pbuf1, pbuf2, 64, 32, 32, 128, 1 << 9, 10, 100);
>      }
>  
>      if (opt.weight_sp)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho