[x265] [PATCH] count_nonzero primitive, downscaling quantCoeff from int32_t* to int16_t*

Wed Aug 13 22:31:29 CEST 2014

On 08/12, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1407834530 -19800
> # Node ID bb4d44663964237e4b66af6d92b2f13dbcf4f9b9
> # Parent  8a7f4bb1d1be32fe668d410450c2e320ccae6098
> count_nonzero primitive, downscaling quantCoeff from int32_t* to int16_t*

There's not much point in applying these patches until all of the quant
primitives are using short ints for coefficients. As-is this will just
be a slow-down.

> diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/dct.cpp
> --- a/source/common/dct.cpp	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/common/dct.cpp	Tue Aug 12 14:38:50 2014 +0530
> @@ -815,7 +815,7 @@
>      return numSig;
>  }
>  
> -int  count_nonzero_c(const int32_t *quantCoeff, int numCoeff)
> +int  count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
>  {
>      X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
>      X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
> diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/primitives.h
> --- a/source/common/primitives.h	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/common/primitives.h	Tue Aug 12 14:38:50 2014 +0530
> @@ -163,7 +163,7 @@
>  typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
>  typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> -typedef int  (*count_nonzero_t)(const int32_t *quantCoeff, int numCoeff);
> +typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
>  
>  typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>  typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
> diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/quant.cpp
> --- a/source/common/quant.cpp	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/common/quant.cpp	Tue Aug 12 14:38:50 2014 +0530
> @@ -2,6 +2,7 @@
>   * Copyright (C) 2014 x265 project
>   *
>   * Authors: Steve Borho <steve at borho.org>
> + *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -463,7 +464,17 @@
>          const uint32_t sizeIdx = log2TrSize - 2;
>          int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
>  
> -        X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << log2TrSize * 2), "numSig differ\n");
> +        /* This section of code is to safely convert int32_t coefficients to int16_t, once the caller function is
> +         * optimize to take coefficients as int16_t*, it will be cleanse.*/
> +        int numCoeff = (1 << (log2TrSize * 2));
> +        assert(numCoeff <= 1024);
> +        ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
> +        for (int i = 0; i < numCoeff; i++)
> +        {
> +            qCoeff[i] = (coeff[i] & 0xFFFF);
> +        }
> +
> +        X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, 1 << log2TrSize * 2), "numSig differ\n");
>  
>          // DC only
>          if (numSig == 1 && coeff[0] != 0 && !useDST)
> @@ -501,7 +512,16 @@
>      int numCoeff = 1 << log2TrSize * 2;
>      uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);

These two loops are only here for an X265_CHECK statement that is
usually compiled out.  All of this code should have been wrapped within
#if CHECKED_BUILD || _DEBUG

> -    X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, numCoeff), "numSig differ\n");
> +    /* This section of code is to safely convert int32_t coefficients to int16_t, once the caller function is
> +     * optimize to take coefficients as int16_t*, it will be cleanse.*/
> +    assert(numCoeff <= 1024);
> +    ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
> +    for (int i = 0; i < numCoeff; i++)
> +    {
> +        qCoeff[i] = (dstCoeff[i] & 0xFFFF);
> +    }
> +
> +    X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, numCoeff), "numSig differ\n");
>      if (!numSig)
>          return 0;
>  
> diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/common/x86/pixel-util.h	Tue Aug 12 14:38:50 2014 +0530
> @@ -2,6 +2,7 @@
>   * Copyright (C) 2013 x265 project
>   *
>   * Authors: Steve Borho <steve at borho.org>
> + *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -47,7 +48,7 @@
>  uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
>  uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> -int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
> +int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
>  
>  void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
>  void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
> diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/common/x86/pixel-util8.asm	Tue Aug 12 14:38:50 2014 +0530
> @@ -3,6 +3,7 @@
>  ;*
>  ;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
>  ;*          Nabajit Deka <nabajit at multicorewareinc.com>
> +;*          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>  ;*
>  ;* This program is free software; you can redistribute it and/or modify
>  ;* it under the terms of the GNU General Public License as published by
> @@ -1091,10 +1092,10 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; int count_nonzero(const int32_t *quantCoeff, int numCoeff);
> +; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM ssse3
> -cglobal count_nonzero, 2,2,5
> +cglobal count_nonzero, 2,2,4
>      pxor        m0, m0
>      shr         r1d, 4
>      movd        m1, r1d
> @@ -1103,12 +1104,8 @@
>  .loop:
>      mova        m2, [r0 +  0]
>      mova        m3, [r0 + 16]
> -    packssdw    m2, m3
> -    mova        m3, [r0 + 32]
> -    mova        m4, [r0 + 48]
> -    add         r0, 64
> -    packssdw    m3, m4
>      packsswb    m2, m3
> +    add         r0, 32
>      pcmpeqb     m2, m0
>      paddb       m1, m2
>      dec         r1d
> diff -r 8a7f4bb1d1be -r bb4d44663964 source/encoder/entropy.cpp
> --- a/source/encoder/entropy.cpp	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/encoder/entropy.cpp	Tue Aug 12 14:38:50 2014 +0530
> @@ -2,6 +2,7 @@
>  * Copyright (C) 2013 x265 project
>  *
>  * Authors: Steve Borho <steve at borho.org>
> +*          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>  *
>  * This program is free software; you can redistribute it and/or modify
>  * it under the terms of the GNU General Public License as published by
> @@ -1488,8 +1489,18 @@
>  {
>      uint32_t trSize = 1 << log2TrSize;
>  
> +    /* This section of code is to safely convert int32_t coefficients to int16_t, once the caller function is
> +     * optimize to take coefficients as int16_t*, it will be cleanse.*/
> +    int numCoeff = (1 << (log2TrSize << 1));
> +    assert(numCoeff <= 1024);
> +    ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
> +    for (int i = 0; i < numCoeff; i++)
> +    {
> +        qCoeff[i] = (coeff[i] & 0xFFFF);
> +    }
> +
>      // compute number of significant coefficients
> -    uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1)));
> +    uint32_t numSig = primitives.count_nonzero(qCoeff, (1 << (log2TrSize << 1)));
>  
>      X265_CHECK(numSig > 0, "cbf check fail\n");
>  
> diff -r 8a7f4bb1d1be -r bb4d44663964 source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp	Tue Aug 12 01:11:39 2014 -0500
> +++ b/source/test/mbdstharness.cpp	Tue Aug 12 14:38:50 2014 +0530
> @@ -366,7 +366,7 @@
>  
>  bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
>  {
> -    ALIGN_VAR_32(int32_t, qcoeff[32 * 32]);
> +    ALIGN_VAR_32(int16_t, qcoeff[32 * 32]);
>  
>      for (int i = 0; i < 4; i++)
>      {
> @@ -376,7 +376,7 @@
>  
>          for (int n = 0; n <= num; n++)
>          {
> -            memset(qcoeff, 0, num * sizeof(int32_t));
> +            memset(qcoeff, 0, num * sizeof(int16_t));
>  
>              for (int j = 0; j < n; j++)
>              {
> @@ -386,7 +386,7 @@
>                      k = (k + 11) & mask;
>                  }
>  
> -                qcoeff[k] = rand() - RAND_MAX / 2;
> +                qcoeff[k] = (int16_t)rand() - RAND_MAX / 2;
>              }
>  
>              int refval = ref(qcoeff, num);
> @@ -516,7 +516,7 @@
>          for (int i = 4; i <= 32; i <<= 1)
>          {
>              printf("count_nonzero[%dx%d]", i, i);
> -            REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbufidct, i * i)
> +            REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1, i * i)
>          }
>      }
>  }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho