[x265] quant: remove scaledCoeff from nquant()

Mon Aug 11 05:42:01 CEST 2014

On 08/10, Satoshi Nakagawa wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1407658928 -32400
> #      Sun Aug 10 17:22:08 2014 +0900
> # Node ID d1dad09266327d40b6c2372f9916f7fcf288c2f0
> # Parent  6e4eb854220350cf0c980fc02cc11109c506585f
> quant: remove scaledCoeff from nquant()

Queued, thanks

> diff -r 6e4eb8542203 -r d1dad0926632 source/common/dct.cpp
> --- a/source/common/dct.cpp	Sat Aug 09 19:43:23 2014 -0500
> +++ b/source/common/dct.cpp	Sun Aug 10 17:22:08 2014 +0900
> @@ -795,7 +795,7 @@
>      return numSig;
>  }
>  
> -uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* scaledCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
> +uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int32_t* qCoef, int qBits, int add, int numCoeff)
>  {
>      uint32_t numSig = 0;
>  
> @@ -805,7 +805,6 @@
>          int sign  = (level < 0 ? -1 : 1);
>  
>          int tmplevel = abs(level) * quantCoeff[blockpos];
> -        scaledCoeff[blockpos] = tmplevel;
>          level = ((tmplevel + add) >> qBits);
>          if (level)
>              ++numSig;
> diff -r 6e4eb8542203 -r d1dad0926632 source/common/primitives.h
> --- a/source/common/primitives.h	Sat Aug 09 19:43:23 2014 -0500
> +++ b/source/common/primitives.h	Sun Aug 10 17:22:08 2014 +0900
> @@ -160,7 +160,7 @@
>  typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
>  typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
>  typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
> -typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
> +typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
>  typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
>  typedef int  (*count_nonzero_t)(const int32_t *quantCoeff, int numCoeff);
> diff -r 6e4eb8542203 -r d1dad0926632 source/common/quant.cpp
> --- a/source/common/quant.cpp	Sat Aug 09 19:43:23 2014 -0500
> +++ b/source/common/quant.cpp	Sun Aug 10 17:22:08 2014 +0900
> @@ -487,7 +487,6 @@
>   * probability models like CABAC */
>  uint32_t Quant::rdoQuant(TComDataCU* cu, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
>  {
> -    uint32_t trSize = 1 << log2TrSize;
>      int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
>      int scalingListType = (cu->isIntra(absPartIdx) ? 0 : 3) + ttype;
>  
> @@ -500,14 +499,13 @@
>      int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
>  
>      int numCoeff = 1 << log2TrSize * 2;
> -    int scaledCoeff[32 * 32];
> -    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, scaledCoeff, dstCoeff, qbits, add, numCoeff);
> +    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
>  
>      X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, numCoeff), "numSig differ\n");
>      if (!numSig)
>          return 0;
>  
> -    x265_emms();
> +    uint32_t trSize = 1 << log2TrSize;
>  
>      /* unquant constants for psy-rdoq. The dequant coefficients have a (1<<4) scale applied that
>       * must be removed during unquant.  This may be larger than the QP upshift, which would turn
> diff -r 6e4eb8542203 -r d1dad0926632 source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h	Sat Aug 09 19:43:23 2014 -0500
> +++ b/source/common/x86/pixel-util.h	Sun Aug 10 17:22:08 2014 +0900
> @@ -45,7 +45,7 @@
>  void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
>  
>  uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
>  int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
>  
> diff -r 6e4eb8542203 -r d1dad0926632 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Sat Aug 09 19:43:23 2014 -0500
> +++ b/source/common/x86/pixel-util8.asm	Sun Aug 10 17:22:08 2014 +0900
> @@ -938,72 +938,63 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *scaledCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
> +; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal nquant, 5,6,8
> -
> -    ; fill qbits
> -    movd        m4, r4d         ; m4 = qbits
> -
> -    ; fill offset
> -    movd        m5, r5m
> -    pshufd      m5, m5, 0       ; m5 = add
> -
> -    mov         r4d, r6m
> +cglobal nquant, 4,5,8
> +    movd        m6, r4m
> +    mov         r4d, r5m
> +    pxor        m7, m7          ; m7 = numZero
> +    movd        m5, r3d         ; m5 = qbits
> +    pshufd      m6, m6, 0       ; m6 = add
> +    mov         r3d, r4d        ; r3 = numCoeff
>      shr         r4d, 3
> -    pxor        m7, m7          ; m7 = numZero
>  .loop:
> -    ; 4 coeff
>      movu        m0, [r0]        ; m0 = level
> -    pxor        m1, m1
> -    pcmpgtd     m1, m0          ; m1 = sign
> +    movu        m1, [r0 + 16]   ; m1 = level
>      movu        m2, [r1]        ; m2 = qcoeff
> +    movu        m3, [r1 + 16]   ; m3 = qcoeff
> +    add         r0, 32
> +    add         r1, 32
> +
> +    pxor        m4, m4
> +    pcmpgtd     m4, m0          ; m4 = sign
>      pabsd       m0, m0
>      pmulld      m0, m2          ; m0 = tmpLevel1
> -    movu        [r2], m0        ; m0 = scaledCoeff
> -    paddd       m2, m0, m5
> -    psrad       m2, m4          ; m2 = level1
> -    pxor        m0, m0
> -    pcmpeqd     m0, m2          ; m0 = mask4
> -    psubd       m7, m0
> -
> -    pxor        m2, m1
> -    psubd       m2, m1
> -    packssdw    m2, m2
> -    pmovsxwd    m2, m2
> -    movu        [r3], m2
> -    ; 4 coeff
> -    movu        m0, [r0 + 16]   ; m0 = level
> -    pxor        m1, m1
> -    pcmpgtd     m1, m0          ; m1 = sign
> -    movu        m2, [r1 + 16]   ; m2 = qcoeff
> -    pabsd       m0, m0
> -    pmulld      m0, m2          ; m0 = tmpLevel1
> -    movu        [r2 + 16], m0   ; m0 = scaledCoeff
> -    paddd       m2, m0, m5
> -    psrad       m2, m4          ; m2 = level1
> -    pxor        m0, m0
> -    pcmpeqd     m0, m2          ; m0 = mask4
> -    psubd       m7, m0
> -
> -    pxor        m2, m1
> -    psubd       m2, m1
> -    packssdw    m2, m2
> -    pmovsxwd    m2, m2
> -    movu        [r3 + 16], m2
> -
> -    add         r0, 32
> -    add         r1, 32
> +    paddd       m0, m6
> +    psrad       m0, m5          ; m0 = level1
> +    pxor        m0, m4
> +    psubd       m0, m4
> +
> +    pxor        m4, m4
> +    pcmpgtd     m4, m1          ; m4 = sign
> +    pabsd       m1, m1
> +    pmulld      m1, m3          ; m1 = tmpLevel1
> +    paddd       m1, m6
> +    psrad       m1, m5          ; m1 = level1
> +    pxor        m1, m4
> +    psubd       m1, m4
> +
> +    packssdw    m0, m0
> +    packssdw    m1, m1
> +    pmovsxwd    m0, m0
> +    pmovsxwd    m1, m1
> +
> +    movu        [r2], m0
> +    movu        [r2 + 16], m1
>      add         r2, 32
> -    add         r3, 32
> -
>      dec         r4d
> -    jnz        .loop
> -
> -    phaddd      m7, m7
> -    phaddd      m7, m7
> -    mov         eax, r6m
> +
> +    packssdw    m0, m1
> +    pxor        m4, m4
> +    pcmpeqw     m0, m4
> +    psubw       m7, m0
> +
> +    jnz         .loop
> +
> +    packuswb    m7, m7
> +    psadbw      m7, m4
> +    mov         eax, r3d
>      movd        r4d, m7
>      sub         eax, r4d        ; numSig
>  
> diff -r 6e4eb8542203 -r d1dad0926632 source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp	Sat Aug 09 19:43:23 2014 -0500
> +++ b/source/test/mbdstharness.cpp	Sun Aug 10 17:22:08 2014 +0900
> @@ -348,11 +348,8 @@
>          int index1 = rand() % TEST_CASES;
>          int index2 = rand() % TEST_CASES;
>  
> -        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf5, mintbuf6, bits, valueToAdd, numCoeff);
> -        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mintbuf4, bits, valueToAdd, numCoeff);
> -
> -        if (memcmp(mintbuf3, mintbuf5, cmp_size))
> -            return false;
> +        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf6, bits, valueToAdd, numCoeff);
> +        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf4, bits, valueToAdd, numCoeff);
>  
>          if (memcmp(mintbuf4, mintbuf6, cmp_size))
>              return false;
> @@ -511,7 +508,7 @@
>      if (opt.nquant)
>      {
>          printf("nquant\t\t");
> -        REPORT_SPEEDUP(opt.nquant, ref.nquant, mintbuf1, mintbuf2, mintbuf3, mintbuf4, 23, 23785, 32 * 32);
> +        REPORT_SPEEDUP(opt.nquant, ref.nquant, mintbuf1, mintbuf2, mintbuf3, 23, 23785, 32 * 32);
>      }
>  
>      if (opt.count_nonzero)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho