[x265] [PATCH 1 of 2] asm: integrate denoise_dct

Thu Aug 14 01:15:32 CEST 2014

On 08/13, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1407976146 25200
> # Node ID 41de0838d88bf89b9156e44e31772273df24c070
> # Parent  d43e9a6a7cced5b60284c25bd987c55c522c1212
> asm: integrate denoise_dct
> 
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/common.h
> --- a/source/common/common.h	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/common.h	Wed Aug 13 17:29:06 2014 -0700
> @@ -179,15 +179,16 @@
>  #define X265_LOG2(x)  log2(x)
>  #endif
>  
> +// NOTE: MUST be alignment to 16 or 32 bytes for asm code
>  struct NoiseReduction
>  {
> -    bool bNoiseReduction;
> -
>      /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
>       * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
>      uint16_t offsetDenoise[8][1024];
>      uint32_t residualSum[8][1024];
>      uint32_t count[8];
> +
> +    bool bNoiseReduction;

Since the frame encoder is now malloc'ing this structure (I presume for
memory alignment) the bNoiseReduction flag is unnecessary, we can simply
check that the pointer is NULL.  This nicely handles malloc failure
cleanly as well.  I've queued a version of the patch modified in this
way.

>  };
>  
>  /* defined in common.cpp */
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/dct.cpp
> --- a/source/common/dct.cpp	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/dct.cpp	Wed Aug 13 17:29:06 2014 -0700
> @@ -845,6 +845,20 @@
>  
>      return numSig;
>  }
> +
> +void denoiseDct_c(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> +{
> +    for (int i = 0; i < numCoeff; i++)
> +    {
> +        int level = dctCoef[i];
> +        int sign = level >> 31;
> +        level = (level + sign) ^ sign;
> +        resSum[i] += level;
> +        level -= offset[i];
> +        dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
> +    }
> +}
> +
>  }  // closing - anonymous file-static namespace
>  
>  namespace x265 {
> @@ -867,6 +881,7 @@
>      p.idct[IDCT_16x16] = idct16_c;
>      p.idct[IDCT_32x32] = idct32_c;
>      p.count_nonzero = count_nonzero_c;
> +    p.denoiseDct = denoiseDct_c;
>  
>      p.cvt16to32_cnt[BLOCK_4x4] = conv16to32_count<4>;
>      p.cvt16to32_cnt[BLOCK_8x8] = conv16to32_count<8>;
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/primitives.h
> --- a/source/common/primitives.h	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/primitives.h	Wed Aug 13 17:29:06 2014 -0700
> @@ -7,6 +7,7 @@
>   *          Mahesh Pittala <mahesh at multicorewareinc.com>
>   *          Rajesh Paulraj <rajesh at multicorewareinc.com>
>   *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> + *          Min Chen <chenm003 at 163.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -156,6 +157,8 @@
>  
>  typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
>  typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
> +typedef void (*denoiseDct_t)(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
> +
>  typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
>  typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
>  typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> @@ -264,6 +267,7 @@
>      dequant_scaling_t dequant_scaling;
>      dequant_normal_t dequant_normal;
>      count_nonzero_t count_nonzero;
> +    denoiseDct_t    denoiseDct;
>  
>      calcresidual_t  calcresidual[NUM_SQUARE_BLOCKS];
>      calcrecon_t     calcrecon[NUM_SQUARE_BLOCKS];
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/quant.cpp
> --- a/source/common/quant.cpp	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/quant.cpp	Wed Aug 13 17:29:06 2014 -0700
> @@ -49,19 +49,6 @@
>      return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
>  }
>  
> -inline void denoiseDct(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> -{
> -    for (int i = 0; i < numCoeff; i++)
> -    {
> -        int level = dctCoef[i];
> -        int sign = level >> 31;
> -        level = (level + sign) ^ sign;
> -        resSum[i] += level;
> -        level -= offset[i];
> -        dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
> -    }
> -}
> -
>  inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
>  {
>      X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
> @@ -380,7 +367,7 @@
>              /* denoise is not applied to intra residual, so DST can be ignored */
>              int cat = sizeIdx + 4 * !isLuma;
>              int numCoeff = 1 << log2TrSize * 2;
> -            denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
> +            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
>              m_nr->count[cat]++;
>          }
>      }
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/x86/asm-primitives.cpp	Wed Aug 13 17:29:06 2014 -0700
> @@ -1550,6 +1550,7 @@
>          p.idct[IDCT_4x4] = x265_idct4_sse2;
>          p.idct[IDST_4x4] = x265_idst4_sse2;
>          p.planecopy_sp = x265_downShift_16_sse2;
> +        p.denoiseDct = x265_denoise_dct_sse2;
>      }
>      if (cpuMask & X265_CPU_SSSE3)
>      {
> @@ -1585,6 +1586,7 @@
>          p.dct[DST_4x4] = x265_dst4_ssse3;
>          p.idct[IDCT_8x8] = x265_idct8_ssse3;
>          p.count_nonzero = x265_count_nonzero_ssse3;
> +        p.denoiseDct = x265_denoise_dct_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> @@ -1687,6 +1689,7 @@
>  
>          p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
>          p.ssim_end_4 = x265_pixel_ssim_end4_avx;
> +        p.denoiseDct = x265_denoise_dct_avx;
>      }
>      if (cpuMask & X265_CPU_XOP)
>      {
> @@ -1712,6 +1715,7 @@
>          p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
>          p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
>          p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> +        p.denoiseDct = x265_denoise_dct_avx2;
>      }
>  #endif // if HIGH_BIT_DEPTH
>  }
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/x86/dct8.asm	Wed Aug 13 17:29:06 2014 -0700
> @@ -874,5 +874,125 @@
>  
>      ; restore origin stack pointer
>      mov         rsp, [rsp + 16*mmsize]
> +    RET
>  
> +
> +; TODO: split into two version after coeff_t changed
> +%if 1 ;HIGH_BIT_DEPTH
> +;-----------------------------------------------------------------------------
> +; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
> +;-----------------------------------------------------------------------------
> +%macro DENOISE_DCT 0
> +cglobal denoise_dct, 4,4,6
> +    pxor      m5, m5
> +    movsxdifnidn r3, r3d
> +.loop:
> +    mova      m2, [r0+r3*4-2*mmsize]
> +    mova      m3, [r0+r3*4-1*mmsize]
> +    ABSD      m0, m2
> +    ABSD      m1, m3
> +    paddd     m4, m0, [r1+r3*4-2*mmsize]
> +    psubd     m0, [r2+r3*4-2*mmsize]
> +    mova      [r1+r3*4-2*mmsize], m4
> +    paddd     m4, m1, [r1+r3*4-1*mmsize]
> +    psubd     m1, [r2+r3*4-1*mmsize]
> +    mova      [r1+r3*4-1*mmsize], m4
> +    pcmpgtd   m4, m0, m5
> +    pand      m0, m4
> +    pcmpgtd   m4, m1, m5
> +    pand      m1, m4
> +    PSIGND    m0, m2
> +    PSIGND    m1, m3
> +    mova      [r0+r3*4-2*mmsize], m0
> +    mova      [r0+r3*4-1*mmsize], m1
> +    sub      r3d, mmsize/2
> +    jg .loop
>      RET
> +%endmacro
> +
> +%if ARCH_X86_64 == 0
> +INIT_MMX mmx
> +DENOISE_DCT
> +%endif
> +INIT_XMM sse2
> +DENOISE_DCT
> +INIT_XMM ssse3
> +DENOISE_DCT
> +INIT_XMM avx
> +DENOISE_DCT
> +INIT_YMM avx2
> +DENOISE_DCT
> +
> +%else ; !HIGH_BIT_DEPTH
> +
> +;-----------------------------------------------------------------------------
> +; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
> +;-----------------------------------------------------------------------------
> +%macro DENOISE_DCT 0
> +cglobal denoise_dct, 4,4,7
> +    pxor      m6, m6
> +    movsxdifnidn r3, r3d
> +.loop:
> +    mova      m2, [r0+r3*2-2*mmsize]
> +    mova      m3, [r0+r3*2-1*mmsize]
> +    ABSW      m0, m2, sign
> +    ABSW      m1, m3, sign
> +    psubusw   m4, m0, [r2+r3*2-2*mmsize]
> +    psubusw   m5, m1, [r2+r3*2-1*mmsize]
> +    PSIGNW    m4, m2
> +    PSIGNW    m5, m3
> +    mova      [r0+r3*2-2*mmsize], m4
> +    mova      [r0+r3*2-1*mmsize], m5
> +    punpcklwd m2, m0, m6
> +    punpcklwd m3, m1, m6
> +    punpckhwd m0, m6
> +    punpckhwd m1, m6
> +    paddd     m2, [r1+r3*4-4*mmsize]
> +    paddd     m0, [r1+r3*4-3*mmsize]
> +    paddd     m3, [r1+r3*4-2*mmsize]
> +    paddd     m1, [r1+r3*4-1*mmsize]
> +    mova      [r1+r3*4-4*mmsize], m2
> +    mova      [r1+r3*4-3*mmsize], m0
> +    mova      [r1+r3*4-2*mmsize], m3
> +    mova      [r1+r3*4-1*mmsize], m1
> +    sub       r3, mmsize
> +    jg .loop
> +%if (mmsize == 8)
> +    EMMS
> +%endif
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64 == 0
> +INIT_MMX mmx
> +DENOISE_DCT
> +%endif
> +INIT_XMM sse2
> +DENOISE_DCT
> +INIT_XMM ssse3
> +DENOISE_DCT
> +INIT_XMM avx
> +DENOISE_DCT
> +
> +INIT_YMM avx2
> +cglobal denoise_dct, 4,4,4
> +    pxor      m3, m3
> +    movsxdifnidn r3, r3d
> +.loop:
> +    mova      m1, [r0+r3*2-mmsize]
> +    pabsw     m0, m1
> +    psubusw   m2, m0, [r2+r3*2-mmsize]
> +    vpermq    m0, m0, q3120
> +    psignw    m2, m1
> +    mova [r0+r3*2-mmsize], m2
> +    punpcklwd m1, m0, m3
> +    punpckhwd m0, m3
> +    paddd     m1, [r1+r3*4-2*mmsize]
> +    paddd     m0, [r1+r3*4-1*mmsize]
> +    mova      [r1+r3*4-2*mmsize], m1
> +    mova      [r1+r3*4-1*mmsize], m0
> +    sub       r3, mmsize/2
> +    jg .loop
> +    RET
> +
> +%endif ; !HIGH_BIT_DEPTH
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/x86/dct8.h	Wed Aug 13 17:29:06 2014 -0700
> @@ -31,4 +31,10 @@
>  void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>  void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
>  
> +void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_ssse3(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_avx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +
>  #endif // ifndef X265_DCT8_H
> diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/encoder/frameencoder.cpp	Wed Aug 13 17:29:06 2014 -0700
> @@ -73,6 +73,9 @@
>      X265_FREE(m_substreamSizes);
>      m_frameFilter.destroy();
>  
> +    if (m_nr)
> +        X265_FREE(m_nr);
> +
>      // wait for worker thread to exit
>      stop();
>  }
> @@ -116,8 +119,9 @@
>      }
>  
>      memset(&m_frameStats, 0, sizeof(m_frameStats));
> -    memset(&m_nr, 0, sizeof(m_nr));
> -    m_nr.bNoiseReduction = !!m_param->noiseReduction;
> +    m_nr = X265_MALLOC(NoiseReduction, 1);
> +    memset(m_nr, 0, sizeof(NoiseReduction));
> +    m_nr->bNoiseReduction = !!m_param->noiseReduction;
>  
>      start();
>      return ok;
> @@ -640,7 +644,7 @@
>  
>      // setup thread-local data
>      TComPicYuv* fenc = m_frame->getPicYuvOrg();
> -    tld.m_cuCoder.m_quant.m_nr = &m_nr;
> +    tld.m_cuCoder.m_quant.m_nr = m_nr;
>      tld.m_cuCoder.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
>      tld.m_cuCoder.m_log = &tld.m_cuCoder.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
>      setLambda(m_frame->m_picSym->m_slice->m_sliceQp, tld);
> @@ -872,7 +876,7 @@
>  /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
>  void FrameEncoder::noiseReductionUpdate()
>  {
> -    if (!m_nr.bNoiseReduction)
> +    if (!m_nr->bNoiseReduction)
>          return;
>  
>      static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
> @@ -882,24 +886,24 @@
>          int trSize = cat & 3;
>          int coefCount = 1 << ((trSize + 2) * 2);
>  
> -        if (m_nr.count[cat] > maxBlocksPerTrSize[trSize])
> +        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
>          {
>              for (int i = 0; i < coefCount; i++)
> -                m_nr.residualSum[cat][i] >>= 1;
> -            m_nr.count[cat] >>= 1;
> +                m_nr->residualSum[cat][i] >>= 1;
> +            m_nr->count[cat] >>= 1;
>          }
>  
> -        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr.count[cat];
> +        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
>  
>          for (int i = 0; i < coefCount; i++)
>          {
> -            uint64_t value = scaledCount + m_nr.residualSum[cat][i] / 2;
> -            uint64_t denom = m_nr.residualSum[cat][i] + 1;
> -            m_nr.offsetDenoise[cat][i] = (uint16_t)(value / denom);
> +            uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
> +            uint64_t denom = m_nr->residualSum[cat][i] + 1;
> +            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
>          }
>  
>          // Don't denoise DC coefficients
> -        m_nr.offsetDenoise[cat][0] = 0;
> +        m_nr->offsetDenoise[cat][0] = 0;
>      }
>  }
>  
> diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h	Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/encoder/frameencoder.h	Wed Aug 13 17:29:06 2014 -0700
> @@ -142,7 +142,7 @@
>      Bitstream                m_bs;
>      Bitstream*               m_outStreams;
>      uint32_t*                m_substreamSizes;
> -    NoiseReduction           m_nr;
> +    NoiseReduction*          m_nr;
>      NALList                  m_nalList;
>      ThreadLocalData          m_tld; /* for --no-wpp */
>  
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho