[x265] [PATCH 1 of 2] asm: integrate denoise_dct
Steve Borho
steve at borho.org
Thu Aug 14 01:15:32 CEST 2014
On 08/13, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1407976146 25200
> # Node ID 41de0838d88bf89b9156e44e31772273df24c070
> # Parent d43e9a6a7cced5b60284c25bd987c55c522c1212
> asm: integrate denoise_dct
>
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/common.h
> --- a/source/common/common.h Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/common.h Wed Aug 13 17:29:06 2014 -0700
> @@ -179,15 +179,16 @@
> #define X265_LOG2(x) log2(x)
> #endif
>
> +// NOTE: MUST be alignment to 16 or 32 bytes for asm code
> struct NoiseReduction
> {
> - bool bNoiseReduction;
> -
> /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
> * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
> uint16_t offsetDenoise[8][1024];
> uint32_t residualSum[8][1024];
> uint32_t count[8];
> +
> + bool bNoiseReduction;
Since the frame encoder is now malloc'ing this structure (I presume for
memory alignment) the bNoiseReduction flag is unnecessary, we can simply
check that the pointer is NULL. This nicely handles malloc failure
cleanly as well. I've queued a version of the patch modified in this
way.
> };
>
> /* defined in common.cpp */
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/dct.cpp
> --- a/source/common/dct.cpp Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/dct.cpp Wed Aug 13 17:29:06 2014 -0700
> @@ -845,6 +845,20 @@
>
> return numSig;
> }
> +
> +void denoiseDct_c(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> +{
> + for (int i = 0; i < numCoeff; i++)
> + {
> + int level = dctCoef[i];
> + int sign = level >> 31;
> + level = (level + sign) ^ sign;
> + resSum[i] += level;
> + level -= offset[i];
> + dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
> + }
> +}
> +
> } // closing - anonymous file-static namespace
>
> namespace x265 {
> @@ -867,6 +881,7 @@
> p.idct[IDCT_16x16] = idct16_c;
> p.idct[IDCT_32x32] = idct32_c;
> p.count_nonzero = count_nonzero_c;
> + p.denoiseDct = denoiseDct_c;
>
> p.cvt16to32_cnt[BLOCK_4x4] = conv16to32_count<4>;
> p.cvt16to32_cnt[BLOCK_8x8] = conv16to32_count<8>;
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/primitives.h
> --- a/source/common/primitives.h Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/primitives.h Wed Aug 13 17:29:06 2014 -0700
> @@ -7,6 +7,7 @@
> * Mahesh Pittala <mahesh at multicorewareinc.com>
> * Rajesh Paulraj <rajesh at multicorewareinc.com>
> * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> + * Min Chen <chenm003 at 163.com>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -156,6 +157,8 @@
>
> typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
> typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
> +typedef void (*denoiseDct_t)(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
> +
> typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> @@ -264,6 +267,7 @@
> dequant_scaling_t dequant_scaling;
> dequant_normal_t dequant_normal;
> count_nonzero_t count_nonzero;
> + denoiseDct_t denoiseDct;
>
> calcresidual_t calcresidual[NUM_SQUARE_BLOCKS];
> calcrecon_t calcrecon[NUM_SQUARE_BLOCKS];
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/quant.cpp
> --- a/source/common/quant.cpp Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/quant.cpp Wed Aug 13 17:29:06 2014 -0700
> @@ -49,19 +49,6 @@
> return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
> }
>
> -inline void denoiseDct(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> -{
> - for (int i = 0; i < numCoeff; i++)
> - {
> - int level = dctCoef[i];
> - int sign = level >> 31;
> - level = (level + sign) ^ sign;
> - resSum[i] += level;
> - level -= offset[i];
> - dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
> - }
> -}
> -
> inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
> {
> X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
> @@ -380,7 +367,7 @@
> /* denoise is not applied to intra residual, so DST can be ignored */
> int cat = sizeIdx + 4 * !isLuma;
> int numCoeff = 1 << log2TrSize * 2;
> - denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
> + primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
> m_nr->count[cat]++;
> }
> }
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/x86/asm-primitives.cpp Wed Aug 13 17:29:06 2014 -0700
> @@ -1550,6 +1550,7 @@
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
> p.planecopy_sp = x265_downShift_16_sse2;
> + p.denoiseDct = x265_denoise_dct_sse2;
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
> @@ -1585,6 +1586,7 @@
> p.dct[DST_4x4] = x265_dst4_ssse3;
> p.idct[IDCT_8x8] = x265_idct8_ssse3;
> p.count_nonzero = x265_count_nonzero_ssse3;
> + p.denoiseDct = x265_denoise_dct_ssse3;
> }
> if (cpuMask & X265_CPU_SSE4)
> {
> @@ -1687,6 +1689,7 @@
>
> p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
> p.ssim_end_4 = x265_pixel_ssim_end4_avx;
> + p.denoiseDct = x265_denoise_dct_avx;
> }
> if (cpuMask & X265_CPU_XOP)
> {
> @@ -1712,6 +1715,7 @@
> p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
> p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
> p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> + p.denoiseDct = x265_denoise_dct_avx2;
> }
> #endif // if HIGH_BIT_DEPTH
> }
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/x86/dct8.asm Wed Aug 13 17:29:06 2014 -0700
> @@ -874,5 +874,125 @@
>
> ; restore origin stack pointer
> mov rsp, [rsp + 16*mmsize]
> + RET
>
> +
> +; TODO: split into two version after coeff_t changed
> +%if 1 ;HIGH_BIT_DEPTH
> +;-----------------------------------------------------------------------------
> +; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
> +;-----------------------------------------------------------------------------
> +%macro DENOISE_DCT 0
> +cglobal denoise_dct, 4,4,6
> + pxor m5, m5
> + movsxdifnidn r3, r3d
> +.loop:
> + mova m2, [r0+r3*4-2*mmsize]
> + mova m3, [r0+r3*4-1*mmsize]
> + ABSD m0, m2
> + ABSD m1, m3
> + paddd m4, m0, [r1+r3*4-2*mmsize]
> + psubd m0, [r2+r3*4-2*mmsize]
> + mova [r1+r3*4-2*mmsize], m4
> + paddd m4, m1, [r1+r3*4-1*mmsize]
> + psubd m1, [r2+r3*4-1*mmsize]
> + mova [r1+r3*4-1*mmsize], m4
> + pcmpgtd m4, m0, m5
> + pand m0, m4
> + pcmpgtd m4, m1, m5
> + pand m1, m4
> + PSIGND m0, m2
> + PSIGND m1, m3
> + mova [r0+r3*4-2*mmsize], m0
> + mova [r0+r3*4-1*mmsize], m1
> + sub r3d, mmsize/2
> + jg .loop
> RET
> +%endmacro
> +
> +%if ARCH_X86_64 == 0
> +INIT_MMX mmx
> +DENOISE_DCT
> +%endif
> +INIT_XMM sse2
> +DENOISE_DCT
> +INIT_XMM ssse3
> +DENOISE_DCT
> +INIT_XMM avx
> +DENOISE_DCT
> +INIT_YMM avx2
> +DENOISE_DCT
> +
> +%else ; !HIGH_BIT_DEPTH
> +
> +;-----------------------------------------------------------------------------
> +; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
> +;-----------------------------------------------------------------------------
> +%macro DENOISE_DCT 0
> +cglobal denoise_dct, 4,4,7
> + pxor m6, m6
> + movsxdifnidn r3, r3d
> +.loop:
> + mova m2, [r0+r3*2-2*mmsize]
> + mova m3, [r0+r3*2-1*mmsize]
> + ABSW m0, m2, sign
> + ABSW m1, m3, sign
> + psubusw m4, m0, [r2+r3*2-2*mmsize]
> + psubusw m5, m1, [r2+r3*2-1*mmsize]
> + PSIGNW m4, m2
> + PSIGNW m5, m3
> + mova [r0+r3*2-2*mmsize], m4
> + mova [r0+r3*2-1*mmsize], m5
> + punpcklwd m2, m0, m6
> + punpcklwd m3, m1, m6
> + punpckhwd m0, m6
> + punpckhwd m1, m6
> + paddd m2, [r1+r3*4-4*mmsize]
> + paddd m0, [r1+r3*4-3*mmsize]
> + paddd m3, [r1+r3*4-2*mmsize]
> + paddd m1, [r1+r3*4-1*mmsize]
> + mova [r1+r3*4-4*mmsize], m2
> + mova [r1+r3*4-3*mmsize], m0
> + mova [r1+r3*4-2*mmsize], m3
> + mova [r1+r3*4-1*mmsize], m1
> + sub r3, mmsize
> + jg .loop
> +%if (mmsize == 8)
> + EMMS
> +%endif
> + RET
> +%endmacro
> +
> +%if ARCH_X86_64 == 0
> +INIT_MMX mmx
> +DENOISE_DCT
> +%endif
> +INIT_XMM sse2
> +DENOISE_DCT
> +INIT_XMM ssse3
> +DENOISE_DCT
> +INIT_XMM avx
> +DENOISE_DCT
> +
> +INIT_YMM avx2
> +cglobal denoise_dct, 4,4,4
> + pxor m3, m3
> + movsxdifnidn r3, r3d
> +.loop:
> + mova m1, [r0+r3*2-mmsize]
> + pabsw m0, m1
> + psubusw m2, m0, [r2+r3*2-mmsize]
> + vpermq m0, m0, q3120
> + psignw m2, m1
> + mova [r0+r3*2-mmsize], m2
> + punpcklwd m1, m0, m3
> + punpckhwd m0, m3
> + paddd m1, [r1+r3*4-2*mmsize]
> + paddd m0, [r1+r3*4-1*mmsize]
> + mova [r1+r3*4-2*mmsize], m1
> + mova [r1+r3*4-1*mmsize], m0
> + sub r3, mmsize/2
> + jg .loop
> + RET
> +
> +%endif ; !HIGH_BIT_DEPTH
> diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/common/x86/dct8.h Wed Aug 13 17:29:06 2014 -0700
> @@ -31,4 +31,10 @@
> void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
>
> +void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_ssse3(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_avx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +
> #endif // ifndef X265_DCT8_H
> diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/encoder/frameencoder.cpp Wed Aug 13 17:29:06 2014 -0700
> @@ -73,6 +73,9 @@
> X265_FREE(m_substreamSizes);
> m_frameFilter.destroy();
>
> + if (m_nr)
> + X265_FREE(m_nr);
> +
> // wait for worker thread to exit
> stop();
> }
> @@ -116,8 +119,9 @@
> }
>
> memset(&m_frameStats, 0, sizeof(m_frameStats));
> - memset(&m_nr, 0, sizeof(m_nr));
> - m_nr.bNoiseReduction = !!m_param->noiseReduction;
> + m_nr = X265_MALLOC(NoiseReduction, 1);
> + memset(m_nr, 0, sizeof(NoiseReduction));
> + m_nr->bNoiseReduction = !!m_param->noiseReduction;
>
> start();
> return ok;
> @@ -640,7 +644,7 @@
>
> // setup thread-local data
> TComPicYuv* fenc = m_frame->getPicYuvOrg();
> - tld.m_cuCoder.m_quant.m_nr = &m_nr;
> + tld.m_cuCoder.m_quant.m_nr = m_nr;
> tld.m_cuCoder.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
> tld.m_cuCoder.m_log = &tld.m_cuCoder.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
> setLambda(m_frame->m_picSym->m_slice->m_sliceQp, tld);
> @@ -872,7 +876,7 @@
> /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
> void FrameEncoder::noiseReductionUpdate()
> {
> - if (!m_nr.bNoiseReduction)
> + if (!m_nr->bNoiseReduction)
> return;
>
> static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
> @@ -882,24 +886,24 @@
> int trSize = cat & 3;
> int coefCount = 1 << ((trSize + 2) * 2);
>
> - if (m_nr.count[cat] > maxBlocksPerTrSize[trSize])
> + if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
> {
> for (int i = 0; i < coefCount; i++)
> - m_nr.residualSum[cat][i] >>= 1;
> - m_nr.count[cat] >>= 1;
> + m_nr->residualSum[cat][i] >>= 1;
> + m_nr->count[cat] >>= 1;
> }
>
> - uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr.count[cat];
> + uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
>
> for (int i = 0; i < coefCount; i++)
> {
> - uint64_t value = scaledCount + m_nr.residualSum[cat][i] / 2;
> - uint64_t denom = m_nr.residualSum[cat][i] + 1;
> - m_nr.offsetDenoise[cat][i] = (uint16_t)(value / denom);
> + uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
> + uint64_t denom = m_nr->residualSum[cat][i] + 1;
> + m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
> }
>
> // Don't denoise DC coefficients
> - m_nr.offsetDenoise[cat][0] = 0;
> + m_nr->offsetDenoise[cat][0] = 0;
> }
> }
>
> diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.h
> --- a/source/encoder/frameencoder.h Wed Aug 13 00:19:31 2014 -0500
> +++ b/source/encoder/frameencoder.h Wed Aug 13 17:29:06 2014 -0700
> @@ -142,7 +142,7 @@
> Bitstream m_bs;
> Bitstream* m_outStreams;
> uint32_t* m_substreamSizes;
> - NoiseReduction m_nr;
> + NoiseReduction* m_nr;
> NALList m_nalList;
> ThreadLocalData m_tld; /* for --no-wpp */
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list