[x265] [PATCH 1 of 2] asm: integrate denoise_dct
Min Chen
chenm003 at 163.com
Thu Aug 14 02:29:28 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1407976146 25200
# Node ID 41de0838d88bf89b9156e44e31772273df24c070
# Parent d43e9a6a7cced5b60284c25bd987c55c522c1212
asm: integrate denoise_dct
diff -r d43e9a6a7cce -r 41de0838d88b source/common/common.h
--- a/source/common/common.h Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/common.h Wed Aug 13 17:29:06 2014 -0700
@@ -179,15 +179,16 @@
#define X265_LOG2(x) log2(x)
#endif
+// NOTE: MUST be alignment to 16 or 32 bytes for asm code
struct NoiseReduction
{
- bool bNoiseReduction;
-
/* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
* 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
uint16_t offsetDenoise[8][1024];
uint32_t residualSum[8][1024];
uint32_t count[8];
+
+ bool bNoiseReduction;
};
/* defined in common.cpp */
diff -r d43e9a6a7cce -r 41de0838d88b source/common/dct.cpp
--- a/source/common/dct.cpp Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/dct.cpp Wed Aug 13 17:29:06 2014 -0700
@@ -845,6 +845,20 @@
return numSig;
}
+
+void denoiseDct_c(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
+{
+ for (int i = 0; i < numCoeff; i++)
+ {
+ int level = dctCoef[i];
+ int sign = level >> 31;
+ level = (level + sign) ^ sign;
+ resSum[i] += level;
+ level -= offset[i];
+ dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
+ }
+}
+
} // closing - anonymous file-static namespace
namespace x265 {
@@ -867,6 +881,7 @@
p.idct[IDCT_16x16] = idct16_c;
p.idct[IDCT_32x32] = idct32_c;
p.count_nonzero = count_nonzero_c;
+ p.denoiseDct = denoiseDct_c;
p.cvt16to32_cnt[BLOCK_4x4] = conv16to32_count<4>;
p.cvt16to32_cnt[BLOCK_8x8] = conv16to32_count<8>;
diff -r d43e9a6a7cce -r 41de0838d88b source/common/primitives.h
--- a/source/common/primitives.h Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/primitives.h Wed Aug 13 17:29:06 2014 -0700
@@ -7,6 +7,7 @@
* Mahesh Pittala <mahesh at multicorewareinc.com>
* Rajesh Paulraj <rajesh at multicorewareinc.com>
* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+ * Min Chen <chenm003 at 163.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -156,6 +157,8 @@
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
+typedef void (*denoiseDct_t)(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
+
typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
@@ -264,6 +267,7 @@
dequant_scaling_t dequant_scaling;
dequant_normal_t dequant_normal;
count_nonzero_t count_nonzero;
+ denoiseDct_t denoiseDct;
calcresidual_t calcresidual[NUM_SQUARE_BLOCKS];
calcrecon_t calcrecon[NUM_SQUARE_BLOCKS];
diff -r d43e9a6a7cce -r 41de0838d88b source/common/quant.cpp
--- a/source/common/quant.cpp Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/quant.cpp Wed Aug 13 17:29:06 2014 -0700
@@ -49,19 +49,6 @@
return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
}
-inline void denoiseDct(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
-{
- for (int i = 0; i < numCoeff; i++)
- {
- int level = dctCoef[i];
- int sign = level >> 31;
- level = (level + sign) ^ sign;
- resSum[i] += level;
- level -= offset[i];
- dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
- }
-}
-
inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
@@ -380,7 +367,7 @@
/* denoise is not applied to intra residual, so DST can be ignored */
int cat = sizeIdx + 4 * !isLuma;
int numCoeff = 1 << log2TrSize * 2;
- denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
+ primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
m_nr->count[cat]++;
}
}
diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 13 17:29:06 2014 -0700
@@ -1550,6 +1550,7 @@
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
+ p.denoiseDct = x265_denoise_dct_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -1585,6 +1586,7 @@
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
p.count_nonzero = x265_count_nonzero_ssse3;
+ p.denoiseDct = x265_denoise_dct_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -1687,6 +1689,7 @@
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
p.ssim_end_4 = x265_pixel_ssim_end4_avx;
+ p.denoiseDct = x265_denoise_dct_avx;
}
if (cpuMask & X265_CPU_XOP)
{
@@ -1712,6 +1715,7 @@
p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
+ p.denoiseDct = x265_denoise_dct_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/x86/dct8.asm Wed Aug 13 17:29:06 2014 -0700
@@ -874,5 +874,125 @@
; restore origin stack pointer
mov rsp, [rsp + 16*mmsize]
+ RET
+
+; TODO: split into two version after coeff_t changed
+%if 1 ;HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,6
+ pxor m5, m5
+ movsxdifnidn r3, r3d
+.loop:
+ mova m2, [r0+r3*4-2*mmsize]
+ mova m3, [r0+r3*4-1*mmsize]
+ ABSD m0, m2
+ ABSD m1, m3
+ paddd m4, m0, [r1+r3*4-2*mmsize]
+ psubd m0, [r2+r3*4-2*mmsize]
+ mova [r1+r3*4-2*mmsize], m4
+ paddd m4, m1, [r1+r3*4-1*mmsize]
+ psubd m1, [r2+r3*4-1*mmsize]
+ mova [r1+r3*4-1*mmsize], m4
+ pcmpgtd m4, m0, m5
+ pand m0, m4
+ pcmpgtd m4, m1, m5
+ pand m1, m4
+ PSIGND m0, m2
+ PSIGND m1, m3
+ mova [r0+r3*4-2*mmsize], m0
+ mova [r0+r3*4-1*mmsize], m1
+ sub r3d, mmsize/2
+ jg .loop
RET
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_MMX mmx
+DENOISE_DCT
+%endif
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
+INIT_YMM avx2
+DENOISE_DCT
+
+%else ; !HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,7
+ pxor m6, m6
+ movsxdifnidn r3, r3d
+.loop:
+ mova m2, [r0+r3*2-2*mmsize]
+ mova m3, [r0+r3*2-1*mmsize]
+ ABSW m0, m2, sign
+ ABSW m1, m3, sign
+ psubusw m4, m0, [r2+r3*2-2*mmsize]
+ psubusw m5, m1, [r2+r3*2-1*mmsize]
+ PSIGNW m4, m2
+ PSIGNW m5, m3
+ mova [r0+r3*2-2*mmsize], m4
+ mova [r0+r3*2-1*mmsize], m5
+ punpcklwd m2, m0, m6
+ punpcklwd m3, m1, m6
+ punpckhwd m0, m6
+ punpckhwd m1, m6
+ paddd m2, [r1+r3*4-4*mmsize]
+ paddd m0, [r1+r3*4-3*mmsize]
+ paddd m3, [r1+r3*4-2*mmsize]
+ paddd m1, [r1+r3*4-1*mmsize]
+ mova [r1+r3*4-4*mmsize], m2
+ mova [r1+r3*4-3*mmsize], m0
+ mova [r1+r3*4-2*mmsize], m3
+ mova [r1+r3*4-1*mmsize], m1
+ sub r3, mmsize
+ jg .loop
+%if (mmsize == 8)
+ EMMS
+%endif
+ RET
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_MMX mmx
+DENOISE_DCT
+%endif
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
+
+INIT_YMM avx2
+cglobal denoise_dct, 4,4,4
+ pxor m3, m3
+ movsxdifnidn r3, r3d
+.loop:
+ mova m1, [r0+r3*2-mmsize]
+ pabsw m0, m1
+ psubusw m2, m0, [r2+r3*2-mmsize]
+ vpermq m0, m0, q3120
+ psignw m2, m1
+ mova [r0+r3*2-mmsize], m2
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m1, [r1+r3*4-2*mmsize]
+ paddd m0, [r1+r3*4-1*mmsize]
+ mova [r1+r3*4-2*mmsize], m1
+ mova [r1+r3*4-1*mmsize], m0
+ sub r3, mmsize/2
+ jg .loop
+ RET
+
+%endif ; !HIGH_BIT_DEPTH
diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/x86/dct8.h Wed Aug 13 17:29:06 2014 -0700
@@ -31,4 +31,10 @@
void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_ssse3(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_avx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+
#endif // ifndef X265_DCT8_H
diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp Wed Aug 13 00:19:31 2014 -0500
+++ b/source/encoder/frameencoder.cpp Wed Aug 13 17:29:06 2014 -0700
@@ -73,6 +73,9 @@
X265_FREE(m_substreamSizes);
m_frameFilter.destroy();
+ if (m_nr)
+ X265_FREE(m_nr);
+
// wait for worker thread to exit
stop();
}
@@ -116,8 +119,9 @@
}
memset(&m_frameStats, 0, sizeof(m_frameStats));
- memset(&m_nr, 0, sizeof(m_nr));
- m_nr.bNoiseReduction = !!m_param->noiseReduction;
+ m_nr = X265_MALLOC(NoiseReduction, 1);
+ memset(m_nr, 0, sizeof(NoiseReduction));
+ m_nr->bNoiseReduction = !!m_param->noiseReduction;
start();
return ok;
@@ -640,7 +644,7 @@
// setup thread-local data
TComPicYuv* fenc = m_frame->getPicYuvOrg();
- tld.m_cuCoder.m_quant.m_nr = &m_nr;
+ tld.m_cuCoder.m_quant.m_nr = m_nr;
tld.m_cuCoder.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
tld.m_cuCoder.m_log = &tld.m_cuCoder.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
setLambda(m_frame->m_picSym->m_slice->m_sliceQp, tld);
@@ -872,7 +876,7 @@
/* DCT-domain noise reduction / adaptive deadzone from libavcodec */
void FrameEncoder::noiseReductionUpdate()
{
- if (!m_nr.bNoiseReduction)
+ if (!m_nr->bNoiseReduction)
return;
static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
@@ -882,24 +886,24 @@
int trSize = cat & 3;
int coefCount = 1 << ((trSize + 2) * 2);
- if (m_nr.count[cat] > maxBlocksPerTrSize[trSize])
+ if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
{
for (int i = 0; i < coefCount; i++)
- m_nr.residualSum[cat][i] >>= 1;
- m_nr.count[cat] >>= 1;
+ m_nr->residualSum[cat][i] >>= 1;
+ m_nr->count[cat] >>= 1;
}
- uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr.count[cat];
+ uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
for (int i = 0; i < coefCount; i++)
{
- uint64_t value = scaledCount + m_nr.residualSum[cat][i] / 2;
- uint64_t denom = m_nr.residualSum[cat][i] + 1;
- m_nr.offsetDenoise[cat][i] = (uint16_t)(value / denom);
+ uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
+ uint64_t denom = m_nr->residualSum[cat][i] + 1;
+ m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
}
// Don't denoise DC coefficients
- m_nr.offsetDenoise[cat][0] = 0;
+ m_nr->offsetDenoise[cat][0] = 0;
}
}
diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h Wed Aug 13 00:19:31 2014 -0500
+++ b/source/encoder/frameencoder.h Wed Aug 13 17:29:06 2014 -0700
@@ -142,7 +142,7 @@
Bitstream m_bs;
Bitstream* m_outStreams;
uint32_t* m_substreamSizes;
- NoiseReduction m_nr;
+ NoiseReduction* m_nr;
NALList m_nalList;
ThreadLocalData m_tld; /* for --no-wpp */
More information about the x265-devel
mailing list