[x265] [PATCH 1 of 2] asm: integrate denoise_dct

Thu Aug 14 02:29:28 CEST 2014

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1407976146 25200
# Node ID 41de0838d88bf89b9156e44e31772273df24c070
# Parent  d43e9a6a7cced5b60284c25bd987c55c522c1212
asm: integrate denoise_dct

diff -r d43e9a6a7cce -r 41de0838d88b source/common/common.h

--- a/source/common/common.h	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/common.h	Wed Aug 13 17:29:06 2014 -0700
@@ -179,15 +179,16 @@
 #define X265_LOG2(x)  log2(x)
 #endif
 
+// NOTE: MUST be alignment to 16 or 32 bytes for asm code
 struct NoiseReduction
 {
-    bool bNoiseReduction;
-
     /* 0 = luma 4x4, 1 = luma 8x8, 2 = luma 16x16, 3 = luma 32x32
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32 */
     uint16_t offsetDenoise[8][1024];
     uint32_t residualSum[8][1024];
     uint32_t count[8];
+
+    bool bNoiseReduction;
 };
 
 /* defined in common.cpp */
diff -r d43e9a6a7cce -r 41de0838d88b source/common/dct.cpp
--- a/source/common/dct.cpp	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/dct.cpp	Wed Aug 13 17:29:06 2014 -0700
@@ -845,6 +845,20 @@
 
     return numSig;
 }
+
+void denoiseDct_c(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
+{
+    for (int i = 0; i < numCoeff; i++)
+    {
+        int level = dctCoef[i];
+        int sign = level >> 31;
+        level = (level + sign) ^ sign;
+        resSum[i] += level;
+        level -= offset[i];
+        dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
+    }
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace x265 {
@@ -867,6 +881,7 @@
     p.idct[IDCT_16x16] = idct16_c;
     p.idct[IDCT_32x32] = idct32_c;
     p.count_nonzero = count_nonzero_c;
+    p.denoiseDct = denoiseDct_c;
 
     p.cvt16to32_cnt[BLOCK_4x4] = conv16to32_count<4>;
     p.cvt16to32_cnt[BLOCK_8x8] = conv16to32_count<8>;
diff -r d43e9a6a7cce -r 41de0838d88b source/common/primitives.h
--- a/source/common/primitives.h	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/primitives.h	Wed Aug 13 17:29:06 2014 -0700
@@ -7,6 +7,7 @@
  *          Mahesh Pittala <mahesh at multicorewareinc.com>
  *          Rajesh Paulraj <rajesh at multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+ *          Min Chen <chenm003 at 163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -156,6 +157,8 @@
 
 typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
 typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
+typedef void (*denoiseDct_t)(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
+
 typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
@@ -264,6 +267,7 @@
     dequant_scaling_t dequant_scaling;
     dequant_normal_t dequant_normal;
     count_nonzero_t count_nonzero;
+    denoiseDct_t    denoiseDct;
 
     calcresidual_t  calcresidual[NUM_SQUARE_BLOCKS];
     calcrecon_t     calcrecon[NUM_SQUARE_BLOCKS];
diff -r d43e9a6a7cce -r 41de0838d88b source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/quant.cpp	Wed Aug 13 17:29:06 2014 -0700
@@ -49,19 +49,6 @@
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
 }
 
-inline void denoiseDct(coeff_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
-{
-    for (int i = 0; i < numCoeff; i++)
-    {
-        int level = dctCoef[i];
-        int sign = level >> 31;
-        level = (level + sign) ^ sign;
-        resSum[i] += level;
-        level -= offset[i];
-        dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
-    }
-}
-
 inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
 {
     X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
@@ -380,7 +367,7 @@
             /* denoise is not applied to intra residual, so DST can be ignored */
             int cat = sizeIdx + 4 * !isLuma;
             int numCoeff = 1 << log2TrSize * 2;
-            denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
+            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
             m_nr->count[cat]++;
         }
     }
diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 13 17:29:06 2014 -0700
@@ -1550,6 +1550,7 @@
         p.idct[IDCT_4x4] = x265_idct4_sse2;
         p.idct[IDST_4x4] = x265_idst4_sse2;
         p.planecopy_sp = x265_downShift_16_sse2;
+        p.denoiseDct = x265_denoise_dct_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1585,6 +1586,7 @@
         p.dct[DST_4x4] = x265_dst4_ssse3;
         p.idct[IDCT_8x8] = x265_idct8_ssse3;
         p.count_nonzero = x265_count_nonzero_ssse3;
+        p.denoiseDct = x265_denoise_dct_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -1687,6 +1689,7 @@
 
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
         p.ssim_end_4 = x265_pixel_ssim_end4_avx;
+        p.denoiseDct = x265_denoise_dct_avx;
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1712,6 +1715,7 @@
         p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
         p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
         p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
+        p.denoiseDct = x265_denoise_dct_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/x86/dct8.asm	Wed Aug 13 17:29:06 2014 -0700
@@ -874,5 +874,125 @@
 
     ; restore origin stack pointer
     mov         rsp, [rsp + 16*mmsize]
+    RET
 
+
+; TODO: split into two version after coeff_t changed
+%if 1 ;HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,6
+    pxor      m5, m5
+    movsxdifnidn r3, r3d
+.loop:
+    mova      m2, [r0+r3*4-2*mmsize]
+    mova      m3, [r0+r3*4-1*mmsize]
+    ABSD      m0, m2
+    ABSD      m1, m3
+    paddd     m4, m0, [r1+r3*4-2*mmsize]
+    psubd     m0, [r2+r3*4-2*mmsize]
+    mova      [r1+r3*4-2*mmsize], m4
+    paddd     m4, m1, [r1+r3*4-1*mmsize]
+    psubd     m1, [r2+r3*4-1*mmsize]
+    mova      [r1+r3*4-1*mmsize], m4
+    pcmpgtd   m4, m0, m5
+    pand      m0, m4
+    pcmpgtd   m4, m1, m5
+    pand      m1, m4
+    PSIGND    m0, m2
+    PSIGND    m1, m3
+    mova      [r0+r3*4-2*mmsize], m0
+    mova      [r0+r3*4-1*mmsize], m1
+    sub      r3d, mmsize/2
+    jg .loop
     RET
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_MMX mmx
+DENOISE_DCT
+%endif
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
+INIT_YMM avx2
+DENOISE_DCT
+
+%else ; !HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 0
+cglobal denoise_dct, 4,4,7
+    pxor      m6, m6
+    movsxdifnidn r3, r3d
+.loop:
+    mova      m2, [r0+r3*2-2*mmsize]
+    mova      m3, [r0+r3*2-1*mmsize]
+    ABSW      m0, m2, sign
+    ABSW      m1, m3, sign
+    psubusw   m4, m0, [r2+r3*2-2*mmsize]
+    psubusw   m5, m1, [r2+r3*2-1*mmsize]
+    PSIGNW    m4, m2
+    PSIGNW    m5, m3
+    mova      [r0+r3*2-2*mmsize], m4
+    mova      [r0+r3*2-1*mmsize], m5
+    punpcklwd m2, m0, m6
+    punpcklwd m3, m1, m6
+    punpckhwd m0, m6
+    punpckhwd m1, m6
+    paddd     m2, [r1+r3*4-4*mmsize]
+    paddd     m0, [r1+r3*4-3*mmsize]
+    paddd     m3, [r1+r3*4-2*mmsize]
+    paddd     m1, [r1+r3*4-1*mmsize]
+    mova      [r1+r3*4-4*mmsize], m2
+    mova      [r1+r3*4-3*mmsize], m0
+    mova      [r1+r3*4-2*mmsize], m3
+    mova      [r1+r3*4-1*mmsize], m1
+    sub       r3, mmsize
+    jg .loop
+%if (mmsize == 8)
+    EMMS
+%endif
+    RET
+%endmacro
+
+%if ARCH_X86_64 == 0
+INIT_MMX mmx
+DENOISE_DCT
+%endif
+INIT_XMM sse2
+DENOISE_DCT
+INIT_XMM ssse3
+DENOISE_DCT
+INIT_XMM avx
+DENOISE_DCT
+
+INIT_YMM avx2
+cglobal denoise_dct, 4,4,4
+    pxor      m3, m3
+    movsxdifnidn r3, r3d
+.loop:
+    mova      m1, [r0+r3*2-mmsize]
+    pabsw     m0, m1
+    psubusw   m2, m0, [r2+r3*2-mmsize]
+    vpermq    m0, m0, q3120
+    psignw    m2, m1
+    mova [r0+r3*2-mmsize], m2
+    punpcklwd m1, m0, m3
+    punpckhwd m0, m3
+    paddd     m1, [r1+r3*4-2*mmsize]
+    paddd     m0, [r1+r3*4-1*mmsize]
+    mova      [r1+r3*4-2*mmsize], m1
+    mova      [r1+r3*4-1*mmsize], m0
+    sub       r3, mmsize/2
+    jg .loop
+    RET
+
+%endif ; !HIGH_BIT_DEPTH
diff -r d43e9a6a7cce -r 41de0838d88b source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/common/x86/dct8.h	Wed Aug 13 17:29:06 2014 -0700
@@ -31,4 +31,10 @@
 void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
 
+void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_ssse3(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_avx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
+
 #endif // ifndef X265_DCT8_H
diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/encoder/frameencoder.cpp	Wed Aug 13 17:29:06 2014 -0700
@@ -73,6 +73,9 @@
     X265_FREE(m_substreamSizes);
     m_frameFilter.destroy();
 
+    if (m_nr)
+        X265_FREE(m_nr);
+
     // wait for worker thread to exit
     stop();
 }
@@ -116,8 +119,9 @@
     }
 
     memset(&m_frameStats, 0, sizeof(m_frameStats));
-    memset(&m_nr, 0, sizeof(m_nr));
-    m_nr.bNoiseReduction = !!m_param->noiseReduction;
+    m_nr = X265_MALLOC(NoiseReduction, 1);
+    memset(m_nr, 0, sizeof(NoiseReduction));
+    m_nr->bNoiseReduction = !!m_param->noiseReduction;
 
     start();
     return ok;
@@ -640,7 +644,7 @@
 
     // setup thread-local data
     TComPicYuv* fenc = m_frame->getPicYuvOrg();
-    tld.m_cuCoder.m_quant.m_nr = &m_nr;
+    tld.m_cuCoder.m_quant.m_nr = m_nr;
     tld.m_cuCoder.m_me.setSourcePlane(fenc->getLumaAddr(), fenc->getStride());
     tld.m_cuCoder.m_log = &tld.m_cuCoder.m_sliceTypeLog[m_frame->m_picSym->m_slice->m_sliceType];
     setLambda(m_frame->m_picSym->m_slice->m_sliceQp, tld);
@@ -872,7 +876,7 @@
 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
 void FrameEncoder::noiseReductionUpdate()
 {
-    if (!m_nr.bNoiseReduction)
+    if (!m_nr->bNoiseReduction)
         return;
 
     static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
@@ -882,24 +886,24 @@
         int trSize = cat & 3;
         int coefCount = 1 << ((trSize + 2) * 2);
 
-        if (m_nr.count[cat] > maxBlocksPerTrSize[trSize])
+        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
         {
             for (int i = 0; i < coefCount; i++)
-                m_nr.residualSum[cat][i] >>= 1;
-            m_nr.count[cat] >>= 1;
+                m_nr->residualSum[cat][i] >>= 1;
+            m_nr->count[cat] >>= 1;
         }
 
-        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr.count[cat];
+        uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
 
         for (int i = 0; i < coefCount; i++)
         {
-            uint64_t value = scaledCount + m_nr.residualSum[cat][i] / 2;
-            uint64_t denom = m_nr.residualSum[cat][i] + 1;
-            m_nr.offsetDenoise[cat][i] = (uint16_t)(value / denom);
+            uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
+            uint64_t denom = m_nr->residualSum[cat][i] + 1;
+            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
         }
 
         // Don't denoise DC coefficients
-        m_nr.offsetDenoise[cat][0] = 0;
+        m_nr->offsetDenoise[cat][0] = 0;
     }
 }
 
diff -r d43e9a6a7cce -r 41de0838d88b source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Wed Aug 13 00:19:31 2014 -0500
+++ b/source/encoder/frameencoder.h	Wed Aug 13 17:29:06 2014 -0700
@@ -142,7 +142,7 @@
     Bitstream                m_bs;
     Bitstream*               m_outStreams;
     uint32_t*                m_substreamSizes;
-    NoiseReduction           m_nr;
+    NoiseReduction*          m_nr;
     NALList                  m_nalList;
     ThreadLocalData          m_tld; /* for --no-wpp */