[x265] cleanup m_sharedPredTransformSkip[]
Steve Borho
steve at borho.org
Thu Mar 6 07:04:43 CET 2014
On Tue, Mar 4, 2014 at 4:40 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1393929339 -32400
> # Tue Mar 04 19:35:39 2014 +0900
> # Node ID 7a61566806f691ddff84cbbc42801f6c2d46df88
> # Parent 3cbde0b893e34e5770cc311d3f4b6fe064c27774
> cleanup m_sharedPredTransformSkip[]
>
> NEW_CALCRECON macro is TODO mark for asm experts, to optimize register assignment.
Sorry I haven't responded to this yet; I would like Min to review it
before I push it.
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Mar 04 19:35:39 2014 +0900
> @@ -63,7 +63,6 @@
> m_qtTempTUCoeffCr = NULL;
> for (int i = 0; i < 3; i++)
> {
> - m_sharedPredTransformSkip[i] = NULL;
> m_qtTempTransformSkipFlag[i] = NULL;
> m_qtTempCbf[i] = NULL;
> }
> @@ -96,7 +95,6 @@
> for (uint32_t i = 0; i < 3; ++i)
> {
> X265_FREE(m_qtTempCbf[i]);
> - X265_FREE(m_sharedPredTransformSkip[i]);
> X265_FREE(m_qtTempTransformSkipFlag[i]);
> }
>
> @@ -153,9 +151,6 @@
> CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
> CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
>
> - CHECKED_MALLOC(m_sharedPredTransformSkip[0], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> - CHECKED_MALLOC(m_sharedPredTransformSkip[1], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> - CHECKED_MALLOC(m_sharedPredTransformSkip[2], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> CHECKED_MALLOC(m_qtTempTUCoeffY, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> @@ -414,7 +409,6 @@
> Pel* fenc = fencYuv->getLumaAddr(absPartIdx);
> Pel* pred = predYuv->getLumaAddr(absPartIdx);
> int16_t* residual = resiYuv->getLumaAddr(absPartIdx);
> - Pel* recon = predYuv->getLumaAddr(absPartIdx);
> int chFmt = cu->getChromaFormat();
> int part = partitionFromSizes(width, height);
>
> @@ -439,15 +433,6 @@
> cu->getPattern()->initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_predBufStride, m_predBufHeight, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt);
> //===== get prediction signal =====
> predIntraLumaAng(lumaPredMode, pred, stride, width);
> - // save prediction
> - if (default0Save1Load2 == 1)
> - {
> - primitives.luma_copy_pp[part](m_sharedPredTransformSkip[0], width, pred, stride);
> - }
> - }
> - else
> - {
> - primitives.luma_copy_pp[part](pred, stride, m_sharedPredTransformSkip[0], width);
> }
>
> //===== get residual signal =====
> @@ -491,12 +476,19 @@
> primitives.blockfill_s[size](resiTmp, stride, 0);
> }
>
> + assert(width <= 32);
> +#if NEW_CALCRECON
> //===== reconstruction =====
> - assert(width <= 32);
> + primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> + //===== update distortion =====
> + outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
> +#else
> + ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
> + //===== reconstruction =====
> primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> -
> //===== update distortion =====
> outDist += primitives.sse_pp[part](fenc, stride, recon, stride);
> +#endif
> }
>
> void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
> @@ -534,7 +526,6 @@
> Pel* fenc = (chromaId > 0 ? fencYuv->getCrAddr(absPartIdx) : fencYuv->getCbAddr(absPartIdx));
> Pel* pred = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
> int16_t* residual = (chromaId > 0 ? resiYuv->getCrAddr(absPartIdx) : resiYuv->getCbAddr(absPartIdx));
> - Pel* recon = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
>
> uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
> uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth() * cu->getSlice()->getSPS()->getMaxCUHeight() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
> @@ -561,19 +552,6 @@
>
> //===== get prediction signal =====
> predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, width, height, chFmt);
> -
> - // save prediction
> - if (default0Save1Load2 == 1)
> - {
> - Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
> - primitives.luma_copy_pp[part](predbuf, width, pred, stride);
> - }
> - }
> - else
> - {
> - // load prediction
> - Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
> - primitives.luma_copy_pp[part](pred, stride, predbuf, width);
> }
>
> //===== get residual signal =====
> @@ -627,12 +605,20 @@
> }
> }
>
> + assert(((intptr_t)residual & (width - 1)) == 0);
> + assert(width <= 32);
> +#if NEW_CALCRECON
> //===== reconstruction =====
> - assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
> - assert(width <= 32);
> + primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> + //===== update distortion =====
> + uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
> +#else
> + ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
> + //===== reconstruction =====
> primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> //===== update distortion =====
> uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride);
> +#endif
> if (ttype == TEXT_CHROMA_U)
> {
> outDist += m_rdCost->scaleChromaDistCb(dist);
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.h
> --- a/source/Lib/TLibEncoder/TEncSearch.h Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.h Tue Mar 04 19:35:39 2014 +0900
> @@ -84,7 +84,6 @@
> protected:
>
> ShortYuv* m_qtTempShortYuv;
> - pixel* m_sharedPredTransformSkip[3];
>
> TCoeff** m_qtTempCoeffY;
> TCoeff** m_qtTempCoeffCb;
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/common/pixel.cpp Tue Mar 04 19:35:39 2014 +0900
> @@ -460,20 +460,33 @@
> }
>
> template<int blockSize>
> -void calcRecons(pixel* pred, int16_t* residual, pixel* recon, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
> +void calcRecons(pixel* pred, int16_t* residual,
> +#if NEW_CALCRECON
> + pixel*,
> +#else
> + pixel* recon,
> +#endif
> + int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
> {
> for (int uiY = 0; uiY < blockSize; uiY++)
> {
> for (int uiX = 0; uiX < blockSize; uiX++)
> {
> +#if NEW_CALCRECON
> + recqt[uiX] = (int16_t)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
> + recipred[uiX] = (pixel)recqt[uiX];
> +#else
> recon[uiX] = (pixel)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
> recqt[uiX] = (int16_t)recon[uiX];
> recipred[uiX] = recon[uiX];
> +#endif
> }
>
> pred += stride;
> residual += stride;
> +#if !NEW_CALCRECON
> recon += stride;
> +#endif
> recqt += qtstride;
> recipred += ipredstride;
> }
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/primitives.h
> --- a/source/common/primitives.h Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/common/primitives.h Tue Mar 04 19:35:39 2014 +0900
> @@ -34,6 +34,8 @@
> #include "cpu.h"
> #include "x265.h"
>
> +#define NEW_CALCRECON 1 // TODO: remove recon[] arg
> +
> #define FENC_STRIDE 64
>
> #define NUM_INTRA_MODE 35
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/common/x86/pixel-util8.asm Tue Mar 04 19:35:39 2014 +0900
> @@ -57,6 +57,7 @@
> cextern pw_2000
> cextern pw_pixel_max
>
> +%define NEW_CALCRECON 1 ; TODO: remove recon[] arg
> ;-----------------------------------------------------------------------------
> ; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
> ;-----------------------------------------------------------------------------
> @@ -101,7 +102,9 @@
> CLIPW m0, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movh [t2], m0
> +%endif
> movh [t4], m0
> %if ARCH_X86_64 == 0
> add t4, t7
> @@ -113,7 +116,9 @@
> movhps [t4 + t7], m0
> lea t4, [t4 + t7 * 2]
> %endif
> +%if NEW_CALCRECON == 0
> movhps [t2 + t5], m0
> +%endif
>
> ; store recqt[]
> movh [t3], m0
> @@ -123,7 +128,9 @@
>
> lea t0, [t0 + t5 * 2]
> lea t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
> lea t2, [t2 + t5 * 2]
> +%endif
>
> dec t8d
> jnz .loop
> @@ -165,11 +172,15 @@
> packuswb m1, m1
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movd [t2], m1
> +%endif
> movd [t4], m1
> add t4, t7
> pshufd m2, m1, 1
> +%if NEW_CALCRECON == 0
> movd [t2 + t5], m2
> +%endif
> movd [t4], m2
> add t4, t7
>
> @@ -182,7 +193,9 @@
>
> lea t0, [t0 + t5 * 2]
> lea t1, [t1 + t5 * 4]
> +%if NEW_CALCRECON == 0
> lea t2, [t2 + t5 * 2]
> +%endif
>
> dec t8d
> jnz .loop
> @@ -231,8 +244,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2], m0
> movu [t2 + t5], m1
> +%endif
> movu [t4], m0
> %if ARCH_X86_64 == 0
> add t4, t7
> @@ -253,7 +268,9 @@
>
> lea t0, [t0 + t5 * 2]
> lea t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
> lea t2, [t2 + t5 * 2]
> +%endif
>
> dec t8d
> jnz .loop
> @@ -295,8 +312,10 @@
> packuswb m1, m2
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movlps [t2], m1
> movhps [t2 + t5], m1
> +%endif
> movlps [t4], m1
> %if ARCH_X86_64 == 0
> add t4, t7
> @@ -317,7 +336,9 @@
>
> lea t0, [t0 + t5 * 2]
> lea t1, [t1 + t5 * 4]
> +%if NEW_CALCRECON == 0
> lea t2, [t2 + t5 * 2]
> +%endif
>
> dec t8d
> jnz .loop
> @@ -367,8 +388,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2], m0
> movu [t2 + 16], m1
> +%endif
> movu [t4], m0
> movu [t4 + 16], m1
> %if ARCH_X86_64 == 0
> @@ -391,8 +414,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2 + t5], m0
> movu [t2 + t5 + 16], m1
> +%endif
> %if ARCH_X86_64 == 0
> movu [t4], m0
> movu [t4 + 16], m1
> @@ -411,7 +436,9 @@
>
> lea t0, [t0 + t5 * 2]
> lea t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
> lea t2, [t2 + t5 * 2]
> +%endif
>
> dec t8d
> jnz .loop
> @@ -451,7 +478,9 @@
> packuswb m1, m2
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2], m1
> +%endif
> movu [t4], m1
>
> ; store recqt[]
> @@ -464,7 +493,9 @@
> add t4, t7
> add t0, t5
> lea t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
> add t2, t5
> +%endif
>
> dec t8d
> jnz .loop
> @@ -513,8 +544,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2], m0
> movu [t2 + 16], m1
> +%endif
> movu [t4], m0
> movu [t4 + 16], m1
>
> @@ -532,8 +565,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2 + 32], m0
> movu [t2 + 48], m1
> +%endif
> movu [t4 + 32], m0
> movu [t4 + 48], m1
> %if ARCH_X86_64 == 0
> @@ -556,8 +591,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2 + t5], m0
> movu [t2 + t5 + 16], m1
> +%endif
> %if ARCH_X86_64 == 0
> movu [t4], m0
> movu [t4 + 16], m1
> @@ -580,8 +617,10 @@
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2 + t5 + 32], m0
> movu [t2 + t5 + 48], m1
> +%endif
> %if ARCH_X86_64 == 0
> movu [t4 + 32], m0
> movu [t4 + 48], m1
> @@ -600,7 +639,9 @@
>
> lea t0, [t0 + t5 * 2]
> lea t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
> lea t2, [t2 + t5 * 2]
> +%endif
>
> dec t8d
> jnz .loop
> @@ -648,8 +689,10 @@
> packuswb m3, m4
>
> ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
> movu [t2], m1
> movu [t2 + 16], m3
> +%endif
> movu [t4], m1
> movu [t4 + 16], m3
>
> @@ -667,7 +710,9 @@
> add t4, t7
> add t0, t5
> lea t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
> add t2, t5
> +%endif
>
> dec t8d
> jnz .loop
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/test/pixelharness.cpp Tue Mar 04 19:35:39 2014 +0900
> @@ -351,10 +351,12 @@
> {
> return false;
> }
> +#if !NEW_CALCRECON
> if (memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)))
> {
> return false;
> }
> +#endif
> if (memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
> {
> return false;
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list