[x265] cleanup m_sharedPredTransformSkip[]
Satoshi Nakagawa
nakagawa424 at oki.com
Tue Mar 4 11:40:23 CET 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1393929339 -32400
# Tue Mar 04 19:35:39 2014 +0900
# Node ID 7a61566806f691ddff84cbbc42801f6c2d46df88
# Parent 3cbde0b893e34e5770cc311d3f4b6fe064c27774
cleanup m_sharedPredTransformSkip[]
NEW_CALCRECON macro is TODO mark for asm experts, to optimize register assignment.
diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Mar 03 13:37:35 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Mar 04 19:35:39 2014 +0900
@@ -63,7 +63,6 @@
m_qtTempTUCoeffCr = NULL;
for (int i = 0; i < 3; i++)
{
- m_sharedPredTransformSkip[i] = NULL;
m_qtTempTransformSkipFlag[i] = NULL;
m_qtTempCbf[i] = NULL;
}
@@ -96,7 +95,6 @@
for (uint32_t i = 0; i < 3; ++i)
{
X265_FREE(m_qtTempCbf[i]);
- X265_FREE(m_sharedPredTransformSkip[i]);
X265_FREE(m_qtTempTransformSkipFlag[i]);
}
@@ -153,9 +151,6 @@
CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
- CHECKED_MALLOC(m_sharedPredTransformSkip[0], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
- CHECKED_MALLOC(m_sharedPredTransformSkip[1], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
- CHECKED_MALLOC(m_sharedPredTransformSkip[2], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
CHECKED_MALLOC(m_qtTempTUCoeffY, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
@@ -414,7 +409,6 @@
Pel* fenc = fencYuv->getLumaAddr(absPartIdx);
Pel* pred = predYuv->getLumaAddr(absPartIdx);
int16_t* residual = resiYuv->getLumaAddr(absPartIdx);
- Pel* recon = predYuv->getLumaAddr(absPartIdx);
int chFmt = cu->getChromaFormat();
int part = partitionFromSizes(width, height);
@@ -439,15 +433,6 @@
cu->getPattern()->initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_predBufStride, m_predBufHeight, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt);
//===== get prediction signal =====
predIntraLumaAng(lumaPredMode, pred, stride, width);
- // save prediction
- if (default0Save1Load2 == 1)
- {
- primitives.luma_copy_pp[part](m_sharedPredTransformSkip[0], width, pred, stride);
- }
- }
- else
- {
- primitives.luma_copy_pp[part](pred, stride, m_sharedPredTransformSkip[0], width);
}
//===== get residual signal =====
@@ -491,12 +476,19 @@
primitives.blockfill_s[size](resiTmp, stride, 0);
}
+ assert(width <= 32);
+#if NEW_CALCRECON
//===== reconstruction =====
- assert(width <= 32);
+ primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
+ //===== update distortion =====
+ outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
+#else
+ ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
+ //===== reconstruction =====
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
-
//===== update distortion =====
outDist += primitives.sse_pp[part](fenc, stride, recon, stride);
+#endif
}
void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
@@ -534,7 +526,6 @@
Pel* fenc = (chromaId > 0 ? fencYuv->getCrAddr(absPartIdx) : fencYuv->getCbAddr(absPartIdx));
Pel* pred = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
int16_t* residual = (chromaId > 0 ? resiYuv->getCrAddr(absPartIdx) : resiYuv->getCbAddr(absPartIdx));
- Pel* recon = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth() * cu->getSlice()->getSPS()->getMaxCUHeight() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
@@ -561,19 +552,6 @@
//===== get prediction signal =====
predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, width, height, chFmt);
-
- // save prediction
- if (default0Save1Load2 == 1)
- {
- Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
- primitives.luma_copy_pp[part](predbuf, width, pred, stride);
- }
- }
- else
- {
- // load prediction
- Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
- primitives.luma_copy_pp[part](pred, stride, predbuf, width);
}
//===== get residual signal =====
@@ -627,12 +605,20 @@
}
}
+ assert(((intptr_t)residual & (width - 1)) == 0);
+ assert(width <= 32);
+#if NEW_CALCRECON
//===== reconstruction =====
- assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
- assert(width <= 32);
+ primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+ //===== update distortion =====
+ uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
+#else
+ ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
+ //===== reconstruction =====
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
//===== update distortion =====
uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride);
+#endif
if (ttype == TEXT_CHROMA_U)
{
outDist += m_rdCost->scaleChromaDistCb(dist);
diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h Mon Mar 03 13:37:35 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.h Tue Mar 04 19:35:39 2014 +0900
@@ -84,7 +84,6 @@
protected:
ShortYuv* m_qtTempShortYuv;
- pixel* m_sharedPredTransformSkip[3];
TCoeff** m_qtTempCoeffY;
TCoeff** m_qtTempCoeffCb;
diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Mar 03 13:37:35 2014 -0600
+++ b/source/common/pixel.cpp Tue Mar 04 19:35:39 2014 +0900
@@ -460,20 +460,33 @@
}
template<int blockSize>
-void calcRecons(pixel* pred, int16_t* residual, pixel* recon, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
+void calcRecons(pixel* pred, int16_t* residual,
+#if NEW_CALCRECON
+ pixel*,
+#else
+ pixel* recon,
+#endif
+ int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
{
for (int uiY = 0; uiY < blockSize; uiY++)
{
for (int uiX = 0; uiX < blockSize; uiX++)
{
+#if NEW_CALCRECON
+ recqt[uiX] = (int16_t)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
+ recipred[uiX] = (pixel)recqt[uiX];
+#else
recon[uiX] = (pixel)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
recqt[uiX] = (int16_t)recon[uiX];
recipred[uiX] = recon[uiX];
+#endif
}
pred += stride;
residual += stride;
+#if !NEW_CALCRECON
recon += stride;
+#endif
recqt += qtstride;
recipred += ipredstride;
}
diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/primitives.h
--- a/source/common/primitives.h Mon Mar 03 13:37:35 2014 -0600
+++ b/source/common/primitives.h Tue Mar 04 19:35:39 2014 +0900
@@ -34,6 +34,8 @@
#include "cpu.h"
#include "x265.h"
+#define NEW_CALCRECON 1 // TODO: remove recon[] arg
+
#define FENC_STRIDE 64
#define NUM_INTRA_MODE 35
diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Mar 03 13:37:35 2014 -0600
+++ b/source/common/x86/pixel-util8.asm Tue Mar 04 19:35:39 2014 +0900
@@ -57,6 +57,7 @@
cextern pw_2000
cextern pw_pixel_max
+%define NEW_CALCRECON 1 ; TODO: remove recon[] arg
;-----------------------------------------------------------------------------
; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
;-----------------------------------------------------------------------------
@@ -101,7 +102,9 @@
CLIPW m0, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movh [t2], m0
+%endif
movh [t4], m0
%if ARCH_X86_64 == 0
add t4, t7
@@ -113,7 +116,9 @@
movhps [t4 + t7], m0
lea t4, [t4 + t7 * 2]
%endif
+%if NEW_CALCRECON == 0
movhps [t2 + t5], m0
+%endif
; store recqt[]
movh [t3], m0
@@ -123,7 +128,9 @@
lea t0, [t0 + t5 * 2]
lea t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
lea t2, [t2 + t5 * 2]
+%endif
dec t8d
jnz .loop
@@ -165,11 +172,15 @@
packuswb m1, m1
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movd [t2], m1
+%endif
movd [t4], m1
add t4, t7
pshufd m2, m1, 1
+%if NEW_CALCRECON == 0
movd [t2 + t5], m2
+%endif
movd [t4], m2
add t4, t7
@@ -182,7 +193,9 @@
lea t0, [t0 + t5 * 2]
lea t1, [t1 + t5 * 4]
+%if NEW_CALCRECON == 0
lea t2, [t2 + t5 * 2]
+%endif
dec t8d
jnz .loop
@@ -231,8 +244,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2], m0
movu [t2 + t5], m1
+%endif
movu [t4], m0
%if ARCH_X86_64 == 0
add t4, t7
@@ -253,7 +268,9 @@
lea t0, [t0 + t5 * 2]
lea t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
lea t2, [t2 + t5 * 2]
+%endif
dec t8d
jnz .loop
@@ -295,8 +312,10 @@
packuswb m1, m2
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movlps [t2], m1
movhps [t2 + t5], m1
+%endif
movlps [t4], m1
%if ARCH_X86_64 == 0
add t4, t7
@@ -317,7 +336,9 @@
lea t0, [t0 + t5 * 2]
lea t1, [t1 + t5 * 4]
+%if NEW_CALCRECON == 0
lea t2, [t2 + t5 * 2]
+%endif
dec t8d
jnz .loop
@@ -367,8 +388,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2], m0
movu [t2 + 16], m1
+%endif
movu [t4], m0
movu [t4 + 16], m1
%if ARCH_X86_64 == 0
@@ -391,8 +414,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2 + t5], m0
movu [t2 + t5 + 16], m1
+%endif
%if ARCH_X86_64 == 0
movu [t4], m0
movu [t4 + 16], m1
@@ -411,7 +436,9 @@
lea t0, [t0 + t5 * 2]
lea t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
lea t2, [t2 + t5 * 2]
+%endif
dec t8d
jnz .loop
@@ -451,7 +478,9 @@
packuswb m1, m2
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2], m1
+%endif
movu [t4], m1
; store recqt[]
@@ -464,7 +493,9 @@
add t4, t7
add t0, t5
lea t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
add t2, t5
+%endif
dec t8d
jnz .loop
@@ -513,8 +544,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2], m0
movu [t2 + 16], m1
+%endif
movu [t4], m0
movu [t4 + 16], m1
@@ -532,8 +565,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2 + 32], m0
movu [t2 + 48], m1
+%endif
movu [t4 + 32], m0
movu [t4 + 48], m1
%if ARCH_X86_64 == 0
@@ -556,8 +591,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2 + t5], m0
movu [t2 + t5 + 16], m1
+%endif
%if ARCH_X86_64 == 0
movu [t4], m0
movu [t4 + 16], m1
@@ -580,8 +617,10 @@
CLIPW m1, m4, m5
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2 + t5 + 32], m0
movu [t2 + t5 + 48], m1
+%endif
%if ARCH_X86_64 == 0
movu [t4 + 32], m0
movu [t4 + 48], m1
@@ -600,7 +639,9 @@
lea t0, [t0 + t5 * 2]
lea t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
lea t2, [t2 + t5 * 2]
+%endif
dec t8d
jnz .loop
@@ -648,8 +689,10 @@
packuswb m3, m4
; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
movu [t2], m1
movu [t2 + 16], m3
+%endif
movu [t4], m1
movu [t4 + 16], m3
@@ -667,7 +710,9 @@
add t4, t7
add t0, t5
lea t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
add t2, t5
+%endif
dec t8d
jnz .loop
diff -r 3cbde0b893e3 -r 7a61566806f6 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Mar 03 13:37:35 2014 -0600
+++ b/source/test/pixelharness.cpp Tue Mar 04 19:35:39 2014 +0900
@@ -351,10 +351,12 @@
{
return false;
}
+#if !NEW_CALCRECON
if (memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)))
{
return false;
}
+#endif
if (memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
{
return false;
More information about the x265-devel
mailing list