[x265] asm: cleanup unused sub_ps and add_ps
Satoshi Nakagawa
nakagawa424 at oki.com
Thu Aug 7 12:29:06 CEST 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1407407018 -32400
# Thu Aug 07 19:23:38 2014 +0900
# Node ID 5cf494d8591b20ce203b225600c6da50eeb312b8
# Parent 619633a933f6502bff56038997e33e7e2eba75f9
asm: cleanup unused sub_ps and add_ps
sub_ps and add_ps are used by CU or TU level, not PU level.
diff -r 619633a933f6 -r 5cf494d8591b source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/Lib/TLibCommon/TComYuv.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -167,13 +167,11 @@
void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size)
{
- int part = partitionFromLog2Size(log2Size);
-
- addClipLuma(srcYuv0, srcYuv1, part);
- addClipChroma(srcYuv0, srcYuv1, part);
+ addClipLuma(srcYuv0, srcYuv1, log2Size);
+ addClipChroma(srcYuv0, srcYuv1, log2Size);
}
-void TComYuv::addClipLuma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t part)
+void TComYuv::addClipLuma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size)
{
pixel* src0 = srcYuv0->getLumaAddr();
int16_t* src1 = srcYuv1->getLumaAddr();
@@ -183,14 +181,14 @@
uint32_t src1Stride = srcYuv1->m_width;
uint32_t dststride = getStride();
- primitives.luma_add_ps[part](dst, dststride, src0, src1, src0Stride, src1Stride);
+ primitives.luma_add_ps[log2Size - 2](dst, dststride, src0, src1, src0Stride, src1Stride);
}
-void TComYuv::addClipChroma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t part)
+void TComYuv::addClipChroma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size)
{
pixel* srcU0 = srcYuv0->getCbAddr();
+ pixel* srcV0 = srcYuv0->getCrAddr();
int16_t* srcU1 = srcYuv1->getCbAddr();
- pixel* srcV0 = srcYuv0->getCrAddr();
int16_t* srcV1 = srcYuv1->getCrAddr();
pixel* dstU = getCbAddr();
pixel* dstV = getCrAddr();
@@ -199,8 +197,8 @@
uint32_t src1Stride = srcYuv1->m_cwidth;
uint32_t dststride = getCStride();
- primitives.chroma[m_csp].add_ps[part](dstU, dststride, srcU0, srcU1, src0Stride, src1Stride);
- primitives.chroma[m_csp].add_ps[part](dstV, dststride, srcV0, srcV1, src0Stride, src1Stride);
+ primitives.chroma[m_csp].add_ps[log2Size - 2](dstU, dststride, srcU0, srcU1, src0Stride, src1Stride);
+ primitives.chroma[m_csp].add_ps[log2Size - 2](dstV, dststride, srcV0, srcV1, src0Stride, src1Stride);
}
void TComYuv::addAvg(ShortYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
diff -r 619633a933f6 -r 5cf494d8591b source/Lib/TLibCommon/TComYuv.h
--- a/source/Lib/TLibCommon/TComYuv.h Wed Aug 06 17:03:38 2014 -0500
+++ b/source/Lib/TLibCommon/TComYuv.h Thu Aug 07 19:23:38 2014 +0900
@@ -136,8 +136,8 @@
// Clip(srcYuv0 + srcYuv1) -> m_apiBuf
void addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size);
- void addClipLuma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t part);
- void addClipChroma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t part);
+ void addClipLuma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size);
+ void addClipChroma(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size);
// (srcYuv0 + srcYuv1)/2 for YUV partition
void addAvg(ShortYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma);
diff -r 619633a933f6 -r 5cf494d8591b source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -864,16 +864,14 @@
//--- set coded block flag ---
cu->setCbfSubParts((numSig ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
- int part = partitionFromLog2Size(log2TrSize);
-
if (numSig)
{
//--- inverse transform ---
m_quant.invtransformNxN(cu->getCUTransquantBypass(absPartIdx), residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTransformSkip, numSig);
// Generate Recon
- primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);
- primitives.luma_copy_pp[part](reconIPred, reconIPredStride, recon, stride);
+ primitives.luma_add_ps[sizeIdx](recon, stride, pred, residual, stride, stride);
+ primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, recon, stride);
}
else
{
@@ -882,8 +880,8 @@
#endif
// Generate Recon
- primitives.luma_copy_pp[part](recon, stride, pred, stride);
- primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred, stride);
+ primitives.square_copy_pp[sizeIdx](recon, stride, pred, stride);
+ primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);
}
}
@@ -1283,8 +1281,7 @@
uint32_t tuSize = 1 << log2TrSizeC;
uint32_t stride = fencYuv->getCStride();
const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
- int sizeIdxC = log2TrSizeC - 2;
- int part = partitionFromLog2Size(log2TrSizeC);
+ const int sizeIdxC = log2TrSizeC - 2;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
@@ -1344,7 +1341,7 @@
//===== reconstruction =====
// use square primitives
- primitives.chroma[X265_CSP_I444].add_ps[part](recon, stride, pred, residual, stride, stride);
+ primitives.chroma[X265_CSP_I444].add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
primitives.square_copy_pp[sizeIdxC](reconIPred, reconIPredStride, recon, stride);
}
else
@@ -2820,7 +2817,7 @@
uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
uint32_t stride = fencYuv->getStride();
//===== reconstruction =====
- primitives.luma_add_ps[partSize](reconIPred, reconIPredStride, pred, curResiY, stride, strideResiY);
+ primitives.luma_add_ps[sizeIdx](reconIPred, reconIPredStride, pred, curResiY, stride, strideResiY);
int size = log2TrSize - 2;
nonZeroPsyEnergyY = m_rdCost.psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getStride());
@@ -2919,8 +2916,8 @@
uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
uint32_t stride = fencYuv->getCStride();
//===== reconstruction =====
- primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, curResiU, stride, strideResiC);
int size = log2TrSizeC - 2;
+ primitives.luma_add_ps[size](reconIPred, reconIPredStride, pred, curResiU, stride, strideResiC);
nonZeroPsyEnergyU = m_rdCost.psyCost(size, fencYuv->getCbAddr(absPartIdxC), fencYuv->getCStride(),
cu->m_pic->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder),
cu->m_pic->getPicYuvRec()->getCStride());
@@ -3001,8 +2998,8 @@
uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
uint32_t stride = fencYuv->getCStride();
//===== reconstruction =====
- primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, curResiV, stride, strideResiC);
int size = log2TrSizeC - 2;
+ primitives.luma_add_ps[size](reconIPred, reconIPredStride, pred, curResiV, stride, strideResiC);
nonZeroPsyEnergyV = m_rdCost.psyCost(size, fencYuv->getCrAddr(absPartIdxC), fencYuv->getCStride(),
cu->m_pic->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder),
cu->m_pic->getPicYuvRec()->getCStride());
@@ -3107,8 +3104,8 @@
uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getStride();
uint32_t stride = fencYuv->getStride();
//===== reconstruction =====
- primitives.luma_add_ps[partSize](reconIPred, reconIPredStride, pred, tsResiY, stride, trSize);
int size = log2TrSize - 2;
+ primitives.luma_add_ps[size](reconIPred, reconIPredStride, pred, tsResiY, stride, trSize);
nonZeroPsyEnergyY = m_rdCost.psyCost(size, fencYuv->getLumaAddr(absPartIdx), fencYuv->getStride(),
cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder),
cu->m_pic->getPicYuvRec()->getStride());
@@ -3195,8 +3192,8 @@
uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
uint32_t stride = fencYuv->getCStride();
//===== reconstruction =====
- primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, tsResiU, stride, trSizeC);
int size = log2TrSizeC - 2;
+ primitives.luma_add_ps[size](reconIPred, reconIPredStride, pred, tsResiU, stride, trSizeC);
nonZeroPsyEnergyU = m_rdCost.psyCost(size, fencYuv->getCbAddr(absPartIdxC), fencYuv->getCStride(),
cu->m_pic->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getCStride());
singleCostU = m_rdCost.calcPsyRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.section], nonZeroPsyEnergyU);
@@ -3236,8 +3233,8 @@
uint32_t reconIPredStride = cu->m_pic->getPicYuvRec()->getCStride();
uint32_t stride = fencYuv->getCStride();
//===== reconstruction =====
- primitives.luma_add_ps[partSizeC](reconIPred, reconIPredStride, pred, tsResiV, stride, trSizeC);
int size = log2TrSizeC - 2;
+ primitives.luma_add_ps[size](reconIPred, reconIPredStride, pred, tsResiV, stride, trSizeC);
nonZeroPsyEnergyV = m_rdCost.psyCost(size, fencYuv->getCrAddr(absPartIdxC), fencYuv->getCStride(),
cu->m_pic->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder), cu->m_pic->getPicYuvRec()->getCStride());
singleCostV = m_rdCost.calcPsyRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.section], nonZeroPsyEnergyV);
diff -r 619633a933f6 -r 5cf494d8591b source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/pixel.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -1002,37 +1002,47 @@
p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; \
- p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+ p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define CHROMA_422(W, H) \
p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>; \
- p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+ p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define CHROMA_444(W, H) \
p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; \
- p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
- p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+ p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define LUMA(W, H) \
p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
- p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; \
+ p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
+
+#define LUMA_PIXELSUB(W, H) \
p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+#define CHROMA_PIXELSUB_420(W, H) \
+ p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
+ p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+
+#define CHROMA_PIXELSUB_422(W, H) \
+ p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
+ p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+
+#define CHROMA_PIXELSUB_444(W, H) \
+ p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
+ p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
+
+
+
LUMA(4, 4);
LUMA(8, 8);
CHROMA_420(4, 4);
@@ -1083,6 +1093,24 @@
LUMA(16, 64);
CHROMA_420(8, 32);
+ LUMA_PIXELSUB(4, 4);
+ LUMA_PIXELSUB(8, 8);
+ LUMA_PIXELSUB(16, 16);
+ LUMA_PIXELSUB(32, 32);
+ LUMA_PIXELSUB(64, 64);
+ CHROMA_PIXELSUB_420(4, 4)
+ CHROMA_PIXELSUB_420(8, 8)
+ CHROMA_PIXELSUB_420(16, 16)
+ CHROMA_PIXELSUB_420(32, 32)
+ CHROMA_PIXELSUB_422(4, 8)
+ CHROMA_PIXELSUB_422(8, 16)
+ CHROMA_PIXELSUB_422(16, 32)
+ CHROMA_PIXELSUB_422(32, 64)
+ CHROMA_PIXELSUB_444(8, 8)
+ CHROMA_PIXELSUB_444(16, 16)
+ CHROMA_PIXELSUB_444(32, 32)
+ CHROMA_PIXELSUB_444(64, 64)
+
CHROMA_422(4, 8);
CHROMA_422(4, 4);
CHROMA_422(2, 8);
diff -r 619633a933f6 -r 5cf494d8591b source/common/primitives.cpp
--- a/source/common/primitives.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/primitives.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -76,9 +76,13 @@
p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
+ p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
+ }
+
+ for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
+ {
p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
- p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
}
for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
diff -r 619633a933f6 -r 5cf494d8591b source/common/primitives.h
--- a/source/common/primitives.h Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/primitives.h Thu Aug 07 19:23:38 2014 +0900
@@ -225,8 +225,8 @@
copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS];
copy_ss_t luma_copy_ss[NUM_LUMA_PARTITIONS];
- pixel_sub_ps_t luma_sub_ps[NUM_LUMA_PARTITIONS];
- pixel_add_ps_t luma_add_ps[NUM_LUMA_PARTITIONS];
+ pixel_sub_ps_t luma_sub_ps[NUM_SQUARE_BLOCKS];
+ pixel_add_ps_t luma_add_ps[NUM_SQUARE_BLOCKS];
copy_pp_t square_copy_pp[NUM_SQUARE_BLOCKS];
copy_sp_t square_copy_sp[NUM_SQUARE_BLOCKS];
copy_ps_t square_copy_ps[NUM_SQUARE_BLOCKS];
@@ -287,13 +287,13 @@
filter_ss_t filter_vss[NUM_LUMA_PARTITIONS];
filter_pp_t filter_hpp[NUM_LUMA_PARTITIONS];
filter_hps_t filter_hps[NUM_LUMA_PARTITIONS];
+ addAvg_t addAvg[NUM_LUMA_PARTITIONS];
copy_pp_t copy_pp[NUM_LUMA_PARTITIONS];
copy_sp_t copy_sp[NUM_LUMA_PARTITIONS];
copy_ps_t copy_ps[NUM_LUMA_PARTITIONS];
copy_ss_t copy_ss[NUM_LUMA_PARTITIONS];
- pixel_sub_ps_t sub_ps[NUM_LUMA_PARTITIONS];
- pixel_add_ps_t add_ps[NUM_LUMA_PARTITIONS];
- addAvg_t addAvg[NUM_LUMA_PARTITIONS];
+ pixel_sub_ps_t sub_ps[NUM_SQUARE_BLOCKS];
+ pixel_add_ps_t add_ps[NUM_SQUARE_BLOCKS];
} chroma[4]; // X265_CSP_COUNT - do not want to include x265.h here
};
diff -r 619633a933f6 -r 5cf494d8591b source/common/shortyuv.cpp
--- a/source/common/shortyuv.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/shortyuv.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -87,20 +87,20 @@
void ShortYuv::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t log2Size)
{
- int part = partitionFromLog2Size(log2Size);
+ const int sizeIdx = log2Size - 2;
pixel* srcY0 = srcYuv0->getLumaAddr();
pixel* srcY1 = srcYuv1->getLumaAddr();
- primitives.luma_sub_ps[part](getLumaAddr(), m_width, srcY0, srcY1, srcYuv0->getStride(), srcYuv1->getStride());
+ primitives.luma_sub_ps[sizeIdx](getLumaAddr(), m_width, srcY0, srcY1, srcYuv0->getStride(), srcYuv1->getStride());
pixel* srcU0 = srcYuv0->getCbAddr();
pixel* srcU1 = srcYuv1->getCbAddr();
- primitives.chroma[m_csp].sub_ps[part](getCbAddr(), m_cwidth, srcU0, srcU1, srcYuv0->getCStride(), srcYuv1->getCStride());
+ primitives.chroma[m_csp].sub_ps[sizeIdx](getCbAddr(), m_cwidth, srcU0, srcU1, srcYuv0->getCStride(), srcYuv1->getCStride());
pixel* srcV0 = srcYuv0->getCrAddr();
pixel* srcV1 = srcYuv1->getCrAddr();
- primitives.chroma[m_csp].sub_ps[part](getCrAddr(), m_cwidth, srcV0, srcV1, srcYuv0->getCStride(), srcYuv1->getCStride());
+ primitives.chroma[m_csp].sub_ps[sizeIdx](getCrAddr(), m_cwidth, srcV0, srcV1, srcYuv0->getCStride(), srcYuv1->getCStride());
}
void ShortYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t log2Size)
diff -r 619633a933f6 -r 5cf494d8591b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -701,29 +701,9 @@
#define CHROMA_PIXELSUB_PS(cpu) \
SETUP_CHROMA_PIXELSUB(4, 4, cpu); \
- SETUP_CHROMA_PIXELSUB(4, 2, cpu); \
- SETUP_CHROMA_PIXELSUB(2, 4, cpu); \
SETUP_CHROMA_PIXELSUB(8, 8, cpu); \
- SETUP_CHROMA_PIXELSUB(8, 4, cpu); \
- SETUP_CHROMA_PIXELSUB(4, 8, cpu); \
- SETUP_CHROMA_PIXELSUB(8, 6, cpu); \
- SETUP_CHROMA_PIXELSUB(6, 8, cpu); \
- SETUP_CHROMA_PIXELSUB(8, 2, cpu); \
- SETUP_CHROMA_PIXELSUB(2, 8, cpu); \
SETUP_CHROMA_PIXELSUB(16, 16, cpu); \
- SETUP_CHROMA_PIXELSUB(16, 8, cpu); \
- SETUP_CHROMA_PIXELSUB(8, 16, cpu); \
- SETUP_CHROMA_PIXELSUB(16, 12, cpu); \
- SETUP_CHROMA_PIXELSUB(12, 16, cpu); \
- SETUP_CHROMA_PIXELSUB(16, 4, cpu); \
- SETUP_CHROMA_PIXELSUB(4, 16, cpu); \
- SETUP_CHROMA_PIXELSUB(32, 32, cpu); \
- SETUP_CHROMA_PIXELSUB(32, 16, cpu); \
- SETUP_CHROMA_PIXELSUB(16, 32, cpu); \
- SETUP_CHROMA_PIXELSUB(32, 24, cpu); \
- SETUP_CHROMA_PIXELSUB(24, 32, cpu); \
- SETUP_CHROMA_PIXELSUB(32, 8, cpu); \
- SETUP_CHROMA_PIXELSUB(8, 32, cpu);
+ SETUP_CHROMA_PIXELSUB(32, 32, cpu);
#define SETUP_CHROMA_PIXELSUB_422(W, H, cpu) \
p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
@@ -731,29 +711,9 @@
#define CHROMA_PIXELSUB_PS_422(cpu) \
SETUP_CHROMA_PIXELSUB_422(4, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_422(4, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_422(2, 8, cpu); \
SETUP_CHROMA_PIXELSUB_422(8, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_422(8, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_422(4, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_422(8, 12, cpu); \
- SETUP_CHROMA_PIXELSUB_422(6, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_422(8, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_422(2, 16, cpu); \
SETUP_CHROMA_PIXELSUB_422(16, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_422(16, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_422(8, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_422(16, 24, cpu); \
- SETUP_CHROMA_PIXELSUB_422(12, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_422(16, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_422(4, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_422(32, 64, cpu); \
- SETUP_CHROMA_PIXELSUB_422(32, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_422(16, 64, cpu); \
- SETUP_CHROMA_PIXELSUB_422(32, 48, cpu); \
- SETUP_CHROMA_PIXELSUB_422(24, 64, cpu); \
- SETUP_CHROMA_PIXELSUB_422(32, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_422(8, 64, cpu);
+ SETUP_CHROMA_PIXELSUB_422(32, 64, cpu);
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
@@ -785,29 +745,9 @@
#define LUMA_PIXELSUB(cpu) \
SETUP_LUMA_SUB_FUNC_DEF(4, 4, cpu); \
SETUP_LUMA_SUB_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(8, 4, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(4, 8, cpu); \
SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(16, 8, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(8, 16, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(16, 12, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(12, 16, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(16, 4, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(4, 16, cpu); \
SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(32, 16, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(16, 32, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(32, 24, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(24, 32, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(32, 8, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(8, 32, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(64, 32, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(32, 64, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(64, 48, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(48, 64, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_SUB_FUNC_DEF(16, 64, cpu);
+ SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu);
#define LUMA_SP_FILTERS(cpu) \
SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
diff -r 619633a933f6 -r 5cf494d8591b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/x86/ipfilter16.asm Thu Aug 07 19:23:38 2014 +0900
@@ -1672,8 +1672,7 @@
packssdw m0, m1
packssdw m2, m3
pxor m5, m5
- CLIPW m0, m5, m7
- CLIPW m2, m5, m7
+ CLIPW2 m0, m2, m5, m7
%endif
movh [r2], m0
@@ -2125,8 +2124,7 @@
packssdw m0, m1
packssdw m2, m3
pxor m5, m5
- CLIPW m0, m5, m7
- CLIPW m2, m5, m7
+ CLIPW2 m0, m2, m5, m7
%endif
movh [r2], m0
@@ -2294,8 +2292,7 @@
packssdw m2, m3
pxor m5, m5
mova m6, [pw_pixel_max]
- CLIPW m0, m5, m6
- CLIPW m2, m5, m6
+ CLIPW2 m0, m2, m5, m6
%endif
movu [r2], m0
@@ -2512,8 +2509,7 @@
packssdw m2, m3
pxor m1, m1
- CLIPW m0, m1, [pw_pixel_max]
- CLIPW m2, m1, [pw_pixel_max]
+ CLIPW2 m0, m2, m1, [pw_pixel_max]
movh [r2], m0
movhps [r2 + r3], m0
@@ -2700,8 +2696,7 @@
packssdw m2, m3
pxor m1, m1
- CLIPW m0, m1, [pw_pixel_max]
- CLIPW m2, m1, [pw_pixel_max]
+ CLIPW2 m0, m2, m1, [pw_pixel_max]
movh [r2], m0
movhps [r2 + r3], m0
diff -r 619633a933f6 -r 5cf494d8591b source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/x86/pixel-util.h Thu Aug 07 19:23:38 2014 +0900
@@ -70,86 +70,25 @@
#define CHROMA_PIXELSUB_DEF(cpu) \
SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 2, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 4, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 6, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(6, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 2, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 12, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(12, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 24, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(24, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu);
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu);
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu);
#define CHROMA_PIXELSUB_DEF_422(cpu) \
SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 8, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 12, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(6, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 4, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 16, cpu); \
SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 24, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(12, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 8, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 64, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 48, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(24, 64, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
- SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 64, cpu);
+ SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 64, cpu);
#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \
void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t * dest, intptr_t destride, pixel * src0, pixel * src1, intptr_t srcstride0, intptr_t srcstride1); \
void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel * dest, intptr_t destride, pixel * src0, int16_t * scr1, intptr_t srcStride0, intptr_t srcStride1);
#define LUMA_PIXELSUB_DEF(cpu) \
- SETUP_LUMA_PIXELSUB_PS_FUNC(4, 4, cpu); \
SETUP_LUMA_PIXELSUB_PS_FUNC(8, 8, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(8, 4, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(4, 8, cpu); \
SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(16, 8, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(8, 16, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(16, 12, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(12, 16, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(16, 4, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(4, 16, cpu); \
SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(32, 16, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(16, 32, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(32, 24, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(24, 32, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(32, 8, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(8, 32, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(64, 32, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(32, 64, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(64, 48, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(48, 64, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(64, 16, cpu); \
- SETUP_LUMA_PIXELSUB_PS_FUNC(16, 64, cpu);
+ SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu);
CHROMA_PIXELSUB_DEF(_sse4);
LUMA_PIXELSUB_DEF(_sse4);
diff -r 619633a933f6 -r 5cf494d8591b source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Aug 07 19:23:38 2014 +0900
@@ -137,7 +137,7 @@
; store recqt[]
punpcklbw m1, m0
- movlps [r2], m1
+ movh [r2], m1
movhps [r2 + r5], m1
lea r0, [r0 + r4 * 2]
@@ -178,8 +178,7 @@
movu m3, [r1 + r4]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recipred[]
movu [r3], m0
@@ -218,7 +217,7 @@
packuswb m1, m2
; store recon[] and recipred[]
- movlps [r3], m1
+ movh [r3], m1
movhps [r3 + r6], m1
; store recqt[]
@@ -266,8 +265,7 @@
movu m3, [r1 + 16]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recipred[]
movu [r3], m0
@@ -283,8 +281,7 @@
movu m3, [r1 + r4 + 16]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recon[] and recipred[]
movu [r3 + r6], m0
@@ -375,8 +372,7 @@
movu m3, [r1 + 16]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recipred[]
movu [r3], m0
@@ -392,8 +388,7 @@
movu m3, [r1 + 48]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recon[] and recipred[]
movu [r3 + 32], m0
@@ -410,8 +405,7 @@
movu m3, [r1 + r4 + 16]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recon[] and recipred[]
movu [r3 + r6], m0
@@ -427,8 +421,7 @@
movu m3, [r1 + r4 + 48]
paddw m0, m2
paddw m1, m3
- CLIPW m0, m4, m5
- CLIPW m1, m4, m5
+ CLIPW2 m0, m1, m4, m5
; store recon[] and recipred[]
movu [r3 + r6 + 32], m0
@@ -552,7 +545,7 @@
punpckldq m3, m4
punpcklbw m3, m0
psubw m1, m3
- movlps [r2], m1
+ movh [r2], m1
movhps [r2 + r3 * 2], m1
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
@@ -568,7 +561,7 @@
punpckldq m3, m4
punpcklbw m3, m0
psubw m1, m3
- movlps [r2], m1
+ movh [r2], m1
movhps [r2 + r3 * 2], m1
%endif
RET
@@ -1346,9 +1339,9 @@
punpcklwd m2, m3
punpckhdq m1, m0, m2
punpckldq m0, m2
- movlps [r0], m0
- movhps [r0 + %1], m0
- movlps [r0 + 2 * %1], m1
+ movh [r0], m0
+ movhps [r0 + %1], m0
+ movh [r0 + 2 * %1], m1
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m1
%endmacro
@@ -1435,15 +1428,15 @@
punpckhdq m3, m1, m5
punpckldq m1, m5
- movlps [r0], m0
+ movh [r0], m0
movhps [r0 + %1], m0
- movlps [r0 + 2 * %1], m2
+ movh [r0 + 2 * %1], m2
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m2
- movlps [r0 + 2 * %1], m1
+ movh [r0 + 2 * %1], m1
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m1
- movlps [r0 + 2 * %1], m3
+ movh [r0 + 2 * %1], m3
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m3
@@ -2549,370 +2542,217 @@
;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_sub_ps_2x4, 6, 6, 2
- add r1, r1
- add r4, r4
- add r5, r5
-
- movd m0, [r2]
- movd m1, [r3]
- psubw m0, m1
- movd [r0], m0
-
- movd m0, [r2 + r4]
- movd m1, [r3 + r5]
- psubw m0, m1
- movd [r0 + r1], m0
-
- movd m0, [r2 + 2 * r4]
- movd m1, [r3 + 2 * r5]
- psubw m0, m1
- movd [r0 + 2 * r1], m0
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movd m0, [r2 + r4]
- movd m1, [r3 + r5]
- psubw m0, m1
- movd [r0 + r1], m0
+cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r4, r4
+ add r5, r5
+ add r1, r1
+ movh m0, [r2]
+ movh m2, [r2 + r4]
+ movh m1, [r3]
+ movh m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movh m4, [r2]
+ movh m6, [r2 + r4]
+ movh m5, [r3]
+ movh m7, [r3 + r5]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movh [r0], m0
+ movh [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m4
+ movh [r0 + r1], m6
+
+ RET
%else
INIT_XMM sse4
-%if ARCH_X86_64
- cglobal pixel_sub_ps_2x4, 6, 8, 0
-
- %define tmp_r1 r1
- DECLARE_REG_TMP 6, 7
-%else
- cglobal pixel_sub_ps_2x4, 6, 7, 0, 0-4
-
- %define tmp_r1 dword [rsp]
- DECLARE_REG_TMP 6, 1
-%endif ; ARCH_X86_64
-
- add r1, r1
-
-%if ARCH_X86_64 == 0
- mov tmp_r1, r1
-
+cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1, r1
+ movd m0, [r2]
+ movd m2, [r2 + r4]
+ movd m1, [r3]
+ movd m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movd m4, [r2]
+ movd m6, [r2 + r4]
+ movd m5, [r3]
+ movd m7, [r3 + r5]
+ punpckldq m0, m2
+ punpckldq m1, m3
+ punpckldq m4, m6
+ punpckldq m5, m7
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+
+ psubw m0, m1
+ psubw m4, m5
+
+ movh [r0], m0
+ movhps [r0 + r1], m0
+ movh [r0 + r1 * 2], m4
+ lea r0, [r0 + r1 * 2]
+ movhps [r0 + r1], m4
+
+ RET
%endif
-movzx t0d, byte [r2]
-movzx t1d, byte [r3]
-
-sub t0d, t1d
-
-mov [r0], t0w
-
-movzx t0d, byte [r2 + 1]
-movzx t1d, byte [r3 + 1]
-
-sub t0d, t1d
-
-mov [r0 + 2], t0w
-
-add r0, tmp_r1
-
-movzx t0d, byte [r2 + r4]
-movzx t1d, byte [r3 + r5]
-
-sub t0d, t1d
-
-mov [r0], t0w
-
-movzx t0d, byte [r2 + r4 + 1]
-movzx t1d, byte [r3 + r5 + 1]
-
-sub t0d, t1d
-
-mov [r0 + 2], t0w
-
-add r0, tmp_r1
-
-movzx t0d, byte [r2 + r4 * 2]
-movzx t1d, byte [r3 + r5 * 2]
-
-sub t0d, t1d
-
-mov [r0], t0w
-
-movzx t0d, byte [r2 + r4 * 2 + 1]
-movzx t1d, byte [r3 + r5 * 2 + 1]
-
-sub t0d, t1d
-
-mov [r0 + 2], t0w
-
-add r0, tmp_r1
-
-lea r2, [r2 + r4 * 2]
-lea r3, [r3 + r5 * 2]
-
-movzx t0d, byte [r2 + r4]
-movzx t1d, byte [r3 + r5]
-
-sub t0d, t1d
-
-mov [r0], t0w
-
-movzx t0d, byte [r2 + r4 + 1]
-movzx t1d, byte [r3 + r5 + 1]
-
-sub t0d, t1d
-
-mov [r0 + 2], t0w
-%endif
-RET
;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_2x8(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%macro PIXELSUB_PS_W4_H4 2
%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_sub_ps_2x8, 6, 6, 2
- add r1, r1
- add r4, r4
- add r5, r5
-
- movd m0, [r2]
- movd m1, [r3]
- psubw m0, m1
- movd [r0], m0
-
- movd m0, [r2 + r4]
- movd m1, [r3 + r5]
- psubw m0, m1
- movd [r0 + r1], m0
-
- movd m0, [r2 + 2 * r4]
- movd m1, [r3 + 2 * r5]
- psubw m0, m1
- movd [r0 + 2 * r1], m0
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movd m0, [r2 + r4]
- movd m1, [r3 + r5]
- psubw m0, m1
- lea r0, [r0 + 2 * r1]
- movd [r0 + r1], m0
-
- movd m0, [r2 + 2 * r4]
- movd m1, [r3 + 2 * r5]
- psubw m0, m1
- movd [r0 + 2 * r1], m0
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movd m0, [r2 + r4]
- movd m1, [r3 + r5]
- psubw m0, m1
- lea r0, [r0 + 2 * r1]
- movd [r0 + r1], m0
-
- movd m0, [r2 + 2 * r4]
- movd m1, [r3 + 2 * r5]
- psubw m0, m1
- movd [r0 + 2 * r1], m0
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movd m0, [r2 + r4]
- movd m1, [r3 + r5]
- psubw m0, m1
- lea r0, [r0 + 2 * r1]
- movd [r0 + r1], m0
-%else
-INIT_XMM sse4
-%if ARCH_X86_64
- cglobal pixel_sub_ps_2x8, 6, 8, 0
-
- %define tmp_r1 r1
- DECLARE_REG_TMP 6, 7
-%else
- cglobal pixel_sub_ps_2x8, 6, 7, 0, 0-4
-
- %define tmp_r1 dword [rsp]
- DECLARE_REG_TMP 6, 1
-%endif ; ARCH_X86_64
-
- add r1, r1
-
-%if ARCH_X86_64 == 0
- mov tmp_r1, r1
-
-%endif
-
- movzx t0d, byte [r2]
- movzx t1d, byte [r3]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + 1]
- movzx t1d, byte [r3 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- movzx t0d, byte [r2 + r4]
- movzx t1d, byte [r3 + r5]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 + 1]
- movzx t1d, byte [r3 + r5 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- movzx t0d, byte [r2 + r4 * 2]
- movzx t1d, byte [r3 + r5 * 2]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 * 2 + 1]
- movzx t1d, byte [r3 + r5 * 2 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
-
- movzx t0d, byte [r2 + r4]
- movzx t1d, byte [r3 + r5]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 + 1]
- movzx t1d, byte [r3 + r5 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- movzx t0d, byte [r2 + r4 * 2]
- movzx t1d, byte [r3 + r5 * 2]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 * 2 + 1]
- movzx t1d, byte [r3 + r5 * 2 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
-
- movzx t0d, byte [r2 + r4]
- movzx t1d, byte [r3 + r5]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 + 1]
- movzx t1d, byte [r3 + r5 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- movzx t0d, byte [r2 + r4 * 2]
- movzx t1d, byte [r3 + r5 * 2]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 * 2 + 1]
- movzx t1d, byte [r3 + r5 * 2 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-
- add r0, tmp_r1
-
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
-
- movzx t0d, byte [r2 + r4]
- movzx t1d, byte [r3 + r5]
-
- sub t0d, t1d
-
- mov [r0], t0w
- movzx t0d, byte [r2 + r4 + 1]
- movzx t1d, byte [r3 + r5 + 1]
-
- sub t0d, t1d
-
- mov [r0 + 2], t0w
-%endif
-RET
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_2x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PIXEL_SUB_PS_W2_H2 2
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
- add r1, r1
+cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/4
add r4, r4
add r5, r5
- mov r6d, %2/2
+ add r1, r1
.loop:
- movd m0, [r2]
- movd m1, [r3]
- movd m2, [r2 + r4]
- movd m3, [r3 + r5]
+ movh m0, [r2]
+ movh m2, [r2 + r4]
+ movh m1, [r3]
+ movh m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movh m4, [r2]
+ movh m6, [r2 + r4]
+ movh m5, [r3]
+ movh m7, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
+
psubw m0, m1
psubw m2, m3
- movd [r0], m0
- movd [r0 + r1], m2
- lea r0, [r0 + 2 * r1]
+ psubw m4, m5
+ psubw m6, m7
+
+ movh [r0], m0
+ movh [r0 + r1], m2
+ movh [r0 + r1 * 2], m4
+ lea r0, [r0 + r1 * 2]
+ movh [r0 + r1], m6
+ lea r0, [r0 + r1 * 2]
+
jnz .loop
RET
%else
+cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/4
+ add r1, r1
+.loop:
+ movd m0, [r2]
+ movd m2, [r2 + r4]
+ movd m1, [r3]
+ movd m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movd m4, [r2]
+ movd m6, [r2 + r4]
+ movd m5, [r3]
+ movd m7, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ punpckldq m0, m2
+ punpckldq m1, m3
+ punpckldq m4, m6
+ punpckldq m5, m7
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+
+ psubw m0, m1
+ psubw m4, m5
+
+ movh [r0], m0
+ movhps [r0 + r1], m0
+ movh [r0 + r1 * 2], m4
+ lea r0, [r0 + r1 * 2]
+ movhps [r0 + r1], m4
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
+%endif
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+PIXELSUB_PS_W4_H4 4, 8
+%else
INIT_XMM sse4
-cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
+PIXELSUB_PS_W4_H4 4, 8
+%endif
+
+
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
+%macro PIXELSUB_PS_W8_H4 2
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/4
+ add r4, r4
+ add r5, r5
+ add r1, r1
+.loop:
+ movu m0, [r2]
+ movu m2, [r2 + r4]
+ movu m1, [r3]
+ movu m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movu m4, [r2]
+ movu m6, [r2 + r4]
+ movu m5, [r3]
+ movu m7, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + r1], m2
+ movu [r0 + r1 * 2], m4
+ lea r0, [r0 + r1 * 2]
+ movu [r0 + r1], m6
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
+%else
+cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/4
add r1, r1
- mov r6d, %2/2
.loop:
- pinsrw m0, [r2], 0
- pinsrw m1, [r3], 0
- pinsrw m2, [r2 + r4], 0
- pinsrw m3, [r3 + r5], 0
+ movh m0, [r2]
+ movh m2, [r2 + r4]
+ movh m1, [r3]
+ movh m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movh m4, [r2]
+ movh m6, [r2 + r4]
+ movh m5, [r3]
+ movh m7, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
@@ -2920,1304 +2760,529 @@
pmovzxbw m1, m1
pmovzxbw m2, m2
pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+ pmovzxbw m6, m6
+ pmovzxbw m7, m7
+
psubw m0, m1
psubw m2, m3
- movd [r0], m0
- movd [r0 + r1], m2
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + r1], m2
+ movu [r0 + r1 * 2], m4
lea r0, [r0 + r1 * 2]
+ movu [r0 + r1], m6
+ lea r0, [r0 + r1 * 2]
+
jnz .loop
RET
%endif
%endmacro
-PIXEL_SUB_PS_W2_H2 2, 16
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_sp_c_4x2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_sub_ps_4x2, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- add r4, r4
- add r5, r5
- movh m0, [r2]
- movh m1, [r3]
- movh m2, [r2 + r4]
- movh m3, [r3 + r5]
- psubw m0, m1
- psubw m2, m3
-
- movh [r0], m0
- movh [r0 + r1], m2
+PIXELSUB_PS_W8_H4 8, 8
+PIXELSUB_PS_W8_H4 8, 16
%else
INIT_XMM sse4
-cglobal pixel_sub_ps_4x2, 6, 6, 4, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add r1, r1
-
-movd m0, [r2]
-movd m1, [r3]
-
-movd m2, [r2 + r4]
-movd m3, [r3 + r5]
-
-punpckldq m0, m2
-punpckldq m1, m3
-pmovzxbw m0, m0
-pmovzxbw m1, m1
-
-psubw m0, m1
-
-movlps [r0], m0
-movhps [r0 + r1], m0
-%endif
-RET
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W4_H4 2
-cglobal pixel_sub_ps_%1x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
- mov r6d, %2/4
- add r1, r1
-%if HIGH_BIT_DEPTH
- add r4, r4
- add r5, r5
-.loop:
- movh m0, [r2]
- movh m1, [r3]
- movh m2, [r2 + r4]
- movh m3, [r3 + r5]
- movh m4, [r2 + 2 * r4]
- movh m5, [r3 + 2 * r5]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movh m6, [r2 + r4]
- movh m7, [r3 + r5]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- psubw m6, m7
-
- movh [r0], m0
- movh [r0 + r1], m2
- movh [r0 + 2 * r1], m4
- lea r0, [r0 + 2 * r1]
- movh [r0 + r1], m6
-%else
-.loop:
- movd m0, [r2]
- movd m1, [r3]
- movd m2, [r2 + r4]
- movd m3, [r3 + r5]
- movd m4, [r2 + 2 * r4]
- movd m5, [r3 + 2 * r5]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movd m6, [r2 + r4]
- movd m7, [r3 + r5]
- punpckldq m0, m2
- punpckldq m1, m3
- punpckldq m4, m6
- punpckldq m5, m7
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- psubw m0, m1
- psubw m4, m5
-
- movlps [r0], m0
- movhps [r0 + r1], m0
- movlps [r0 + 2 * r1], m4
- lea r0, [r0 + 2 * r1]
- movhps [r0 + r1], m4
-%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
-RET
-%endmacro
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PIXELSUB_PS_W4_H4 4, 4
-PIXELSUB_PS_W4_H4 4, 8
-PIXELSUB_PS_W4_H4 4, 16
-;
-PIXELSUB_PS_W4_H4 4, 12
-PIXELSUB_PS_W4_H4 4, 32
-%else
-INIT_XMM sse4
-PIXELSUB_PS_W4_H4 4, 4
-PIXELSUB_PS_W4_H4 4, 8
-PIXELSUB_PS_W4_H4 4, 16
-;
-PIXELSUB_PS_W4_H4 4, 12
-PIXELSUB_PS_W4_H4 4, 32
-%endif
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W6_H4 2
-cglobal pixel_sub_ps_%1x%2, 6, 7, 2
- add r1, r1
- mov r6d, %2/4
-%if HIGH_BIT_DEPTH
- add r4, r4
- add r5, r5
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- psubw m0, m1
- movh [r0], m0
- movhlps m0, m0
- movd [r0 + 8], m0
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- psubw m0, m1
- movh [r0 + r1], m0
- movhlps m0, m0
- movd [r0 + r1 + 8], m0
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r3]
- psubw m0, m1
- movh [r0], m0
- movhlps m0, m0
- movd [r0 + 8], m0
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- psubw m0, m1
- movh [r0 + r1], m0
- movhlps m0, m0
- movd [r0 + r1 + 8], m0
-%else
-.loop:
- movh m0, [r2]
- pmovzxbw m0, m0
- movh m1, [r3]
- pmovzxbw m1, m1
- psubw m0, m1
- movh [r0], m0
- pextrd [r0 + 8], m0, 2
-
- movh m0, [r2 + r4]
- pmovzxbw m0, m0
- movh m1, [r3 + r5]
- pmovzxbw m1, m1
- psubw m0, m1
- movh [r0 + r1], m0
- pextrd [r0 + r1 + 8], m0, 2
-
- movh m0, [r2 + 2 * r4]
- pmovzxbw m0, m0
- movh m1, [r3 + 2 * r5]
- pmovzxbw m1, m1
- psubw m0, m1
- movh [r0 + 2* r1], m0
- pextrd [r0 + 2 * r1 + 8], m0, 2
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movh m0, [r2 + r4]
- pmovzxbw m0, m0
- movh m1, [r3 + r5]
- pmovzxbw m1, m1
- psubw m0, m1
- lea r0, [r0 + 2 * r1]
- movh [r0 + r1], m0
- pextrd [r0 + r1 + 8], m0, 2
-%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
- RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PIXELSUB_PS_W6_H4 6, 8
-;
-PIXELSUB_PS_W6_H4 6, 16
-%else
-INIT_XMM sse4
-PIXELSUB_PS_W6_H4 6, 8
-;
-PIXELSUB_PS_W6_H4 6, 16
-%endif
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_8x2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_sub_ps_8x2, 6, 6, 4, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- add r4, r4
- add r5, r5
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
-%else
-INIT_XMM sse4
-cglobal pixel_sub_ps_8x2, 6, 6, 4, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- movh m0, [r2]
- movh m1, [r3]
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- movh m2, [r2 + r4]
- movh m3, [r3 + r5]
- pmovzxbw m2, m2
- pmovzxbw m3, m3
-%endif
- psubw m0, m1
- psubw m2, m3
-
- movu [r0], m0
- movu [r0 + r1], m2
- RET
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_8x6(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_sub_ps_8x6, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- add r4, r4
- add r5, r5
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movu m4, [r2]
- movu m5, [r3]
- movu m6, [r2 + r4]
- movu m7, [r3 + r5]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- psubw m6, m7
-
- movu [r0], m0
- movu [r0 + r1], m2
- lea r0, [r0 + 2 * r1]
- movu [r0], m4
- movu [r0 + r1], m6
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- psubw m0, m1
- psubw m2, m3
-
- lea r0, [r0 + 2 * r1]
- movu [r0], m0
- movu [r0 + r1], m2
-%else
-
-INIT_XMM sse4
-cglobal pixel_sub_ps_8x6, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- movh m0, [r2]
- movh m1, [r3]
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- movh m2, [r2 + r4]
- movh m3, [r3 + r5]
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- movh m4, [r2 + 2 * r4]
- movh m5, [r3 + 2 * r5]
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movh m6, [r2 + r4]
- movh m7, [r3 + r5]
- pmovzxbw m6, m6
- pmovzxbw m7, m7
- movh m1, [r2 + 2 * r4]
- movh m3, [r3 + 2 * r5]
- pmovzxbw m1, m1
- pmovzxbw m3, m3
- psubw m6, m7
- psubw m1, m3
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movh m3, [r2 + r4]
- movh m5, [r3 + r5]
- pmovzxbw m3, m3
- pmovzxbw m5, m5
- psubw m3, m5
-
- movu [r0], m0
- movu [r0 + r1], m2
- movu [r0 + 2 * r1], m4
- lea r0, [r0 + 2 * r1]
- movu [r0 + r1], m6
- movu [r0 + 2 * r1], m1
- lea r0, [r0 + 2 * r1]
- movu [r0 + r1], m3
-%endif
- RET
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W8_H4 2
-cglobal pixel_sub_ps_%1x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/4
-%if HIGH_BIT_DEPTH
- add r4, r4
- add r5, r5
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movu m4, [r2]
- movu m5, [r3]
- movu m6, [r2 + r4]
- movu m7, [r3 + r5]
-%else
-
-.loop:
- movh m0, [r2]
- movh m1, [r3]
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- movh m2, [r2 + r4]
- movh m3, [r3 + r5]
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- movh m4, [r2 + 2 * r4]
- movh m5, [r3 + 2 * r5]
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- movh m6, [r2 + r4]
- movh m7, [r3 + r5]
- pmovzxbw m6, m6
- pmovzxbw m7, m7
-%endif
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- psubw m6, m7
-
- movu [r0], m0
- movu [r0 + r1], m2
- movu [r0 + 2 * r1], m4
- lea r0, [r0 + 2 * r1]
- movu [r0 + r1], m6
-
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
- RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PIXELSUB_PS_W8_H4 8, 4
PIXELSUB_PS_W8_H4 8, 8
PIXELSUB_PS_W8_H4 8, 16
-PIXELSUB_PS_W8_H4 8, 32
-;
-PIXELSUB_PS_W8_H4 8, 12
-PIXELSUB_PS_W8_H4 8, 64
-%else
-INIT_XMM sse4
-PIXELSUB_PS_W8_H4 8, 4
-PIXELSUB_PS_W8_H4 8, 8
-PIXELSUB_PS_W8_H4 8, 16
-PIXELSUB_PS_W8_H4 8, 32
-;
-PIXELSUB_PS_W8_H4 8, 12
-PIXELSUB_PS_W8_H4 8, 64
%endif
+
;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W12_H4 2
-cglobal pixel_sub_ps_%1x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/4
-%if HIGH_BIT_DEPTH
- add r4, r4
- add r5, r5
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movh m2, [r2 + 16]
- movh m3, [r3 + 16]
- movu m4, [r2 + r4]
- movu m5, [r3 + r5]
- movh m6, [r2 + r4 + 16]
- movh m7, [r3 + r5 + 16]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- psubw m6, m7
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- movu [r0], m0
- movh [r0 + 16], m2
- movu [r0 + r1], m4
- movh [r0 + r1 + 16], m6
-
- movu m0, [r2]
- movu m1, [r3]
- movh m2, [r2 + 16]
- movh m3, [r3 + 16]
- movu m4, [r2 + r4]
- movu m5, [r3 + r5]
- movh m6, [r2 + r4 + 16]
- movh m7, [r3 + r5 + 16]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- psubw m6, m7
- lea r0, [r0 + 2 * r1]
-
- movu [r0], m0
- movh [r0 + 16], m2
- movu [r0 + r1], m4
- movh [r0 + r1 + 16], m6
-%else
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- mova m4, m0
- mova m5, m1
- punpckhdq m4, m2
- punpckhdq m5, m3
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0], m0
- movlps [r0 + 16], m4
- movu [r0 + r1], m2
- movhps [r0 + r1 + 16], m4
- movu m0, [r2 + 2 * r4]
- movu m1, [r3 + 2 * r5]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- mova m4, m0
- mova m5, m1
- punpckhdq m4, m2
- punpckhdq m5, m3
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0 + 2 * r1], m0
- movlps [r0 + 2 * r1 + 16], m4
- lea r0, [r0 + 2 * r1]
- movu [r0 + r1], m2
- movhps [r0 + r1 + 16], m4
-%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
- RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PIXELSUB_PS_W12_H4 12, 16
-;
-PIXELSUB_PS_W12_H4 12, 32
-%else
-INIT_XMM sse4
-PIXELSUB_PS_W12_H4 12, 16
-;
-PIXELSUB_PS_W12_H4 12, 32
-%endif
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W16_H4 2
%if HIGH_BIT_DEPTH
-cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/4
- add r4, r4
- add r5, r5
+cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/4
+ add r4, r4
+ add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- psubw m0, m1
- psubw m2, m3
- movu m4, [r2 + r4]
- movu m5, [r3 + r5]
- movu m1, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- psubw m4, m5
- psubw m1, m3
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + r1], m4
- movu [r0 + r1 + 16], m1
-
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- psubw m0, m1
- psubw m2, m3
- movu m4, [r2 + r4]
- movu m5, [r3 + r5]
- movu m1, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- psubw m4, m5
- psubw m1, m3
- lea r0, [r0 + 2 * r1]
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + r1], m4
- movu [r0 + r1 + 16], m1
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
+ movu m4, [r2 + r4]
+ movu m6, [r2 + r4 + 16]
+ movu m5, [r3 + r5]
+ movu m7, [r3 + r5 + 16]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m6
+
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
+ movu m4, [r2 + r4]
+ movu m5, [r3 + r5]
+ movu m6, [r2 + r4 + 16]
+ movu m7, [r3 + r5 + 16]
+ lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m6
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
-cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/4
- pxor m6, m6
+cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/4
+ pxor m6, m6
+ add r1, r1
.loop:
- movu m1, [r2]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m0, m2
- psubw m1, m3
-
- movu m5, [r2 + r4]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- movu m3, [r3 + r5]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m4, m2
- psubw m5, m3
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m4
- movu [r0 + r1 + 16], m5
-
- movu m1, [r2 + 2 * r4]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3 + 2 * r5]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- psubw m0, m2
- psubw m1, m3
- movu m5, [r2 + r4]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- movu m3, [r3 + r5]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m4, m2
- psubw m5, m3
-
- movu [r0 + 2 * r1], m0
- movu [r0 + 2 * r1 + 16], m1
- lea r0, [r0 + 2 * r1]
- movu [r0 + r1], m4
- movu [r0 + r1 + 16], m5
+ movu m1, [r2]
+ movu m3, [r3]
+ pmovzxbw m0, m1
+ pmovzxbw m2, m3
+ punpckhbw m1, m6
+ punpckhbw m3, m6
+
+ psubw m0, m2
+ psubw m1, m3
+
+ movu m5, [r2 + r4]
+ movu m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m4, m5
+ pmovzxbw m2, m3
+ punpckhbw m5, m6
+ punpckhbw m3, m6
+
+ psubw m4, m2
+ psubw m5, m3
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m5
+
+ movu m1, [r2]
+ movu m3, [r3]
+ pmovzxbw m0, m1
+ pmovzxbw m2, m3
+ punpckhbw m1, m6
+ punpckhbw m3, m6
+
+ psubw m0, m2
+ psubw m1, m3
+
+ movu m5, [r2 + r4]
+ movu m3, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ pmovzxbw m4, m5
+ pmovzxbw m2, m3
+ punpckhbw m5, m6
+ punpckhbw m3, m6
+
+ psubw m4, m2
+ psubw m5, m3
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m5
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
-
-RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-PIXELSUB_PS_W16_H4 16, 4
-PIXELSUB_PS_W16_H4 16, 8
-PIXELSUB_PS_W16_H4 16, 12
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
-PIXELSUB_PS_W16_H4 16, 64
-;
-PIXELSUB_PS_W16_H4 16, 24
%else
INIT_XMM sse4
-PIXELSUB_PS_W16_H4 16, 4
-PIXELSUB_PS_W16_H4 16, 8
-PIXELSUB_PS_W16_H4 16, 12
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
-PIXELSUB_PS_W16_H4 16, 64
-;
-PIXELSUB_PS_W16_H4 16, 24
%endif
+
;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W24_H2 2
+%macro PIXELSUB_PS_W32_H2 2
%if HIGH_BIT_DEPTH
-cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
+cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/2
add r4, r4
add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- movu m4, [r2 + 32]
- movu m5, [r3 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + 32], m4
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- movu m4, [r2 + r4 + 32]
- movu m5, [r3 + r5 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
- movu [r0 + r1 + 32], m4
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m4, [r2 + 32]
+ movu m6, [r2 + 48]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
+ movu m5, [r3 + 32]
+ movu m7, [r3 + 48]
+ dec r6d
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m4
+ movu [r0 + 48], m6
+
+ movu m0, [r2 + r4]
+ movu m2, [r2 + r4 + 16]
+ movu m4, [r2 + r4 + 32]
+ movu m6, [r2 + r4 + 48]
+ movu m1, [r3 + r5]
+ movu m3, [r3 + r5 + 16]
+ movu m5, [r3 + r5 + 32]
+ movu m7, [r3 + r5 + 48]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ movu [r0 + r1 + 32], m4
+ movu [r0 + r1 + 48], m6
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
-cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/2
- pxor m6, m6
+cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/2
+ add r1, r1
.loop:
- movu m1, [r2]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movh m2, [r2 + 16]
- pmovzxbw m2, m2
- movu m4, [r3]
- pmovzxbw m3, m4
- punpckhbw m4, m6
- movh m5, [r3 + 16]
- pmovzxbw m5, m5
- psubw m0, m3
- psubw m1, m4
- psubw m2, m5
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
-
- movu m1, [r2 + r4]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movh m2, [r2 + r4 + 16]
- pmovzxbw m2, m2
- movu m4, [r3 + r5]
- pmovzxbw m3, m4
- punpckhbw m4, m6
- movh m5, [r3 + r5 + 16]
- pmovzxbw m5, m5
- psubw m0, m3
- psubw m1, m4
- psubw m2, m5
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
+ movh m0, [r2]
+ movh m1, [r2 + 8]
+ movh m2, [r2 + 16]
+ movh m6, [r2 + 24]
+ movh m3, [r3]
+ movh m4, [r3 + 8]
+ movh m5, [r3 + 16]
+ movh m7, [r3 + 24]
+ dec r6d
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m6, m6
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+ pmovzxbw m7, m7
+
+ psubw m0, m3
+ psubw m1, m4
+ psubw m2, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m6
+
+ movh m0, [r2 + r4]
+ movh m1, [r2 + r4 + 8]
+ movh m2, [r2 + r4 + 16]
+ movh m6, [r2 + r4 + 24]
+ movh m3, [r3 + r5]
+ movh m4, [r3 + r5 + 8]
+ movh m5, [r3 + r5 + 16]
+ movh m7, [r3 + r5 + 24]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m6, m6
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+ pmovzxbw m7, m7
+
+ psubw m0, m3
+ psubw m1, m4
+ psubw m2, m5
+ psubw m6, m7
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m6
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%endif
-
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
-
-RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-PIXELSUB_PS_W24_H2 24, 32
-;
-PIXELSUB_PS_W24_H2 24, 64
+PIXELSUB_PS_W32_H2 32, 32
+PIXELSUB_PS_W32_H2 32, 64
%else
INIT_XMM sse4
-PIXELSUB_PS_W24_H2 24, 32
-;
-PIXELSUB_PS_W24_H2 24, 64
-%endif
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W32_H2 2
-cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/2
-%if HIGH_BIT_DEPTH
- add r4, r4
- add r5, r5
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- movu m4, [r2 + 32]
- movu m5, [r3 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- movu m3, [r2 + 48]
- movu m5, [r3 + 48]
- psubw m3, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + 32], m4
- movu [r0 + 48], m3
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- movu m4, [r2 + r4 + 32]
- movu m5, [r3 + r5 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- movu m3, [r2 + r4 + 48]
- movu m5, [r3 + r5 + 48]
- psubw m3, m5
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
- movu [r0 + r1 + 32], m4
- movu [r0 + r1 + 48], m3
-%else
-.loop:
- movh m0, [r2]
- movh m1, [r2 + 8]
- movh m2, [r2 + 16]
- movh m3, [r3]
- movh m4, [r3 + 8]
- movh m5, [r3 + 16]
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- psubw m0, m3
- psubw m1, m4
- psubw m2, m5
- movh m3, [r2 + 24]
- movh m4, [r3 + 24]
- pmovzxbw m4, m4
- pmovzxbw m3, m3
- psubw m3, m4
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
-
- movh m0, [r2 + r4]
- movh m1, [r2 + r4 + 8]
- movh m2, [r2 + r4 + 16]
- movh m3, [r3 + r5]
- movh m4, [r3 + r5 + 8]
- movh m5, [r3 + r5 + 16]
- pmovzxbw m0, m0
- pmovzxbw m1, m1
- pmovzxbw m2, m2
- pmovzxbw m3, m3
- pmovzxbw m4, m4
- pmovzxbw m5, m5
- psubw m0, m3
- psubw m1, m4
- psubw m2, m5
- movh m3, [r2 + r4 + 24]
- movh m4, [r3 + r5 + 24]
- pmovzxbw m3, m3
- pmovzxbw m4, m4
- psubw m3, m4
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
-%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
- RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PIXELSUB_PS_W32_H2 32, 8
-PIXELSUB_PS_W32_H2 32, 16
-PIXELSUB_PS_W32_H2 32, 24
PIXELSUB_PS_W32_H2 32, 32
PIXELSUB_PS_W32_H2 32, 64
-;
-PIXELSUB_PS_W32_H2 32, 48
-%else
-INIT_XMM sse4
-PIXELSUB_PS_W32_H2 32, 8
-PIXELSUB_PS_W32_H2 32, 16
-PIXELSUB_PS_W32_H2 32, 24
-PIXELSUB_PS_W32_H2 32, 32
-PIXELSUB_PS_W32_H2 32, 64
-;
-PIXELSUB_PS_W32_H2 32, 48
%endif
+
;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W48_H2 2
+%macro PIXELSUB_PS_W64_H2 2
%if HIGH_BIT_DEPTH
-cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
+cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/2
add r4, r4
add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- movu m4, [r2 + 32]
- movu m5, [r3 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + 32], m4
-
- movu m0, [r2 + 48]
- movu m1, [r3 + 48]
- movu m2, [r2 + 64]
- movu m3, [r3 + 64]
- movu m4, [r2 + 80]
- movu m5, [r3 + 80]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0 + 48], m0
- movu [r0 + 64], m2
- movu [r0 + 80], m4
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- movu m4, [r2 + r4 + 32]
- movu m5, [r3 + r5 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
- movu [r0 + r1 + 32], m4
-
- movu m0, [r2 + r4 + 48]
- movu m1, [r3 + r5 + 48]
- movu m2, [r2 + r4 + 64]
- movu m3, [r3 + r5 + 64]
- movu m4, [r2 + r4 + 80]
- movu m5, [r3 + r5 + 80]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
-
- movu [r0 + r1 + 48], m0
- movu [r0 + r1 + 64], m2
- movu [r0 + r1 + 80], m4
-%else
-
-cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/2
- pxor m6, m6
-.loop:
- movu m1, [r2]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- movu m5, [r2 + 16]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0], m0
- movu [r0 + 16], m1
-
- movu m3, [r3 + 16]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m4, m2
- psubw m5, m3
-
- movu [r0 + 32], m4
- movu [r0 + 48], m5
-
- movu m1, [r2 + 32]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3 + 32]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0 + 64], m0
- movu [r0 + 80], m1
-
- movu m1, [r2 + r4]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3 + r5]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- movu m5, [r2 + r5 + 16]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
-
- movu m3, [r3 + r4 + 16]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m4, m2
- psubw m5, m3
-
- movu [r0 + r1 + 32], m4
- movu [r0 + r1 + 48], m5
-
- movu m1, [r2 + r4 + 32]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3 + r5 + 32]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0 + r1 + 64], m0
- movu [r0 + r1 + 80], m1
-%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
-
-RET
-%endmacro
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-PIXELSUB_PS_W48_H2 48, 64
-%else
-INIT_XMM sse4
-PIXELSUB_PS_W48_H2 48, 64
-%endif
-
-;-----------------------------------------------------------------------------
-; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
-;-----------------------------------------------------------------------------
-%macro PIXELSUB_PS_W64_H2 2
-%if HIGH_BIT_DEPTH
-cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/2
- add r4, r4
- add r5, r5
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- movu m4, [r2 + 32]
- movu m5, [r3 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- movu m3, [r2 + 48]
- movu m5, [r3 + 48]
- psubw m3, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
- movu [r0 + 32], m4
- movu [r0 + 48], m3
-
- movu m0, [r2 + 64]
- movu m1, [r3 + 64]
- movu m2, [r2 + 80]
- movu m3, [r3 + 80]
- movu m4, [r2 + 96]
- movu m5, [r3 + 96]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- movu m3, [r2 + 112]
- movu m5, [r3 + 112]
- psubw m3, m5
-
- movu [r0 + 64], m0
- movu [r0 + 80], m2
- movu [r0 + 96], m4
- movu [r0 + 112], m3
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- movu m4, [r2 + r4 + 32]
- movu m5, [r3 + r5 + 32]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- movu m3, [r2 + r4 + 48]
- movu m5, [r3 + r5 + 48]
- psubw m3, m5
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
- movu [r0 + r1 + 32], m4
- movu [r0 + r1 + 48], m3
-
- movu m0, [r2 + r4 + 64]
- movu m1, [r3 + r5 + 64]
- movu m2, [r2 + r4 + 80]
- movu m3, [r3 + r5 + 80]
- movu m4, [r2 + r4 + 96]
- movu m5, [r3 + r5 + 96]
- psubw m0, m1
- psubw m2, m3
- psubw m4, m5
- movu m3, [r2 + r4 + 112]
- movu m5, [r3 + r5 + 112]
- psubw m3, m5
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m4, [r2 + 32]
+ movu m6, [r2 + 48]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
+ movu m5, [r3 + 32]
+ movu m7, [r3 + 48]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m4
+ movu [r0 + 48], m6
+
+ movu m0, [r2 + 64]
+ movu m2, [r2 + 80]
+ movu m4, [r2 + 96]
+ movu m6, [r2 + 112]
+ movu m1, [r3 + 64]
+ movu m3, [r3 + 80]
+ movu m5, [r3 + 96]
+ movu m7, [r3 + 112]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0 + 64], m0
+ movu [r0 + 80], m2
+ movu [r0 + 96], m4
+ movu [r0 + 112], m6
+
+ movu m0, [r2 + r4]
+ movu m2, [r2 + r4 + 16]
+ movu m4, [r2 + r4 + 32]
+ movu m6, [r2 + r4 + 48]
+ movu m1, [r3 + r5]
+ movu m3, [r3 + r5 + 16]
+ movu m5, [r3 + r5 + 32]
+ movu m7, [r3 + r5 + 48]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ movu [r0 + r1 + 32], m4
+ movu [r0 + r1 + 48], m6
+
+ movu m0, [r2 + r4 + 64]
+ movu m2, [r2 + r4 + 80]
+ movu m4, [r2 + r4 + 96]
+ movu m6, [r2 + r4 + 112]
+ movu m1, [r3 + r5 + 64]
+ movu m3, [r3 + r5 + 80]
+ movu m5, [r3 + r5 + 96]
+ movu m7, [r3 + r5 + 112]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ psubw m6, m7
movu [r0 + r1 + 64], m0
movu [r0 + r1 + 80], m2
movu [r0 + r1 + 96], m4
- movu [r0 + r1 + 112], m3
-
+ movu [r0 + r1 + 112], m6
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
-
-cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
- add r1, r1
- mov r6d, %2/2
- pxor m6, m6
+cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ mov r6d, %2/2
+ pxor m6, m6
+ add r1, r1
.loop:
- movu m1, [r2]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r3]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- movu m5, [r2 + 16]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0], m0
- movu [r0 + 16], m1
-
- movu m1, [r3 + 16]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r2 + 32]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m4, m0
- psubw m5, m1
-
- movu [r0 + 32], m4
- movu [r0 + 48], m5
-
- movu m5, [r3 + 32]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- movu m1, [r2 + 48]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- psubw m2, m4
- psubw m3, m5
-
- movu [r0 + 64], m2
- movu [r0 + 80], m3
-
- movu m3, [r3 + 48]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- movu m5, [r2 + r4]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0 + 96], m0
- movu [r0 + 112], m1
-
- movu m1, [r3 + r5]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- movu m3, [r2 + r4 + 16]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- psubw m4, m0
- psubw m5, m1
-
- movu [r0 + r1], m4
- movu [r0 + r1 + 16], m5
-
- movu m5, [r3 + r5 + 16]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- movu m1, [r2 + r4 + 32]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- psubw m2, m4
- psubw m3, m5
-
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
-
- movu m3, [r3 + r5 + 32]
- pmovzxbw m2, m3
- punpckhbw m3, m6
- movu m5, [r2 + r4 + 48]
- pmovzxbw m4, m5
- punpckhbw m5, m6
- psubw m0, m2
- psubw m1, m3
-
- movu [r0 + r1 + 64], m0
- movu [r0 + r1 + 80], m1
-
- movu m1, [r3 + r5 + 48]
- pmovzxbw m0, m1
- punpckhbw m1, m6
- psubw m4, m0
- psubw m5, m1
-
- movu [r0 + r1 + 96], m4
- movu [r0 + r1 + 112], m5
+ movu m1, [r2]
+ movu m5, [r2 + 16]
+ movu m3, [r3]
+ movu m7, [r3 + 16]
+
+ pmovzxbw m0, m1
+ pmovzxbw m4, m5
+ pmovzxbw m2, m3
+ punpckhbw m1, m6
+ punpckhbw m3, m6
+ punpckhbw m5, m6
+
+ psubw m0, m2
+ psubw m1, m3
+ pmovzxbw m2, m7
+ punpckhbw m7, m6
+ psubw m4, m2
+ psubw m5, m7
+
+ movu m3, [r2 + 32]
+ movu m7, [r3 + 32]
+ pmovzxbw m2, m3
+ punpckhbw m3, m6
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m4
+ movu [r0 + 48], m5
+
+ movu m1, [r2 + 48]
+ movu m5, [r3 + 48]
+ pmovzxbw m0, m1
+ pmovzxbw m4, m7
+ punpckhbw m1, m6
+ punpckhbw m7, m6
+
+ psubw m2, m4
+ psubw m3, m7
+
+ movu [r0 + 64], m2
+ movu [r0 + 80], m3
+
+ movu m7, [r2 + r4]
+ movu m3, [r3 + r5]
+ pmovzxbw m2, m5
+ pmovzxbw m4, m7
+ punpckhbw m5, m6
+ punpckhbw m7, m6
+
+ psubw m0, m2
+ psubw m1, m5
+
+ movu [r0 + 96], m0
+ movu [r0 + 112], m1
+
+ movu m2, [r2 + r4 + 16]
+ movu m5, [r3 + r5 + 16]
+ pmovzxbw m0, m3
+ pmovzxbw m1, m2
+ punpckhbw m3, m6
+ punpckhbw m2, m6
+
+ psubw m4, m0
+ psubw m7, m3
+
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m7
+
+ movu m0, [r2 + r4 + 32]
+ movu m3, [r3 + r5 + 32]
+ dec r6d
+ pmovzxbw m4, m5
+ pmovzxbw m7, m0
+ punpckhbw m5, m6
+ punpckhbw m0, m6
+
+ psubw m1, m4
+ psubw m2, m5
+
+ movu [r0 + r1 + 32], m1
+ movu [r0 + r1 + 48], m2
+
+ movu m4, [r2 + r4 + 48]
+ movu m5, [r3 + r5 + 48]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m1, m3
+ pmovzxbw m2, m4
+ punpckhbw m3, m6
+ punpckhbw m4, m6
+
+ psubw m7, m1
+ psubw m0, m3
+
+ movu [r0 + r1 + 64], m7
+ movu [r0 + r1 + 80], m0
+
+ pmovzxbw m7, m5
+ punpckhbw m5, m6
+ psubw m2, m7
+ psubw m4, m5
+
+ movu [r0 + r1 + 96], m2
+ movu [r0 + r1 + 112], m4
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%endif
- dec r6d
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- jnz .loop
- RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-PIXELSUB_PS_W64_H2 64, 16
-PIXELSUB_PS_W64_H2 64, 32
-PIXELSUB_PS_W64_H2 64, 48
PIXELSUB_PS_W64_H2 64, 64
%else
INIT_XMM sse4
-PIXELSUB_PS_W64_H2 64, 16
-PIXELSUB_PS_W64_H2 64, 32
-PIXELSUB_PS_W64_H2 64, 48
PIXELSUB_PS_W64_H2 64, 64
%endif
+
;=============================================================================
; variance
;=============================================================================
diff -r 619633a933f6 -r 5cf494d8591b source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/x86/pixeladd8.asm Thu Aug 07 19:23:38 2014 +0900
@@ -31,1641 +31,710 @@
cextern pw_pixel_max
;-----------------------------------------------------------------------------
-; void pixel_add_ps_2x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_2x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
+cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m1, [pw_pixel_max]
+ pxor m0, m0
+ add r4, r4
+ add r5, r5
+ add r1, r1
+ movh m2, [r2]
+ movhps m2, [r2 + r4]
+ movh m3, [r3]
+ movhps m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movh m4, [r2]
+ movhps m4, [r2 + r4]
+ movh m5, [r3]
+ movhps m5, [r3 + r5]
- movd m0, [r2]
- movd m1, [r3]
- movd m2, [r2 + r4]
- movd m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ paddw m2, m3
+ paddw m4, m5
+ CLIPW2 m2, m4, m0, m1
- movd [r0], m0
- movd [r0 + r1], m2
+ movh [r0], m2
+ movhps [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m4
+ movhps [r0 + r1], m4
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movd m0, [r2]
- movd m1, [r3]
- movd m2, [r2 + r4]
- movd m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movd [r0], m0
- movd [r0 + r1], m2
+ RET
%else
INIT_XMM sse4
-cglobal pixel_add_ps_2x4, 6, 6, 2, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+ add r5, r5
+ pmovzxbw m0, [r2]
+ pmovzxbw m2, [r2 + r4]
+ movh m1, [r3]
+ movh m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m4, [r2]
+ pmovzxbw m6, [r2 + r4]
+ movh m5, [r3]
+ movh m7, [r3 + r5]
-add r5, r5
+ paddw m0, m1
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ packuswb m0, m0
+ packuswb m2, m2
+ packuswb m4, m4
+ packuswb m6, m6
-pmovzxbw m0, [r2]
-movd m1, [r3]
+ movd [r0], m0
+ movd [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ movd [r0], m4
+ movd [r0 + r1], m6
-paddw m0, m1
-packuswb m0, m0
-
-pextrw [r0], m0, 0
-
-pmovzxbw m0, [r2 + r4]
-movd m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-pextrw [r0 + r1], m0, 0
-
-pmovzxbw m0, [r2 + 2 * r4]
-movd m1, [r3 + 2 * r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-pextrw [r0 + 2 * r1], m0, 0
-
-lea r0, [r0 + 2 * r1]
-lea r2, [r2 + 2 * r4]
-lea r3, [r3 + 2 * r5]
-
-pmovzxbw m0, [r2 + r4]
-movd m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-pextrw [r0 + r1], m0, 0
+ RET
%endif
-RET
;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PIXEL_ADD_PS_W2_H4 2
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mov r6d, %2/4
- mova m5, [pw_pixel_max]
-.loop:
- movd m0, [r2]
- movd m1, [r3]
- movd m2, [r2 + r4]
- movd m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movd [r0], m0
- movd [r0 + r1], m2
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movd m0, [r2]
- movd m1, [r3]
- movd m2, [r2 + r4]
- movd m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movd [r0], m0
- movd [r0 + r1], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 2, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-mov r6d, %2/4
-
-.loop:
- pmovzxbw m0, [r2]
- movd m1, [r3]
-
- paddw m0, m1
- packuswb m0, m0
-
- pextrw [r0], m0, 0
-
- pmovzxbw m0, [r2 + r4]
- movd m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- pextrw [r0 + r1], m0, 0
-
- pmovzxbw m0, [r2 + 2 * r4]
- movd m1, [r3 + 2 * r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- pextrw [r0 + 2 * r1], m0, 0
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- pmovzxbw m0, [r2 + r4]
- movd m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- pextrw [r0 + r1], m0, 0
-%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
-RET
-%endmacro
-
-PIXEL_ADD_PS_W2_H4 2, 8
-
-PIXEL_ADD_PS_W2_H4 2, 16
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_4x2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_4x2, 6, 6, 4
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m0, m0
- mova m1, [pw_pixel_max]
-
- movh m2, [r2]
- movhps m2, [r2 + r4]
-
- movh m3, [r3]
- movhps m3, [r3 + r5]
-
- paddw m2, m3
- CLIPW m2, m0, m1
-
- movh [r0], m2
- movhps [r0 + r1], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_4x2, 6, 6, 2, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-pmovzxbw m0, [r2]
-movh m1, [r3]
-
-paddw m0, m1
-packuswb m0, m0
-
-movd [r0], m0
-
-pmovzxbw m0, [r2 + r4]
-movh m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movd [r0 + r1], m0
-%endif
-RET
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%macro PIXEL_ADD_PS_W4_H4 2
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 4
- mov r6d, %2/4
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m0, m0
- mova m1, [pw_pixel_max]
+cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m1, [pw_pixel_max]
+ pxor m0, m0
+ mov r6d, %2/4
+ add r4, r4
+ add r5, r5
+ add r1, r1
.loop:
- movh m2, [r2]
- movhps m2, [r2 + r4]
+ movh m2, [r2]
+ movhps m2, [r2 + r4]
+ movh m3, [r3]
+ movhps m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ movh m4, [r2]
+ movhps m4, [r2 + r4]
+ movh m5, [r3]
+ movhps m5, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- movh m3, [r3]
- movhps m3, [r3 + r5]
+ paddw m2, m3
+ paddw m4, m5
+ CLIPW2 m2, m4, m0, m1
- paddw m2, m3
- CLIPW m2, m0, m1
+ movh [r0], m2
+ movhps [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m4
+ movhps [r0 + r1], m4
+ lea r0, [r0 + r1 * 2]
- movlps [r0], m2
- movhps [r0 + r1], m2
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movh m2, [r2]
- movhps m2, [r2 + r4]
-
- movh m3, [r3]
- movhps m3, [r3 + r5]
-
- paddw m2, m3
- CLIPW m2, m0, m1
-
- movh [r0], m2
- movhps [r0 + r1], m2
+ jnz .loop
+ RET
%else
INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 2, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+ mov r6d, %2/4
+ add r5, r5
+.loop:
+ pmovzxbw m0, [r2]
+ pmovzxbw m2, [r2 + r4]
+ movh m1, [r3]
+ movh m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m4, [r2]
+ pmovzxbw m6, [r2 + r4]
+ movh m5, [r3]
+ movh m7, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
-add r5, r5
+ paddw m0, m1
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ packuswb m0, m0
+ packuswb m2, m2
+ packuswb m4, m4
+ packuswb m6, m6
-mov r6d, %2/4
+ movd [r0], m0
+ movd [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ movd [r0], m4
+ movd [r0 + r1], m6
+ lea r0, [r0 + r1 * 2]
-.loop:
-
- pmovzxbw m0, [r2]
- movh m1, [r3]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0], m0
-
- pmovzxbw m0, [r2 + r4]
- movh m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0 + r1], m0
-
- pmovzxbw m0, [r2 + 2 * r4]
- movh m1, [r3 + 2 * r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0 + 2 * r1], m0
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- pmovzxbw m0, [r2 + r4]
- movh m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0 + r1], m0
+ jnz .loop
+ RET
%endif
- dec r6d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- jnz .loop
-
-RET
%endmacro
-PIXEL_ADD_PS_W4_H4 4, 4
PIXEL_ADD_PS_W4_H4 4, 8
-PIXEL_ADD_PS_W4_H4 4, 16
-PIXEL_ADD_PS_W4_H4 4, 32
;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PIXEL_ADD_PS_W6_H4 2
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- mov r6d, %2/4
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movh [r0], m0
- pshufd m1, m0, 2
- movd [r0 + 8], m1
- movh [r0 + r1], m2
- pshufd m3, m2, 2
- movd [r0 + r1 + 8], m3
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movh [r0], m0
- pshufd m1, m0, 2
- movd [r0 + 8], m1
- movh [r0 + r1], m2
- pshufd m3, m2, 2
- movd [r0 + r1 + 8], m3
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 2, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-mov r6d, %2/4
-
-.loop:
- pmovzxbw m0, [r2]
- movu m1, [r3]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0], m0
- pextrw [r0 + 4], m0, 2
-
- pmovzxbw m0, [r2 + r4]
- movu m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0 + r1], m0
- pextrw [r0 + r1 + 4], m0, 2
-
- pmovzxbw m0, [r2 + 2 * r4]
- movu m1, [r3 + 2 * r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0 + 2 * r1], m0
- pextrw [r0 + 2 * r1 + 4], m0, 2
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- pmovzxbw m0, [r2 + r4]
- movu m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movd [r0 + r1], m0
- pextrw [r0 + r1 + 4], m0, 2
-%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
-RET
-%endmacro
-
-PIXEL_ADD_PS_W6_H4 6, 8
-
-PIXEL_ADD_PS_W6_H4 6, 16
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_8x2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_8x2, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
-
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movu [r0 + r1], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_8x2, 6, 6, 2, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-pmovzxbw m0, [r2]
-movu m1, [r3]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0], m0
-
-pmovzxbw m0, [r2 + r4]
-movu m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0 + r1], m0
-%endif
-RET
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_8x6(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_8x6, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
-
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movu [r0 + r1], m2
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movu [r0 + r1], m2
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movu [r0 + r1], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_8x6, 6, 6, 2, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-pmovzxbw m0, [r2]
-movu m1, [r3]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0], m0
-
-pmovzxbw m0, [r2 + r4]
-movu m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0 + r1], m0
-
-pmovzxbw m0, [r2 + 2 * r4]
-movu m1, [r3 + 2 * r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0 + 2 * r1], m0
-
-lea r0, [r0 + 2 * r1]
-lea r2, [r2 + 2 * r4]
-lea r3, [r3 + 2 * r5]
-
-pmovzxbw m0, [r2 + r4]
-movu m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0 + r1], m0
-
-pmovzxbw m0, [r2 + 2 * r4]
-movu m1, [r3 + 2 * r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0 + 2 * r1], m0
-
-lea r0, [r0 + 2 * r1]
-lea r2, [r2 + 2 * r4]
-lea r3, [r3 + 2 * r5]
-
-pmovzxbw m0, [r2 + r4]
-movu m1, [r3 + r5]
-
-paddw m0, m1
-packuswb m0, m0
-
-movh [r0 + r1], m0
-%endif
-RET
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%macro PIXEL_ADD_PS_W8_H4 2
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m5, [pw_pixel_max]
+ pxor m4, m4
mov r6d, %2/4
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
+ add r4, r4
+ add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2]
+ movu m2, [r2 + r4]
+ movu m1, [r3]
+ movu m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- movu [r0], m0
- movu [r0 + r1], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
+ movu [r0], m0
+ movu [r0 + r1], m2
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2]
+ movu m2, [r2 + r4]
+ movu m1, [r3]
+ movu m3, [r3 + r5]
+ dec r6d
+ lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- movu [r0], m0
- movu [r0 + r1], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0], m0
+ movu [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 2, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+ mov r6d, %2/4
+ add r5, r5
+.loop:
+ pmovzxbw m0, [r2]
+ pmovzxbw m2, [r2 + r4]
+ movu m1, [r3]
+ movu m3, [r3 + r5]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ pmovzxbw m4, [r2]
+ pmovzxbw m6, [r2 + r4]
+ movu m5, [r3]
+ movu m7, [r3 + r5]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
-add r5, r5
+ paddw m0, m1
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ packuswb m0, m0
+ packuswb m2, m2
+ packuswb m4, m4
+ packuswb m6, m6
-mov r6d, %2/4
+ movh [r0], m0
+ movh [r0 + r1], m2
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m4
+ movh [r0 + r1], m6
+ lea r0, [r0 + r1 * 2]
-.loop:
- pmovzxbw m0, [r2]
- movu m1, [r3]
-
- paddw m0, m1
- packuswb m0, m0
-
- movh [r0], m0
-
- pmovzxbw m0, [r2 + r4]
- movu m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movh [r0 + r1], m0
-
- pmovzxbw m0, [r2 + 2 * r4]
- movu m1, [r3 + 2 * r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movh [r0 + 2 * r1], m0
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- pmovzxbw m0, [r2 + r4]
- movu m1, [r3 + r5]
-
- paddw m0, m1
- packuswb m0, m0
-
- movh [r0 + r1], m0
+ jnz .loop
+ RET
%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
-RET
%endmacro
-PIXEL_ADD_PS_W8_H4 8, 4
PIXEL_ADD_PS_W8_H4 8, 8
PIXEL_ADD_PS_W8_H4 8, 16
-PIXEL_ADD_PS_W8_H4 8, 32
-PIXEL_ADD_PS_W8_H4 8, 12
-PIXEL_ADD_PS_W8_H4 8, 64
;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PIXEL_ADD_PS_W12_H4 2
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- mov r6d, %2/4
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movh m2, [r2 + 16]
- movh m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movh [r0 + 16], m2
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movh m2, [r2 + r4 + 16]
- movh m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + r1], m0
- movh [r0 + r1 + 16], m2
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
- movu m0, [r2]
- movu m1, [r3]
- movh m2, [r2 + 16]
- movh m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movh [r0 + 16], m2
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movh m2, [r2 + r4 + 16]
- movh m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + r1], m0
- movh [r0 + r1 + 16], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-mov r6d, %2/4
-
-.loop:
- pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 8]
-
- movu m2, [r3]
- movh m3, [r3 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movh [r0], m0
- movhlps m0, m0
- movd [r0 + 8], m0
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
-
- movu m2, [r3 + r5]
- movh m3, [r3 + r5 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movh [r0 + r1], m0
- movhlps m0, m0
- movd [r0 + r1 + 8], m0
-
- pmovzxbw m0, [r2 + 2 * r4]
- pmovzxbw m1, [r2 + 2 * r4 + 8]
-
- movu m2, [r3 + 2 * r5]
- movh m3, [r3 + 2 * r5 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movh [r0 + 2 * r1], m0
- movhlps m0, m0
- movd [r0 + 2 * r1 + 8], m0
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
-
- movu m2, [r3 + r5]
- movh m3, [r3 + r5 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movh [r0 + r1], m0
- movhlps m0, m0
- movd [r0 + r1 + 8], m0
-%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
-RET
-%endmacro
-
-PIXEL_ADD_PS_W12_H4 12, 16
-
-PIXEL_ADD_PS_W12_H4 12, 32
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%macro PIXEL_ADD_PS_W16_H4 2
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m5, [pw_pixel_max]
+ pxor m4, m4
mov r6d, %2/4
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
+ add r4, r4
+ add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
- movu [r0], m0
- movu [r0 + 16], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu [r0], m0
+ movu [r0 + 16], m2
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
+ movu m0, [r2 + r4]
+ movu m2, [r2 + r4 + 16]
+ movu m1, [r3 + r5]
+ movu m3, [r3 + r5 + 16]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
- movu [r0], m0
- movu [r0 + 16], m2
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
+ lea r0, [r0 + r1 * 2]
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
+ movu [r0], m0
+ movu [r0 + 16], m2
+
+ movu m0, [r2 + r4]
+ movu m2, [r2 + r4 + 16]
+ movu m1, [r3 + r5]
+ movu m3, [r3 + r5 + 16]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+ mov r6d, %2/4
+ add r5, r5
+.loop:
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 8]
+ pmovzxbw m4, [r2 + r4]
+ pmovzxbw m5, [r2 + r4 + 8]
+ movu m2, [r3]
+ movu m3, [r3 + 16]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 16]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
-add r5, r5
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ packuswb m0, m1
+ packuswb m4, m5
-mov r6d, %2/4
+ movu [r0], m0
+ movu [r0 + r1], m4
-.loop:
- pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 8]
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 8]
+ pmovzxbw m4, [r2 + r4]
+ pmovzxbw m5, [r2 + r4 + 8]
+ movu m2, [r3]
+ movu m3, [r3 + 16]
+ movu m6, [r3 + r5]
+ movu m7, [r3 + r5 + 16]
+ dec r6d
+ lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- movu m2, [r3]
- movu m3, [r3 + 16]
+ paddw m0, m2
+ paddw m1, m3
+ paddw m4, m6
+ paddw m5, m7
+ packuswb m0, m1
+ packuswb m4, m5
- paddw m0, m2
- paddw m1, m3
+ movu [r0], m0
+ movu [r0 + r1], m4
+ lea r0, [r0 + r1 * 2]
- packuswb m0, m1
-
- movu [r0], m0
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
-
- movu m2, [r3 + r5]
- movu m3, [r3 + r5 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movu [r0 + r1], m0
-
- pmovzxbw m0, [r2 + 2 * r4]
- pmovzxbw m1, [r2 + 2 * r4 + 8]
-
- movu m2, [r3 + 2 * r5]
- movu m3, [r3 + 2 * r5 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movu [r0 + 2 * r1], m0
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
-
- movu m2, [r3 + r5]
- movu m3, [r3 + r5 + 16]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movu [r0 + r1], m0
+ jnz .loop
+ RET
%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
-RET
%endmacro
-PIXEL_ADD_PS_W16_H4 16, 4
-PIXEL_ADD_PS_W16_H4 16, 8
-PIXEL_ADD_PS_W16_H4 16, 12
PIXEL_ADD_PS_W16_H4 16, 16
PIXEL_ADD_PS_W16_H4 16, 32
-PIXEL_ADD_PS_W16_H4 16, 64
-PIXEL_ADD_PS_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PIXEL_ADD_PS_W24_H2 2
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- mov r6d, %2/2
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
-
- movu m0, [r2 + 32]
- movu m1, [r3 + 32]
- movu m2, [r2 + r4]
- movu m3, [r3 + r5]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + 32], m0
- movu [r0 + r1], m2
-
- movu m0, [r2 + r4 + 16]
- movu m1, [r3 + r5 + 16]
- movu m2, [r2 + r4 + 32]
- movu m3, [r3 + r5 + 32]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + r1 + 16], m0
- movu [r0 + r1 + 32], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
-
- add r5, r5
- mov r6d, %2/2
-
-.loop:
- pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 8]
- pmovzxbw m2, [r2 + 16]
-
- movu m3, [r3]
- movu m4, [r3 + 16]
- movu m5, [r3 + 32]
-
- paddw m0, m3
- paddw m1, m4
- paddw m2, m5
-
- packuswb m0, m1
- packuswb m2, m2
-
- movu [r0], m0
- movh [r0 + 16], m2
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
- pmovzxbw m2, [r2 + r4 + 16]
-
- movu m3, [r3 + r5]
- movu m4, [r3 + r5 + 16]
- movu m5, [r3 + r5 + 32]
-
- paddw m0, m3
- paddw m1, m4
- paddw m2, m5
-
- packuswb m0, m1
- packuswb m2, m2
-
- movu [r0 + r1], m0
- movh [r0 + r1 + 16], m2
-%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
- RET
-%endmacro
-
-PIXEL_ADD_PS_W24_H2 24, 32
-
-PIXEL_ADD_PS_W24_H2 24, 64
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%macro PIXEL_ADD_PS_W32_H2 2
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m5, [pw_pixel_max]
+ pxor m4, m4
mov r6d, %2/2
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
+ add r4, r4
+ add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
- movu [r0], m0
- movu [r0 + 16], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu m0, [r2 + 32]
- movu m1, [r3 + 32]
- movu m2, [r2 + 48]
- movu m3, [r3 + 48]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu [r0], m0
+ movu [r0 + 16], m2
- movu [r0 + 32], m0
- movu [r0 + 48], m2
+ movu m0, [r2 + 32]
+ movu m2, [r2 + 48]
+ movu m1, [r3 + 32]
+ movu m3, [r3 + 48]
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
+ movu [r0 + 32], m0
+ movu [r0 + 48], m2
- movu m0, [r2 + r4 + 32]
- movu m1, [r3 + r5 + 32]
- movu m2, [r2 + r4 + 48]
- movu m3, [r3 + r5 + 48]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2 + r4]
+ movu m2, [r2 + r4 + 16]
+ movu m1, [r3 + r5]
+ movu m3, [r3 + r5 + 16]
- movu [r0 + r1 + 32], m0
- movu [r0 + r1 + 48], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+
+ movu m0, [r2 + r4 + 32]
+ movu m2, [r2 + r4 + 48]
+ movu m1, [r3 + r5 + 32]
+ movu m3, [r3 + r5 + 48]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0 + r1 + 32], m0
+ movu [r0 + r1 + 48], m2
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+ mov r6d, %2/2
+ add r5, r5
+.loop:
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 8]
+ pmovzxbw m2, [r2 + 16]
+ pmovzxbw m3, [r2 + 24]
+ movu m4, [r3]
+ movu m5, [r3 + 16]
+ movu m6, [r3 + 32]
+ movu m7, [r3 + 48]
- add r5, r5
- mov r6d, %2/2
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m0, m1
+ packuswb m2, m3
-.loop:
- pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 8]
- pmovzxbw m2, [r2 + 16]
- pmovzxbw m3, [r2 + 24]
+ movu [r0], m0
+ movu [r0 + 16], m2
- movu m4, [r3]
- movu m5, [r3 + 16]
- movu m6, [r3 + 32]
- movu m7, [r3 + 48]
+ pmovzxbw m0, [r2 + r4]
+ pmovzxbw m1, [r2 + r4 + 8]
+ pmovzxbw m2, [r2 + r4 + 16]
+ pmovzxbw m3, [r2 + r4 + 24]
+ movu m4, [r3 + r5]
+ movu m5, [r3 + r5 + 16]
+ movu m6, [r3 + r5 + 32]
+ movu m7, [r3 + r5 + 48]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
-
- movu [r0], m0
- movu [r0 + 16], m2
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
- pmovzxbw m2, [r2 + r4 + 16]
- pmovzxbw m3, [r2 + r4 + 24]
-
- movu m4, [r3 + r5]
- movu m5, [r3 + r5 + 16]
- movu m6, [r3 + r5 + 32]
- movu m7, [r3 + r5 + 48]
-
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m0, m1
+ packuswb m2, m3
movu [r0 + r1], m0
movu [r0 + r1 + 16], m2
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
- RET
%endmacro
-PIXEL_ADD_PS_W32_H2 32, 8
-PIXEL_ADD_PS_W32_H2 32, 16
-PIXEL_ADD_PS_W32_H2 32, 24
PIXEL_ADD_PS_W32_H2 32, 32
PIXEL_ADD_PS_W32_H2 32, 64
-PIXEL_ADD_PS_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
-;-----------------------------------------------------------------------------
-%macro PIXEL_ADD_PS_W48_H2 2
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- mov r6d, %2/2
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
-.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0], m0
- movu [r0 + 16], m2
-
- movu m0, [r2 + 32]
- movu m1, [r3 + 32]
- movu m2, [r2 + 48]
- movu m3, [r3 + 48]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + 32], m0
- movu [r0 + 48], m2
-
- movu m0, [r2 + 64]
- movu m1, [r3 + 64]
- movu m2, [r2 + 80]
- movu m3, [r3 + 80]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + 64], m0
- movu [r0 + 80], m2
-
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
-
- movu m0, [r2 + r4 + 32]
- movu m1, [r3 + r5 + 32]
- movu m2, [r2 + r4 + 48]
- movu m3, [r3 + r5 + 48]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + r1 + 32], m0
- movu [r0 + r1 + 48], m2
-
- movu m0, [r2 + r4 + 64]
- movu m1, [r3 + r5 + 64]
- movu m2, [r2 + r4 + 80]
- movu m3, [r3 + r5 + 80]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
-
- movu [r0 + r1 + 64], m0
- movu [r0 + r1 + 80], m2
-%else
-INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
-
-add r5, r5
-
-mov r6d, %2/2
-
-.loop:
- pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 8]
- pmovzxbw m2, [r2 + 16]
- pmovzxbw m3, [r2 + 24]
-
- movu m4, [r3]
- movu m5, [r3 + 16]
- movu m6, [r3 + 32]
- movu m7, [r3 + 48]
-
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
-
- movu [r0], m0
- movu [r0 + 16], m2
-
- pmovzxbw m0, [r2 + 32]
- pmovzxbw m1, [r2 + 40]
-
- movu m2, [r3 + 64]
- movu m3, [r3 + 80]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movu [r0 + 32], m0
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
- pmovzxbw m2, [r2 + r4 + 16]
- pmovzxbw m3, [r2 + r4 + 24]
-
- movu m4, [r3 + r5]
- movu m5, [r3 + r5 + 16]
- movu m6, [r3 + r5 + 32]
- movu m7, [r3 + r5 + 48]
-
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
-
- pmovzxbw m0, [r2 + r4 + 32]
- pmovzxbw m1, [r2 + r4 + 40]
-
- movu m2, [r3 + r5 + 64]
- movu m3, [r3 + r5 + 80]
-
- paddw m0, m2
- paddw m1, m3
-
- packuswb m0, m1
-
- movu [r0 + r1 + 32], m0
-%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
-RET
-%endmacro
-
-PIXEL_ADD_PS_W48_H2 48, 64
-
-;-----------------------------------------------------------------------------
-; void pixel_add_ps_%1x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
+; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
;-----------------------------------------------------------------------------
%macro PIXEL_ADD_PS_W64_H2 2
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_%1x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
+ mova m5, [pw_pixel_max]
+ pxor m4, m4
mov r6d, %2/2
- add r1, r1
- add r4, r4
- add r5, r5
- pxor m4, m4
- mova m5, [pw_pixel_max]
+ add r4, r4
+ add r5, r5
+ add r1, r1
.loop:
- movu m0, [r2]
- movu m1, [r3]
- movu m2, [r2 + 16]
- movu m3, [r3 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2]
+ movu m2, [r2 + 16]
+ movu m1, [r3]
+ movu m3, [r3 + 16]
- movu [r0], m0
- movu [r0 + 16], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu m0, [r2 + 32]
- movu m1, [r3 + 32]
- movu m2, [r2 + 48]
- movu m3, [r3 + 48]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu [r0], m0
+ movu [r0 + 16], m2
- movu [r0 + 32], m0
- movu [r0 + 48], m2
+ movu m0, [r2 + 32]
+ movu m2, [r2 + 48]
+ movu m1, [r3 + 32]
+ movu m3, [r3 + 48]
- movu m0, [r2 + 64]
- movu m1, [r3 + 64]
- movu m2, [r2 + 80]
- movu m3, [r3 + 80]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu [r0 + 64], m0
- movu [r0 + 80], m2
+ movu [r0 + 32], m0
+ movu [r0 + 48], m2
- movu m0, [r2 + 96]
- movu m1, [r3 + 96]
- movu m2, [r2 + 112]
- movu m3, [r3 + 112]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2 + 64]
+ movu m2, [r2 + 80]
+ movu m1, [r3 + 64]
+ movu m3, [r3 + 80]
- movu [r0 + 96], m0
- movu [r0 + 112], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu m0, [r2 + r4]
- movu m1, [r3 + r5]
- movu m2, [r2 + r4 + 16]
- movu m3, [r3 + r5 + 16]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu [r0 + 64], m0
+ movu [r0 + 80], m2
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m2
+ movu m0, [r2 + 96]
+ movu m2, [r2 + 112]
+ movu m1, [r3 + 96]
+ movu m3, [r3 + 112]
- movu m0, [r2 + r4 + 32]
- movu m1, [r3 + r5 + 32]
- movu m2, [r2 + r4 + 48]
- movu m3, [r3 + r5 + 48]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu [r0 + r1 + 32], m0
- movu [r0 + r1 + 48], m2
+ movu [r0 + 96], m0
+ movu [r0 + 112], m2
- movu m0, [r2 + r4 + 64]
- movu m1, [r3 + r5 + 64]
- movu m2, [r2 + r4 + 80]
- movu m3, [r3 + r5 + 80]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu m0, [r2 + r4]
+ movu m2, [r2 + r4 + 16]
+ movu m1, [r3 + r5]
+ movu m3, [r3 + r5 + 16]
- movu [r0 + r1 + 64], m0
- movu [r0 + r1 + 80], m2
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
- movu m0, [r2 + r4 + 96]
- movu m1, [r3 + r5 + 96]
- movu m2, [r2 + r4 + 112]
- movu m3, [r3 + r5 + 112]
- paddw m0, m1
- paddw m2, m3
- CLIPW m0, m4, m5
- CLIPW m2, m4, m5
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
- movu [r0 + r1 + 96], m0
- movu [r0 + r1 + 112], m2
+ movu m0, [r2 + r4 + 32]
+ movu m2, [r2 + r4 + 48]
+ movu m1, [r3 + r5 + 32]
+ movu m3, [r3 + r5 + 48]
+
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0 + r1 + 32], m0
+ movu [r0 + r1 + 48], m2
+
+ movu m0, [r2 + r4 + 64]
+ movu m2, [r2 + r4 + 80]
+ movu m1, [r3 + r5 + 64]
+ movu m3, [r3 + r5 + 80]
+
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0 + r1 + 64], m0
+ movu [r0 + r1 + 80], m2
+
+ movu m0, [r2 + r4 + 96]
+ movu m2, [r2 + r4 + 112]
+ movu m1, [r3 + r5 + 96]
+ movu m3, [r3 + r5 + 112]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ paddw m0, m1
+ paddw m2, m3
+ CLIPW2 m0, m2, m4, m5
+
+ movu [r0 + r1 + 96], m0
+ movu [r0 + r1 + 112], m2
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%else
INIT_XMM sse4
-cglobal pixel_add_ps_%1x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+ mov r6d, %2/2
+ add r5, r5
+.loop:
+ pmovzxbw m0, [r2]
+ pmovzxbw m1, [r2 + 8]
+ pmovzxbw m2, [r2 + 16]
+ pmovzxbw m3, [r2 + 24]
+ movu m4, [r3]
+ movu m5, [r3 + 16]
+ movu m6, [r3 + 32]
+ movu m7, [r3 + 48]
- add r5, r5
- mov r6d, %2/2
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m0, m1
+ packuswb m2, m3
-.loop:
- pmovzxbw m0, [r2]
- pmovzxbw m1, [r2 + 8]
- pmovzxbw m2, [r2 + 16]
- pmovzxbw m3, [r2 + 24]
+ movu [r0], m0
+ movu [r0 + 16], m2
- movu m4, [r3]
- movu m5, [r3 + 16]
- movu m6, [r3 + 32]
- movu m7, [r3 + 48]
+ pmovzxbw m0, [r2 + 32]
+ pmovzxbw m1, [r2 + 40]
+ pmovzxbw m2, [r2 + 48]
+ pmovzxbw m3, [r2 + 56]
+ movu m4, [r3 + 64]
+ movu m5, [r3 + 80]
+ movu m6, [r3 + 96]
+ movu m7, [r3 + 112]
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m0, m1
+ packuswb m2, m3
- packuswb m0, m1
- packuswb m2, m3
+ movu [r0 + 32], m0
+ movu [r0 + 48], m2
- movu [r0], m0
- movu [r0 + 16], m2
+ pmovzxbw m0, [r2 + r4]
+ pmovzxbw m1, [r2 + r4 + 8]
+ pmovzxbw m2, [r2 + r4 + 16]
+ pmovzxbw m3, [r2 + r4 + 24]
+ movu m4, [r3 + r5]
+ movu m5, [r3 + r5 + 16]
+ movu m6, [r3 + r5 + 32]
+ movu m7, [r3 + r5 + 48]
- pmovzxbw m0, [r2 + 32]
- pmovzxbw m1, [r2 + 40]
- pmovzxbw m2, [r2 + 48]
- pmovzxbw m3, [r2 + 56]
-
- movu m4, [r3 + 64]
- movu m5, [r3 + 80]
- movu m6, [r3 + 96]
- movu m7, [r3 + 112]
-
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
-
- movu [r0 + 32], m0
- movu [r0 + 48], m2
-
- pmovzxbw m0, [r2 + r4]
- pmovzxbw m1, [r2 + r4 + 8]
- pmovzxbw m2, [r2 + r4 + 16]
- pmovzxbw m3, [r2 + r4 + 24]
-
- movu m4, [r3 + r5]
- movu m5, [r3 + r5 + 16]
- movu m6, [r3 + r5 + 32]
- movu m7, [r3 + r5 + 48]
-
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m0, m1
+ packuswb m2, m3
movu [r0 + r1], m0
movu [r0 + r1 + 16], m2
- pmovzxbw m0, [r2 + r4 + 32]
- pmovzxbw m1, [r2 + r4 + 40]
- pmovzxbw m2, [r2 + r4 + 48]
- pmovzxbw m3, [r2 + r4 + 56]
+ pmovzxbw m0, [r2 + r4 + 32]
+ pmovzxbw m1, [r2 + r4 + 40]
+ pmovzxbw m2, [r2 + r4 + 48]
+ pmovzxbw m3, [r2 + r4 + 56]
+ movu m4, [r3 + r5 + 64]
+ movu m5, [r3 + r5 + 80]
+ movu m6, [r3 + r5 + 96]
+ movu m7, [r3 + r5 + 112]
+ dec r6d
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
- movu m4, [r3 + r5 + 64]
- movu m5, [r3 + r5 + 80]
- movu m6, [r3 + r5 + 96]
- movu m7, [r3 + r5 + 112]
-
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
-
- packuswb m0, m1
- packuswb m2, m3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m0, m1
+ packuswb m2, m3
movu [r0 + r1 + 32], m0
movu [r0 + r1 + 48], m2
+ lea r0, [r0 + r1 * 2]
+
+ jnz .loop
+ RET
%endif
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
-
- dec r6d
- jnz .loop
-
- RET
%endmacro
-PIXEL_ADD_PS_W64_H2 64, 16
-PIXEL_ADD_PS_W64_H2 64, 32
-PIXEL_ADD_PS_W64_H2 64, 48
PIXEL_ADD_PS_W64_H2 64, 64
diff -r 619633a933f6 -r 5cf494d8591b source/common/x86/x86util.asm
--- a/source/common/x86/x86util.asm Wed Aug 06 17:03:38 2014 -0500
+++ b/source/common/x86/x86util.asm Thu Aug 07 19:23:38 2014 +0900
@@ -290,6 +290,13 @@
pminsw %1, %3
%endmacro
+%macro CLIPW2 4 ;(dst0, dst1, min, max)
+ pmaxsw %1, %3
+ pmaxsw %2, %3
+ pminsw %1, %4
+ pminsw %2, %4
+%endmacro
+
%macro HADDD 2 ; sum junk
%if sizeof%1 == 32
%define %2 xmm%2
diff -r 619633a933f6 -r 5cf494d8591b source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/encoder/analysis.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -1840,9 +1840,10 @@
if (lcu->getPredictionMode(absPartIdx) == MODE_INTER)
{
- int part = partitionFromLog2Size(cu->getLog2CUSize(0));
+ int log2CUSize = cu->getLog2CUSize(0);
if (!lcu->getSkipFlag(absPartIdx))
{
+ const int sizeIdx = log2CUSize - 2;
// Calculate Residue
pixel* src2 = m_bestPredYuv[0]->getLumaAddr(absPartIdx);
pixel* src1 = m_origYuv[0]->getLumaAddr(absPartIdx);
@@ -1850,7 +1851,7 @@
uint32_t src2stride = m_bestPredYuv[0]->getStride();
uint32_t src1stride = m_origYuv[0]->getStride();
uint32_t dststride = m_tmpResiYuv[depth]->m_width;
- primitives.luma_sub_ps[part](dst, dststride, src1, src2, src1stride, src2stride);
+ primitives.luma_sub_ps[sizeIdx](dst, dststride, src1, src2, src1stride, src2stride);
src2 = m_bestPredYuv[0]->getCbAddr(absPartIdx);
src1 = m_origYuv[0]->getCbAddr(absPartIdx);
@@ -1858,13 +1859,13 @@
src2stride = m_bestPredYuv[0]->getCStride();
src1stride = m_origYuv[0]->getCStride();
dststride = m_tmpResiYuv[depth]->m_cwidth;
- primitives.chroma[m_param->internalCsp].sub_ps[part](dst, dststride, src1, src2, src1stride, src2stride);
+ primitives.chroma[m_param->internalCsp].sub_ps[sizeIdx](dst, dststride, src1, src2, src1stride, src2stride);
src2 = m_bestPredYuv[0]->getCrAddr(absPartIdx);
src1 = m_origYuv[0]->getCrAddr(absPartIdx);
dst = m_tmpResiYuv[depth]->getCrAddr();
dststride = m_tmpResiYuv[depth]->m_cwidth;
- primitives.chroma[m_param->internalCsp].sub_ps[part](dst, dststride, src1, src2, src1stride, src2stride);
+ primitives.chroma[m_param->internalCsp].sub_ps[sizeIdx](dst, dststride, src1, src2, src1stride, src2stride);
// Residual encoding
residualTransformQuantInter(cu, 0, m_origYuv[0], m_tmpResiYuv[depth], cu->getDepth(0));
@@ -1886,7 +1887,7 @@
dststride = m_bestRecoYuv[depth]->getStride();
src1stride = m_bestPredYuv[0]->getStride();
src2stride = m_tmpResiYuv[depth]->m_width;
- primitives.luma_add_ps[part](reco, dststride, pred, res, src1stride, src2stride);
+ primitives.luma_add_ps[sizeIdx](reco, dststride, pred, res, src1stride, src2stride);
pred = m_bestPredYuv[0]->getCbAddr(absPartIdx);
res = m_tmpResiYuv[depth]->getCbAddr();
@@ -1894,19 +1895,20 @@
dststride = m_bestRecoYuv[depth]->getCStride();
src1stride = m_bestPredYuv[0]->getCStride();
src2stride = m_tmpResiYuv[depth]->m_cwidth;
- primitives.chroma[m_param->internalCsp].add_ps[part](reco, dststride, pred, res, src1stride, src2stride);
+ primitives.chroma[m_param->internalCsp].add_ps[sizeIdx](reco, dststride, pred, res, src1stride, src2stride);
pred = m_bestPredYuv[0]->getCrAddr(absPartIdx);
res = m_tmpResiYuv[depth]->getCrAddr();
reco = m_bestRecoYuv[depth]->getCrAddr();
reco = m_bestRecoYuv[depth]->getCrAddr();
- primitives.chroma[m_param->internalCsp].add_ps[part](reco, dststride, pred, res, src1stride, src2stride);
+ primitives.chroma[m_param->internalCsp].add_ps[sizeIdx](reco, dststride, pred, res, src1stride, src2stride);
m_bestRecoYuv[depth]->copyToPicYuv(pic->getPicYuvRec(), lcu->getAddr(), absPartIdx);
return;
}
}
// Generate Recon
+ int part = partitionFromLog2Size(log2CUSize);
TComPicYuv* rec = pic->getPicYuvRec();
pixel* src = m_bestPredYuv[0]->getLumaAddr(absPartIdx);
pixel* dst = rec->getLumaAddr(cu->getAddr(), absPartIdx);
diff -r 619633a933f6 -r 5cf494d8591b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Aug 06 17:03:38 2014 -0500
+++ b/source/test/pixelharness.cpp Thu Aug 07 19:23:38 2014 +0900
@@ -1217,24 +1217,6 @@
}
}
- if (opt.luma_sub_ps[part])
- {
- if (!check_pixel_sub_ps(ref.luma_sub_ps[part], opt.luma_sub_ps[part]))
- {
- printf("luma_sub_ps[%s] failed\n", lumaPartStr[part]);
- return false;
- }
- }
-
- if (opt.luma_add_ps[part])
- {
- if (!check_pixel_add_ps(ref.luma_add_ps[part], opt.luma_add_ps[part]))
- {
- printf("luma_add_ps[%s] failed\n", lumaPartStr[part]);
- return false;
- }
- }
-
if (opt.luma_addAvg[part])
{
if (!check_addAvg(ref.luma_addAvg[part], opt.luma_addAvg[part]))
@@ -1244,6 +1226,27 @@
}
}
+ if (part < NUM_SQUARE_BLOCKS)
+ {
+ if (opt.luma_sub_ps[part])
+ {
+ if (!check_pixel_sub_ps(ref.luma_sub_ps[part], opt.luma_sub_ps[part]))
+ {
+ printf("luma_sub_ps[%s] failed\n", lumaPartStr[part]);
+ return false;
+ }
+ }
+
+ if (opt.luma_add_ps[part])
+ {
+ if (!check_pixel_add_ps(ref.luma_add_ps[part], opt.luma_add_ps[part]))
+ {
+ printf("luma_add_ps[%s] failed\n", lumaPartStr[part]);
+ return false;
+ }
+ }
+ }
+
for (int i = 0; i < X265_CSP_COUNT; i++)
{
if (opt.chroma[i].copy_pp[part])
@@ -1278,22 +1281,6 @@
return false;
}
}
- if (opt.chroma[i].sub_ps[part])
- {
- if (!check_pixel_sub_ps(ref.chroma[i].sub_ps[part], opt.chroma[i].sub_ps[part]))
- {
- printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
- return false;
- }
- }
- if (opt.chroma[i].add_ps[part])
- {
- if (!check_pixel_add_ps(ref.chroma[i].add_ps[part], opt.chroma[i].add_ps[part]))
- {
- printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
- return false;
- }
- }
if (opt.chroma[i].addAvg[part])
{
if (!check_addAvg(ref.chroma[i].addAvg[part], opt.chroma[i].addAvg[part]))
@@ -1302,6 +1289,25 @@
return false;
}
}
+ if (part < NUM_SQUARE_BLOCKS)
+ {
+ if (opt.chroma[i].sub_ps[part])
+ {
+ if (!check_pixel_sub_ps(ref.chroma[i].sub_ps[part], opt.chroma[i].sub_ps[part]))
+ {
+ printf("chroma_sub_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
+ if (opt.chroma[i].add_ps[part])
+ {
+ if (!check_pixel_add_ps(ref.chroma[i].add_ps[part], opt.chroma[i].add_ps[part]))
+ {
+ printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
+ return false;
+ }
+ }
+ }
}
return true;
@@ -1629,23 +1635,24 @@
HEADER("luma_copy_ss[%s]", lumaPartStr[part]);
REPORT_SPEEDUP(opt.luma_copy_ss[part], ref.luma_copy_ss[part], sbuf1, 64, sbuf2, 128);
}
- if (opt.luma_sub_ps[part])
- {
- HEADER("luma_sub_ps[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
- }
-
- if (opt.luma_add_ps[part])
- {
- HEADER("luma_add_ps[%s]", lumaPartStr[part]);
- REPORT_SPEEDUP(opt.luma_add_ps[part], ref.luma_add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
- }
-
if (opt.luma_addAvg[part])
{
HEADER("luma_addAvg[%s]", lumaPartStr[part]);
REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
+ if (part < NUM_SQUARE_BLOCKS)
+ {
+ if (opt.luma_sub_ps[part])
+ {
+ HEADER("luma_sub_ps[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ }
+ if (opt.luma_add_ps[part])
+ {
+ HEADER("luma_add_ps[%s]", lumaPartStr[part]);
+ REPORT_SPEEDUP(opt.luma_add_ps[part], ref.luma_add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+ }
+ }
for (int i = 0; i < X265_CSP_COUNT; i++)
{
@@ -1669,21 +1676,24 @@
HEADER("[%s] copy_ss[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].copy_ss[part], ref.chroma[i].copy_ss[part], sbuf1, 64, sbuf2, 128);
}
- if (opt.chroma[i].sub_ps[part])
- {
- HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].sub_ps[part], ref.chroma[i].sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
- }
- if (opt.chroma[i].add_ps[part])
- {
- HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
- REPORT_SPEEDUP(opt.chroma[i].add_ps[part], ref.chroma[i].add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
- }
if (opt.chroma[i].addAvg[part])
{
HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
}
+ if (part < NUM_SQUARE_BLOCKS)
+ {
+ if (opt.chroma[i].sub_ps[part])
+ {
+ HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].sub_ps[part], ref.chroma[i].sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ }
+ if (opt.chroma[i].add_ps[part])
+ {
+ HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
+ REPORT_SPEEDUP(opt.chroma[i].add_ps[part], ref.chroma[i].add_ps[part], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
+ }
+ }
}
#undef HEADER
More information about the x265-devel
mailing list