[x265] [PATCH] asm: assembly code for IntraPred_DC[4x4]
Min Chen
chenm003 at 163.com
Wed Nov 20 05:47:42 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384922749 -28800
# Node ID 400ab5fa31730fe395e981e45e54a051a6651fbf
# Parent 17e5d27ae03452ef9d6c0a8adf26e6c6a93d6751
asm: assembly code for IntraPred_DC[4x4]
diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -130,7 +130,7 @@
assert(g_convertToBit[size] >= 0); // 4x 4
assert(g_convertToBit[size] <= 5); // 128x128
- char log2BlkSize = g_convertToBit[size] + 2;
+ int log2BlkSize = g_convertToBit[size] + 2;
Pel *src = m_predBuf;
assert(log2BlkSize >= 2 && log2BlkSize < 7);
@@ -164,7 +164,7 @@
}
else if (dirMode == DC_IDX)
{
- primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);
+ primitives.intra_pred_dc[log2BlkSize - 2]((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, bFilter);
}
else
{
@@ -175,6 +175,8 @@
// Angular chroma
void TComPrediction::predIntraChromaAng(Pel* src, uint32_t dirMode, Pel* dst, uint32_t stride, int width)
{
+ int log2BlkSize = g_convertToBit[width];
+
// Create the prediction
Pel refAbv[3 * MAX_CU_SIZE];
Pel refLft[3 * MAX_CU_SIZE];
@@ -193,7 +195,7 @@
}
else if (dirMode == DC_IDX)
{
- primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
+ primitives.intra_pred_dc[log2BlkSize](refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, false);
}
else
{
diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -1622,7 +1622,7 @@
pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
// DC
- primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
+ primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
modeCosts[DC_IDX] = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
Pel *abovePlanar = above;
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/CMakeLists.txt Wed Nov 20 12:45:49 2013 +0800
@@ -113,7 +113,7 @@
if(ENABLE_PRIMITIVES_ASM)
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h)
- set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)
+ set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm intrapred.asm)
if (NOT X64)
set(A_SRCS ${A_SRCS} pixel-32.asm)
endif()
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/intrapred.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -80,7 +80,8 @@
}
}
-void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter)
+template<int width>
+void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter)
{
int k, l;
int blkSize = width;
@@ -300,7 +301,10 @@
void Setup_C_IPredPrimitives(EncoderPrimitives& p)
{
- p.intra_pred_dc = PredIntraDC;
+ p.intra_pred_dc[BLOCK_4x4] = PredIntraDC<4>;
+ p.intra_pred_dc[BLOCK_8x8] = PredIntraDC<8>;
+ p.intra_pred_dc[BLOCK_16x16] = PredIntraDC<16>;
+ p.intra_pred_dc[BLOCK_32x32] = PredIntraDC<32>;
p.intra_pred_planar = PredIntraPlanar;
p.intra_pred_ang = PredIntraAngBufRef;
p.intra_pred_allangs[0] = PredIntraAngs_C<4>;
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/primitives.h
--- a/source/common/primitives.h Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/primitives.h Wed Nov 20 12:45:49 2013 +0800
@@ -177,7 +177,7 @@
typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
-typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
+typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter);
typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
@@ -274,7 +274,7 @@
filter_p2s_t chroma_p2s;
extendCURowBorder_t extendRowBorder;
- intra_dc_t intra_pred_dc;
+ intra_dc_t intra_pred_dc[NUM_SQUARE_BLOCKS];
intra_planar_t intra_pred_planar;
intra_ang_t intra_pred_ang;
intra_allangs_t intra_pred_allangs[NUM_SQUARE_BLOCKS];
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/vec/intra-sse41.cpp
--- a/source/common/vec/intra-sse41.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/vec/intra-sse41.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -102,7 +102,8 @@
}
}
-void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int filter)
+template<int width>
+void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
{
int sum;
int logSize = g_convertToBit[width] + 2;
@@ -8708,7 +8709,10 @@
initFileStaticVars();
p.intra_pred_planar = intra_pred_planar;
- p.intra_pred_dc = intra_pred_dc;
+ p.intra_pred_dc[BLOCK_4x4] = intra_pred_dc<4>;
+ p.intra_pred_dc[BLOCK_8x8] = intra_pred_dc<8>;
+ p.intra_pred_dc[BLOCK_16x16] = intra_pred_dc<16>;
+ p.intra_pred_dc[BLOCK_32x32] = intra_pred_dc<32>;
#if defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER == 1500))
p.intra_pred_allangs[0] = predIntraAngs4;
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -634,6 +634,7 @@
p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
p.quant = x265_quant_sse4;
+ p.intra_pred_dc[BLOCK_4x4] = x265_intra_pred_dc4_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/intrapred.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/intrapred.asm Wed Nov 20 12:45:49 2013 +0800
@@ -0,0 +1,95 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4, 5,6,8
+ pxor m0, m0
+ movd m1, [r0]
+ movd m2, [r1]
+ punpckldq m1, m2
+ psadbw m1, m0 ; m1 = sum
+
+ test r4d, r4d
+
+ mov r4d, 4096
+ movd m2, r4d
+ pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
+ movd r4d, m1 ; r4d = dc_val
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+ ; store DC 4x4
+ lea r5, [r3 * 3]
+ movd [r2], m1
+ movd [r2 + r3], m1
+ movd [r2 + r3 * 2], m1
+ movd [r2 + r5], m1
+
+ ; do DC filter
+ jz .end
+ lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
+ add r4d, r5d ; r4d = DC * 3 + 2
+ movd m1, r4d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+
+ ; filter top
+ pmovzxbw m2, [r0]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ movd [r2], m2 ; overwrite top-left pixel, we will update it later
+
+ ; filter top-left
+ movzx r0d, byte [r0]
+ add r5d, r0d
+ movzx r0d, byte [r1]
+ add r0d, r5d
+ shr r0d, 2
+ mov [r2], r0b
+
+ ; filter left
+ add r2, r3
+ pmovzxbw m2, [r1 + 1]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ movd r0d, m2
+ mov [r2], r0b
+ mov [r2 + r3], r0h
+ shr r0d, 16
+ mov [r2 + r3 * 2], r0b
+
+.end
+
+ RET
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/x86/pixel.h Wed Nov 20 12:45:49 2013 +0800
@@ -365,5 +365,6 @@
void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+void x265_intra_pred_dc4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
#endif // ifndef X265_I386_PIXEL_H
diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/encoder/compress.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -145,7 +145,7 @@
pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
// DC
- primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
+ primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
bmode = mode = DC_IDX;
bits = m_search->xModeBitsIntra(cu, mode, partOffset, depth, initTrDepth);
diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/encoder/slicetype.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -566,7 +566,7 @@
int predsize = cuSize * cuSize;
// generate 35 intra predictions into tmp
- primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, cuSize, cuSize, (cuSize <= 16));
+ primitives.intra_pred_dc[nLog2SizeMinus2](pAbove0 + 1, pLeft0 + 1, predictions, cuSize, (cuSize <= 16));
pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0;
primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, predictions + predsize, cuSize, cuSize);
diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Wed Nov 20 12:45:28 2013 +0800
+++ b/source/test/intrapredharness.cpp Wed Nov 20 12:45:49 2013 +0800
@@ -68,17 +68,16 @@
X265_FREE(pixel_out_33_vec);
}
-bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt)
+bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width)
{
int j = ADI_BUF_STRIDE;
for (int i = 0; i <= 100; i++)
{
- int rand_width = 1 << ((rand() % 4) + 2); // Randomly generated Width
int rand_filter = rand() & 1;
pixel left[MAX_CU_SIZE * 2 + 1];
- for (int k = 0; k < rand_width * 2 + 1; k++)
+ for (int k = 0; k < width * 2 + 1; k++)
{
left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];
}
@@ -87,17 +86,16 @@
memset(pixel_out_vec, 0xCD, out_size);
memset(pixel_out_c, 0xCD, out_size);
#endif
+ ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_filter);
+ opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
- ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_width, rand_filter);
- opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
-
- for (int k = 0; k < rand_width; k++)
+ for (int k = 0; k < width; k++)
{
- if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, rand_width))
+ if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))
{
#if _DEBUG
- ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_width, rand_filter);
- opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
+ ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_filter);
+ opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
#endif
return false;
}
@@ -245,12 +243,16 @@
bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
- if (opt.intra_pred_dc)
+ for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
{
- if (!check_dc_primitive(ref.intra_pred_dc, opt.intra_pred_dc))
+ if (opt.intra_pred_dc[i])
{
- printf("intra_dc failed\n");
- return false;
+ const int size = (1 << (i + 2));
+ if (!check_dc_primitive(ref.intra_pred_dc[i], opt.intra_pred_dc[i], size))
+ {
+ printf("intra_dc %dx%d failed\n", size, size);
+ return false;
+ }
}
}
if (opt.intra_pred_planar)
@@ -286,14 +288,18 @@
int width = 64;
uint16_t srcStride = 96;
- if (opt.intra_pred_dc)
+ for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
{
- printf("intra_dc[filter=0]");
- REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
- pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 0);
- printf("intra_dc[filter=1]");
- REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
- pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 1);
+ if (opt.intra_pred_dc[i])
+ {
+ const int size = (1 << (i + 2));
+ printf("intra_dc_%dx%d[filter=0]", size, size);
+ REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
+ pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 0);
+ printf("intra_dc_%dx%d[filter=1]", size, size);
+ REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
+ pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 1);
+ }
}
if (opt.intra_pred_planar)
{
diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.h
--- a/source/test/intrapredharness.h Wed Nov 20 12:45:28 2013 +0800
+++ b/source/test/intrapredharness.h Wed Nov 20 12:45:49 2013 +0800
@@ -43,7 +43,7 @@
static const int out_size = 64 * FENC_STRIDE;
static const int out_size_33 = 33 * 64 * FENC_STRIDE;
- bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt);
+ bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width);
bool check_planar_primitive(intra_planar_t ref, intra_planar_t opt);
bool check_angular_primitive(intra_ang_t ref, intra_ang_t opt);
bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
More information about the x265-devel
mailing list