[x265] [PATCH] asm: assembly code for IntraPred_DC[4x4]
Steve Borho
steve at borho.org
Wed Nov 20 06:57:05 CET 2013
Queued, but some points below
On Nov 19, 2013, at 10:47 PM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1384922749 -28800
> # Node ID 400ab5fa31730fe395e981e45e54a051a6651fbf
> # Parent 17e5d27ae03452ef9d6c0a8adf26e6c6a93d6751
> asm: assembly code for IntraPred_DC[4x4]
>
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -130,7 +130,7 @@
> assert(g_convertToBit[size] >= 0); // 4x 4
> assert(g_convertToBit[size] <= 5); // 128x128
>
> - char log2BlkSize = g_convertToBit[size] + 2;
> + int log2BlkSize = g_convertToBit[size] + 2;
>
> Pel *src = m_predBuf;
> assert(log2BlkSize >= 2 && log2BlkSize < 7);
> @@ -164,7 +164,7 @@
> }
> else if (dirMode == DC_IDX)
> {
> - primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);
> + primitives.intra_pred_dc[log2BlkSize - 2]((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, bFilter);
it's no longer necessary to cast Pel* to pixel*, they are always the same type now.
> }
> else
> {
> @@ -175,6 +175,8 @@
> // Angular chroma
> void TComPrediction::predIntraChromaAng(Pel* src, uint32_t dirMode, Pel* dst, uint32_t stride, int width)
> {
> + int log2BlkSize = g_convertToBit[width];
> +
> // Create the prediction
> Pel refAbv[3 * MAX_CU_SIZE];
> Pel refLft[3 * MAX_CU_SIZE];
> @@ -193,7 +195,7 @@
> }
> else if (dirMode == DC_IDX)
> {
> - primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
> + primitives.intra_pred_dc[log2BlkSize](refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, false);
> }
> else
> {
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -1622,7 +1622,7 @@
> pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
>
> // DC
> - primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
> + primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
> modeCosts[DC_IDX] = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
>
> Pel *abovePlanar = above;
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/CMakeLists.txt
> --- a/source/common/CMakeLists.txt Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/CMakeLists.txt Wed Nov 20 12:45:49 2013 +0800
> @@ -113,7 +113,7 @@
>
> if(ENABLE_PRIMITIVES_ASM)
> set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h)
> - set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)
> + set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm intrapred.asm)
> if (NOT X64)
> set(A_SRCS ${A_SRCS} pixel-32.asm)
> endif()
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/intrapred.cpp
> --- a/source/common/intrapred.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/intrapred.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -80,7 +80,8 @@
> }
> }
>
> -void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter)
> +template<int width>
> +void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter)
> {
> int k, l;
> int blkSize = width;
> @@ -300,7 +301,10 @@
>
> void Setup_C_IPredPrimitives(EncoderPrimitives& p)
> {
> - p.intra_pred_dc = PredIntraDC;
> + p.intra_pred_dc[BLOCK_4x4] = PredIntraDC<4>;
> + p.intra_pred_dc[BLOCK_8x8] = PredIntraDC<8>;
> + p.intra_pred_dc[BLOCK_16x16] = PredIntraDC<16>;
> + p.intra_pred_dc[BLOCK_32x32] = PredIntraDC<32>;
> p.intra_pred_planar = PredIntraPlanar;
> p.intra_pred_ang = PredIntraAngBufRef;
> p.intra_pred_allangs[0] = PredIntraAngs_C<4>;
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/primitives.h
> --- a/source/common/primitives.h Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/primitives.h Wed Nov 20 12:45:49 2013 +0800
> @@ -177,7 +177,7 @@
> typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
> typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
>
> -typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
> +typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter);
> typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
> typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
> typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
> @@ -274,7 +274,7 @@
> filter_p2s_t chroma_p2s;
> extendCURowBorder_t extendRowBorder;
>
> - intra_dc_t intra_pred_dc;
> + intra_dc_t intra_pred_dc[NUM_SQUARE_BLOCKS];
> intra_planar_t intra_pred_planar;
> intra_ang_t intra_pred_ang;
> intra_allangs_t intra_pred_allangs[NUM_SQUARE_BLOCKS];
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/vec/intra-sse41.cpp
> --- a/source/common/vec/intra-sse41.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/vec/intra-sse41.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -102,7 +102,8 @@
> }
> }
>
> -void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int filter)
> +template<int width>
> +void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
> {
> int sum;
> int logSize = g_convertToBit[width] + 2;
> @@ -8708,7 +8709,10 @@
> initFileStaticVars();
>
> p.intra_pred_planar = intra_pred_planar;
> - p.intra_pred_dc = intra_pred_dc;
> + p.intra_pred_dc[BLOCK_4x4] = intra_pred_dc<4>;
> + p.intra_pred_dc[BLOCK_8x8] = intra_pred_dc<8>;
> + p.intra_pred_dc[BLOCK_16x16] = intra_pred_dc<16>;
> + p.intra_pred_dc[BLOCK_32x32] = intra_pred_dc<32>;
>
> #if defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER == 1500))
> p.intra_pred_allangs[0] = predIntraAngs4;
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/x86/asm-primitives.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -634,6 +634,7 @@
> p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
> p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
> p.quant = x265_quant_sse4;
> + p.intra_pred_dc[BLOCK_4x4] = x265_intra_pred_dc4_sse4;
> }
> if (cpuMask & X265_CPU_AVX)
> {
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/intrapred.asm
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/intrapred.asm Wed Nov 20 12:45:49 2013 +0800
> @@ -0,0 +1,95 @@
> +;*****************************************************************************
> +;* Copyright (C) 2013 x265 project
> +;*
> +;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
> +;*
> +;* This program is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* This program is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License
> +;* along with this program; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
> +;*
> +;* This program is also available under a commercial proprietary license.
> +;* For more information, contact us at licensing at multicorewareinc.com.
> +;*****************************************************************************/
> +
> +%include "x86inc.asm"
> +%include "x86util.asm"
> +
> +SECTION_RODATA 32
> +
> +
> +
> +SECTION .text
> +
> +;-----------------------------------------------------------------------------
> +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse4
> +cglobal intra_pred_dc4, 5,6,8
> + pxor m0, m0
> + movd m1, [r0]
> + movd m2, [r1]
> + punpckldq m1, m2
> + psadbw m1, m0 ; m1 = sum
> +
> + test r4d, r4d
> +
> + mov r4d, 4096
> + movd m2, r4d
> + pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
> + movd r4d, m1 ; r4d = dc_val
> + pshufb m1, m0 ; m1 = byte [dc_val ...]
> +
> + ; store DC 4x4
> + lea r5, [r3 * 3]
> + movd [r2], m1
> + movd [r2 + r3], m1
> + movd [r2 + r3 * 2], m1
> + movd [r2 + r5], m1
> +
> + ; do DC filter
> + jz .end
> + lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
> + add r4d, r5d ; r4d = DC * 3 + 2
> + movd m1, r4d
> + pshuflw m1, m1, 0 ; m1 = pixDCx3
> +
> + ; filter top
> + pmovzxbw m2, [r0]
> + paddw m2, m1
> + psraw m2, 2
> + packuswb m2, m2
> + movd [r2], m2 ; overwrite top-left pixel, we will update it later
> +
> + ; filter top-left
> + movzx r0d, byte [r0]
> + add r5d, r0d
> + movzx r0d, byte [r1]
> + add r0d, r5d
> + shr r0d, 2
> + mov [r2], r0b
> +
> + ; filter left
> + add r2, r3
> + pmovzxbw m2, [r1 + 1]
> + paddw m2, m1
> + psraw m2, 2
> + packuswb m2, m2
> + movd r0d, m2
> + mov [r2], r0b
> + mov [r2 + r3], r0h
> + shr r0d, 16
> + mov [r2 + r3 * 2], r0b
> +
> +.end
> +
> + RET
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/x86/pixel.h Wed Nov 20 12:45:49 2013 +0800
> @@ -365,5 +365,6 @@
> void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +void x265_intra_pred_dc4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
>
> #endif // ifndef X265_I386_PIXEL_H
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/compress.cpp
> --- a/source/encoder/compress.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/encoder/compress.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -145,7 +145,7 @@
> pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
>
> // DC
> - primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
> + primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
> sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
> bmode = mode = DC_IDX;
> bits = m_search->xModeBitsIntra(cu, mode, partOffset, depth, initTrDepth);
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/encoder/slicetype.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -566,7 +566,7 @@
> int predsize = cuSize * cuSize;
>
> // generate 35 intra predictions into tmp
> - primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, cuSize, cuSize, (cuSize <= 16));
> + primitives.intra_pred_dc[nLog2SizeMinus2](pAbove0 + 1, pLeft0 + 1, predictions, cuSize, (cuSize <= 16));
> pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
> pixel *left = (cuSize >= 8) ? pLeft1 : pLeft0;
> primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, predictions + predsize, cuSize, cuSize);
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.cpp
> --- a/source/test/intrapredharness.cpp Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/test/intrapredharness.cpp Wed Nov 20 12:45:49 2013 +0800
> @@ -68,17 +68,16 @@
> X265_FREE(pixel_out_33_vec);
> }
>
> -bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt)
> +bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width)
> {
> int j = ADI_BUF_STRIDE;
>
> for (int i = 0; i <= 100; i++)
> {
> - int rand_width = 1 << ((rand() % 4) + 2); // Randomly generated Width
> int rand_filter = rand() & 1;
>
> pixel left[MAX_CU_SIZE * 2 + 1];
> - for (int k = 0; k < rand_width * 2 + 1; k++)
> + for (int k = 0; k < width * 2 + 1; k++)
> {
> left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];
> }
> @@ -87,17 +86,16 @@
> memset(pixel_out_vec, 0xCD, out_size);
> memset(pixel_out_c, 0xCD, out_size);
> #endif
> + ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_filter);
> + opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
>
> - ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_width, rand_filter);
> - opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
> -
> - for (int k = 0; k < rand_width; k++)
> + for (int k = 0; k < width; k++)
> {
> - if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, rand_width))
> + if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))
> {
> #if _DEBUG
> - ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_width, rand_filter);
> - opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
> + ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, rand_filter);
> + opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
> #endif
> return false;
> }
> @@ -245,12 +243,16 @@
>
> bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
> {
> - if (opt.intra_pred_dc)
> + for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
> {
> - if (!check_dc_primitive(ref.intra_pred_dc, opt.intra_pred_dc))
> + if (opt.intra_pred_dc[i])
> {
> - printf("intra_dc failed\n");
> - return false;
> + const int size = (1 << (i + 2));
> + if (!check_dc_primitive(ref.intra_pred_dc[i], opt.intra_pred_dc[i], size))
> + {
> + printf("intra_dc %dx%d failed\n", size, size);
> + return false;
> + }
> }
> }
> if (opt.intra_pred_planar)
> @@ -286,14 +288,18 @@
> int width = 64;
> uint16_t srcStride = 96;
>
> - if (opt.intra_pred_dc)
> + for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
> {
> - printf("intra_dc[filter=0]");
> - REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
> - pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 0);
> - printf("intra_dc[filter=1]");
> - REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
> - pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 1);
> + if (opt.intra_pred_dc[i])
> + {
> + const int size = (1 << (i + 2));
> + printf("intra_dc_%dx%d[filter=0]", size, size);
> + REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
> + pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 0);
> + printf("intra_dc_%dx%d[filter=1]", size, size);
> + REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
> + pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 1);
> + }
> }
> if (opt.intra_pred_planar)
> {
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.h
> --- a/source/test/intrapredharness.h Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/test/intrapredharness.h Wed Nov 20 12:45:49 2013 +0800
> @@ -43,7 +43,7 @@
> static const int out_size = 64 * FENC_STRIDE;
> static const int out_size_33 = 33 * 64 * FENC_STRIDE;
>
> - bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt);
> + bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width);
> bool check_planar_primitive(intra_planar_t ref, intra_planar_t opt);
> bool check_angular_primitive(intra_ang_t ref, intra_ang_t opt);
> bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 842 bytes
Desc: Message signed with OpenPGP using GPGMail
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131119/1a58787a/attachment.sig>
More information about the x265-devel
mailing list