[x265] [PATCH] asm: assembly code for IntraPred_DC[4x4]

Wed Nov 20 06:57:05 CET 2013

Queued, but some points below

On Nov 19, 2013, at 10:47 PM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1384922749 -28800
> # Node ID 400ab5fa31730fe395e981e45e54a051a6651fbf
> # Parent  17e5d27ae03452ef9d6c0a8adf26e6c6a93d6751
> asm: assembly code for IntraPred_DC[4x4]
> 
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibCommon/TComPrediction.cpp
> --- a/source/Lib/TLibCommon/TComPrediction.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/Lib/TLibCommon/TComPrediction.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -130,7 +130,7 @@
>     assert(g_convertToBit[size] >= 0);   //   4x  4
>     assert(g_convertToBit[size] <= 5);   // 128x128
> 
> -    char log2BlkSize = g_convertToBit[size] + 2;
> +    int log2BlkSize = g_convertToBit[size] + 2;
> 
>     Pel *src = m_predBuf;
>     assert(log2BlkSize >= 2 && log2BlkSize < 7);
> @@ -164,7 +164,7 @@
>     }
>     else if (dirMode == DC_IDX)
>     {
> -        primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);
> +        primitives.intra_pred_dc[log2BlkSize - 2]((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, bFilter);

it's no longer necessary to cast Pel* to pixel*, they are always the same type now.

>     }
>     else
>     {
> @@ -175,6 +175,8 @@
> // Angular chroma
> void TComPrediction::predIntraChromaAng(Pel* src, uint32_t dirMode, Pel* dst, uint32_t stride, int width)
> {
> +    int log2BlkSize = g_convertToBit[width];
> +
>     // Create the prediction
>     Pel refAbv[3 * MAX_CU_SIZE];
>     Pel refLft[3 * MAX_CU_SIZE];
> @@ -193,7 +195,7 @@
>     }
>     else if (dirMode == DC_IDX)
>     {
> -        primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
> +        primitives.intra_pred_dc[log2BlkSize](refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, false);
>     }
>     else
>     {
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -1622,7 +1622,7 @@
>             pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> 
>             // DC
> -            primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
> +            primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
>             modeCosts[DC_IDX] = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
> 
>             Pel *abovePlanar   = above;
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/CMakeLists.txt
> --- a/source/common/CMakeLists.txt	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/CMakeLists.txt	Wed Nov 20 12:45:49 2013 +0800
> @@ -113,7 +113,7 @@
> 
> if(ENABLE_PRIMITIVES_ASM)
>     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h)
> -    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)
> +    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm intrapred.asm)
>     if (NOT X64)
>         set(A_SRCS ${A_SRCS} pixel-32.asm)
>     endif()
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/intrapred.cpp
> --- a/source/common/intrapred.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/intrapred.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -80,7 +80,8 @@
>     }
> }
> 
> -void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter)
> +template<int width>
> +void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter)
> {
>     int k, l;
>     int blkSize = width;
> @@ -300,7 +301,10 @@
> 
> void Setup_C_IPredPrimitives(EncoderPrimitives& p)
> {
> -    p.intra_pred_dc = PredIntraDC;
> +    p.intra_pred_dc[BLOCK_4x4] = PredIntraDC<4>;
> +    p.intra_pred_dc[BLOCK_8x8] = PredIntraDC<8>;
> +    p.intra_pred_dc[BLOCK_16x16] = PredIntraDC<16>;
> +    p.intra_pred_dc[BLOCK_32x32] = PredIntraDC<32>;
>     p.intra_pred_planar = PredIntraPlanar;
>     p.intra_pred_ang = PredIntraAngBufRef;
>     p.intra_pred_allangs[0] = PredIntraAngs_C<4>;
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/primitives.h
> --- a/source/common/primitives.h	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/primitives.h	Wed Nov 20 12:45:49 2013 +0800
> @@ -177,7 +177,7 @@
> typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
> typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
> 
> -typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
> +typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter);
> typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
> typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
> typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
> @@ -274,7 +274,7 @@
>     filter_p2s_t    chroma_p2s;
>     extendCURowBorder_t extendRowBorder;
> 
> -    intra_dc_t      intra_pred_dc;
> +    intra_dc_t      intra_pred_dc[NUM_SQUARE_BLOCKS];
>     intra_planar_t  intra_pred_planar;
>     intra_ang_t     intra_pred_ang;
>     intra_allangs_t intra_pred_allangs[NUM_SQUARE_BLOCKS];
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/vec/intra-sse41.cpp
> --- a/source/common/vec/intra-sse41.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/vec/intra-sse41.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -102,7 +102,8 @@
>     }
> }
> 
> -void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int filter)
> +template<int width>
> +void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
> {
>     int sum;
>     int logSize = g_convertToBit[width] + 2;
> @@ -8708,7 +8709,10 @@
>     initFileStaticVars();
> 
>     p.intra_pred_planar = intra_pred_planar;
> -    p.intra_pred_dc = intra_pred_dc;
> +    p.intra_pred_dc[BLOCK_4x4] = intra_pred_dc<4>;
> +    p.intra_pred_dc[BLOCK_8x8] = intra_pred_dc<8>;
> +    p.intra_pred_dc[BLOCK_16x16] = intra_pred_dc<16>;
> +    p.intra_pred_dc[BLOCK_32x32] = intra_pred_dc<32>;
> 
> #if defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER == 1500))
>     p.intra_pred_allangs[0] = predIntraAngs4;
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/x86/asm-primitives.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -634,6 +634,7 @@
>         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
>         p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
>         p.quant = x265_quant_sse4;
> +        p.intra_pred_dc[BLOCK_4x4] = x265_intra_pred_dc4_sse4;
>     }
>     if (cpuMask & X265_CPU_AVX)
>     {
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/intrapred.asm
> --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/intrapred.asm	Wed Nov 20 12:45:49 2013 +0800
> @@ -0,0 +1,95 @@
> +;*****************************************************************************
> +;* Copyright (C) 2013 x265 project
> +;*
> +;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
> +;*
> +;* This program is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* This program is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License
> +;* along with this program; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
> +;*
> +;* This program is also available under a commercial proprietary license.
> +;* For more information, contact us at licensing at multicorewareinc.com.
> +;*****************************************************************************/
> +
> +%include "x86inc.asm"
> +%include "x86util.asm"
> +
> +SECTION_RODATA 32
> +
> +
> +
> +SECTION .text
> +
> +;-----------------------------------------------------------------------------
> +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse4
> +cglobal intra_pred_dc4, 5,6,8
> +    pxor        m0, m0
> +    movd        m1, [r0]
> +    movd        m2, [r1]
> +    punpckldq   m1, m2
> +    psadbw      m1, m0              ; m1 = sum
> +
> +    test        r4d, r4d
> +
> +    mov         r4d, 4096
> +    movd        m2, r4d
> +    pmulhrsw    m1, m2              ; m1 = (sum + 4) / 8
> +    movd        r4d, m1             ; r4d = dc_val
> +    pshufb      m1, m0              ; m1 = byte [dc_val ...]
> +
> +    ; store DC 4x4
> +    lea         r5, [r3 * 3]
> +    movd        [r2], m1
> +    movd        [r2 + r3], m1
> +    movd        [r2 + r3 * 2], m1
> +    movd        [r2 + r5], m1
> +
> +    ; do DC filter
> +    jz         .end
> +    lea         r5d, [r4d * 2 + 2]  ; r5d = DC * 2 + 2
> +    add         r4d, r5d            ; r4d = DC * 3 + 2
> +    movd        m1, r4d
> +    pshuflw     m1, m1, 0           ; m1 = pixDCx3
> +
> +    ; filter top
> +    pmovzxbw    m2, [r0]
> +    paddw       m2, m1
> +    psraw       m2, 2
> +    packuswb    m2, m2
> +    movd        [r2], m2            ; overwrite top-left pixel, we will update it later
> +
> +    ; filter top-left
> +    movzx       r0d, byte [r0]
> +    add         r5d, r0d
> +    movzx       r0d, byte [r1]
> +    add         r0d, r5d
> +    shr         r0d, 2
> +    mov         [r2], r0b
> +
> +    ; filter left
> +    add         r2, r3
> +    pmovzxbw    m2, [r1 + 1]
> +    paddw       m2, m1
> +    psraw       m2, 2
> +    packuswb    m2, m2
> +    movd        r0d, m2
> +    mov         [r2], r0b
> +    mov         [r2 + r3], r0h
> +    shr         r0d, 16
> +    mov         [r2 + r3 * 2], r0b
> +
> +.end
> +
> +    RET
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/common/x86/pixel.h	Wed Nov 20 12:45:49 2013 +0800
> @@ -365,5 +365,6 @@
> void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> +void x265_intra_pred_dc4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
> 
> #endif // ifndef X265_I386_PIXEL_H
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/compress.cpp
> --- a/source/encoder/compress.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/encoder/compress.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -145,7 +145,7 @@
>     pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
> 
>     // DC
> -    primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
> +    primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
>     sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
>     bmode = mode = DC_IDX;
>     bits  = m_search->xModeBitsIntra(cu, mode, partOffset, depth, initTrDepth);
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/encoder/slicetype.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -566,7 +566,7 @@
>         int predsize = cuSize * cuSize;
> 
>         // generate 35 intra predictions into tmp
> -        primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, cuSize, cuSize, (cuSize <= 16));
> +        primitives.intra_pred_dc[nLog2SizeMinus2](pAbove0 + 1, pLeft0 + 1, predictions, cuSize, (cuSize <= 16));
>         pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
>         pixel *left  = (cuSize >= 8) ? pLeft1 : pLeft0;
>         primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, predictions + predsize, cuSize, cuSize);
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.cpp
> --- a/source/test/intrapredharness.cpp	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/test/intrapredharness.cpp	Wed Nov 20 12:45:49 2013 +0800
> @@ -68,17 +68,16 @@
>     X265_FREE(pixel_out_33_vec);
> }
> 
> -bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt)
> +bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width)
> {
>     int j = ADI_BUF_STRIDE;
> 
>     for (int i = 0; i <= 100; i++)
>     {
> -        int rand_width = 1 << ((rand() % 4) + 2);                  // Randomly generated Width
>         int rand_filter = rand() & 1;
> 
>         pixel left[MAX_CU_SIZE * 2 + 1];
> -        for (int k = 0; k < rand_width * 2 + 1; k++)
> +        for (int k = 0; k < width * 2 + 1; k++)
>         {
>             left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];
>         }
> @@ -87,17 +86,16 @@
>         memset(pixel_out_vec, 0xCD, out_size);
>         memset(pixel_out_c, 0xCD, out_size);
> #endif
> +        ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_filter);
> +        opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
> 
> -        ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_width, rand_filter);
> -        opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
> -
> -        for (int k = 0; k < rand_width; k++)
> +        for (int k = 0; k < width; k++)
>         {
> -            if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, rand_width))
> +            if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))
>             {
> #if _DEBUG
> -                ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_width, rand_filter);
> -                opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
> +                ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_filter);
> +                opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
> #endif
>                 return false;
>             }
> @@ -245,12 +243,16 @@
> 
> bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
> {
> -    if (opt.intra_pred_dc)
> +    for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
>     {
> -        if (!check_dc_primitive(ref.intra_pred_dc, opt.intra_pred_dc))
> +        if (opt.intra_pred_dc[i])
>         {
> -            printf("intra_dc failed\n");
> -            return false;
> +            const int size = (1 << (i + 2));
> +            if (!check_dc_primitive(ref.intra_pred_dc[i], opt.intra_pred_dc[i], size))
> +            {
> +                printf("intra_dc %dx%d failed\n", size, size);
> +                return false;
> +            }
>         }
>     }
>     if (opt.intra_pred_planar)
> @@ -286,14 +288,18 @@
>     int width = 64;
>     uint16_t srcStride = 96;
> 
> -    if (opt.intra_pred_dc)
> +    for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
>     {
> -        printf("intra_dc[filter=0]");
> -        REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
> -                       pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 0);
> -        printf("intra_dc[filter=1]");
> -        REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
> -                       pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 1);
> +        if (opt.intra_pred_dc[i])
> +        {
> +            const int size = (1 << (i + 2));
> +            printf("intra_dc_%dx%d[filter=0]", size, size);
> +            REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
> +                           pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 0);
> +            printf("intra_dc_%dx%d[filter=1]", size, size);
> +            REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
> +                           pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 1);
> +        }
>     }
>     if (opt.intra_pred_planar)
>     {
> diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.h
> --- a/source/test/intrapredharness.h	Wed Nov 20 12:45:28 2013 +0800
> +++ b/source/test/intrapredharness.h	Wed Nov 20 12:45:49 2013 +0800
> @@ -43,7 +43,7 @@
>     static const int out_size = 64 * FENC_STRIDE;
>     static const int out_size_33 = 33 * 64 * FENC_STRIDE;
> 
> -    bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt);
> +    bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width);
>     bool check_planar_primitive(intra_planar_t ref, intra_planar_t opt);
>     bool check_angular_primitive(intra_ang_t ref, intra_ang_t opt);
>     bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 842 bytes
Desc: Message signed with OpenPGP using GPGMail
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131119/1a58787a/attachment.sig>