[x265] [PATCH] asm: residual buffer is alignment to size, so we can use alignment load instruction

Steve Borho steve at borho.org
Thu Nov 14 15:05:03 CET 2013


On Thu, Nov 14, 2013 at 2:48 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1384418720 -28800
> # Node ID 493981f517c44293fd1134707a910b53cc688015
> # Parent  8e22129119d6d8049996ed5f487625e4801b0a50
> asm: residual buffer is alignment to size, so we can use alignment load
> instruction
>

an older version of this was pushed a couple of days ago


>
> diff -r 8e22129119d6 -r 493981f517c4 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Thu Nov 14 16:45:03 2013
> +0800
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Thu Nov 14 16:45:20 2013
> +0800
> @@ -501,6 +501,8 @@
>          primitives.blockfill_s[size](resiTmp, stride, 0);
>      }
>
> +    assert(((uint32_t)residual & (width - 1)) == 0);
> +    assert(width <= 32);
>      //===== reconstruction =====
>      primitives.calcrecon[size](pred, residual, recon, reconQt,
> reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
>
> diff -r 8e22129119d6 -r 493981f517c4 source/common/x86/pixel-util.asm
> --- a/source/common/x86/pixel-util.asm  Thu Nov 14 16:45:03 2013 +0800
> +++ b/source/common/x86/pixel-util.asm  Thu Nov 14 16:45:20 2013 +0800
> @@ -239,10 +239,10 @@
>  cglobal calcRecons16
>  %if ARCH_X86_64 == 1
>      DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> -    PROLOGUE 6,9,5
> +    PROLOGUE 6,9,3
>  %else
>      DECLARE_REG_TMP 0,1,2,3,4,5
> -    PROLOGUE 6,7,5
> +    PROLOGUE 6,7,3
>      %define t6      r6m
>      %define t6d     r6d
>      %define t7      r7m
> @@ -265,10 +265,8 @@
>      movu        m2, [t0]
>      pmovzxbw    m1, m2
>      punpckhbw   m2, m0
> -    movu        m3, [t1]
> -    movu        m4, [t1 + 16]
> -    paddw       m1, m3
> -    paddw       m2, m4
> +    paddw       m1, [t1]
> +    paddw       m2, [t1 + 16]
>      packuswb    m1, m2
>
>      ; store recon[] and recipred[]
> @@ -296,10 +294,10 @@
>  cglobal calcRecons32
>  %if ARCH_X86_64 == 1
>      DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> -    PROLOGUE 6,9,7
> +    PROLOGUE 6,9,5
>  %else
>      DECLARE_REG_TMP 0,1,2,3,4,5
> -    PROLOGUE 6,7,7
> +    PROLOGUE 6,7,5
>      %define t6      r6m
>      %define t6d     r6d
>      %define t7      r7m
> @@ -326,16 +324,12 @@
>      pmovzxbw    m3, m4
>      punpckhbw   m4, m0
>
> -    movu        m5, [t1 + 0 * 16]
> -    movu        m6, [t1 + 1 * 16]
> -    paddw       m1, m5
> -    paddw       m2, m6
> +    paddw       m1, [t1 + 0 * 16]
> +    paddw       m2, [t1 + 1 * 16]
>      packuswb    m1, m2
>
> -    movu        m5, [t1 + 2 * 16]
> -    movu        m6, [t1 + 3 * 16]
> -    paddw       m3, m5
> -    paddw       m4, m6
> +    paddw       m3, [t1 + 2 * 16]
> +    paddw       m4, [t1 + 3 * 16]
>      packuswb    m3, m4
>
>      ; store recon[] and recipred[]
> @@ -369,10 +363,10 @@
>  cglobal calcRecons64
>  %if ARCH_X86_64 == 1
>      DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> -    PROLOGUE 6,9,7
> +    PROLOGUE 6,9,5
>  %else
>      DECLARE_REG_TMP 0,1,2,3,4,5
> -    PROLOGUE 6,7,7
> +    PROLOGUE 6,7,5
>      %define t6      r6m
>      %define t6d     r6d
>      %define t7      r7m
> @@ -400,16 +394,12 @@
>      pmovzxbw    m3, m4
>      punpckhbw   m4, m0
>
> -    movu        m5, [t1 + 0 * 16]
> -    movu        m6, [t1 + 1 * 16]
> -    paddw       m1, m5
> -    paddw       m2, m6
> +    paddw       m1, [t1 + 0 * 16]
> +    paddw       m2, [t1 + 1 * 16]
>      packuswb    m1, m2
>
> -    movu        m5, [t1 + 2 * 16]
> -    movu        m6, [t1 + 3 * 16]
> -    paddw       m3, m5
> -    paddw       m4, m6
> +    paddw       m3, [t1 + 2 * 16]
> +    paddw       m4, [t1 + 3 * 16]
>      packuswb    m3, m4
>
>      ; store recon[] and recipred[]
> @@ -436,16 +426,12 @@
>      pmovzxbw    m3, m4
>      punpckhbw   m4, m0
>
> -    movu        m5, [t1 + 4 * 16]
> -    movu        m6, [t1 + 5 * 16]
> -    paddw       m1, m5
> -    paddw       m2, m6
> +    paddw       m1, [t1 + 4 * 16]
> +    paddw       m2, [t1 + 5 * 16]
>      packuswb    m1, m2
>
> -    movu        m5, [t1 + 6 * 16]
> -    movu        m6, [t1 + 7 * 16]
> -    paddw       m3, m5
> -    paddw       m4, m6
> +    paddw       m3, [t1 + 6 * 16]
> +    paddw       m4, [t1 + 7 * 16]
>      packuswb    m3, m4
>
>      ; store recon[] and recipred[]
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131114/392e222b/attachment.html>


More information about the x265-devel mailing list