[x265] [PATCH] asm: residual buffer is alignment to size, so we can use alignment load instruction
Steve Borho
steve at borho.org
Thu Nov 14 15:05:03 CET 2013
On Thu, Nov 14, 2013 at 2:48 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1384418720 -28800
> # Node ID 493981f517c44293fd1134707a910b53cc688015
> # Parent 8e22129119d6d8049996ed5f487625e4801b0a50
> asm: residual buffer is alignment to size, so we can use alignment load
> instruction
>
an older version of this was pushed a couple of days ago
>
> diff -r 8e22129119d6 -r 493981f517c4 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 16:45:03 2013
> +0800
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 16:45:20 2013
> +0800
> @@ -501,6 +501,8 @@
> primitives.blockfill_s[size](resiTmp, stride, 0);
> }
>
> + assert(((uint32_t)residual & (width - 1)) == 0);
> + assert(width <= 32);
> //===== reconstruction =====
> primitives.calcrecon[size](pred, residual, recon, reconQt,
> reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
>
> diff -r 8e22129119d6 -r 493981f517c4 source/common/x86/pixel-util.asm
> --- a/source/common/x86/pixel-util.asm Thu Nov 14 16:45:03 2013 +0800
> +++ b/source/common/x86/pixel-util.asm Thu Nov 14 16:45:20 2013 +0800
> @@ -239,10 +239,10 @@
> cglobal calcRecons16
> %if ARCH_X86_64 == 1
> DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,5
> + PROLOGUE 6,9,3
> %else
> DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,5
> + PROLOGUE 6,7,3
> %define t6 r6m
> %define t6d r6d
> %define t7 r7m
> @@ -265,10 +265,8 @@
> movu m2, [t0]
> pmovzxbw m1, m2
> punpckhbw m2, m0
> - movu m3, [t1]
> - movu m4, [t1 + 16]
> - paddw m1, m3
> - paddw m2, m4
> + paddw m1, [t1]
> + paddw m2, [t1 + 16]
> packuswb m1, m2
>
> ; store recon[] and recipred[]
> @@ -296,10 +294,10 @@
> cglobal calcRecons32
> %if ARCH_X86_64 == 1
> DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,7
> + PROLOGUE 6,9,5
> %else
> DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,7
> + PROLOGUE 6,7,5
> %define t6 r6m
> %define t6d r6d
> %define t7 r7m
> @@ -326,16 +324,12 @@
> pmovzxbw m3, m4
> punpckhbw m4, m0
>
> - movu m5, [t1 + 0 * 16]
> - movu m6, [t1 + 1 * 16]
> - paddw m1, m5
> - paddw m2, m6
> + paddw m1, [t1 + 0 * 16]
> + paddw m2, [t1 + 1 * 16]
> packuswb m1, m2
>
> - movu m5, [t1 + 2 * 16]
> - movu m6, [t1 + 3 * 16]
> - paddw m3, m5
> - paddw m4, m6
> + paddw m3, [t1 + 2 * 16]
> + paddw m4, [t1 + 3 * 16]
> packuswb m3, m4
>
> ; store recon[] and recipred[]
> @@ -369,10 +363,10 @@
> cglobal calcRecons64
> %if ARCH_X86_64 == 1
> DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,7
> + PROLOGUE 6,9,5
> %else
> DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,7
> + PROLOGUE 6,7,5
> %define t6 r6m
> %define t6d r6d
> %define t7 r7m
> @@ -400,16 +394,12 @@
> pmovzxbw m3, m4
> punpckhbw m4, m0
>
> - movu m5, [t1 + 0 * 16]
> - movu m6, [t1 + 1 * 16]
> - paddw m1, m5
> - paddw m2, m6
> + paddw m1, [t1 + 0 * 16]
> + paddw m2, [t1 + 1 * 16]
> packuswb m1, m2
>
> - movu m5, [t1 + 2 * 16]
> - movu m6, [t1 + 3 * 16]
> - paddw m3, m5
> - paddw m4, m6
> + paddw m3, [t1 + 2 * 16]
> + paddw m4, [t1 + 3 * 16]
> packuswb m3, m4
>
> ; store recon[] and recipred[]
> @@ -436,16 +426,12 @@
> pmovzxbw m3, m4
> punpckhbw m4, m0
>
> - movu m5, [t1 + 4 * 16]
> - movu m6, [t1 + 5 * 16]
> - paddw m1, m5
> - paddw m2, m6
> + paddw m1, [t1 + 4 * 16]
> + paddw m2, [t1 + 5 * 16]
> packuswb m1, m2
>
> - movu m5, [t1 + 6 * 16]
> - movu m6, [t1 + 7 * 16]
> - paddw m3, m5
> - paddw m4, m6
> + paddw m3, [t1 + 6 * 16]
> + paddw m4, [t1 + 7 * 16]
> packuswb m3, m4
>
> ; store recon[] and recipred[]
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131114/392e222b/attachment.html>
More information about the x265-devel
mailing list