[x265] [PATCH 2 of 2] Fix : Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv
Steve Borho
steve at borho.org
Thu Jul 18 19:45:36 CEST 2013
These two patches need to be submitted as one combined patch, since the
first one is functionally broken
On Thu, Jul 18, 2013 at 4:52 AM, <gopu at multicorewareinc.com> wrote:
> # HG changeset patch
> # User ggopu
> # Date 1374141101 -19800
> # Node ID eef3745d2cfea9ded3659de1a7392146d9ec3187
> # Parent d93bf22889f8a58c3b3a03733c8e031ffe192fc3
> Fix : Primitive: Performance Primitives for Pixel add Clip - TcomYuv and
> TshortYuv
>
> diff -r d93bf22889f8 -r eef3745d2cfe source/common/vec/blockcopy.inc
> --- a/source/common/vec/blockcopy.inc Thu Jul 18 12:55:02 2013 +0530
> +++ b/source/common/vec/blockcopy.inc Thu Jul 18 15:21:41 2013 +0530
> @@ -231,7 +231,6 @@
> if (!(aligncheck & 31))
> {
> // fast path, multiples of 32 pixel wide blocks
> - // fast path, multiples of 16 pixel wide blocks
> for (int y = 0; y < by; y++)
> {
> for (int x = 0; x < bx; x += 32)
> @@ -294,19 +293,18 @@
>
> void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short
> *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
> {
> - size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 |
> dstride;
> + size_t aligncheck = (size_t)dst | (size_t)src0 | sstride0 | dstride;
>
> #if INSTRSET >= 8 && 0
> - if (!(aligncheck & 31))
> + if (!(aligncheck & 31) && !(bx & 15))
> {
> - // fast path, multiples of 32 pixel wide blocks
> + Vec16s zero(0), maxval((1 << X265_DEPTH) - 1);
> // fast path, multiples of 16 pixel wide blocks
> for (int y = 0; y < by; y++)
> {
> - for (int x = 0; x < bx; x += 32)
> + for (int x = 0; x < bx; x += 16)
> {
> - Vec32s vecsrc0, vecsrc1, vecsum;
> - Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
> + Vec16s vecsrc0, vecsrc1, vecsum;
> vecsrc0.load_a(src0 + x);
> vecsrc1.load_a(src1 + x);
>
This looks better
>
> @@ -324,15 +322,15 @@
> }
> else
> #endif /* if INSTRSET >= 8 && 0 */
> - if (!(aligncheck & 15))
> + if ( !(aligncheck & 15) && !(bx & 7))
> {
> - // fast path, multiples of 16 pixel wide blocks
> + Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
> + // fast path, multiples of 8 pixel wide blocks
> for (int y = 0; y < by; y++)
> {
> for (int x = 0; x < bx; x += 8)
> {
> Vec8s vecsrc0, vecsrc1, vecsum;
> - Vec8s zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
> vecsrc0.load_a(src0 + x);
> vecsrc1.load_a(src1 + x);
>
good
> @@ -348,6 +346,29 @@
> dst += dstride;
> }
> }
> + else if (!(bx & 7))
> + {
> + Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
> + for (int y = 0; y < by; y++)
> + {
> + for (int x = 0; x < bx; x += 8)
> + {
> + Vec8s vecsrc0, vecsrc1, vecsum;
> + vecsrc0.load(src0 + x);
> + vecsrc1.load(src1 + x);
> +
> + vecsum = add_saturated(vecsrc0, vecsrc1);
> + vecsum = max(vecsum, zero);
> + vecsum = min(vecsum, maxval);
> +
> + vecsum.store(dst + x);
> + }
> +
> + src0 += sstride0;
> + src1 += sstride1;
> + dst += dstride;
> + }
>
good
> + }
> else
> {
> int tmp;
> @@ -370,24 +391,23 @@
> }
> }
>
> +#if !HIGH_BIT_DEPTH
> void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel
> *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
> {
> size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 |
> dstride;
>
> #if INSTRSET >= 8 && 0
> - if (!(aligncheck & 31))
> + if (!(aligncheck & 31) && !(bx & 31))
>
for 8bpp functions, bx does not have to be checked separately from
aligncheck. the block width is a byte count just like the strides.
> {
> + Vec32uc zero(0), maxval((1 << X265_DEPTH) - 1);
> // fast path, multiples of 32 pixel wide blocks
> - // fast path, multiples of 16 pixel wide blocks
> for (int y = 0; y < by; y++)
> {
> for (int x = 0; x < bx; x += 32)
> {
> - Vec32s vecsrc0, vecsrc1, vecsum;
> - Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
> + Vec32uc vecsrc0, vecsrc1, vecsum;
> vecsrc0.load_a(src0 + x);
> vecsrc1.load_a(src1 + x);
> -
> vecsum = vecsrc0 + vecsrc1;
> vecsum = max(vecsum, zero);
> vecsum = min(vecsum, maxval);
> @@ -402,18 +422,40 @@
> }
> else
> #endif /* if INSTRSET >= 8 && 0 */
> - if (!(aligncheck & 15))
> + if (!(aligncheck & 15) && !(bx & 15))
> {
> + Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
> // fast path, multiples of 16 pixel wide blocks
> for (int y = 0; y < by; y++)
> {
> for (int x = 0; x < bx; x += 16)
> {
> Vec16uc vecsrc0, vecsrc1, vecsum;
> - Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); //
> Currently g_bitDepthY = 8 and g_bitDepthC = 8
> vecsrc0.load_a(src0 + x);
> vecsrc1.load_a(src1 + x);
> + vecsum = add_saturated(vecsrc0, vecsrc1);
> + vecsum = max(vecsum, zero);
> + vecsum = min(vecsum, maxval);
>
> + vecsum.store(dst + x);
> + }
> +
> + src0 += sstride0;
> + src1 += sstride1;
> + dst += dstride;
> + }
> + }
> + else if (!(bx & 15))
> + {
> + Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
> + // fast path, multiples of 16 pixel wide blocks
>
this comment should say data width is a multiple of SIMD register but
pointers/strides require unaligned accesses
> + for (int y = 0; y < by; y++)
> + {
> + for (int x = 0; x < bx; x += 16)
> + {
> + Vec16uc vecsrc0, vecsrc1, vecsum;
> + vecsrc0.load(src0 + x);
> + vecsrc1.load(src1 + x);
> vecsum = add_saturated(vecsrc0, vecsrc1);
> vecsum = max(vecsum, zero);
> vecsum = min(vecsum, maxval);
> @@ -447,6 +489,7 @@
> }
> }
> }
> +#endif
>
> void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
> {
> diff -r d93bf22889f8 -r eef3745d2cfe source/common/vec/vecprimitives.inc
> --- a/source/common/vec/vecprimitives.inc Thu Jul 18 12:55:02 2013
> +0530
> +++ b/source/common/vec/vecprimitives.inc Thu Jul 18 15:21:41 2013
> +0530
> @@ -28,8 +28,8 @@
> #include "utils.h"
> #include <string.h>
>
> -#include "TLibCommon\TComRom.h"
> -#include "TLibCommon\TypeDef.h"
> +#include "TLibCommon/TComRom.h"
> +#include "TLibCommon/TypeDef.h"
>
> using namespace x265;
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/private/x265-devel/attachments/20130718/63170e5a/attachment.html>
More information about the x265-devel
mailing list