[x265] [PATCH 2 of 2] Fix : Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv
gopu at multicorewareinc.com
gopu at multicorewareinc.com
Thu Jul 18 11:52:26 CEST 2013
# HG changeset patch
# User ggopu
# Date 1374141101 -19800
# Node ID eef3745d2cfea9ded3659de1a7392146d9ec3187
# Parent d93bf22889f8a58c3b3a03733c8e031ffe192fc3
Fix : Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv
diff -r d93bf22889f8 -r eef3745d2cfe source/common/vec/blockcopy.inc
--- a/source/common/vec/blockcopy.inc Thu Jul 18 12:55:02 2013 +0530
+++ b/source/common/vec/blockcopy.inc Thu Jul 18 15:21:41 2013 +0530
@@ -231,7 +231,6 @@
if (!(aligncheck & 31))
{
// fast path, multiples of 32 pixel wide blocks
- // fast path, multiples of 16 pixel wide blocks
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 32)
@@ -294,19 +293,18 @@
void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
{
- size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
+ size_t aligncheck = (size_t)dst | (size_t)src0 | sstride0 | dstride;
#if INSTRSET >= 8 && 0
- if (!(aligncheck & 31))
+ if (!(aligncheck & 31) && !(bx & 15))
{
- // fast path, multiples of 32 pixel wide blocks
+ Vec16s zero(0), maxval((1 << X265_DEPTH) - 1);
// fast path, multiples of 16 pixel wide blocks
for (int y = 0; y < by; y++)
{
- for (int x = 0; x < bx; x += 32)
+ for (int x = 0; x < bx; x += 16)
{
- Vec32s vecsrc0, vecsrc1, vecsum;
- Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
+ Vec16s vecsrc0, vecsrc1, vecsum;
vecsrc0.load_a(src0 + x);
vecsrc1.load_a(src1 + x);
@@ -324,15 +322,15 @@
}
else
#endif /* if INSTRSET >= 8 && 0 */
- if (!(aligncheck & 15))
+ if ( !(aligncheck & 15) && !(bx & 7))
{
- // fast path, multiples of 16 pixel wide blocks
+ Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+ // fast path, multiples of 8 pixel wide blocks
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 8)
{
Vec8s vecsrc0, vecsrc1, vecsum;
- Vec8s zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
vecsrc0.load_a(src0 + x);
vecsrc1.load_a(src1 + x);
@@ -348,6 +346,29 @@
dst += dstride;
}
}
+ else if (!(bx & 7))
+ {
+ Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 8)
+ {
+ Vec8s vecsrc0, vecsrc1, vecsum;
+ vecsrc0.load(src0 + x);
+ vecsrc1.load(src1 + x);
+
+ vecsum = add_saturated(vecsrc0, vecsrc1);
+ vecsum = max(vecsum, zero);
+ vecsum = min(vecsum, maxval);
+
+ vecsum.store(dst + x);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
else
{
int tmp;
@@ -370,24 +391,23 @@
}
}
+#if !HIGH_BIT_DEPTH
void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
{
size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
#if INSTRSET >= 8 && 0
- if (!(aligncheck & 31))
+ if (!(aligncheck & 31) && !(bx & 31))
{
+ Vec32uc zero(0), maxval((1 << X265_DEPTH) - 1);
// fast path, multiples of 32 pixel wide blocks
- // fast path, multiples of 16 pixel wide blocks
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 32)
{
- Vec32s vecsrc0, vecsrc1, vecsum;
- Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
+ Vec32uc vecsrc0, vecsrc1, vecsum;
vecsrc0.load_a(src0 + x);
vecsrc1.load_a(src1 + x);
-
vecsum = vecsrc0 + vecsrc1;
vecsum = max(vecsum, zero);
vecsum = min(vecsum, maxval);
@@ -402,18 +422,40 @@
}
else
#endif /* if INSTRSET >= 8 && 0 */
- if (!(aligncheck & 15))
+ if (!(aligncheck & 15) && !(bx & 15))
{
+ Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
// fast path, multiples of 16 pixel wide blocks
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 16)
{
Vec16uc vecsrc0, vecsrc1, vecsum;
- Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
vecsrc0.load_a(src0 + x);
vecsrc1.load_a(src1 + x);
+ vecsum = add_saturated(vecsrc0, vecsrc1);
+ vecsum = max(vecsum, zero);
+ vecsum = min(vecsum, maxval);
+ vecsum.store(dst + x);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else if (!(bx & 15))
+ {
+ Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 16)
+ {
+ Vec16uc vecsrc0, vecsrc1, vecsum;
+ vecsrc0.load(src0 + x);
+ vecsrc1.load(src1 + x);
vecsum = add_saturated(vecsrc0, vecsrc1);
vecsum = max(vecsum, zero);
vecsum = min(vecsum, maxval);
@@ -447,6 +489,7 @@
}
}
}
+#endif
void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
{
diff -r d93bf22889f8 -r eef3745d2cfe source/common/vec/vecprimitives.inc
--- a/source/common/vec/vecprimitives.inc Thu Jul 18 12:55:02 2013 +0530
+++ b/source/common/vec/vecprimitives.inc Thu Jul 18 15:21:41 2013 +0530
@@ -28,8 +28,8 @@
#include "utils.h"
#include <string.h>
-#include "TLibCommon\TComRom.h"
-#include "TLibCommon\TypeDef.h"
+#include "TLibCommon/TComRom.h"
+#include "TLibCommon/TypeDef.h"
using namespace x265;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-2.patch
Type: text/x-patch
Size: 6323 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130718/5527d394/attachment.bin>
More information about the x265-devel
mailing list