[x265] [PATCH 2 of 2] Fix : Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv

gopu at multicorewareinc.com gopu at multicorewareinc.com
Thu Jul 18 11:52:26 CEST 2013


# HG changeset patch
# User ggopu
# Date 1374141101 -19800
# Node ID eef3745d2cfea9ded3659de1a7392146d9ec3187
# Parent  d93bf22889f8a58c3b3a03733c8e031ffe192fc3
Fix : Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv

diff -r d93bf22889f8 -r eef3745d2cfe source/common/vec/blockcopy.inc
--- a/source/common/vec/blockcopy.inc	Thu Jul 18 12:55:02 2013 +0530
+++ b/source/common/vec/blockcopy.inc	Thu Jul 18 15:21:41 2013 +0530
@@ -231,7 +231,6 @@
     if (!(aligncheck & 31))
     {
         // fast path, multiples of 32 pixel wide blocks
-        // fast path, multiples of 16 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 32)
@@ -294,19 +293,18 @@
 
 void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
 {
-    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
+    size_t aligncheck = (size_t)dst | (size_t)src0 | sstride0 | dstride;
 
 #if INSTRSET >= 8 && 0
-    if (!(aligncheck & 31))
+    if (!(aligncheck & 31) && !(bx & 15))
     {
-        // fast path, multiples of 32 pixel wide blocks
+        Vec16s zero(0), maxval((1 << X265_DEPTH) - 1); 
         // fast path, multiples of 16 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
-            for (int x = 0; x < bx; x += 32)
+            for (int x = 0; x < bx; x += 16)
             {
-                Vec32s vecsrc0, vecsrc1, vecsum;
-                Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
+                Vec16s vecsrc0, vecsrc1, vecsum;
                 vecsrc0.load_a(src0 + x);
                 vecsrc1.load_a(src1 + x);
 
@@ -324,15 +322,15 @@
     }
     else
 #endif /* if INSTRSET >= 8 && 0 */
-    if (!(aligncheck & 15))
+    if ( !(aligncheck & 15) && !(bx & 7))
     {
-        // fast path, multiples of 16 pixel wide blocks
+        Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+        // fast path, multiples of 8 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 8)
             {
                 Vec8s vecsrc0, vecsrc1, vecsum;
-                Vec8s zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
                 vecsrc0.load_a(src0 + x);
                 vecsrc1.load_a(src1 + x);
 
@@ -348,6 +346,29 @@
             dst += dstride;
         }
     }
+    else if (!(bx & 7))
+    {
+        Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 8)
+            {
+                Vec8s vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load(src0 + x);
+                vecsrc1.load(src1 + x);
+
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
     else
     {
         int tmp;
@@ -370,24 +391,23 @@
     }
 }
 
+#if !HIGH_BIT_DEPTH
 void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
 {
     size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
 
 #if INSTRSET >= 8 && 0
-    if (!(aligncheck & 31))
+    if (!(aligncheck & 31) && !(bx & 31))
     {
+         Vec32uc zero(0), maxval((1 << X265_DEPTH) - 1); 
         // fast path, multiples of 32 pixel wide blocks
-        // fast path, multiples of 16 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 32)
             {
-                Vec32s vecsrc0, vecsrc1, vecsum;
-                Vec32s zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
+                Vec32uc vecsrc0, vecsrc1, vecsum;
                 vecsrc0.load_a(src0 + x);
                 vecsrc1.load_a(src1 + x);
-
                 vecsum = vecsrc0 + vecsrc1;
                 vecsum = max(vecsum, zero);
                 vecsum = min(vecsum, maxval);
@@ -402,18 +422,40 @@
     }
     else
 #endif /* if INSTRSET >= 8 && 0 */
-    if (!(aligncheck & 15))
+    if (!(aligncheck & 15) && !(bx & 15))
     {
+        Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); 
         // fast path, multiples of 16 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 16)
             {
                 Vec16uc vecsrc0, vecsrc1, vecsum;
-                Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
                 vecsrc0.load_a(src0 + x);
                 vecsrc1.load_a(src1 + x);
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
 
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else if (!(bx & 15))
+    {
+        Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); 
+        // fast path, multiples of 16 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 16)
+            {
+                Vec16uc vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load(src0 + x);
+                vecsrc1.load(src1 + x);
                 vecsum = add_saturated(vecsrc0, vecsrc1);
                 vecsum = max(vecsum, zero);
                 vecsum = min(vecsum, maxval);
@@ -447,6 +489,7 @@
         }
     }
 }
+#endif
 
 void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
 {
diff -r d93bf22889f8 -r eef3745d2cfe source/common/vec/vecprimitives.inc
--- a/source/common/vec/vecprimitives.inc	Thu Jul 18 12:55:02 2013 +0530
+++ b/source/common/vec/vecprimitives.inc	Thu Jul 18 15:21:41 2013 +0530
@@ -28,8 +28,8 @@
 #include "utils.h"
 #include <string.h>
 
-#include "TLibCommon\TComRom.h"
-#include "TLibCommon\TypeDef.h"
+#include "TLibCommon/TComRom.h"
+#include "TLibCommon/TypeDef.h"
 
 using namespace x265;
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-2.patch
Type: text/x-patch
Size: 6323 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130718/5527d394/attachment.bin>


More information about the x265-devel mailing list