[x265] [PATCH] Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv

Fri Jul 19 07:16:26 CEST 2013

# HG changeset patch
# User ggopu
# Date 1374210970 -19800
# Node ID 6a864fc57a58e6988969faf3b5b919dd3defa0c1
# Parent  a4c00c3f1897d9af8239bacf5f56621297e9785b
Primitive: Performance Primitives for Pixel add Clip - TcomYuv and TshortYuv

diff -r a4c00c3f1897 -r 6a864fc57a58 source/common/pixel.cpp

--- a/source/common/pixel.cpp	Thu Jul 18 23:06:16 2013 -0500
+++ b/source/common/pixel.cpp	Fri Jul 19 10:46:10 2013 +0530
@@ -535,6 +535,36 @@
     }
 }
 
+void pixeladd_ss_c(int bx, int by, short *a, intptr_t dstride, short *b0, short *b1, intptr_t sstride0, intptr_t sstride1)
+{
+    for (int y = 0; y < by; y++)
+    {
+        for (int x = 0; x < bx; x++)
+        {
+            a[x] = (short)ClipY(b0[x] + b1[x]);
+        }
+
+        b0 += sstride0;
+        b1 += sstride1;
+        a += dstride;
+    }
+}
+
+void pixeladd_pp_c(int bx, int by, pixel *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+{
+    for (int y = 0; y < by; y++)
+    {
+        for (int x = 0; x < bx; x++)
+        {
+            a[x] = (pixel)ClipY(b0[x] + b1[x]);
+        }
+
+        b0 += sstride0;
+        b1 += sstride1;
+        a += dstride;
+    }
+}
+
 }  // end anonymous namespace
 
 namespace x265 {
@@ -738,5 +768,7 @@
     p.weightpUni = weightUnidir;
 
     p.pixelsub_sp = pixelsub_sp_c;
+    p.pixeladd_pp = pixeladd_pp_c;
+    p.pixeladd_ss = pixeladd_ss_c;
 }
 }
diff -r a4c00c3f1897 -r 6a864fc57a58 source/common/primitives.h
--- a/source/common/primitives.h	Thu Jul 18 23:06:16 2013 -0500
+++ b/source/common/primitives.h	Fri Jul 19 10:46:10 2013 +0530
@@ -192,7 +192,9 @@
 typedef void (*blockcpy_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
 typedef void (*blockcpy_ps_t)(int bx, int by, pixel *dst, intptr_t dstride, short *src, intptr_t sstride); // dst is aligned
 typedef void (*blockcpy_sc_t)(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned
-typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); 
+typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixeladd_ss_t)(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixeladd_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
 
 typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
 typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
@@ -275,6 +277,8 @@
 
     weightpUni_t    weightpUni;
     pixelsub_sp_t   pixelsub_sp;
+    pixeladd_ss_t   pixeladd_ss;
+    pixeladd_pp_t   pixeladd_pp;
 
     filterVwghtd_t  filterVwghtd;
     filterHwghtd_t  filterHwghtd;
diff -r a4c00c3f1897 -r 6a864fc57a58 source/common/vec/blockcopy.inc
--- a/source/common/vec/blockcopy.inc	Thu Jul 18 23:06:16 2013 -0500
+++ b/source/common/vec/blockcopy.inc	Fri Jul 19 10:46:10 2013 +0530
@@ -27,6 +27,9 @@
 #include <string.h>
 #include "utils.h"
 
+#include "TLibCommon/TComRom.h"
+#include "TLibCommon/TypeDef.h"
+
 namespace {
 
 #if HIGH_BIT_DEPTH
@@ -237,7 +240,6 @@
     if (!(aligncheck & 31))
     {
         // fast path, multiples of 32 pixel wide blocks
-        // fast path, multiples of 16 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 32)
@@ -297,6 +299,206 @@
         }
     }
 }
+
+void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
+{
+    size_t aligncheck = (size_t)dst | (size_t)src0 | sstride0 | dstride;
+
+#if INSTRSET >= 8 && 0
+    if (!(aligncheck & 31) && !(bx & 15))
+    {
+        Vec16s zero(0), maxval((1 << X265_DEPTH) - 1); 
+        // fast path, multiples of 16 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 16)
+            {
+                Vec16s vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load_a(src0 + x);
+                vecsrc1.load_a(src1 + x);
+
+                vecsum = vecsrc0 + vecsrc1;
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+#endif /* if INSTRSET >= 8 && 0 */
+    if ( !(aligncheck & 15) && !(bx & 7))
+    {
+        Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+        // fast path, multiples of 8 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 8)
+            {
+                Vec8s vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load_a(src0 + x);
+                vecsrc1.load_a(src1 + x);
+
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else if (!(bx & 7))
+    {
+        Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 8)
+            {
+                Vec8s vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load(src0 + x);
+                vecsrc1.load(src1 + x);
+
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+    {
+        int tmp;
+        int max = (1 << X265_DEPTH) - 1;
+        // slow path, irregular memory alignments or sizes
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x++)
+            {
+                tmp = src0[x] + src1[x];
+                tmp = tmp < 0 ? 0 : tmp;
+                tmp = tmp > max ? max : tmp;
+                dst[x] = (short)tmp;
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+}
+
+#if !HIGH_BIT_DEPTH
+void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
+{
+    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
+
+#if INSTRSET >= 8 && 0
+    if (!(aligncheck & 31))
+    {
+         Vec32uc zero(0), maxval((1 << X265_DEPTH) - 1); 
+        // fast path, multiples of 32 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 32)
+            {
+                Vec32uc vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load_a(src0 + x);
+                vecsrc1.load_a(src1 + x);
+                vecsum = vecsrc0 + vecsrc1;
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+#endif /* if INSTRSET >= 8 && 0 */
+    if (!(aligncheck & 15))
+    {
+        Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); 
+        // fast path, multiples of 16 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 16)
+            {
+                Vec16uc vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load_a(src0 + x);
+                vecsrc1.load_a(src1 + x);
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else if (!(bx & 15))
+    {
+        Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1); 
+        // fast path, multiples of 16 pixel wide blocks but pointers/strides require unaligned accesses
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 16)
+            {
+                Vec16uc vecsrc0, vecsrc1, vecsum;
+                vecsrc0.load(src0 + x);
+                vecsrc1.load(src1 + x);
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+    {
+        int tmp;
+        int max = (1 << X265_DEPTH) - 1;
+        // slow path, irregular memory alignments or sizes
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x++)
+            {
+                tmp = src0[x] + src1[x];
+                tmp = tmp < 0 ? 0 : tmp;
+                tmp = tmp > max ? max : tmp;
+                dst[x] = (pixel)tmp;
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+}
+#endif
 }
 
 namespace x265 {
@@ -308,12 +510,16 @@
     p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;
     p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;
     p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;
+    p.pixeladd_pp = (x265::pixeladd_pp_t)pixeladd_ss;
+    p.pixeladd_ss = pixeladd_ss;
 #else
     p.blockcpy_pp = blockcopy_p_p;
     p.blockcpy_ps = blockcopy_p_s;
     p.blockcpy_sp = blockcopy_s_p;
     p.blockcpy_sc = blockcopy_s_p;
     p.pixelsub_sp = pixelsub_sp;
+    p.pixeladd_ss = pixeladd_ss;
+    p.pixeladd_pp = pixeladd_pp;
 #endif
 }
 }
diff -r a4c00c3f1897 -r 6a864fc57a58 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Jul 18 23:06:16 2013 -0500
+++ b/source/test/pixelharness.cpp	Fri Jul 19 10:46:10 2013 +0530
@@ -376,6 +376,52 @@
     return true;
 }
 
+bool PixelHarness::check_pixeladd_ss(x265::pixeladd_ss_t ref, x265::pixeladd_ss_t opt)
+{
+    ALIGN_VAR_16(short, ref_dest[64 * 64]);
+    ALIGN_VAR_16(short, opt_dest[64 * 64]);
+    int bx = 64;
+    int by = 64;
+    int j = 0;
+    for (int i = 0; i <= 100; i++)
+    {
+        opt(bx, by, opt_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j, 128, 128);
+        ref(bx, by, ref_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j, 128, 128);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
+            return false;
+
+        j += 4;
+        bx = 4 * ((rand() & 15) + 1);
+        by = 4 * ((rand() & 15) + 1);
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_pixeladd_pp(x265::pixeladd_pp_t ref, x265::pixeladd_pp_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+    int bx = 64;
+    int by = 64;
+    int j = 0;
+    for (int i = 0; i <= 100; i++)
+    {
+        opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
+        ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        j += 4;
+        bx = 4 * ((rand() & 15) + 1);
+        by = 4 * ((rand() & 15) + 1);
+    }
+
+    return true;
+}
+
 bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
@@ -535,6 +581,24 @@
         }
     }
 
+    if (opt.pixeladd_ss)
+    {
+        if (!check_pixeladd_ss(ref.pixeladd_ss, opt.pixeladd_ss))
+        {
+            printf("pixel add clip failed!\n");
+            return false;
+        }
+    }
+
+    if (opt.pixeladd_pp)
+    {
+        if (!check_pixeladd_pp(ref.pixeladd_pp, opt.pixeladd_pp))
+        {
+            printf("pixel add clip failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -649,4 +713,16 @@
         printf("Pixel Sub");
         REPORT_SPEEDUP(opt.pixelsub_sp, ref.pixelsub_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
     }
+
+    if (opt.pixeladd_ss)
+    {
+        printf("pixel_ss add");
+        REPORT_SPEEDUP(opt.pixeladd_ss, ref.pixeladd_ss, 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, (short*)pbuf1, STRIDE, STRIDE);
+    }
+
+    if (opt.pixeladd_pp)
+    {
+        printf("pixel_pp add");
+        REPORT_SPEEDUP(opt.pixeladd_pp, ref.pixeladd_pp, 64, 64, pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+    }
 }
diff -r a4c00c3f1897 -r 6a864fc57a58 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Thu Jul 18 23:06:16 2013 -0500
+++ b/source/test/pixelharness.h	Fri Jul 19 10:46:10 2013 +0530
@@ -48,6 +48,8 @@
     bool check_calcrecon(x265::calcrecon_t ref, x265::calcrecon_t opt);
     bool check_weightpUni(x265::weightpUni_t ref, x265::weightpUni_t opt);
     bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t opt);
+    bool check_pixeladd_ss(x265::pixeladd_ss_t ref, x265::pixeladd_ss_t opt);
+    bool check_pixeladd_pp(x265::pixeladd_pp_t ref, x265::pixeladd_pp_t opt);
 
 public:
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265_new.patch
Type: text/x-patch
Size: 13596 bytes
Desc: not available
URL: <http://mailman.videolan.org/private/x265-devel/attachments/20130719/a1c5bcf3/attachment.bin>