[x265] [PATCH] TShortYUV : Implemented perfomance Primitives pixeladd (Clipadd)

gopu at multicorewareinc.com gopu at multicorewareinc.com
Wed Jul 17 09:36:20 CEST 2013


# HG changeset patch
# User ggopu
# Date 1374046566 -19800
# Node ID c6c045e9272798ce5268643436759db22fd1950b
# Parent  054d8c409569100c4aacb015ffb1b3281100d993
TShortYUV : Implemented perfomance Primitives pixeladd (Clipadd)

diff -r 054d8c409569 -r c6c045e92727 source/common/TShortYUV.cpp
--- a/source/common/TShortYUV.cpp	Wed Jul 17 11:33:20 2013 +0530
+++ b/source/common/TShortYUV.cpp	Wed Jul 17 13:06:06 2013 +0530
@@ -122,61 +122,38 @@
 #pragma warning (disable: 4244)
 #endif
 
-void TShortYUV::addClipLuma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
+void TShortYUV::addClipLuma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)
 {
-    int x, y;
+    int x = partSize, y = partSize;
 
-    short* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);
-    short* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);
-    short* pDst  = getLumaAddr(uiTrUnitIdx, uiPartSize);
+    short* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);
+    short* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);
+    short* dst  = getLumaAddr(trUnitIdx, partSize);
 
-    unsigned int iSrc0Stride = pcYuvSrc0->width;
-    unsigned int iSrc1Stride = pcYuvSrc1->width;
-    unsigned int iDstStride  = width;
+    unsigned int src0Stride = pcYuvSrc0->width;
+    unsigned int src1Stride = pcYuvSrc1->width;
+    unsigned int dstStride  = width;
 
-    for (y = uiPartSize - 1; y >= 0; y--)
-    {
-        for (x = uiPartSize - 1; x >= 0; x--)
-        {
-            pDst[x] = ClipY(pSrc0[x] + pSrc1[x]);
-        }
-
-        pSrc0 += iSrc0Stride;
-        pSrc1 += iSrc1Stride;
-        pDst  += iDstStride;
-    }
+    primitives.pixeladd(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);
 }
 
-void TShortYUV::addClipChroma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
+void TShortYUV::addClipChroma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)
 {
-    int x, y;
+    int x = partSize, y = partSize;
 
-    short* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);
-    short* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);
-    short* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);
-    short* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);
-    short* pDstU = getCbAddr(uiTrUnitIdx, uiPartSize);
-    short* pDstV = getCrAddr(uiTrUnitIdx, uiPartSize);
+    short* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);
+    short* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);
+    short* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);
+    short* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);
+    short* dstU = getCbAddr(trUnitIdx, partSize);
+    short* dstV = getCrAddr(trUnitIdx, partSize);
 
-    unsigned int  iSrc0Stride = pcYuvSrc0->Cwidth;
-    unsigned int  iSrc1Stride = pcYuvSrc1->Cwidth;
-    unsigned int  iDstStride  = Cwidth;
+    unsigned int  src0Stride = pcYuvSrc0->Cwidth;
+    unsigned int  src1Stride = pcYuvSrc1->Cwidth;
+    unsigned int  dstStride  = Cwidth;
 
-    for (y = uiPartSize - 1; y >= 0; y--)
-    {
-        for (x = uiPartSize - 1; x >= 0; x--)
-        {
-            pDstU[x] = ClipC(pSrcU0[x] + pSrcU1[x]);
-            pDstV[x] = ClipC(pSrcV0[x] + pSrcV1[x]);
-        }
-
-        pSrcU0 += iSrc0Stride;
-        pSrcU1 += iSrc1Stride;
-        pSrcV0 += iSrc0Stride;
-        pSrcV1 += iSrc1Stride;
-        pDstU  += iDstStride;
-        pDstV  += iDstStride;
-    }
+    primitives.pixeladd(x, y, dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);
+    primitives.pixeladd(x, y, dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);
 }
 
 #if _MSC_VER
diff -r 054d8c409569 -r c6c045e92727 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Jul 17 11:33:20 2013 +0530
+++ b/source/common/pixel.cpp	Wed Jul 17 13:06:06 2013 +0530
@@ -30,7 +30,6 @@
 #include <algorithm>
 #include <stdlib.h> // abs()
 
-
 #define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
     p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4,  DATA_TYPE1, DATA_TYPE2>;  \
     p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8,  DATA_TYPE1, DATA_TYPE2>;  \
@@ -504,14 +503,15 @@
 void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth)
 {
     int x, y;
+
     for (y = height - 1; y >= 0; y--)
     {
         for (x = width - 1; x >= 0; )
         {
             // note: luma min width is 4
-            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
+            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
             x--;
-            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
+            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
             x--;
         }
 
@@ -535,6 +535,20 @@
     }
 }
 
+void pixeladd_ss(int bx, int by, short *a, intptr_t dstride, short *b0, short *b1, intptr_t sstride0, intptr_t sstride1)
+{
+    for (int y = 0; y < by; y++)
+    {
+        for (int x = 0; x < bx; x++)
+        {
+            a[x] = (short)ClipY(b0[x] + b1[x]);
+        }
+
+        b0 += sstride0;
+        b1 += sstride1;
+        a += dstride;
+    }
+}
 }  // end anonymous namespace
 
 namespace x265 {
@@ -738,5 +752,6 @@
     p.weightpUni = weightUnidir;
 
     p.pixelsubsp = pixelsub_sp;
+    p.pixeladd   = pixeladd_ss;
 }
 }
diff -r 054d8c409569 -r c6c045e92727 source/common/primitives.h
--- a/source/common/primitives.h	Wed Jul 17 11:33:20 2013 +0530
+++ b/source/common/primitives.h	Wed Jul 17 13:06:06 2013 +0530
@@ -215,7 +215,8 @@
 typedef void (*dequant_t)(int bitDepth, const int* src, int* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int trSizeLog2, int *dequantCoef);
 typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);
 typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth);
-typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); 
+typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixeladd_t)(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -267,6 +268,7 @@
 
     weightpUni_t    weightpUni;
     pixelsub_sp_t   pixelsubsp;
+    pixeladd_t      pixeladd;
 };
 
 /* This copy of the table is what gets used by the encoder.
diff -r 054d8c409569 -r c6c045e92727 source/common/vec/blockcopy.inc
--- a/source/common/vec/blockcopy.inc	Wed Jul 17 11:33:20 2013 +0530
+++ b/source/common/vec/blockcopy.inc	Wed Jul 17 13:06:06 2013 +0530
@@ -79,7 +79,7 @@
         }
     }
     else
-#endif
+#endif /* if INSTRSET >= 8 */
     if (!(aligncheck & 15))
     {
         // fast path, multiples of 16 pixel wide blocks
@@ -131,7 +131,7 @@
         }
     }
     else
-#endif
+#endif /* if INSTRSET >= 8 && 0 */
     if (!(aligncheck & 15))
     {
         // fast path, multiples of 16 pixel wide blocks
@@ -170,6 +170,7 @@
 void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)
 {
     size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;
+
 #if INSTRSET >= 8 && 0
     if (!(aligncheck & 31))
     {
@@ -189,7 +190,7 @@
         }
     }
     else
-#endif
+#endif /* if INSTRSET >= 8 && 0 */
     if (!(aligncheck & 15))
     {
         // fast path, multiples of 16 pixel wide blocks
@@ -292,6 +293,84 @@
     }
 }
 
+void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1)
+{
+    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
+
+#if INSTRSET >= 8 && 0
+    if (!(aligncheck & 31))
+    {
+        // fast path, multiples of 32 pixel wide blocks
+        // fast path, multiples of 16 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 8)
+            {
+                Vec8s vecsrc0, vecsrc1, vecsum;
+                Vec8s zero(0), maxval((1 << 8) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
+                vecsrc0.load_a(src0 + x);
+                vecsrc1.load_a(src1 + x);
+
+                vecsum = vecsrc0 + vecsrc1;
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+#endif /* if INSTRSET >= 8 && 0 */
+    if (!(aligncheck & 15))
+    {
+        // fast path, multiples of 16 pixel wide blocks
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x += 8)
+            {
+                Vec8s vecsrc0, vecsrc1, vecsum;
+                Vec8s zero(0), maxval((1 << 8) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8
+                vecsrc0.load_a(src0 + x);
+                vecsrc1.load_a(src1 + x);
+
+                vecsum = add_saturated(vecsrc0, vecsrc1);
+                vecsum = max(vecsum, zero);
+                vecsum = min(vecsum, maxval);
+
+                vecsum.store(dst + x);
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+    else
+    {
+        int tmp;
+        int max = (1 << 8) - 1;
+        // slow path, irregular memory alignments or sizes
+        for (int y = 0; y < by; y++)
+        {
+            for (int x = 0; x < bx; x++)
+            {
+                tmp = src0[x] + src1[x];
+                tmp = tmp < 0 ? 0 : tmp;
+                tmp = tmp > max ? max : tmp;
+                dst[x] = (short)tmp;
+            }
+
+            src0 += sstride0;
+            src1 += sstride1;
+            dst += dstride;
+        }
+    }
+}
+
 void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
 {
 #if HIGH_BIT_DEPTH
@@ -307,5 +386,6 @@
     p.blockcpy_sp = blockcopy_s_p;
     p.blockcpy_sc = blockcopy_s_p;
     p.pixelsubsp = pixelsub_sp;
-#endif
+    p.pixeladd = pixeladd_ss;
+#endif /* if HIGH_BIT_DEPTH */
 }
diff -r 054d8c409569 -r c6c045e92727 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Jul 17 11:33:20 2013 +0530
+++ b/source/test/pixelharness.cpp	Wed Jul 17 13:06:06 2013 +0530
@@ -376,6 +376,29 @@
     return true;
 }
 
+bool PixelHarness::check_pixeladd_sp(x265::pixeladd_t ref, x265::pixeladd_t opt)
+{
+    ALIGN_VAR_16(short, ref_dest[64 * 64]);
+    ALIGN_VAR_16(short, opt_dest[64 * 64]);
+    int bx = 64;
+    int by = 64;
+    int j = 0;
+    for (int i = 0; i <= 100; i++)
+    {
+        opt(bx, by, opt_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j, 128, 128);
+        ref(bx, by, ref_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j, 128, 128);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
+            return false;
+
+        j += 4;
+        bx = 4 * ((rand() & 15) + 1);
+        by = 4 * ((rand() & 15) + 1);
+    }
+
+    return true;
+}
+
 bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
@@ -535,6 +558,14 @@
         }
     }
 
+    if (opt.pixeladd)
+    {
+        if (!check_pixeladd_sp(ref.pixeladd, opt.pixeladd))
+        {
+            printf("Pixel Add failed!\n");
+            return false;
+        }
+    }
     return true;
 }
 
@@ -649,4 +680,10 @@
         printf("Pixel Sub");
         REPORT_SPEEDUP(opt.pixelsubsp, ref.pixelsubsp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
     }
+
+    if (opt.pixeladd)
+    {
+        printf("Pixel Add");
+        REPORT_SPEEDUP(opt.pixeladd, ref.pixeladd, 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, (short*)pbuf1, STRIDE, STRIDE);
+    }
 }
diff -r 054d8c409569 -r c6c045e92727 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Jul 17 11:33:20 2013 +0530
+++ b/source/test/pixelharness.h	Wed Jul 17 13:06:06 2013 +0530
@@ -48,6 +48,7 @@
     bool check_calcrecon(x265::calcrecon_t ref, x265::calcrecon_t opt);
     bool check_weightpUni(x265::weightpUni_t ref, x265::weightpUni_t opt);
     bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t opt);
+    bool check_pixeladd_sp(x265::pixeladd_t ref, x265::pixeladd_t opt);
 
 public:
 


More information about the x265-devel mailing list