[x265] [PATCH] added calSign primitive, improved 2316.99 -> 233.63 (9.92x) over C code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Dec 30 15:08:08 CET 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1419948480 -19800
# Node ID c107c70e154b18e50b7f29f08724a0d923859cfb
# Parent  10a16175a843d46df9c84523edce2d61d20761fa
added calSign primitive, improved 2316.99 -> 233.63 (9.92x) over C code

Calsign primitive will be utilized to optimize various SAO algorithm switch cases

diff -r 10a16175a843 -r c107c70e154b source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/loopfilter.cpp	Tue Dec 30 19:38:00 2014 +0530
@@ -28,6 +28,18 @@
 #define PIXEL_MIN 0
 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 
+/* get the sign of input variable (TODO: this is a dup, make common) */
+inline int8_t signOf(int x)
+{
+    return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
+}
+
+void calSign(int8_t *dst, pixel *src1, pixel *src2, int endX)
+{
+    for (int x = 0; x < endX; x++)
+        dst[x] = signOf(src1[x] - src2[x]);
+}
+
 void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
 {
     int x;
@@ -49,5 +61,6 @@
 void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
 {
     p.saoCuOrgE0 = processSaoCUE0;
+    p.sign = calSign;
 }
 }
diff -r 10a16175a843 -r c107c70e154b source/common/primitives.h
--- a/source/common/primitives.h	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/primitives.h	Tue Dec 30 19:38:00 2014 +0530
@@ -191,6 +191,7 @@
 typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
 typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*sign_t)(int8_t *dst, pixel *src1, pixel *src2, int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 
@@ -240,6 +241,7 @@
     ssim_4x4x2_core_t     ssim_4x4x2_core;
     ssim_end4_t           ssim_end_4;
 
+    sign_t                sign;
     saoCuOrgE0_t          saoCuOrgE0;
 
     downscale_t           frameInitLowres;
diff -r 10a16175a843 -r c107c70e154b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 30 19:38:00 2014 +0530
@@ -1642,6 +1642,7 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        p.sign = x265_calSign_sse4;
         p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
 
         LUMA_ADDAVG(_sse4);
diff -r 10a16175a843 -r c107c70e154b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/x86/loopfilter.asm	Tue Dec 30 19:38:00 2014 +0530
@@ -83,3 +83,33 @@
     sub         r2d, 16
     jnz        .loop
     RET
+
+;============================================================================================================
+; void calSign(int8_t *dst, Pixel *src1 , Pixel *src2, int endX)
+;============================================================================================================
+INIT_XMM sse4
+cglobal calSign, 4, 5, 7
+
+    mov         r4,    16
+    mova        m1,    [pb_128]
+    mova        m0,    [pb_1]
+    shr         r3d,   4
+.loop
+    movu        m2,    [r1]        ; m2 = pRec[x]
+    movu        m3,    [r2]        ; m3 = pTmpU[x]
+
+    pxor        m4,    m2,    m1
+    pxor        m5,    m3,    m1
+    pcmpgtb     m6,    m4,    m5
+    pcmpgtb     m5,    m4
+    pand        m6,    m0
+    por         m6,    m5
+
+    movu        [r0],  m6
+
+    add         r0,    r4
+    add         r1,    r4
+    add         r2,    r4
+    dec         r3d
+    jnz         .loop
+    RET
diff -r 10a16175a843 -r c107c70e154b source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/x86/loopfilter.h	Tue Dec 30 19:38:00 2014 +0530
@@ -25,5 +25,6 @@
 #define X265_LOOPFILTER_H
 
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_calSign_sse4(int8_t *dst, pixel *src1, pixel *src2, int endX);
 
 #endif // ifndef X265_LOOPFILTER_H
diff -r 10a16175a843 -r c107c70e154b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/test/pixelharness.cpp	Tue Dec 30 19:38:00 2014 +0530
@@ -856,6 +856,33 @@
     return true;
 }
 
+bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
+{
+    ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
+    ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int width = 16 * (rand() % 4 + 1);
+
+        ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
+        checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1386,6 +1413,15 @@
         }
     }
 
+    if (opt.sign)
+    {
+        if (!check_calSign(ref.sign, opt.sign))
+        {
+            printf("calSign failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuOrgE0)
     {
         if (!check_saoCuOrgE0_t(ref.saoCuOrgE0, opt.saoCuOrgE0))
@@ -1712,6 +1748,12 @@
         REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4);
     }
 
+    if (opt.sign)
+    {
+        HEADER0("calSign");
+        REPORT_SPEEDUP(opt.sign, ref.sign, psbuf1, pbuf1, pbuf2, 64);
+    }
+
     if (opt.saoCuOrgE0)
     {
         HEADER0("SAO_EO_0");
diff -r 10a16175a843 -r c107c70e154b source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Dec 23 13:07:09 2014 +0530
+++ b/source/test/pixelharness.h	Tue Dec 30 19:38:00 2014 +0530
@@ -93,6 +93,7 @@
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
+    bool check_calSign(sign_t ref, sign_t opt);
 
 public:
 


More information about the x265-devel mailing list