[x265] [PATCH] asm: saoCuOrgE1 asm code

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Wed Jan 7 09:15:49 CET 2015


# HG changeset patch
# User Nabajit Deka
# Date 1420618463 -19800
#      Wed Jan 07 13:44:23 2015 +0530
# Node ID 6cc757f662ed982a2f64122eba8e557d8ef0abba
# Parent  357ec738fb0ccaa678ab548629666b118f9f938f
asm: saoCuOrgE1 asm code

diff -r 357ec738fb0c -r 6cc757f662ed source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/common/loopfilter.cpp	Wed Jan 07 13:44:23 2015 +0530
@@ -57,6 +57,23 @@
     }
 }
 
+void processSaoCUE1(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+{
+    int x;
+    int8_t signDown;
+    int edgeType;
+
+    for (x = 0; x < width; x++)
+    {
+        signDown = signOf(rec[x] - rec[x + stride]);
+        edgeType = signDown + upBuff1[x] + 2;
+        upBuff1[x] = -signDown;
+
+        short v = rec[x] + offsetEo[edgeType];
+        rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
+    }
+}
+
 void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
 {
     int x;
@@ -94,6 +111,7 @@
 void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
 {
     p.saoCuOrgE0 = processSaoCUE0;
+    p.saoCuOrgE1 = processSaoCUE1;
     p.saoCuOrgE2 = processSaoCUE2;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
diff -r 357ec738fb0c -r 6cc757f662ed source/common/primitives.h
--- a/source/common/primitives.h	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/common/primitives.h	Wed Jan 07 13:44:23 2015 +0530
@@ -191,6 +191,7 @@
 typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
 typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
@@ -245,6 +246,7 @@
 
     sign_t                sign;
     saoCuOrgE0_t          saoCuOrgE0;
+    saoCuOrgE1_t          saoCuOrgE1;
     saoCuOrgE2_t          saoCuOrgE2;
     saoCuOrgB0_t          saoCuOrgB0;
 
diff -r 357ec738fb0c -r 6cc757f662ed source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jan 07 13:44:23 2015 +0530
@@ -1650,6 +1650,7 @@
     {
         p.sign = x265_calSign_sse4;
         p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
+        p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
         p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
         p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
 
diff -r 357ec738fb0c -r 6cc757f662ed source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Wed Jan 07 13:44:23 2015 +0530
@@ -86,6 +86,56 @@
     jnz        .loop
     RET
 
+;==================================================================================================
+; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
+;==================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+    mov         r3d, r3m
+    mov         r4d, r4m
+    pxor        m0,    m0                      ; m0 = 0
+    movu        m6,    [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+    mova        m7,    [pb_128]
+    shr         r4d,   4
+    .loop
+         movu        m1,    [r0]                    ; m1 = pRec[x]
+         movu        m2,    [r0 + r3]               ; m2 = pRec[x + iStride]
+
+         pxor        m3,    m1,    m7
+         pxor        m4,    m2,    m7
+         pcmpgtb     m2,    m3,    m4
+         pcmpgtb     m4,    m3
+         pand        m2,    [pb_1]
+         por         m2,    m4
+
+         movu        m3,    [r1]                    ; m3 = m_iUpBuff1
+
+         paddb       m3,    m2
+         paddb       m3,    m6
+
+         movu        m4,    [r2]                    ; m4 = m_iOffsetEo
+         pshufb      m5,    m4,    m3
+
+         psubb       m3,    m0,    m2
+         movu        [r1],  m3
+
+         pmovzxbw    m2,    m1
+         punpckhbw   m1,    m0
+         pmovsxbw    m3,    m5
+         punpckhbw   m5,    m5
+         psraw       m5,    8
+
+         paddw       m2,    m3
+         paddw       m1,    m5
+         packuswb    m2,    m1
+         movu        [r0],  m2
+
+         add         r0,    16
+         add         r1,    16
+         dec         r4d
+         jnz         .loop
+    RET
+
 ;======================================================================================================================================================
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
 ;======================================================================================================================================================
diff -r 357ec738fb0c -r 6cc757f662ed source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/common/x86/loopfilter.h	Wed Jan 07 13:44:23 2015 +0530
@@ -26,6 +26,7 @@
 #define X265_LOOPFILTER_H
 
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 357ec738fb0c -r 6cc757f662ed source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/encoder/sao.cpp	Wed Jan 07 13:44:23 2015 +0530
@@ -326,26 +326,34 @@
         if (!tpely)
             rec += stride;
 
-        if (!(ctuWidth & 15))
-            primitives.sign(upBuff1, rec, tmpU, ctuWidth);
-        else
+        if (ctuWidth & 15)
         {
             for (x = 0; x < ctuWidth; x++)
                 upBuff1[x] = signOf(rec[x] - tmpU[x]);
+
+            for (y = startY; y < endY; y++)
+            {
+                for (x = 0; x < ctuWidth; x++)
+                {
+                    int8_t signDown = signOf(rec[x] - rec[x + stride]);
+                    int edgeType = signDown + upBuff1[x] + 2;
+                    upBuff1[x] = -signDown;
+
+                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+                }
+
+                rec += stride;
+            }
         }
+        else
+        {
+            primitives.sign(upBuff1, rec, tmpU, ctuWidth);
 
-        for (y = startY; y < endY; y++)
-        {
-            for (x = 0; x < ctuWidth; x++)
+            for (y = startY; y < endY; y++)
             {
-                int8_t signDown = signOf(rec[x] - rec[x + stride]);
-                int edgeType = signDown + upBuff1[x] + 2;
-                upBuff1[x] = -signDown;
-
-                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+                primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
+                rec += stride;
             }
-
-            rec += stride;
         }
 
         break;
diff -r 357ec738fb0c -r 6cc757f662ed source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/test/pixelharness.cpp	Wed Jan 07 13:44:23 2015 +0530
@@ -66,7 +66,7 @@
         sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
         ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
         psbuf1[i] = psbuf4[i] = (rand() % 65) - 32;                   // range is between -32 to 32
-        psbuf2[i] = (rand() % 3) - 1;
+        psbuf2[i] = psbuf5[i] = (rand() % 3) - 1;                     // possible values {-1,0,1}
         psbuf3[i] = (rand() % 129) - 128;
         sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
     }
@@ -919,6 +919,34 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int width = 16 * (rand() % 4 + 1);
+        int stride = width + 1;
+
+        ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, width);
+        checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, width);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1496,6 +1524,15 @@
         }
     }
 
+    if (opt.saoCuOrgE1)
+    {
+        if (!check_saoCuOrgE1_t(ref.saoCuOrgE1, opt.saoCuOrgE1))
+        {
+            printf("SAO_EO_1 failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuOrgE2)
     {
         if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
@@ -1843,6 +1880,12 @@
         REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
     }
 
+    if (opt.saoCuOrgE1)
+    {
+        HEADER0("SAO_EO_1");
+        REPORT_SPEEDUP(opt.saoCuOrgE1, ref.saoCuOrgE1, pbuf1, psbuf2, psbuf1, 64, 64);
+    }
+
     if (opt.saoCuOrgE2)
     {
         HEADER0("SAO_EO_2");
diff -r 357ec738fb0c -r 6cc757f662ed source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Jan 06 15:39:58 2015 +0530
+++ b/source/test/pixelharness.h	Wed Jan 07 13:44:23 2015 +0530
@@ -50,6 +50,7 @@
     int8_t   psbuf2[BUFFSIZE];
     int8_t   psbuf3[BUFFSIZE];
     int8_t   psbuf4[BUFFSIZE];
+    int8_t   psbuf5[BUFFSIZE];
 
     int16_t  sbuf1[BUFFSIZE];
     int16_t  sbuf2[BUFFSIZE];
@@ -93,6 +94,7 @@
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
     bool check_addAvg(addAvg_t, addAvg_t);
     bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
+    bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
     bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);


More information about the x265-devel mailing list