[x265] [PATCH] sao: add C and sse4 code of saoCuOrgE1 to process 2 rows

Divya Manivannan divya at multicorewareinc.com
Wed Apr 8 06:26:41 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428406902 -19800
#      Tue Apr 07 17:11:42 2015 +0530
# Node ID 7044924d68147152533fe1502df2c75a3512befb
# Parent  3e416dec8024b8339b18568cf65e48eb3448bed1
sao: add C and sse4 code of saoCuOrgE1 to process 2 rows

diff -r 3e416dec8024 -r 7044924d6814 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/loopfilter.cpp	Tue Apr 07 17:11:42 2015 +0530
@@ -77,6 +77,25 @@
     }
 }
 
+void processSaoCUE1_2Rows(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+{
+    int x, y;
+    int8_t signDown;
+    int edgeType;
+
+    for (y = 0; y < 2; y++)
+    {
+        for (x = 0; x < width; x++)
+        {
+            signDown = signOf(rec[x] - rec[x + stride]);
+            edgeType = signDown + upBuff1[x] + 2;
+            upBuff1[x] = -signDown;
+            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+        }
+        rec += stride;
+    }
+}
+
 void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
 {
     int x;
@@ -124,6 +143,7 @@
 {
     p.saoCuOrgE0 = processSaoCUE0;
     p.saoCuOrgE1 = processSaoCUE1;
+    p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
     p.saoCuOrgE2 = processSaoCUE2;
     p.saoCuOrgE3 = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
diff -r 3e416dec8024 -r 7044924d6814 source/common/primitives.h
--- a/source/common/primitives.h	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/primitives.h	Tue Apr 07 17:11:42 2015 +0530
@@ -274,7 +274,7 @@
 
     sign_t                sign;
     saoCuOrgE0_t          saoCuOrgE0;
-    saoCuOrgE1_t          saoCuOrgE1;
+    saoCuOrgE1_t          saoCuOrgE1, saoCuOrgE1_2Rows;
     saoCuOrgE2_t          saoCuOrgE2;
     saoCuOrgE3_t          saoCuOrgE3;
     saoCuOrgB0_t          saoCuOrgB0;
diff -r 3e416dec8024 -r 7044924d6814 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 07 17:11:42 2015 +0530
@@ -1374,6 +1374,7 @@
         p.sign = x265_calSign_sse4;
         p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
         p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
+        p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
         p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
         p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
         p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
diff -r 3e416dec8024 -r 7044924d6814 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Tue Apr 07 17:11:42 2015 +0530
@@ -191,46 +191,121 @@
     mov         r3d, r3m
     mov         r4d, r4m
     pxor        m0,    m0                      ; m0 = 0
-    movu        m6,    [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+    mova        m6,    [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     mova        m7,    [pb_128]
     shr         r4d,   4
-    .loop
-         movu        m1,    [r0]                    ; m1 = pRec[x]
-         movu        m2,    [r0 + r3]               ; m2 = pRec[x + iStride]
+.loop
+    movu        m1,    [r0]                    ; m1 = pRec[x]
+    movu        m2,    [r0 + r3]               ; m2 = pRec[x + iStride]
 
-         pxor        m3,    m1,    m7
-         pxor        m4,    m2,    m7
-         pcmpgtb     m2,    m3,    m4
-         pcmpgtb     m4,    m3
-         pand        m2,    [pb_1]
-         por         m2,    m4
+    pxor        m3,    m1,    m7
+    pxor        m4,    m2,    m7
+    pcmpgtb     m2,    m3,    m4
+    pcmpgtb     m4,    m3
+    pand        m2,    [pb_1]
+    por         m2,    m4
 
-         movu        m3,    [r1]                    ; m3 = m_iUpBuff1
+    movu        m3,    [r1]                    ; m3 = m_iUpBuff1
 
-         paddb       m3,    m2
-         paddb       m3,    m6
+    paddb       m3,    m2
+    paddb       m3,    m6
 
-         movu        m4,    [r2]                    ; m4 = m_iOffsetEo
-         pshufb      m5,    m4,    m3
+    movu        m4,    [r2]                    ; m4 = m_iOffsetEo
+    pshufb      m5,    m4,    m3
 
-         psubb       m3,    m0,    m2
-         movu        [r1],  m3
+    psubb       m3,    m0,    m2
+    movu        [r1],  m3
 
-         pmovzxbw    m2,    m1
-         punpckhbw   m1,    m0
-         pmovsxbw    m3,    m5
-         punpckhbw   m5,    m5
-         psraw       m5,    8
+    pmovzxbw    m2,    m1
+    punpckhbw   m1,    m0
+    pmovsxbw    m3,    m5
+    punpckhbw   m5,    m5
+    psraw       m5,    8
 
-         paddw       m2,    m3
-         paddw       m1,    m5
-         packuswb    m2,    m1
-         movu        [r0],  m2
+    paddw       m2,    m3
+    paddw       m1,    m5
+    packuswb    m2,    m1
+    movu        [r0],  m2
 
-         add         r0,    16
-         add         r1,    16
-         dec         r4d
-         jnz         .loop
+    add         r0,    16
+    add         r1,    16
+    dec         r4d
+    jnz         .loop
+    RET
+
+;========================================================================================================
+; void saoCuOrgE1_2Rows(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
+;========================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE1_2Rows, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+    mov         r3d,        r3m
+    mov         r4d,        r4m
+    pxor        m0,         m0                      ; m0 = 0
+    mova        m7,         [pb_128]
+    shr         r4d,        4
+.loop
+    movu        m1,         [r0]                    ; m1 = pRec[x]
+    movu        m2,         [r0 + r3]               ; m2 = pRec[x + iStride]
+
+    pxor        m3,         m1,         m7
+    pxor        m4,         m2,         m7
+    pcmpgtb     m6,         m3,         m4
+    pcmpgtb     m5,         m4,         m3
+    pand        m6,         [pb_1]
+    por         m6,         m5
+
+    movu        m5,         [r0 + r3 * 2]
+    pxor        m3,         m5,         m7
+    pcmpgtb     m5,         m4,         m3
+    pcmpgtb     m3,         m4
+    pand        m5,         [pb_1]
+    por         m5,         m3
+
+    movu        m3,         [r1]                    ; m3 = m_iUpBuff1
+    paddb       m3,         m6
+    paddb       m3,         [pb_2]
+
+    movu        m4,         [r2]                    ; m4 = m_iOffsetEo
+    pshufb      m4,         m3
+
+    psubb       m3,         m0,         m6
+    movu        [r1],       m3
+
+    pmovzxbw    m6,         m1
+    punpckhbw   m1,         m0
+    pmovsxbw    m3,         m4
+    punpckhbw   m4,         m4
+    psraw       m4,         8
+
+    paddw       m6,         m3
+    paddw       m1,         m4
+    packuswb    m6,         m1
+    movu        [r0],       m6
+
+    movu        m3,         [r1]                    ; m3 = m_iUpBuff1
+    paddb       m3,         m5
+    paddb       m3,         [pb_2]
+
+    movu        m4,         [r2]                    ; m4 = m_iOffsetEo
+    pshufb      m4,         m3
+    psubb       m3,         m0,         m5
+    movu        [r1],       m3
+
+    pmovzxbw    m5,         m2
+    punpckhbw   m2,         m0
+    pmovsxbw    m3,         m4
+    punpckhbw   m4,         m4
+    psraw       m4,         8
+
+    paddw       m5,         m3
+    paddw       m2,         m4
+    packuswb    m5,         m2
+    movu        [r0 + r3],  m5
+
+    add         r0,         16
+    add         r1,         16
+    dec         r4d
+    jnz         .loop
     RET
 
 ;======================================================================================================================================================
diff -r 3e416dec8024 -r 7044924d6814 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.h	Tue Apr 07 17:11:42 2015 +0530
@@ -28,6 +28,7 @@
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
 void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
 void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
diff -r 3e416dec8024 -r 7044924d6814 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/encoder/sao.cpp	Tue Apr 07 17:11:42 2015 +0530
@@ -367,11 +367,14 @@
         {
             primitives.sign(upBuff1, rec, tmpU, ctuWidth);
 
-            for (y = startY; y < endY; y++)
+            int diff = (endY - startY) % 2;
+            for (y = startY; y < endY - diff; y += 2)
             {
+                primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth);
+                rec += 2 * stride;
+            }
+            if (diff & 1)
                 primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
-                rec += stride;
-            }
         }
 
         break;
diff -r 3e416dec8024 -r 7044924d6814 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Apr 07 17:11:42 2015 +0530
@@ -1687,6 +1687,15 @@
         }
     }
 
+    if (opt.saoCuOrgE1_2Rows)
+    {
+        if (!check_saoCuOrgE1_t(ref.saoCuOrgE1_2Rows, opt.saoCuOrgE1_2Rows))
+        {
+            printf("SAO_EO_1_2Rows failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuOrgE2)
     {
         if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
@@ -2065,6 +2074,12 @@
         REPORT_SPEEDUP(opt.saoCuOrgE1, ref.saoCuOrgE1, pbuf1, psbuf2, psbuf1, 64, 64);
     }
 
+    if (opt.saoCuOrgE1_2Rows)
+    {
+        HEADER0("SAO_EO_1_2Rows");
+        REPORT_SPEEDUP(opt.saoCuOrgE1_2Rows, ref.saoCuOrgE1_2Rows, pbuf1, psbuf2, psbuf1, 64, 64);
+    }
+
     if (opt.saoCuOrgE2)
     {
         HEADER0("SAO_EO_2");


More information about the x265-devel mailing list